Merge branch 'main' into users/chapuni/cov/single/unifyusers/chapuni/cov/single/unify

author: NAKAMURA Takumi <geek4civic@gmail.com> 2025-01-10 19:25:56 +0900
committer: NAKAMURA Takumi <geek4civic@gmail.com> 2025-01-10 19:25:56 +0900
commit: 63f5dc16d6bfca0512fb034052b41d13c3751e20 (patch)
tree: e70266be1fda941e0974e71e3d2c1cf080081311
parent: 9e5734688ed3d5f6b3fb76a26b3d90a736d60781 (diff)
parent: 397ac44f623f891d8f05d6673a95984ac0a26671 (diff)
1202 files changed, 27739 insertions, 12352 deletions
diff --git a/.github/workflows/commit-access-review.py b/.github/workflows/commit-access-review.py
index 91d3a61cdcb1..4f539fe98004 100644
--- a/.github/workflows/commit-access-review.py
+++ b/.github/workflows/commit-access-review.py
@@ -67,39 +67,47 @@ def check_manual_requests(
 ) -> list[str]:
     """
     Return a list of users who have been asked since ``start_date`` if they
-    want to keep their commit access.
+    want to keep their commit access or if they have applied for commit
+    access since ``start_date``
     """
+
     query = """
-        query ($query: String!) {
-          search(query: $query, type: ISSUE, first: 100) {
+        query ($query: String!, $after: String) {
+          search(query: $query, type: ISSUE, first: 100, after: $after) {
             nodes {
               ... on Issue {
-                body
-                comments (first: 100) {
-                  nodes {
-                    author {
-                      login
-                    }
-                  }
+                author {
+                  login
                 }
+                body
               }
             }
+            pageInfo {
+              hasNextPage
+              endCursor
+            }
           }
         }
         """
     formatted_start_date = start_date.strftime("%Y-%m-%dT%H:%M:%S")
     variables = {
-        "query": f"type:issue created:>{formatted_start_date} org:llvm repo:llvm-project label:infra:commit-access"
+        "query": f"type:issue created:>{formatted_start_date} org:llvm repo:llvm-project label:infra:commit-access,infra:commit-access-request"
     }
 
-    res_header, res_data = gh._Github__requester.graphql_query(
-        query=query, variables=variables
-    )
-    data = res_data["data"]
+    has_next_page = True
     users = []
-    for issue in data["search"]["nodes"]:
-        users.extend([user[1:] for user in re.findall("@[^ ,\n]+", issue["body"])])
-
+    while has_next_page:
+        res_header, res_data = gh._Github__requester.graphql_query(
+            query=query, variables=variables
+        )
+        data = res_data["data"]
+        for issue in data["search"]["nodes"]:
+            users.extend([user[1:] for user in re.findall("@[^ ,\n]+", issue["body"])])
+            if issue["author"]:
+                users.append(issue["author"]["login"])
+        has_next_page = data["search"]["pageInfo"]["hasNextPage"]
+        if has_next_page:
+            variables["after"] = data["search"]["pageInfo"]["endCursor"]
     return users
 
 
diff --git a/.github/workflows/containers/github-action-ci/Dockerfile b/.github/workflows/containers/github-action-ci/Dockerfile
index 58355d261c43..3757e603f8a1 100644
--- a/.github/workflows/containers/github-action-ci/Dockerfile
+++ b/.github/workflows/containers/github-action-ci/Dockerfile
@@ -57,6 +57,7 @@ RUN apt-get update && \
     nodejs \
     perl-modules \
     python3-psutil \
+    sudo \
 
     # These are needed by the premerge pipeline. Pip is used to install
     # dependent python packages and ccache is used for build caching. File and
@@ -66,6 +67,16 @@ RUN apt-get update && \
     file \
     tzdata
 
+# Install sccache as it is needed by most of the project test workflows and
+# cannot be installed by the ccache action when executing as a non-root user.
+# TODO(boomanaiden154): This should be switched to being installed with apt
+# once we bump to Ubuntu 24.04.
+RUN curl -L 'https://github.com/mozilla/sccache/releases/download/v0.7.6/sccache-v0.7.6-x86_64-unknown-linux-musl.tar.gz' > /tmp/sccache.tar.gz && \
+    echo "2902a5e44c3342132f07b62e70cca75d9b23252922faf3b924f449808cc1ae58 /tmp/sccache.tar.gz" | sha256sum -c && \
+    tar xzf /tmp/sccache.tar.gz -O --wildcards '*/sccache' > '/usr/local/bin/sccache' && \
+    rm /tmp/sccache.tar.gz && \
+    chmod +x /usr/local/bin/sccache
+
 ENV LLVM_SYSROOT=$LLVM_SYSROOT
 ENV PATH=${LLVM_SYSROOT}/bin:${PATH}
 
@@ -73,5 +84,11 @@ ENV PATH=${LLVM_SYSROOT}/bin:${PATH}
 # permissions issues in some tests. Set the user id to 1001 as that is the
 # user id that Github Actions uses to perform the checkout action.
 RUN useradd gha -u 1001 -m -s /bin/bash
+
+# Also add the user to passwordless sudoers so that we can install software
+# later on without having to rebuild the container.
+RUN adduser gha sudo
+RUN echo '%sudo ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers
+
 USER gha
 
diff --git a/.github/workflows/pr-code-format.yml b/.github/workflows/pr-code-format.yml
index f2bb37316d3a..0e6180acf4a4 100644
--- a/.github/workflows/pr-code-format.yml
+++ b/.github/workflows/pr-code-format.yml
@@ -60,7 +60,7 @@ jobs:
       - name: Install clang-format
         uses: aminya/setup-cpp@v1
         with:
-          clangformat: 18.1.7
+          clangformat: 19.1.6
 
       - name: Setup Python env
         uses: actions/setup-python@v5
diff --git a/.github/workflows/premerge.yaml b/.github/workflows/premerge.yaml
index 7a9762812cc1..261dc8bbb97e 100644
--- a/.github/workflows/premerge.yaml
+++ b/.github/workflows/premerge.yaml
@@ -31,6 +31,12 @@ jobs:
       - name: Setup ccache
         uses: hendrikmuhs/ccache-action@v1.2.14
       - name: Build and Test
+        # Mark the job as a success even if the step fails so that people do
+        # not get notified while the new premerge pipeline is in an
+        # experimental state.
+        # TODO(boomanaiden154): Remove this once the pipeline is stable and we
+        # are ready for people to start recieving notifications.
+        continue-on-error: true
         run: |
           git config --global --add safe.directory '*'
 
diff --git a/bolt/lib/Core/BinaryEmitter.cpp b/bolt/lib/Core/BinaryEmitter.cpp
index 1744c1e57172..1aad25242712 100644
--- a/bolt/lib/Core/BinaryEmitter.cpp
+++ b/bolt/lib/Core/BinaryEmitter.cpp
@@ -47,12 +47,16 @@ BreakFunctionNames("break-funcs",
   cl::cat(BoltCategory));
 
 static cl::list<std::string>
-FunctionPadSpec("pad-funcs",
-  cl::CommaSeparated,
-  cl::desc("list of functions to pad with amount of bytes"),
-  cl::value_desc("func1:pad1,func2:pad2,func3:pad3,..."),
-  cl::Hidden,
-  cl::cat(BoltCategory));
+    FunctionPadSpec("pad-funcs", cl::CommaSeparated,
+                    cl::desc("list of functions to pad with amount of bytes"),
+                    cl::value_desc("func1:pad1,func2:pad2,func3:pad3,..."),
+                    cl::Hidden, cl::cat(BoltCategory));
+
+static cl::list<std::string> FunctionPadBeforeSpec(
+    "pad-funcs-before", cl::CommaSeparated,
+    cl::desc("list of functions to pad with amount of bytes"),
+    cl::value_desc("func1:pad1,func2:pad2,func3:pad3,..."), cl::Hidden,
+    cl::cat(BoltCategory));
 
 static cl::opt<bool> MarkFuncs(
     "mark-funcs",
@@ -70,11 +74,11 @@ X86AlignBranchBoundaryHotOnly("x86-align-branch-boundary-hot-only",
   cl::init(true),
   cl::cat(BoltOptCategory));
 
-size_t padFunction(const BinaryFunction &Function) {
-  static std::map<std::string, size_t> FunctionPadding;
-
-  if (FunctionPadding.empty() && !FunctionPadSpec.empty()) {
-    for (std::string &Spec : FunctionPadSpec) {
+size_t padFunction(std::map<std::string, size_t> &FunctionPadding,
+                   const cl::list<std::string> &Spec,
+                   const BinaryFunction &Function) {
+  if (FunctionPadding.empty() && !Spec.empty()) {
+    for (const std::string &Spec : Spec) {
       size_t N = Spec.find(':');
       if (N == std::string::npos)
         continue;
@@ -94,6 +98,15 @@ size_t padFunction(const BinaryFunction &Function) {
   return 0;
 }
 
+size_t padFunctionBefore(const BinaryFunction &Function) {
+  static std::map<std::string, size_t> CacheFunctionPadding;
+  return padFunction(CacheFunctionPadding, FunctionPadBeforeSpec, Function);
+}
+size_t padFunctionAfter(const BinaryFunction &Function) {
+  static std::map<std::string, size_t> CacheFunctionPadding;
+  return padFunction(CacheFunctionPadding, FunctionPadSpec, Function);
+}
+
 } // namespace opts
 
 namespace {
@@ -319,6 +332,31 @@ bool BinaryEmitter::emitFunction(BinaryFunction &Function,
     Streamer.emitCodeAlignment(Function.getAlign(), &*BC.STI);
   }
 
+  if (size_t Padding = opts::padFunctionBefore(Function)) {
+    // Handle padFuncsBefore after the above alignment logic but before
+    // symbol addresses are decided.
+    if (!BC.HasRelocations) {
+      BC.errs() << "BOLT-ERROR: -pad-before-funcs is not supported in "
+                << "non-relocation mode\n";
+      exit(1);
+    }
+
+    // Preserve Function.getMinAlign().
+    if (!isAligned(Function.getMinAlign(), Padding)) {
+      BC.errs() << "BOLT-ERROR: user-requested " << Padding
+                << " padding bytes before function " << Function
+                << " is not a multiple of the minimum function alignment ("
+                << Function.getMinAlign().value() << ").\n";
+      exit(1);
+    }
+
+    LLVM_DEBUG(dbgs() << "BOLT-DEBUG: padding before function " << Function
+                      << " with " << Padding << " bytes\n");
+
+    // Since the padding is not executed, it can be null bytes.
+    Streamer.emitFill(Padding, 0);
+  }
+
   MCContext &Context = Streamer.getContext();
   const MCAsmInfo *MAI = Context.getAsmInfo();
 
@@ -373,7 +411,7 @@ bool BinaryEmitter::emitFunction(BinaryFunction &Function,
   emitFunctionBody(Function, FF, /*EmitCodeOnly=*/false);
 
   // Emit padding if requested.
-  if (size_t Padding = opts::padFunction(Function)) {
+  if (size_t Padding = opts::padFunctionAfter(Function)) {
     LLVM_DEBUG(dbgs() << "BOLT-DEBUG: padding function " << Function << " with "
                       << Padding << " bytes\n");
     Streamer.emitFill(Padding, MAI->getTextAlignFillValue());
diff --git a/bolt/lib/Core/CMakeLists.txt b/bolt/lib/Core/CMakeLists.txt
index bb58667066fd..8c1f5d0bb37b 100644
--- a/bolt/lib/Core/CMakeLists.txt
+++ b/bolt/lib/Core/CMakeLists.txt
@@ -35,6 +35,7 @@ add_llvm_library(LLVMBOLTCore
   ParallelUtilities.cpp
   Relocation.cpp
 
+  NO_EXPORT
   DISABLE_LLVM_LINK_LLVM_DYLIB
   LINK_LIBS
   ${LLVM_PTHREAD_LIB}
diff --git a/bolt/lib/Passes/CMakeLists.txt b/bolt/lib/Passes/CMakeLists.txt
index 1c1273b3d242..1e3289484a5b 100644
--- a/bolt/lib/Passes/CMakeLists.txt
+++ b/bolt/lib/Passes/CMakeLists.txt
@@ -46,6 +46,7 @@ add_llvm_library(LLVMBOLTPasses
   VeneerElimination.cpp
   RetpolineInsertion.cpp
 
+  NO_EXPORT
   DISABLE_LLVM_LINK_LLVM_DYLIB
 
   LINK_LIBS
diff --git a/bolt/lib/Passes/ReorderFunctions.cpp b/bolt/lib/Passes/ReorderFunctions.cpp
index 1256d71342b1..35c5acfdecdb 100644
--- a/bolt/lib/Passes/ReorderFunctions.cpp
+++ b/bolt/lib/Passes/ReorderFunctions.cpp
@@ -28,7 +28,8 @@ extern cl::OptionCategory BoltOptCategory;
 extern cl::opt<unsigned> Verbosity;
 extern cl::opt<uint32_t> RandomSeed;
 
-extern size_t padFunction(const bolt::BinaryFunction &Function);
+extern size_t padFunctionBefore(const bolt::BinaryFunction &Function);
+extern size_t padFunctionAfter(const bolt::BinaryFunction &Function);
 
 extern cl::opt<bolt::ReorderFunctions::ReorderType> ReorderFunctions;
 cl::opt<bolt::ReorderFunctions::ReorderType> ReorderFunctions(
@@ -304,8 +305,10 @@ Error ReorderFunctions::runOnFunctions(BinaryContext &BC) {
                           return false;
                         if (B->isIgnored())
                           return true;
-                        const size_t PadA = opts::padFunction(*A);
-                        const size_t PadB = opts::padFunction(*B);
+                        const size_t PadA = opts::padFunctionBefore(*A) +
+                                            opts::padFunctionAfter(*A);
+                        const size_t PadB = opts::padFunctionBefore(*B) +
+                                            opts::padFunctionAfter(*B);
                         if (!PadA || !PadB) {
                           if (PadA)
                             return true;
diff --git a/bolt/lib/Profile/CMakeLists.txt b/bolt/lib/Profile/CMakeLists.txt
index 9aa4ba0490b0..a2bb4aa074c7 100644
--- a/bolt/lib/Profile/CMakeLists.txt
+++ b/bolt/lib/Profile/CMakeLists.txt
@@ -7,6 +7,7 @@ add_llvm_library(LLVMBOLTProfile
   YAMLProfileReader.cpp
   YAMLProfileWriter.cpp
 
+  NO_EXPORT
   DISABLE_LLVM_LINK_LLVM_DYLIB
 
   LINK_COMPONENTS
diff --git a/bolt/lib/Rewrite/CMakeLists.txt b/bolt/lib/Rewrite/CMakeLists.txt
index 5d114925f59b..c83cf3698216 100644
--- a/bolt/lib/Rewrite/CMakeLists.txt
+++ b/bolt/lib/Rewrite/CMakeLists.txt
@@ -25,6 +25,7 @@ add_llvm_library(LLVMBOLTRewrite
   RewriteInstance.cpp
   SDTRewriter.cpp
 
+  NO_EXPORT
   DISABLE_LLVM_LINK_LLVM_DYLIB
 
   LINK_LIBS
diff --git a/bolt/lib/RuntimeLibs/CMakeLists.txt b/bolt/lib/RuntimeLibs/CMakeLists.txt
index d3ac71d3e797..b8db7e4a1553 100644
--- a/bolt/lib/RuntimeLibs/CMakeLists.txt
+++ b/bolt/lib/RuntimeLibs/CMakeLists.txt
@@ -11,6 +11,7 @@ add_llvm_library(LLVMBOLTRuntimeLibs
   HugifyRuntimeLibrary.cpp
   InstrumentationRuntimeLibrary.cpp
 
+  NO_EXPORT
   DISABLE_LLVM_LINK_LLVM_DYLIB
   )
 
diff --git a/bolt/lib/Target/AArch64/CMakeLists.txt b/bolt/lib/Target/AArch64/CMakeLists.txt
index 7e2d33e09b5a..8435ea7245e7 100644
--- a/bolt/lib/Target/AArch64/CMakeLists.txt
+++ b/bolt/lib/Target/AArch64/CMakeLists.txt
@@ -19,6 +19,7 @@ endif()
 add_llvm_library(LLVMBOLTTargetAArch64
   AArch64MCPlusBuilder.cpp
 
+  NO_EXPORT
   DISABLE_LLVM_LINK_LLVM_DYLIB
 
   DEPENDS
diff --git a/bolt/lib/Target/RISCV/CMakeLists.txt b/bolt/lib/Target/RISCV/CMakeLists.txt
index 5d19d38717de..6c3a196f8a1f 100644
--- a/bolt/lib/Target/RISCV/CMakeLists.txt
+++ b/bolt/lib/Target/RISCV/CMakeLists.txt
@@ -20,6 +20,7 @@ endif()
 add_llvm_library(LLVMBOLTTargetRISCV
   RISCVMCPlusBuilder.cpp
 
+  NO_EXPORT
   DISABLE_LLVM_LINK_LLVM_DYLIB
 
   DEPENDS
diff --git a/bolt/lib/Target/X86/CMakeLists.txt b/bolt/lib/Target/X86/CMakeLists.txt
index b274716e89a4..6d1accb5e815 100644
--- a/bolt/lib/Target/X86/CMakeLists.txt
+++ b/bolt/lib/Target/X86/CMakeLists.txt
@@ -21,6 +21,7 @@ add_llvm_library(LLVMBOLTTargetX86
   X86MCPlusBuilder.cpp
   X86MCSymbolizer.cpp
 
+  NO_EXPORT
   DISABLE_LLVM_LINK_LLVM_DYLIB
 
   DEPENDS
diff --git a/bolt/lib/Utils/CMakeLists.txt b/bolt/lib/Utils/CMakeLists.txt
index c452c1fac377..efba6d54449d 100644
--- a/bolt/lib/Utils/CMakeLists.txt
+++ b/bolt/lib/Utils/CMakeLists.txt
@@ -29,6 +29,8 @@ add_llvm_library(LLVMBOLTUtils
   CommandLineOpts.cpp
   Utils.cpp
   ${version_inc}
+
+  NO_EXPORT
   DISABLE_LLVM_LINK_LLVM_DYLIB
 
   LINK_LIBS
diff --git a/bolt/test/AArch64/pad-before-funcs.s b/bolt/test/AArch64/pad-before-funcs.s
new file mode 100644
index 000000000000..f3e8a23ddfdd
--- /dev/null
+++ b/bolt/test/AArch64/pad-before-funcs.s
@@ -0,0 +1,48 @@
+# Test checks that --pad-before-funcs is working as expected.
+# It should be able to introduce a configurable offset for the _start symbol.
+# It should reject requests which don't obey the code alignment requirement.
+
+# Tests check inserting padding before _start; and additionally a test where
+# padding is inserted after start. In each case, check that the following
+# symbol ends up in the expected place as well.
+
+
+# RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown %s -o %t.o
+# RUN: %clang %cflags %t.o -o %t.exe -Wl,-q -Wl,--section-start=.text=0x4000
+# RUN: llvm-bolt %t.exe -o %t.bolt.0 --pad-funcs-before=_start:0
+# RUN: llvm-bolt %t.exe -o %t.bolt.4 --pad-funcs-before=_start:4
+# RUN: llvm-bolt %t.exe -o %t.bolt.8 --pad-funcs-before=_start:8
+# RUN: llvm-bolt %t.exe -o %t.bolt.4.4 --pad-funcs-before=_start:4 --pad-funcs=_start:4
+# RUN: llvm-bolt %t.exe -o %t.bolt.4.8 --pad-funcs-before=_start:4 --pad-funcs=_start:8
+
+# RUN: not llvm-bolt %t.exe -o %t.bolt.8 --pad-funcs-before=_start:1 2>&1 | FileCheck --check-prefix=CHECK-BAD-ALIGN %s
+
+# CHECK-BAD-ALIGN: user-requested 1 padding bytes before function _start(*2) is not a multiple of the minimum function alignment (4).
+
+# RUN: llvm-objdump --section=.text --disassemble %t.bolt.0 | FileCheck --check-prefix=CHECK-0 %s
+# RUN: llvm-objdump --section=.text --disassemble %t.bolt.4 | FileCheck --check-prefix=CHECK-4 %s
+# RUN: llvm-objdump --section=.text --disassemble %t.bolt.8 | FileCheck --check-prefix=CHECK-8 %s
+# RUN: llvm-objdump --section=.text --disassemble %t.bolt.4.4 | FileCheck --check-prefix=CHECK-4-4 %s
+# RUN: llvm-objdump --section=.text --disassemble %t.bolt.4.8 | FileCheck --check-prefix=CHECK-4-8 %s
+
+# Trigger relocation mode in bolt.
+.reloc 0, R_AARCH64_NONE
+
+.section .text
+
+# CHECK-0: 0000000000400000 <_start>
+# CHECK-4: 0000000000400004 <_start>
+# CHECK-4-4: 0000000000400004 <_start>
+# CHECK-8: 0000000000400008 <_start>
+.globl _start
+_start:
+    ret
+
+# CHECK-0: 0000000000400004 <_subsequent>
+# CHECK-4: 0000000000400008 <_subsequent>
+# CHECK-4-4: 000000000040000c <_subsequent>
+# CHECK-4-8: 0000000000400010 <_subsequent>
+# CHECK-8: 000000000040000c <_subsequent>
+.globl _subsequent
+_subsequent:
+    ret
diff --git a/clang-tools-extra/clang-tidy/bugprone/UnhandledSelfAssignmentCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/UnhandledSelfAssignmentCheck.cpp
index 8121a36f8034..1f432c4ccc5f 100644
--- a/clang-tools-extra/clang-tidy/bugprone/UnhandledSelfAssignmentCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/UnhandledSelfAssignmentCheck.cpp
@@ -74,9 +74,11 @@ void UnhandledSelfAssignmentCheck::registerMatchers(MatchFinder *Finder) {
     // Matcher for standard smart pointers.
     const auto SmartPointerType = qualType(hasUnqualifiedDesugaredType(
         recordType(hasDeclaration(classTemplateSpecializationDecl(
-            hasAnyName("::std::shared_ptr", "::std::unique_ptr",
-                       "::std::weak_ptr", "::std::auto_ptr"),
-            templateArgumentCountIs(1))))));
+            anyOf(allOf(hasAnyName("::std::shared_ptr", "::std::weak_ptr",
+                                   "::std::auto_ptr"),
+                        templateArgumentCountIs(1)),
+                  allOf(hasName("::std::unique_ptr"),
+                        templateArgumentCountIs(2))))))));
 
     // We will warn only if the class has a pointer or a C array field which
     // probably causes a problem during self-assignment (e.g. first resetting
diff --git a/clang-tools-extra/clang-tidy/bugprone/UnusedLocalNonTrivialVariableCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/UnusedLocalNonTrivialVariableCheck.cpp
index 37baae7a6f0c..f15fd4d9ad9f 100644
--- a/clang-tools-extra/clang-tidy/bugprone/UnusedLocalNonTrivialVariableCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/UnusedLocalNonTrivialVariableCheck.cpp
@@ -29,6 +29,12 @@ static constexpr StringRef DefaultIncludeTypeRegex =
 
 AST_MATCHER(VarDecl, isLocalVarDecl) { return Node.isLocalVarDecl(); }
 AST_MATCHER(VarDecl, isReferenced) { return Node.isReferenced(); }
+AST_MATCHER_P(VarDecl, explicitMarkUnused, LangOptions, LangOpts) {
+  // Implementations should not emit a warning that a name-independent
+  // declaration is used or unused.
+  return Node.hasAttr<UnusedAttr>() ||
+         (LangOpts.CPlusPlus26 && Node.isPlaceholderVar(LangOpts));
+}
 AST_MATCHER(Type, isReferenceType) { return Node.isReferenceType(); }
 AST_MATCHER(QualType, isTrivial) {
   return Node.isTrivialType(Finder->getASTContext()) ||
@@ -60,7 +66,7 @@ void UnusedLocalNonTrivialVariableCheck::registerMatchers(MatchFinder *Finder) {
       varDecl(isLocalVarDecl(), unless(isReferenced()),
               unless(isExceptionVariable()), hasLocalStorage(), isDefinition(),
               unless(hasType(isReferenceType())), unless(hasType(isTrivial())),
-              unless(hasAttr(attr::Kind::Unused)),
+              unless(explicitMarkUnused(getLangOpts())),
               hasType(hasUnqualifiedDesugaredType(
                   anyOf(recordType(hasDeclaration(namedDecl(
                             matchesAnyListedName(IncludeTypes),
diff --git a/clang-tools-extra/clangd/HeuristicResolver.h b/clang-tools-extra/clangd/HeuristicResolver.h
index dcc063bbc4ad..c130e0677e86 100644
--- a/clang-tools-extra/clangd/HeuristicResolver.h
+++ b/clang-tools-extra/clangd/HeuristicResolver.h
@@ -26,13 +26,14 @@ class UnresolvedUsingValueDecl;
 
 namespace clangd {
 
-// This class heuristic resolution of declarations and types in template code.
+// This class handles heuristic resolution of declarations and types in template
+// code.
 //
 // As a compiler, clang only needs to perform certain types of processing on
 // template code (such as resolving dependent names to declarations, or
 // resolving the type of a dependent expression) after instantiation. Indeed,
 // C++ language features such as template specialization mean such resolution
-// cannot be done accurately before instantiation
+// cannot be done accurately before instantiation.
 //
 // However, template code is written and read in uninstantiated form, and clangd
 // would like to provide editor features like go-to-definition in template code
diff --git a/clang-tools-extra/clangd/unittests/CMakeLists.txt b/clang-tools-extra/clangd/unittests/CMakeLists.txt
index dffdcd5d014c..8dba8088908d 100644
--- a/clang-tools-extra/clangd/unittests/CMakeLists.txt
+++ b/clang-tools-extra/clangd/unittests/CMakeLists.txt
@@ -64,6 +64,7 @@ add_unittest(ClangdUnitTests ClangdTests
   GlobalCompilationDatabaseTests.cpp
   HeadersTests.cpp
   HeaderSourceSwitchTests.cpp
+  HeuristicResolverTests.cpp
   HoverTests.cpp
   IncludeCleanerTests.cpp
   IndexActionTests.cpp
diff --git a/clang-tools-extra/clangd/unittests/ClangdLSPServerTests.cpp b/clang-tools-extra/clangd/unittests/ClangdLSPServerTests.cpp
index 49a94045ea48..2c7f50d8c9e4 100644
--- a/clang-tools-extra/clangd/unittests/ClangdLSPServerTests.cpp
+++ b/clang-tools-extra/clangd/unittests/ClangdLSPServerTests.cpp
@@ -208,12 +208,13 @@ TEST_F(LSPTest, ClangTidyRename) {
   Annotations Source(R"cpp(
     void [[foo]]() {}
   )cpp");
-  Opts.ClangTidyProvider = [](tidy::ClangTidyOptions &ClangTidyOpts,
-                              llvm::StringRef) {
+  constexpr auto ClangTidyProvider = [](tidy::ClangTidyOptions &ClangTidyOpts,
+                                        llvm::StringRef) {
     ClangTidyOpts.Checks = {"-*,readability-identifier-naming"};
     ClangTidyOpts.CheckOptions["readability-identifier-naming.FunctionCase"] =
         "CamelCase";
   };
+  Opts.ClangTidyProvider = ClangTidyProvider;
   auto &Client = start();
   Client.didOpen("foo.hpp", Header.code());
   Client.didOpen("foo.cpp", Source.code());
@@ -266,10 +267,11 @@ TEST_F(LSPTest, ClangTidyCrash_Issue109367) {
   // This test requires clang-tidy checks to be linked in.
   if (!CLANGD_TIDY_CHECKS)
     return;
-  Opts.ClangTidyProvider = [](tidy::ClangTidyOptions &ClangTidyOpts,
-                              llvm::StringRef) {
+  constexpr auto ClangTidyProvider = [](tidy::ClangTidyOptions &ClangTidyOpts,
+                                        llvm::StringRef) {
     ClangTidyOpts.Checks = {"-*,boost-use-ranges"};
   };
+  Opts.ClangTidyProvider = ClangTidyProvider;
   // Check that registering the boost-use-ranges checker's matchers
   // on two different threads does not cause a crash.
   auto &Client = start();
diff --git a/clang-tools-extra/clangd/unittests/HeuristicResolverTests.cpp b/clang-tools-extra/clangd/unittests/HeuristicResolverTests.cpp
new file mode 100644
index 000000000000..e4b3822fc7eb
--- /dev/null
+++ b/clang-tools-extra/clangd/unittests/HeuristicResolverTests.cpp
@@ -0,0 +1,542 @@
+//===-- HeuristicResolverTests.cpp --------------------------*- C++ -*-----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#include "HeuristicResolver.h"
+#include "clang/ASTMatchers/ASTMatchFinder.h"
+#include "clang/ASTMatchers/ASTMatchers.h"
+#include "clang/Tooling/Tooling.h"
+#include "gmock/gmock-matchers.h"
+#include "gtest/gtest.h"
+
+using namespace clang::ast_matchers;
+using clang::clangd::HeuristicResolver;
+using testing::ElementsAre;
+
+namespace clang {
+namespace {
+
+// Helper for matching a sequence of elements with a variadic list of matchers.
+// Usage: `ElementsAre(matchAdapter(Vs, MatchFunction)...)`, where `Vs...` is
+//        a variadic list of matchers.
+// For each `V` in `Vs`, this will match the corresponding element `E` if
+// `MatchFunction(V, E)` is true.
+MATCHER_P2(matchAdapter, MatcherForElement, MatchFunction, "matchAdapter") {
+  return MatchFunction(MatcherForElement, arg);
+}
+
+template <typename InputNode>
+using ResolveFnT = std::function<std::vector<const NamedDecl *>(
+    const HeuristicResolver *, const InputNode *)>;
+
+// Test heuristic resolution on `Code` using the resolution procedure
+// `ResolveFn`, which takes a `HeuristicResolver` and an input AST node of type
+// `InputNode` and returns a `std::vector<const NamedDecl *>`.
+// `InputMatcher` should be an AST matcher that matches a single node to pass as
+// input to `ResolveFn`, bound to the ID "input". `OutputMatchers` should be AST
+// matchers that each match a single node, bound to the ID "output".
+template <typename InputNode, typename InputMatcher, typename... OutputMatchers>
+void expectResolution(llvm::StringRef Code, ResolveFnT<InputNode> ResolveFn,
+                      const InputMatcher &IM, const OutputMatchers &...OMS) {
+  auto TU = tooling::buildASTFromCodeWithArgs(Code, {"-std=c++20"});
+  auto &Ctx = TU->getASTContext();
+  auto InputMatches = match(IM, Ctx);
+  ASSERT_EQ(1u, InputMatches.size());
+  const auto *Input = InputMatches[0].template getNodeAs<InputNode>("input");
+  ASSERT_TRUE(Input);
+
+  auto OutputNodeMatches = [&](auto &OutputMatcher, auto &Actual) {
+    auto OutputMatches = match(OutputMatcher, Ctx);
+    if (OutputMatches.size() != 1u)
+      return false;
+    const auto *ExpectedOutput =
+        OutputMatches[0].template getNodeAs<NamedDecl>("output");
+    if (!ExpectedOutput)
+      return false;
+    return ExpectedOutput == Actual;
+  };
+
+  HeuristicResolver H(Ctx);
+  auto Results = ResolveFn(&H, Input);
+  EXPECT_THAT(Results, ElementsAre(matchAdapter(OMS, OutputNodeMatches)...));
+}
+
+// Wrapper for the above that accepts a HeuristicResolver member function
+// pointer directly.
+template <typename InputNode, typename InputMatcher, typename... OutputMatchers>
+void expectResolution(llvm::StringRef Code,
+                      std::vector<const NamedDecl *> (
+                          HeuristicResolver::*ResolveFn)(const InputNode *)
+                          const,
+                      const InputMatcher &IM, const OutputMatchers &...OMS) {
+  expectResolution(Code, ResolveFnT<InputNode>(std::mem_fn(ResolveFn)), IM,
+                   OMS...);
+}
+
+TEST(HeuristicResolver, MemberExpr) {
+  std::string Code = R"cpp(
+    template <typename T>
+    struct S {
+      void bar() {}
+    };
+
+    template <typename T>
+    void foo(S<T> arg) {
+      arg.bar();
+    }
+  )cpp";
+  // Test resolution of "bar" in "arg.bar()".
+  expectResolution(
+      Code, &HeuristicResolver::resolveMemberExpr,
+      cxxDependentScopeMemberExpr(hasMemberName("bar")).bind("input"),
+      cxxMethodDecl(hasName("bar")).bind("output"));
+}
+
+TEST(HeuristicResolver, MemberExpr_Overloads) {
+  std::string Code = R"cpp(
+    template <typename T>
+    struct S {
+      void bar(int);
+      void bar(float);
+    };
+
+    template <typename T, typename U>
+    void foo(S<T> arg, U u) {
+      arg.bar(u);
+    }
+  )cpp";
+  // Test resolution of "bar" in "arg.bar(u)". Both overloads should be found.
+  expectResolution(
+      Code, &HeuristicResolver::resolveMemberExpr,
+      cxxDependentScopeMemberExpr(hasMemberName("bar")).bind("input"),
+      cxxMethodDecl(hasName("bar"), hasParameter(0, hasType(asString("int"))))
+          .bind("output"),
+      cxxMethodDecl(hasName("bar"), hasParameter(0, hasType(asString("float"))))
+          .bind("output"));
+}
+
+TEST(HeuristicResolver, MemberExpr_SmartPointer) {
+  std::string Code = R"cpp(
+    template <typename> struct S { void foo() {} };
+    template <typename T> struct unique_ptr {
+      T* operator->();
+    };
+    template <typename T>
+    void test(unique_ptr<S<T>>& v) {
+      v->foo();
+    }
+  )cpp";
+  // Test resolution of "foo" in "v->foo()".
+  expectResolution(
+      Code, &HeuristicResolver::resolveMemberExpr,
+      cxxDependentScopeMemberExpr(hasMemberName("foo")).bind("input"),
+      cxxMethodDecl(hasName("foo")).bind("output"));
+}
+
+TEST(HeuristicResolver, MemberExpr_Chained) {
+  std::string Code = R"cpp(
+    struct A { void foo() {} };
+    template <typename T>
+    struct B {
+      A func(int);
+      void bar() {
+        func(1).foo();
+      }
+    };
+  )cpp";
+  // Test resolution of "foo" in "func(1).foo()".
+  expectResolution(
+      Code, &HeuristicResolver::resolveMemberExpr,
+      cxxDependentScopeMemberExpr(hasMemberName("foo")).bind("input"),
+      cxxMethodDecl(hasName("foo")).bind("output"));
+}
+
+TEST(HeuristicResolver, MemberExpr_TemplateArgs) {
+  std::string Code = R"cpp(
+    struct Foo {
+      static Foo k(int);
+      template <typename T> T convert();
+    };
+    template <typename T>
+    void test() {
+      Foo::k(T()).template convert<T>();
+    }
+  )cpp";
+  // Test resolution of "convert" in "Foo::k(T()).template convert<T>()".
+  expectResolution(
+      Code, &HeuristicResolver::resolveMemberExpr,
+      cxxDependentScopeMemberExpr(hasMemberName("convert")).bind("input"),
+      functionTemplateDecl(hasName("convert")).bind("output"));
+}
+
+TEST(HeuristicResolver, MemberExpr_TypeAlias) {
+  std::string Code = R"cpp(
+    template <typename T>
+    struct Waldo {
+      void find();
+    };
+    template <typename T>
+    using Wally = Waldo<T>;
+    template <typename T>
+    void foo(Wally<T> w) {
+      w.find();
+    }
+  )cpp";
+  // Test resolution of "find" in "w.find()".
+  expectResolution(
+      Code, &HeuristicResolver::resolveMemberExpr,
+      cxxDependentScopeMemberExpr(hasMemberName("find")).bind("input"),
+      cxxMethodDecl(hasName("find")).bind("output"));
+}
+
+TEST(HeuristicResolver, MemberExpr_BaseClass_TypeAlias) {
+  std::string Code = R"cpp(
+    template <typename T>
+    struct Waldo {
+      void find();
+    };
+    template <typename T>
+    using Wally = Waldo<T>;
+    template <typename T>
+    struct S : Wally<T> {
+      void foo() {
+        this->find();
+      }
+    };
+  )cpp";
+  // Test resolution of "find" in "this->find()".
+  expectResolution(
+      Code, &HeuristicResolver::resolveMemberExpr,
+      cxxDependentScopeMemberExpr(hasMemberName("find")).bind("input"),
+      cxxMethodDecl(hasName("find")).bind("output"));
+}
+
+TEST(HeuristicResolver, MemberExpr_Metafunction) {
+  std::string Code = R"cpp(
+    template <typename T>
+    struct Waldo {
+      void find();
+    };
+    template <typename T>
+    struct MetaWaldo {
+      using Type = Waldo<T>;
+    };
+    template <typename T>
+    void foo(typename MetaWaldo<T>::Type w) {
+      w.find();
+    }
+  )cpp";
+  // Test resolution of "find" in "w.find()".
+  expectResolution(
+      Code, &HeuristicResolver::resolveMemberExpr,
+      cxxDependentScopeMemberExpr(hasMemberName("find")).bind("input"),
+      cxxMethodDecl(hasName("find")).bind("output"));
+}
+
+TEST(HeuristicResolver, MemberExpr_DeducedNonTypeTemplateParameter) {
+  std::string Code = R"cpp(
+    template <int N>
+    struct Waldo {
+      const int found = N;
+    };
+    template <Waldo W>
+    int foo() {
+      return W.found;
+    }
+  )cpp";
+  // Test resolution of "found" in "W.found".
+  expectResolution(
+      Code, &HeuristicResolver::resolveMemberExpr,
+      cxxDependentScopeMemberExpr(hasMemberName("found")).bind("input"),
+      fieldDecl(hasName("found")).bind("output"));
+}
+
+TEST(HeuristicResolver, DeclRefExpr_StaticMethod) {
+  std::string Code = R"cpp(
+    template <typename T>
+    struct S {
+      static void bar() {}
+    };
+
+    template <typename T>
+    void foo() {
+      S<T>::bar();
+    }
+  )cpp";
+  // Test resolution of "bar" in "S<T>::bar()".
+  expectResolution(
+      Code, &HeuristicResolver::resolveDeclRefExpr,
+      dependentScopeDeclRefExpr(hasDependentName("bar")).bind("input"),
+      cxxMethodDecl(hasName("bar")).bind("output"));
+}
+
+TEST(HeuristicResolver, DeclRefExpr_StaticOverloads) {
+  std::string Code = R"cpp(
+    template <typename T>
+    struct S {
+      static void bar(int);
+      static void bar(float);
+    };
+
+    template <typename T, typename U>
+    void foo(U u) {
+      S<T>::bar(u);
+    }
+  )cpp";
+  // Test resolution of "bar" in "S<T>::bar(u)". Both overloads should be found.
+  expectResolution(
+      Code, &HeuristicResolver::resolveDeclRefExpr,
+      dependentScopeDeclRefExpr(hasDependentName("bar")).bind("input"),
+      cxxMethodDecl(hasName("bar"), hasParameter(0, hasType(asString("int"))))
+          .bind("output"),
+      cxxMethodDecl(hasName("bar"), hasParameter(0, hasType(asString("float"))))
+          .bind("output"));
+}
+
+TEST(HeuristicResolver, DeclRefExpr_Enumerator) {
+  std::string Code = R"cpp(
+    template <typename T>
+    struct Foo {
+      enum class E { A, B };
+      E e = E::A;
+    };
+  )cpp";
+  // Test resolution of "A" in "E::A".
+  expectResolution(
+      Code, &HeuristicResolver::resolveDeclRefExpr,
+      dependentScopeDeclRefExpr(hasDependentName("A")).bind("input"),
+      enumConstantDecl(hasName("A")).bind("output"));
+}
+
+TEST(HeuristicResolver, DeclRefExpr_RespectScope) {
+  std::string Code = R"cpp(
+    template <typename Info>
+    struct PointerIntPair {
+      void *getPointer() const { return Info::getPointer(); }
+    };
+  )cpp";
+  // Test resolution of "getPointer" in "Info::getPointer()".
+  // Here, we are testing that we do not incorrectly get the enclosing
+  // getPointer() function as a result.
+  expectResolution(
+      Code, &HeuristicResolver::resolveDeclRefExpr,
+      dependentScopeDeclRefExpr(hasDependentName("getPointer")).bind("input"));
+}
+
+TEST(HeuristicResolver, DependentNameType) {
+  std::string Code = R"cpp(
+    template <typename>
+    struct A {
+      struct B {};
+    };
+    template <typename T>
+    void foo(typename A<T>::B);
+  )cpp";
+  // Tests resolution of "B" in "A<T>::B".
+  expectResolution(
+      Code, &HeuristicResolver::resolveDependentNameType,
+      functionDecl(hasParameter(0, hasType(dependentNameType().bind("input")))),
+      classTemplateDecl(
+          has(cxxRecordDecl(has(cxxRecordDecl(hasName("B")).bind("output"))))));
+}
+
+TEST(HeuristicResolver, DependentNameType_Nested) {
+  std::string Code = R"cpp(
+    template <typename>
+    struct A {
+      struct B {
+        struct C {};
+      };
+    };
+    template <typename T>
+    void foo(typename A<T>::B::C);
+  )cpp";
+  // Tests resolution of "C" in "A<T>::B::C".
+  expectResolution(
+      Code, &HeuristicResolver::resolveDependentNameType,
+      functionDecl(hasParameter(0, hasType(dependentNameType().bind("input")))),
+      classTemplateDecl(has(cxxRecordDecl(has(
+          cxxRecordDecl(has(cxxRecordDecl(hasName("C")).bind("output"))))))));
+}
+
+TEST(HeuristicResolver, DependentNameType_Recursion) {
+  std::string Code = R"cpp(
+    template <int N>
+    struct Waldo {
+      using Type = typename Waldo<N - 1>::Type::Next;
+    };
+  )cpp";
+  // Test resolution of "Next" in "typename Waldo<N - 1>::Type::Next".
+  // Here, we are testing that we do not get into an infinite recursion.
+  expectResolution(Code, &HeuristicResolver::resolveDependentNameType,
+                   typeAliasDecl(hasType(dependentNameType().bind("input"))));
+}
+
+TEST(HeuristicResolver, DependentNameType_MutualRecursion) {
+  std::string Code = R"cpp(
+    template <int N>
+    struct Odd;
+    template <int N>
+    struct Even {
+      using Type = typename Odd<N - 1>::Type::Next;
+    };
+    template <int N>
+    struct Odd {
+      using Type = typename Even<N - 1>::Type::Next;
+    };
+  )cpp";
+  // Test resolution of "Next" in "typename Even<N - 1>::Type::Next".
+  // Similar to the above but we have two mutually recursive templates.
+  expectResolution(
+      Code, &HeuristicResolver::resolveDependentNameType,
+      classTemplateDecl(hasName("Odd"),
+                        has(cxxRecordDecl(has(typeAliasDecl(
+                            hasType(dependentNameType().bind("input"))))))));
+}
+
+TEST(HeuristicResolver, NestedNameSpecifier) {
+  // Test resolution of "B" in "A<T>::B::C".
+  // Unlike the "C", the "B" does not get its own DependentNameTypeLoc node,
+  // so the resolution uses the NestedNameSpecifier as input.
+  std::string Code = R"cpp(
+    template <typename>
+    struct A {
+      struct B {
+        struct C {};
+      };
+    };
+    template <typename T>
+    void foo(typename A<T>::B::C);
+  )cpp";
+  // Adapt the call to resolveNestedNameSpecifierToType() to the interface
+  // expected by expectResolution() (returning a vector of decls).
+  ResolveFnT<NestedNameSpecifier> ResolveFn =
+      [](const HeuristicResolver *H,
+         const NestedNameSpecifier *NNS) -> std::vector<const NamedDecl *> {
+    return {H->resolveNestedNameSpecifierToType(NNS)->getAsCXXRecordDecl()};
+  };
+  expectResolution(Code, ResolveFn,
+                   nestedNameSpecifier(hasPrefix(specifiesType(hasDeclaration(
+                                           classTemplateDecl(hasName("A"))))))
+                       .bind("input"),
+                   classTemplateDecl(has(cxxRecordDecl(
+                       has(cxxRecordDecl(hasName("B")).bind("output"))))));
+}
+
+TEST(HeuristicResolver, TemplateSpecializationType) {
+  std::string Code = R"cpp(
+    template <typename>
+    struct A {
+      template <typename>
+      struct B {};
+    };
+    template <typename T>
+    void foo(typename A<T>::template B<int>);
+  )cpp";
+  // Test resolution of "B" in "A<T>::template B<int>".
+  expectResolution(Code, &HeuristicResolver::resolveTemplateSpecializationType,
+                   functionDecl(hasParameter(0, hasType(type().bind("input")))),
+                   classTemplateDecl(has(cxxRecordDecl(
+                       has(classTemplateDecl(hasName("B")).bind("output"))))));
+}
+
+TEST(HeuristicResolver, DependentCall_NonMember) {
+  std::string Code = R"cpp(
+    template <typename T>
+    void nonmember(T);
+    template <typename T>
+    void bar(T t) {
+      nonmember(t);
+    }
+  )cpp";
+  // Test resolution of "nonmember" in "nonmember(t)".
+  expectResolution(Code, &HeuristicResolver::resolveCalleeOfCallExpr,
+                   callExpr(callee(unresolvedLookupExpr(hasAnyDeclaration(
+                                functionTemplateDecl(hasName("nonmember"))))))
+                       .bind("input"),
+                   functionTemplateDecl(hasName("nonmember")).bind("output"));
+}
+
+TEST(HeuristicResolver, DependentCall_Member) {
+  std::string Code = R"cpp(
+    template <typename T>
+    struct A {
+      void member(T);
+    };
+    template <typename T>
+    void bar(A<T> a, T t) {
+      a.member(t);
+    }
+  )cpp";
+  // Test resolution of "member" in "a.member(t)".
+  expectResolution(
+      Code, &HeuristicResolver::resolveCalleeOfCallExpr,
+      callExpr(callee(cxxDependentScopeMemberExpr(hasMemberName("member"))))
+          .bind("input"),
+      cxxMethodDecl(hasName("member")).bind("output"));
+}
+
+TEST(HeuristicResolver, DependentCall_StaticMember) {
+  std::string Code = R"cpp(
+    template <typename T>
+    struct A {
+      static void static_member(T);
+    };
+    template <typename T>
+    void bar(T t) {
+      A<T>::static_member(t);
+    }
+  )cpp";
+  // Test resolution of "static_member" in "A<T>::static_member(t)".
+  expectResolution(Code, &HeuristicResolver::resolveCalleeOfCallExpr,
+                   callExpr(callee(dependentScopeDeclRefExpr(
+                                hasDependentName("static_member"))))
+                       .bind("input"),
+                   cxxMethodDecl(hasName("static_member")).bind("output"));
+}
+
+TEST(HeuristicResolver, DependentCall_Overload) {
+  std::string Code = R"cpp(
+    void overload(int);
+    void overload(double);
+    template <typename T>
+    void bar(T t) {
+      overload(t);
+    }
+  )cpp";
+  // Test resolution of "overload" in "overload(t)". Both overload should be
+  // found.
+  expectResolution(Code, &HeuristicResolver::resolveCalleeOfCallExpr,
+                   callExpr(callee(unresolvedLookupExpr(hasAnyDeclaration(
+                                functionDecl(hasName("overload"))))))
+                       .bind("input"),
+                   functionDecl(hasName("overload"),
+                                hasParameter(0, hasType(asString("double"))))
+                       .bind("output"),
+                   functionDecl(hasName("overload"),
+                                hasParameter(0, hasType(asString("int"))))
+                       .bind("output"));
+}
+
+TEST(HeuristicResolver, UsingValueDecl) {
+  std::string Code = R"cpp(
+    template <typename T>
+    struct Base {
+      void waldo();
+    };
+    template <typename T>
+    struct Derived : Base<T> {
+      using Base<T>::waldo;
+    };
+  )cpp";
+  // Test resolution of "waldo" in "Base<T>::waldo".
+  expectResolution(Code, &HeuristicResolver::resolveUsingValueDecl,
+                   unresolvedUsingValueDecl(hasName("waldo")).bind("input"),
+                   cxxMethodDecl(hasName("waldo")).bind("output"));
+}
+
+} // namespace
+} // namespace clang
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index 1fd9b6077be5..94e15639c4a9 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -233,10 +233,18 @@ Changes in existing checks
   `bsl::optional` and `bdlb::NullableValue` from
   <https://github.com/bloomberg/bde>_.
 
+- Improved :doc:`bugprone-unhandled-self-assignment
+  <clang-tidy/checks/bugprone/unhandled-self-assignment>` check by fixing smart
+  pointer check against std::unique_ptr type.
+
 - Improved :doc:`bugprone-unsafe-functions
   <clang-tidy/checks/bugprone/unsafe-functions>` check to allow specifying
   additional functions to match.
 
+- Improved :doc:`bugprone-unused-local-non-trivial-variable
+  <clang-tidy/checks/bugprone/unused-local-non-trivial-variable>` check to avoid
+  false positives when using name-independent variables after C++26.
+
 - Improved :doc:`bugprone-use-after-move
   <clang-tidy/checks/bugprone/use-after-move>` to avoid triggering on
   ``reset()`` calls on moved-from ``std::optional`` and ``std::any`` objects,
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/unused-local-non-trivial-variable.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/unused-local-non-trivial-variable.rst
index 9f283de78fbd..672eab62b4af 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/unused-local-non-trivial-variable.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/unused-local-non-trivial-variable.rst
@@ -12,6 +12,7 @@ The following types of variables are excluded from this check:
 * static or thread local
 * structured bindings
 * variables with ``[[maybe_unused]]`` attribute
+* name-independent variables
 
 This check can be configured to warn on all non-trivial variables by setting
 `IncludeTypes` to `.*`, and excluding specific types using `ExcludeTypes`.
diff --git a/clang-tools-extra/include-cleaner/lib/Analysis.cpp b/clang-tools-extra/include-cleaner/lib/Analysis.cpp
index 16013f53894e..e3a4834cb19a 100644
--- a/clang-tools-extra/include-cleaner/lib/Analysis.cpp
+++ b/clang-tools-extra/include-cleaner/lib/Analysis.cpp
@@ -85,8 +85,9 @@ analyze(llvm::ArrayRef<Decl *> ASTRoots,
   const auto MainFile = *SM.getFileEntryRefForID(SM.getMainFileID());
   llvm::DenseSet<const Include *> Used;
   llvm::StringMap<Header> Missing;
+  constexpr auto DefaultHeaderFilter = [](llvm::StringRef) { return false; };
   if (!HeaderFilter)
-    HeaderFilter = [](llvm::StringRef) { return false; };
+    HeaderFilter = DefaultHeaderFilter;
   OptionalDirectoryEntryRef ResourceDir =
       PP.getHeaderSearchInfo().getModuleMap().getBuiltinDir();
   walkUsed(ASTRoots, MacroRefs, PI, PP,
diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/unhandled-self-assignment.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/unhandled-self-assignment.cpp
index 14d27855d7c5..8610393449f9 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/bugprone/unhandled-self-assignment.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/unhandled-self-assignment.cpp
@@ -10,7 +10,9 @@ template <class T>
 T &&move(T &x) {
 }
 
-template <class T>
+template <typename T> class default_delete {};
+
+template <class T, typename Deleter = std::default_delete<T>>
 class unique_ptr {
 };
 
diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/unused-local-non-trivial-variable-name-independence.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/unused-local-non-trivial-variable-name-independence.cpp
new file mode 100644
index 000000000000..bcc8b810acab
--- /dev/null
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/unused-local-non-trivial-variable-name-independence.cpp
@@ -0,0 +1,21 @@
+// RUN: %check_clang_tidy -std=c++23 -check-suffixes=,CXX23 %s bugprone-unused-local-non-trivial-variable %t -- \
+// RUN:       -config="{CheckOptions: {bugprone-unused-local-non-trivial-variable.IncludeTypes: '::async::Foo'}}" \
+// RUN:       --
+// RUN: %check_clang_tidy -std=c++26 %s bugprone-unused-local-non-trivial-variable %t -- \
+// RUN:       -config="{CheckOptions: {bugprone-unused-local-non-trivial-variable.IncludeTypes: '::async::Foo'}}" \
+// RUN:       --
+
+namespace async {
+class Foo {
+  public:
+    ~Foo();
+  private:
+};
+} // namespace async
+
+void check() {
+  async::Foo C;
+  // CHECK-MESSAGES: :[[@LINE-1]]:14: warning: unused local variable 'C' of type 'async::Foo' [bugprone-unused-local-non-trivial-variable]
+  async::Foo _;
+  // CHECK-MESSAGES-CXX23: :[[@LINE-1]]:14: warning: unused local variable '_' of type 'async::Foo' [bugprone-unused-local-non-trivial-variable]
+}
diff --git a/clang/docs/LibASTMatchersReference.html b/clang/docs/LibASTMatchersReference.html
index 18f9e7d6c0ea..48dfd9cac003 100644
--- a/clang/docs/LibASTMatchersReference.html
+++ b/clang/docs/LibASTMatchersReference.html
@@ -3462,6 +3462,21 @@ dependentScopeDeclRefExpr(hasDependentName("v")) matches `T::v`
 </pre></td></tr>
 
 
+<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1DependentNameType.html">DependentNameType</a>&gt;</td><td class="name" onclick="toggle('hasDependentName1')"><a name="hasDependentName1Anchor">hasDependentName</a></td><td>std::string N</td></tr>
+<tr><td colspan="4" class="doc" id="hasDependentName1"><pre>Matches the dependent name of a DependentNameType.
+
+Matches the dependent name of a DependentNameType
+
+Given:
+
+  template &lt;typename T&lt; struct declToImport {
+    typedef typename T::type dependent_name;
+  };
+
+dependentNameType(hasDependentName("type")) matches `T::type`
+</pre></td></tr>
+
+
 <tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1CXXDependentScopeMemberExpr.html">CXXDependentScopeMemberExpr</a>&gt;</td><td class="name" onclick="toggle('memberHasSameNameAsBoundNode0')"><a name="memberHasSameNameAsBoundNode0Anchor">memberHasSameNameAsBoundNode</a></td><td>std::string BindingID</td></tr>
 <tr><td colspan="4" class="doc" id="memberHasSameNameAsBoundNode0"><pre>Matches template-dependent, but known, member names against an already-bound
 node
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 8a48a9e3e1f6..2258452d07ec 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -704,6 +704,16 @@ Improvements to Clang's diagnostics
       return ptr + index < ptr; // warning
     }
 
+- Clang now emits a ``-Wvarargs`` diagnostic when the second argument
+  to ``va_arg`` is of array type, which is an undefined behavior (#GH119360).
+
+  .. code-block:: c++
+
+    void test() {
+      va_list va;
+      va_arg(va, int[10]); // warning
+    }
+
 - Fix -Wdangling false positives on conditional operators (#120206).
 
 - Fixed a bug where Clang hung on an unsupported optional scope specifier ``::`` when parsing
@@ -754,6 +764,7 @@ Bug Fixes in This Version
   the unsupported type instead of the ``register`` keyword (#GH109776).
 - Fixed a crash when emit ctor for global variant with flexible array init (#GH113187).
 - Fixed a crash when GNU statement expression contains invalid statement (#GH113468).
+- Fixed a crash when passing the variable length array type to ``va_arg`` (#GH119360).
 - Fixed a failed assertion when using ``__attribute__((noderef))`` on an
   ``_Atomic``-qualified type (#GH116124).
 - No longer return ``false`` for ``noexcept`` expressions involving a
@@ -1119,7 +1130,7 @@ AST Matchers
 
 - Add ``dependentTemplateSpecializationType`` matcher to match a dependent template specialization type.
 
-- Add ``hasDependentName`` matcher to match the dependent name of a DependentScopeDeclRefExpr.
+- Add ``hasDependentName`` matcher to match the dependent name of a DependentScopeDeclRefExpr or DependentNameType.
 
 clang-format
 ------------
diff --git a/clang/docs/analyzer/checkers.rst b/clang/docs/analyzer/checkers.rst
index 29d5e1f92a69..e093b2d672a7 100644
--- a/clang/docs/analyzer/checkers.rst
+++ b/clang/docs/analyzer/checkers.rst
@@ -476,6 +476,9 @@ cplusplus.NewDelete (C++)
 """""""""""""""""""""""""
 Check for double-free and use-after-free problems. Traces memory managed by new/delete.
 
+Custom allocation/deallocation functions can be defined using
+:ref:`ownership attributes<analyzer-ownership-attrs>`.
+
 .. literalinclude:: checkers/newdelete_example.cpp
     :language: cpp
 
@@ -485,6 +488,9 @@ cplusplus.NewDeleteLeaks (C++)
 """"""""""""""""""""""""""""""
 Check for memory leaks. Traces memory managed by new/delete.
 
+Custom allocation/deallocation functions can be defined using
+:ref:`ownership attributes<analyzer-ownership-attrs>`.
+
 .. code-block:: cpp
 
  void test() {
@@ -1263,6 +1269,9 @@ You can silence this warning either by bound checking the ``size`` parameter, or
 by explicitly marking the ``size`` parameter as sanitized. See the
 :ref:`optin-taint-GenericTaint` checker for an example.
 
+Custom allocation/deallocation functions can be defined using
+:ref:`ownership attributes<analyzer-ownership-attrs>`.
+
 .. code-block:: c
 
   void vulnerable(void) {
@@ -1857,6 +1866,9 @@ unix.Malloc (C)
 """""""""""""""
 Check for memory leaks, double free, and use-after-free problems. Traces memory managed by malloc()/free().
 
+Custom allocation/deallocation functions can be defined using
+:ref:`ownership attributes<analyzer-ownership-attrs>`.
+
 .. literalinclude:: checkers/unix_malloc_example.c
     :language: c
 
@@ -1866,6 +1878,9 @@ unix.MallocSizeof (C)
 """""""""""""""""""""
 Check for dubious ``malloc`` arguments involving ``sizeof``.
 
+Custom allocation/deallocation functions can be defined using
+:ref:`ownership attributes<analyzer-ownership-attrs>`.
+
 .. code-block:: c
 
  void test() {
@@ -1881,6 +1896,9 @@ unix.MismatchedDeallocator (C, C++)
 """""""""""""""""""""""""""""""""""
 Check for mismatched deallocators.
 
+Custom allocation/deallocation functions can be defined using
+:ref:`ownership attributes<analyzer-ownership-attrs>`.
+
 .. literalinclude:: checkers/mismatched_deallocator_example.cpp
     :language: c
 
diff --git a/clang/include/clang-c/Index.h b/clang/include/clang-c/Index.h
index 3f95f1db2fbe..63d266dc60ec 100644
--- a/clang/include/clang-c/Index.h
+++ b/clang/include/clang-c/Index.h
@@ -2202,7 +2202,11 @@ enum CXCursorKind {
    */
   CXCursor_OpenACCSetConstruct = 330,
 
-  CXCursor_LastStmt = CXCursor_OpenACCSetConstruct,
+  /** OpenACC update Construct.
+   */
+  CXCursor_OpenACCUpdateConstruct = 331,
+
+  CXCursor_LastStmt = CXCursor_OpenACCUpdateConstruct,
 
   /**
    * Cursor that represents the translation unit itself.
diff --git a/clang/include/clang/AST/OpenACCClause.h b/clang/include/clang/AST/OpenACCClause.h
index adc5e48583d0..4e4dd3447926 100644
--- a/clang/include/clang/AST/OpenACCClause.h
+++ b/clang/include/clang/AST/OpenACCClause.h
@@ -327,18 +327,89 @@ public:
                                  SourceLocation EndLoc);
 };
 
-/// A 'self' clause, which has an optional condition expression.
-class OpenACCSelfClause : public OpenACCClauseWithCondition {
+/// A 'self' clause, which has an optional condition expression, or, in the
+/// event of an 'update' directive, contains a 'VarList'.
+class OpenACCSelfClause final
+    : public OpenACCClauseWithParams,
+      private llvm::TrailingObjects<OpenACCSelfClause, Expr *> {
+  friend TrailingObjects;
+  // Holds whether this HAS a condition expression. Lacks a value if this is NOT
+  // a condition-expr self clause.
+  std::optional<bool> HasConditionExpr;
+  // Holds the number of stored expressions.  In the case of a condition-expr
+  // self clause, this is expected to be ONE (and there to be 1 trailing
+  // object), whether or not that is null.
+  unsigned NumExprs;
+
   OpenACCSelfClause(SourceLocation BeginLoc, SourceLocation LParenLoc,
                     Expr *ConditionExpr, SourceLocation EndLoc);
+  OpenACCSelfClause(SourceLocation BeginLoc, SourceLocation LParenLoc,
+                    ArrayRef<Expr *> VarList, SourceLocation EndLoc);
+
+  // Intentionally internal, meant to be an implementation detail of everything
+  // else. All non-internal uses should go through getConditionExpr/getVarList.
+  llvm::ArrayRef<Expr *> getExprs() const {
+    return {getTrailingObjects<Expr *>(), NumExprs};
+  }
 
 public:
   static bool classof(const OpenACCClause *C) {
     return C->getClauseKind() == OpenACCClauseKind::Self;
   }
+
+  bool isConditionExprClause() const { return HasConditionExpr.has_value(); }
+
+  bool hasConditionExpr() const {
+    assert(HasConditionExpr.has_value() &&
+           "VarList Self Clause asked about condition expression");
+    return *HasConditionExpr;
+  }
+
+  const Expr *getConditionExpr() const {
+    assert(HasConditionExpr.has_value() &&
+           "VarList Self Clause asked about condition expression");
+    assert(getExprs().size() == 1 &&
+           "ConditionExpr Self Clause with too many Exprs");
+    return getExprs()[0];
+  }
+
+  Expr *getConditionExpr() {
+    assert(HasConditionExpr.has_value() &&
+           "VarList Self Clause asked about condition expression");
+    assert(getExprs().size() == 1 &&
+           "ConditionExpr Self Clause with too many Exprs");
+    return getExprs()[0];
+  }
+
+  ArrayRef<Expr *> getVarList() {
+    assert(!HasConditionExpr.has_value() &&
+           "Condition Expr self clause asked about var list");
+    return getExprs();
+  }
+  ArrayRef<Expr *> getVarList() const {
+    assert(!HasConditionExpr.has_value() &&
+           "Condition Expr self clause asked about var list");
+    return getExprs();
+  }
+
+  child_range children() {
+    return child_range(
+        reinterpret_cast<Stmt **>(getTrailingObjects<Expr *>()),
+        reinterpret_cast<Stmt **>(getTrailingObjects<Expr *>() + NumExprs));
+  }
+
+  const_child_range children() const {
+    child_range Children = const_cast<OpenACCSelfClause *>(this)->children();
+    return const_child_range(Children.begin(), Children.end());
+  }
+
   static OpenACCSelfClause *Create(const ASTContext &C, SourceLocation BeginLoc,
                                    SourceLocation LParenLoc,
                                    Expr *ConditionExpr, SourceLocation EndLoc);
+  static OpenACCSelfClause *Create(const ASTContext &C, SourceLocation BeginLoc,
+                                   SourceLocation LParenLoc,
+                                   ArrayRef<Expr *> ConditionExpr,
+                                   SourceLocation EndLoc);
 };
 
 /// Represents a clause that has one or more expressions associated with it.
diff --git a/clang/include/clang/AST/RecursiveASTVisitor.h b/clang/include/clang/AST/RecursiveASTVisitor.h
index 92954cf566c8..d500f4eadef7 100644
--- a/clang/include/clang/AST/RecursiveASTVisitor.h
+++ b/clang/include/clang/AST/RecursiveASTVisitor.h
@@ -4082,6 +4082,8 @@ DEF_TRAVERSE_STMT(OpenACCShutdownConstruct,
                   { TRY_TO(VisitOpenACCClauseList(S->clauses())); })
 DEF_TRAVERSE_STMT(OpenACCSetConstruct,
                   { TRY_TO(VisitOpenACCClauseList(S->clauses())); })
+DEF_TRAVERSE_STMT(OpenACCUpdateConstruct,
+                  { TRY_TO(VisitOpenACCClauseList(S->clauses())); })
 
 // Traverse HLSL: Out argument expression
 DEF_TRAVERSE_STMT(HLSLOutArgExpr, {})
diff --git a/clang/include/clang/AST/StmtOpenACC.h b/clang/include/clang/AST/StmtOpenACC.h
index d3cc106ff008..ebbee152f918 100644
--- a/clang/include/clang/AST/StmtOpenACC.h
+++ b/clang/include/clang/AST/StmtOpenACC.h
@@ -712,5 +712,44 @@ public:
                                      SourceLocation End,
                                      ArrayRef<const OpenACCClause *> Clauses);
 };
+// This class represents an 'update' construct, which has just a clause list.
+class OpenACCUpdateConstruct final
+    : public OpenACCConstructStmt,
+      private llvm::TrailingObjects<OpenACCUpdateConstruct,
+                                    const OpenACCClause *> {
+  friend TrailingObjects;
+  OpenACCUpdateConstruct(unsigned NumClauses)
+      : OpenACCConstructStmt(OpenACCUpdateConstructClass,
+                             OpenACCDirectiveKind::Update, SourceLocation{},
+                             SourceLocation{}, SourceLocation{}) {
+    std::uninitialized_value_construct(
+        getTrailingObjects<const OpenACCClause *>(),
+        getTrailingObjects<const OpenACCClause *>() + NumClauses);
+    setClauseList(MutableArrayRef(getTrailingObjects<const OpenACCClause *>(),
+                                  NumClauses));
+  }
+
+  OpenACCUpdateConstruct(SourceLocation Start, SourceLocation DirectiveLoc,
+                         SourceLocation End,
+                         ArrayRef<const OpenACCClause *> Clauses)
+      : OpenACCConstructStmt(OpenACCUpdateConstructClass,
+                             OpenACCDirectiveKind::Update, Start, DirectiveLoc,
+                             End) {
+    std::uninitialized_copy(Clauses.begin(), Clauses.end(),
+                            getTrailingObjects<const OpenACCClause *>());
+    setClauseList(MutableArrayRef(getTrailingObjects<const OpenACCClause *>(),
+                                  Clauses.size()));
+  }
+
+public:
+  static bool classof(const Stmt *T) {
+    return T->getStmtClass() == OpenACCUpdateConstructClass;
+  }
+  static OpenACCUpdateConstruct *CreateEmpty(const ASTContext &C,
+                                             unsigned NumClauses);
+  static OpenACCUpdateConstruct *
+  Create(const ASTContext &C, SourceLocation Start, SourceLocation DirectiveLoc,
+         SourceLocation End, ArrayRef<const OpenACCClause *> Clauses);
+};
 } // namespace clang
 #endif // LLVM_CLANG_AST_STMTOPENACC_H
diff --git a/clang/include/clang/AST/TextNodeDumper.h b/clang/include/clang/AST/TextNodeDumper.h
index 59cd3ce5c8fb..4aaae48ba8b4 100644
--- a/clang/include/clang/AST/TextNodeDumper.h
+++ b/clang/include/clang/AST/TextNodeDumper.h
@@ -419,6 +419,7 @@ public:
   void VisitOpenACCInitConstruct(const OpenACCInitConstruct *S);
   void VisitOpenACCSetConstruct(const OpenACCSetConstruct *S);
   void VisitOpenACCShutdownConstruct(const OpenACCShutdownConstruct *S);
+  void VisitOpenACCUpdateConstruct(const OpenACCUpdateConstruct *S);
   void VisitOpenACCAsteriskSizeExpr(const OpenACCAsteriskSizeExpr *S);
   void VisitEmbedExpr(const EmbedExpr *S);
   void VisitAtomicExpr(const AtomicExpr *AE);
diff --git a/clang/include/clang/ASTMatchers/ASTMatchers.h b/clang/include/clang/ASTMatchers/ASTMatchers.h
index f10135d7a901..f32170c93bee 100644
--- a/clang/include/clang/ASTMatchers/ASTMatchers.h
+++ b/clang/include/clang/ASTMatchers/ASTMatchers.h
@@ -3257,15 +3257,27 @@ AST_MATCHER_P(CXXDependentScopeMemberExpr, memberHasSameNameAsBoundNode,
       });
 }
 
-/// Matches the dependent name of a DependentScopeDeclRefExpr
+/// Matches the dependent name of a DependentScopeDeclRefExpr or
+/// DependentNameType
 ///
 /// Given:
 /// \code
 ///  template <class T> class X : T { void f() { T::v; } };
 /// \endcode
 /// \c dependentScopeDeclRefExpr(hasDependentName("v")) matches `T::v`
-AST_MATCHER_P(DependentScopeDeclRefExpr, hasDependentName, std::string, N) {
-  return Node.getDeclName().getAsString() == N;
+///
+/// Given:
+/// \code
+///  template <typename T> struct declToImport {
+///    typedef typename T::type dependent_name;
+///  };
+/// \endcode
+/// \c dependentNameType(hasDependentName("type")) matches `T::type`
+AST_POLYMORPHIC_MATCHER_P(hasDependentName,
+                          AST_POLYMORPHIC_SUPPORTED_TYPES(
+                              DependentScopeDeclRefExpr, DependentNameType),
+                          std::string, N) {
+  return internal::getDependentName(Node) == N;
 }
 
 /// Matches C++ classes that are directly or indirectly derived from a class
@@ -7724,7 +7736,7 @@ AST_MATCHER_P(DecayedType, hasDecayedType, internal::Matcher<QualType>,
 
 /// Matches a dependent name type
 ///
-/// Example matches  T::type
+/// Example matches T::type
 /// \code
 ///  template <typename T> struct declToImport {
 ///    typedef typename T::type dependent_name;
diff --git a/clang/include/clang/ASTMatchers/ASTMatchersInternal.h b/clang/include/clang/ASTMatchers/ASTMatchersInternal.h
index 04804d5def04..1f7b5e7cac84 100644
--- a/clang/include/clang/ASTMatchers/ASTMatchersInternal.h
+++ b/clang/include/clang/ASTMatchers/ASTMatchersInternal.h
@@ -2343,6 +2343,14 @@ MatchTemplateArgLocAt(const TemplateSpecializationTypeLoc &Node,
          InnerMatcher.matches(Node.getArgLoc(Index), Finder, Builder);
 }
 
+inline std::string getDependentName(const DependentScopeDeclRefExpr &node) {
+  return node.getDeclName().getAsString();
+}
+
+inline std::string getDependentName(const DependentNameType &node) {
+  return node.getIdentifier()->getName().str();
+}
+
 } // namespace internal
 
 } // namespace ast_matchers
diff --git a/clang/include/clang/Analysis/FlowSensitive/CachedConstAccessorsLattice.h b/clang/include/clang/Analysis/FlowSensitive/CachedConstAccessorsLattice.h
index 48c528736773..aaf89f4e94d4 100644
--- a/clang/include/clang/Analysis/FlowSensitive/CachedConstAccessorsLattice.h
+++ b/clang/include/clang/Analysis/FlowSensitive/CachedConstAccessorsLattice.h
@@ -13,7 +13,9 @@
 #ifndef LLVM_CLANG_ANALYSIS_FLOWSENSITIVE_CACHED_CONST_ACCESSORS_LATTICE_H
 #define LLVM_CLANG_ANALYSIS_FLOWSENSITIVE_CACHED_CONST_ACCESSORS_LATTICE_H
 
+#include "clang/AST/Decl.h"
 #include "clang/AST/Expr.h"
+#include "clang/AST/Type.h"
 #include "clang/Analysis/FlowSensitive/DataflowEnvironment.h"
 #include "clang/Analysis/FlowSensitive/DataflowLattice.h"
 #include "clang/Analysis/FlowSensitive/StorageLocation.h"
@@ -71,10 +73,27 @@ public:
   /// Requirements:
   ///
   ///  - `CE` should return a location (GLValue or a record type).
+  ///
+  /// DEPRECATED: switch users to the below overload which takes Callee and Type
+  /// directly.
   StorageLocation *getOrCreateConstMethodReturnStorageLocation(
       const RecordStorageLocation &RecordLoc, const CallExpr *CE,
       Environment &Env, llvm::function_ref<void(StorageLocation &)> Initialize);
 
+  /// Creates or returns a previously created `StorageLocation` associated with
+  /// a const method call `obj.getFoo()` where `RecordLoc` is the
+  /// `RecordStorageLocation` of `obj`, `Callee` is the decl for `getFoo`.
+  ///
+  /// The callback `Initialize` runs on the storage location if newly created.
+  ///
+  /// Requirements:
+  ///
+  ///  - `Callee` should return a location (return type is a reference type or a
+  ///     record type).
+  StorageLocation &getOrCreateConstMethodReturnStorageLocation(
+      const RecordStorageLocation &RecordLoc, const FunctionDecl *Callee,
+      Environment &Env, llvm::function_ref<void(StorageLocation &)> Initialize);
+
   void clearConstMethodReturnValues(const RecordStorageLocation &RecordLoc) {
     ConstMethodReturnValues.erase(&RecordLoc);
   }
@@ -212,6 +231,27 @@ CachedConstAccessorsLattice<Base>::getOrCreateConstMethodReturnStorageLocation(
   return &Loc;
 }
 
+template <typename Base>
+StorageLocation &
+CachedConstAccessorsLattice<Base>::getOrCreateConstMethodReturnStorageLocation(
+    const RecordStorageLocation &RecordLoc, const FunctionDecl *Callee,
+    Environment &Env, llvm::function_ref<void(StorageLocation &)> Initialize) {
+  assert(Callee != nullptr);
+  QualType Type = Callee->getReturnType();
+  assert(!Type.isNull());
+  assert(Type->isReferenceType() || Type->isRecordType());
+  auto &ObjMap = ConstMethodReturnStorageLocations[&RecordLoc];
+  auto it = ObjMap.find(Callee);
+  if (it != ObjMap.end())
+    return *it->second;
+
+  StorageLocation &Loc = Env.createStorageLocation(Type.getNonReferenceType());
+  Initialize(Loc);
+
+  ObjMap.insert({Callee, &Loc});
+  return Loc;
+}
+
 } // namespace dataflow
 } // namespace clang
 
diff --git a/clang/include/clang/Analysis/FlowSensitive/Models/UncheckedOptionalAccessModel.h b/clang/include/clang/Analysis/FlowSensitive/Models/UncheckedOptionalAccessModel.h
index 713494178b97..fb11c2e230e3 100644
--- a/clang/include/clang/Analysis/FlowSensitive/Models/UncheckedOptionalAccessModel.h
+++ b/clang/include/clang/Analysis/FlowSensitive/Models/UncheckedOptionalAccessModel.h
@@ -37,14 +37,13 @@ struct UncheckedOptionalAccessModelOptions {
   /// can't identify when their results are used safely (across calls),
   /// resulting in false positives in all such cases. Note: this option does not
   /// cover access through `operator[]`.
-  /// FIXME: we currently cache and equate the result of const accessors
-  /// returning pointers, so cover the case of operator-> followed by
-  /// operator->, which covers the common case of smart pointers. We also cover
-  /// some limited cases of returning references (if return type is an optional
-  /// type), so cover some cases of operator* followed by operator*. We don't
-  /// cover mixing operator-> and operator*. Once we are confident in this const
-  /// accessor caching, we shouldn't need the IgnoreSmartPointerDereference
-  /// option anymore.
+  ///
+  /// FIXME: we now cache and equate the result of const accessors
+  /// that look like unique_ptr, have both `->` (returning a pointer type) and
+  /// `*` (returning a reference type). This includes mixing `->` and
+  /// `*` in a sequence of calls as long as the object is not modified. Once we
+  /// are confident in this const accessor caching, we shouldn't need the
+  /// IgnoreSmartPointerDereference option anymore.
   bool IgnoreSmartPointerDereference = false;
 };
 
diff --git a/clang/include/clang/Analysis/FlowSensitive/SmartPointerAccessorCaching.h b/clang/include/clang/Analysis/FlowSensitive/SmartPointerAccessorCaching.h
index 3e4016518eaa..1b116a0cf76e 100644
--- a/clang/include/clang/Analysis/FlowSensitive/SmartPointerAccessorCaching.h
+++ b/clang/include/clang/Analysis/FlowSensitive/SmartPointerAccessorCaching.h
@@ -27,8 +27,13 @@
 #include <cassert>
 
 #include "clang/AST/Decl.h"
+#include "clang/AST/Expr.h"
 #include "clang/AST/Stmt.h"
 #include "clang/ASTMatchers/ASTMatchers.h"
+#include "clang/Analysis/FlowSensitive/MatchSwitch.h"
+#include "clang/Analysis/FlowSensitive/StorageLocation.h"
+#include "clang/Analysis/FlowSensitive/Value.h"
+#include "llvm/ADT/STLFunctionalExtras.h"
 
 namespace clang::dataflow {
 
@@ -58,6 +63,107 @@ ast_matchers::StatementMatcher isSmartPointerLikeOperatorArrow();
 ast_matchers::StatementMatcher isSmartPointerLikeValueMethodCall();
 ast_matchers::StatementMatcher isSmartPointerLikeGetMethodCall();
 
+// Common transfer functions.
+
+/// Returns the "canonical" callee for smart pointer operators (`*` and `->`)
+/// as a key for caching.
+///
+/// We choose `*` as the canonical one, since it needs a
+/// StorageLocation anyway.
+///
+/// Note: there may be multiple `operator*` (one const, one non-const).
+/// We pick the const one, which the above provided matchers require to exist.
+const FunctionDecl *
+getCanonicalSmartPointerLikeOperatorCallee(const CallExpr *CE);
+
+/// A transfer function for `operator*` (and `value`) calls that can be
+/// cached. Runs the `InitializeLoc` callback to initialize any new
+/// StorageLocations.
+///
+/// Requirements:
+///
+/// - LatticeT should use the `CachedConstAccessorsLattice` mixin.
+template <typename LatticeT>
+void transferSmartPointerLikeCachedDeref(
+    const CallExpr *DerefExpr, RecordStorageLocation *SmartPointerLoc,
+    TransferState<LatticeT> &State,
+    llvm::function_ref<void(StorageLocation &)> InitializeLoc);
+
+/// A transfer function for `operator->` (and `get`) calls that can be cached.
+/// Runs the `InitializeLoc` callback to initialize any new StorageLocations.
+///
+/// Requirements:
+///
+/// - LatticeT should use the `CachedConstAccessorsLattice` mixin.
+template <typename LatticeT>
+void transferSmartPointerLikeCachedGet(
+    const CallExpr *GetExpr, RecordStorageLocation *SmartPointerLoc,
+    TransferState<LatticeT> &State,
+    llvm::function_ref<void(StorageLocation &)> InitializeLoc);
+
+template <typename LatticeT>
+void transferSmartPointerLikeCachedDeref(
+    const CallExpr *DerefExpr, RecordStorageLocation *SmartPointerLoc,
+    TransferState<LatticeT> &State,
+    llvm::function_ref<void(StorageLocation &)> InitializeLoc) {
+  if (State.Env.getStorageLocation(*DerefExpr) != nullptr)
+    return;
+  if (SmartPointerLoc == nullptr)
+    return;
+
+  const FunctionDecl *Callee = DerefExpr->getDirectCallee();
+  if (Callee == nullptr)
+    return;
+  const FunctionDecl *CanonicalCallee =
+      getCanonicalSmartPointerLikeOperatorCallee(DerefExpr);
+  // This shouldn't happen, as we should at least find `Callee` itself.
+  assert(CanonicalCallee != nullptr);
+  if (CanonicalCallee != Callee) {
+    // When using the provided matchers, we should always get a reference to
+    // the same type.
+    assert(CanonicalCallee->getReturnType()->isReferenceType() &&
+           Callee->getReturnType()->isReferenceType());
+    assert(CanonicalCallee->getReturnType()
+               .getNonReferenceType()
+               ->getCanonicalTypeUnqualified() ==
+           Callee->getReturnType()
+               .getNonReferenceType()
+               ->getCanonicalTypeUnqualified());
+  }
+
+  StorageLocation &LocForValue =
+      State.Lattice.getOrCreateConstMethodReturnStorageLocation(
+          *SmartPointerLoc, CanonicalCallee, State.Env, InitializeLoc);
+  State.Env.setStorageLocation(*DerefExpr, LocForValue);
+}
+
+template <typename LatticeT>
+void transferSmartPointerLikeCachedGet(
+    const CallExpr *GetExpr, RecordStorageLocation *SmartPointerLoc,
+    TransferState<LatticeT> &State,
+    llvm::function_ref<void(StorageLocation &)> InitializeLoc) {
+  if (SmartPointerLoc == nullptr)
+    return;
+
+  const FunctionDecl *CanonicalCallee =
+      getCanonicalSmartPointerLikeOperatorCallee(GetExpr);
+
+  if (CanonicalCallee != nullptr) {
+    auto &LocForValue =
+        State.Lattice.getOrCreateConstMethodReturnStorageLocation(
+            *SmartPointerLoc, CanonicalCallee, State.Env, InitializeLoc);
+    State.Env.setValue(*GetExpr,
+                       State.Env.template create<PointerValue>(LocForValue));
+  } else {
+    // Otherwise, just cache the pointer value as if it was a const accessor.
+    Value *Val = State.Lattice.getOrCreateConstMethodReturnValue(
+        *SmartPointerLoc, GetExpr, State.Env);
+    if (Val == nullptr)
+      return;
+    State.Env.setValue(*GetExpr, *Val);
+  }
+}
+
 } // namespace clang::dataflow
 
 #endif // LLVM_CLANG_ANALYSIS_FLOWSENSITIVE_SMARTPOINTERACCESSORCACHING_H
diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index bc5f0662adf8..12faf0659700 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -2776,7 +2776,7 @@ def Ownership : InheritableAttr {
   let Args = [IdentifierArgument<"Module">,
               VariadicParamIdxArgument<"Args">];
   let Subjects = SubjectList<[HasFunctionProto]>;
-  let Documentation = [Undocumented];
+  let Documentation = [OwnershipDocs];
 }
 
 def Packed : InheritableAttr {
@@ -4335,16 +4335,6 @@ def HLSLLoopHint: StmtAttr {
   let Documentation = [HLSLLoopHintDocs, HLSLUnrollHintDocs];
 }
 
-def HLSLControlFlowHint: StmtAttr {
-  /// [branch]
-  /// [flatten]
-  let Spellings = [Microsoft<"branch">, Microsoft<"flatten">];
-  let Subjects = SubjectList<[IfStmt],
-                              ErrorDiag, "'if' statements">;
-  let LangOpts = [HLSL];
-  let Documentation = [InternalOnly];
-}
-
 def CapturedRecord : InheritableAttr {
   // This attribute has no spellings as it is only ever created implicitly.
   let Spellings = [];
diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td
index fdad4c9a3ea1..b8d702e41aa0 100644
--- a/clang/include/clang/Basic/AttrDocs.td
+++ b/clang/include/clang/Basic/AttrDocs.td
@@ -1389,6 +1389,82 @@ Query for this attribute with ``__has_attribute(overloadable)``.
   }];
 }
 
+def OwnershipDocs : Documentation {
+  let Heading = "ownership_holds, ownership_returns, ownership_takes (Clang "
+                "Static Analyzer)";
+  let Category = DocCatFunction;
+  let Label = "analyzer-ownership-attrs";
+  let Content = [{
+
+.. note::
+
+  In order for the Clang Static Analyzer to acknowledge these attributes, the
+  ``Optimistic`` config needs to be set to true for the checker
+  ``unix.DynamicMemoryModeling``:
+
+  ``-Xclang -analyzer-config -Xclang unix.DynamicMemoryModeling:Optimistic=true``
+
+These attributes are used by the Clang Static Analyzer's dynamic memory modeling
+facilities to mark custom allocating/deallocating functions.
+
+All 3 attributes' first parameter of type string is the type of the allocation:
+``malloc``, ``new``, etc. to allow for catching :ref:`mismatched deallocation
+<unix-MismatchedDeallocator>` bugs. The allocation type can be any string, e.g.
+a function annotated with
+returning a piece of memory of type ``lasagna`` but freed with a function
+annotated to release ``cheese`` typed memory will result in mismatched
+deallocation warning.
+
+The (currently) only allocation type having special meaning is ``malloc`` --
+the Clang Static Analyzer makes sure that allocating functions annotated with
+``malloc`` are treated like they used the standard ``malloc()``, and can be
+safely deallocated with the standard ``free()``.
+
+* Use ``ownership_returns`` to mark a function as an allocating function. Takes
+  1 parameter to denote the allocation type.
+* Use ``ownership_takes`` to mark a function as a deallocating function. Takes 2
+  parameters: the allocation type, and the index of the parameter that is being
+  deallocated (counting from 1).
+* Use ``ownership_holds`` to mark that a function takes over the ownership of a
+  piece of memory and will free it at some unspecified point in the future. Like
+  ``ownership_takes``, this takes 2 parameters: the allocation type, and the
+  index of the parameter whose ownership will be taken over (counting from 1).
+
+The annotations ``ownership_takes`` and ``ownership_holds`` both prevent memory
+leak reports (concerning the specified argument); the difference between them
+is that using taken memory is a use-after-free error, while using held memory
+is assumed to be legitimate.
+
+Example:
+
+.. code-block:: c
+
+  // Denotes that my_malloc will return with a dynamically allocated piece of
+  // memory using malloc().
+  void __attribute((ownership_returns(malloc))) *my_malloc(size_t);
+
+  // Denotes that my_free will deallocate its parameter using free().
+  void __attribute((ownership_takes(malloc, 1))) my_free(void *);
+
+  // Denotes that my_hold will take over the ownership of its parameter that was
+  // allocated via malloc().
+  void __attribute((ownership_holds(malloc, 1))) my_hold(void *);
+
+Further reading about dynamic memory modeling in the Clang Static Analyzer is
+found in these checker docs:
+:ref:`unix.Malloc <unix-Malloc>`, :ref:`unix.MallocSizeof <unix-MallocSizeof>`,
+:ref:`unix.MismatchedDeallocator <unix-MismatchedDeallocator>`,
+:ref:`cplusplus.NewDelete <cplusplus-NewDelete>`,
+:ref:`cplusplus.NewDeleteLeaks <cplusplus-NewDeleteLeaks>`,
+:ref:`optin.taint.TaintedAlloc <optin-taint-TaintedAlloc>`.
+Mind that many more checkers are affected by dynamic memory modeling changes to
+some extent.
+
+Further reading for other annotations:
+`Source Annotations in the Clang Static Analyzer <https://clang-analyzer.llvm.org/annotations.html>`_.
+  }];
+}
+
 def ObjCMethodFamilyDocs : Documentation {
   let Category = DocCatFunction;
   let Content = [{
diff --git a/clang/include/clang/Basic/BuiltinsSPIRV.td b/clang/include/clang/Basic/BuiltinsSPIRV.td
index 195c13573d04..1e66939b822e 100644
--- a/clang/include/clang/Basic/BuiltinsSPIRV.td
+++ b/clang/include/clang/Basic/BuiltinsSPIRV.td
@@ -8,7 +8,7 @@
 
 include "clang/Basic/BuiltinsBase.td"
 
-def HLSLDistance : Builtin {
+def SPIRVDistance : Builtin {
   let Spellings = ["__builtin_spirv_distance"];
   let Attributes = [NoThrow, Const];
   let Prototype = "void(...)";
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 03fb7ca9bc3c..ab2d6237c1ca 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -5878,7 +5878,7 @@ def err_pack_expansion_without_parameter_packs : Error<
   "pack expansion does not contain any unexpanded parameter packs">;
 def err_pack_expansion_length_conflict : Error<
   "pack expansion contains parameter packs %0 and %1 that have different "
-  "lengths (%2 vs. %select{|at least }3%4))">;
+  "lengths (%2 vs. %select{|at least }3%4)">;
 def err_pack_expansion_length_conflict_multilevel : Error<
   "pack expansion contains parameter pack %0 that has a different "
   "length (%1 vs. %select{|at least }2%3) from outer parameter packs">;
@@ -10512,6 +10512,10 @@ def warn_second_parameter_to_va_arg_ownership_qualified : Warning<
 def warn_second_parameter_to_va_arg_never_compatible : Warning<
   "second argument to 'va_arg' is of promotable type %0; this va_arg has "
   "undefined behavior because arguments will be promoted to %1">, InGroup<Varargs>;
+def warn_second_parameter_to_va_arg_array : Warning<
+  "second argument to 'va_arg' is of array type %0; "
+  "this va_arg has undefined behavior because arguments "
+  "will never be compatible with array type">, InGroup<Varargs>;
 
 def warn_return_missing_expr : Warning<
   "non-void %select{function|method}1 %0 should return a value">, DefaultError,
@@ -12823,6 +12827,10 @@ def err_acc_loop_not_monotonic
             "('++', '--', or compound assignment)">;
 def err_acc_construct_one_clause_of
     : Error<"OpenACC '%0' construct must have at least one %1 clause">;
+def err_acc_update_as_body
+    : Error<"OpenACC 'update' construct may not appear in place of the "
+            "statement following a%select{n if statement| while statement| do "
+            "statement| switch statement| label statement}0">;
 
 // AMDGCN builtins diagnostics
 def err_amdgcn_global_load_lds_size_invalid_value : Error<"invalid size value">;
diff --git a/clang/include/clang/Basic/StmtNodes.td b/clang/include/clang/Basic/StmtNodes.td
index 2ecf19ef6252..ce2c48bd3c84 100644
--- a/clang/include/clang/Basic/StmtNodes.td
+++ b/clang/include/clang/Basic/StmtNodes.td
@@ -316,6 +316,7 @@ def OpenACCWaitConstruct : StmtNode<OpenACCConstructStmt>;
 def OpenACCInitConstruct : StmtNode<OpenACCConstructStmt>;
 def OpenACCShutdownConstruct : StmtNode<OpenACCConstructStmt>;
 def OpenACCSetConstruct : StmtNode<OpenACCConstructStmt>;
+def OpenACCUpdateConstruct : StmtNode<OpenACCConstructStmt>;
 
 // OpenACC Additional Expressions.
 def OpenACCAsteriskSizeExpr : StmtNode<Expr>;
diff --git a/clang/include/clang/Basic/TargetInfo.h b/clang/include/clang/Basic/TargetInfo.h
index f2905f30a7c3..43c09cf1f973 100644
--- a/clang/include/clang/Basic/TargetInfo.h
+++ b/clang/include/clang/Basic/TargetInfo.h
@@ -1531,7 +1531,7 @@ public:
 
   // Return the target-specific priority for features/cpus/vendors so
   // that they can be properly sorted for checking.
-  virtual unsigned getFMVPriority(ArrayRef<StringRef> Features) const {
+  virtual uint64_t getFMVPriority(ArrayRef<StringRef> Features) const {
     return 0;
   }
 
diff --git a/clang/include/clang/Basic/arm_sme.td b/clang/include/clang/Basic/arm_sme.td
index 6b31dec004a1..891ed9874bb3 100644
--- a/clang/include/clang/Basic/arm_sme.td
+++ b/clang/include/clang/Basic/arm_sme.td
@@ -716,6 +716,8 @@ let SMETargetGuard = "sme2" in {
   def SVZERO_ZT : Inst<"svzero_zt", "vi", "", MergeNone, "aarch64_sme_zero_zt", [IsOverloadNone, IsStreamingCompatible, IsOutZT0], [ImmCheck<0, ImmCheck0_0>]>;
 }
 
+def IN_STREAMING_MODE :  Inst<"__arm_in_streaming_mode", "sv", "Pc", MergeNone, "aarch64_sme_in_streaming_mode", [IsOverloadNone, IsStreamingCompatible], []>;
+
 //
 // lookup table expand four contiguous registers
 //
diff --git a/clang/include/clang/CodeGen/BackendUtil.h b/clang/include/clang/CodeGen/BackendUtil.h
index fc8ed4f011f9..7aa4f9db6c2e 100644
--- a/clang/include/clang/CodeGen/BackendUtil.h
+++ b/clang/include/clang/CodeGen/BackendUtil.h
@@ -14,46 +14,46 @@
 #include <memory>
 
 namespace llvm {
-  class BitcodeModule;
-  template <typename T> class Expected;
-  template <typename T> class IntrusiveRefCntPtr;
-  class Module;
-  class MemoryBufferRef;
-  namespace vfs {
-  class FileSystem;
-  } // namespace vfs
-}
+class BitcodeModule;
+template <typename T> class Expected;
+template <typename T> class IntrusiveRefCntPtr;
+class Module;
+class MemoryBufferRef;
+namespace vfs {
+class FileSystem;
+} // namespace vfs
+} // namespace llvm
 
 namespace clang {
-  class DiagnosticsEngine;
-  class HeaderSearchOptions;
-  class CodeGenOptions;
-  class TargetOptions;
-  class LangOptions;
-  class BackendConsumer;
-
-  enum BackendAction {
-    Backend_EmitAssembly,  ///< Emit native assembly files
-    Backend_EmitBC,        ///< Emit LLVM bitcode files
-    Backend_EmitLL,        ///< Emit human-readable LLVM assembly
-    Backend_EmitNothing,   ///< Don't emit anything (benchmarking mode)
-    Backend_EmitMCNull,    ///< Run CodeGen, but don't emit anything
-    Backend_EmitObj        ///< Emit native object files
-  };
-
-  void EmitBackendOutput(DiagnosticsEngine &Diags, const HeaderSearchOptions &,
-                         const CodeGenOptions &CGOpts,
-                         const TargetOptions &TOpts, const LangOptions &LOpts,
-                         StringRef TDesc, llvm::Module *M, BackendAction Action,
-                         llvm::IntrusiveRefCntPtr<llvm::vfs::FileSystem> VFS,
-                         std::unique_ptr<raw_pwrite_stream> OS,
-                         BackendConsumer *BC = nullptr);
-
-  void EmbedBitcode(llvm::Module *M, const CodeGenOptions &CGOpts,
-                    llvm::MemoryBufferRef Buf);
-
-  void EmbedObject(llvm::Module *M, const CodeGenOptions &CGOpts,
-                   DiagnosticsEngine &Diags);
-}
+class DiagnosticsEngine;
+class HeaderSearchOptions;
+class CodeGenOptions;
+class TargetOptions;
+class LangOptions;
+class BackendConsumer;
+
+enum BackendAction {
+  Backend_EmitAssembly, ///< Emit native assembly files
+  Backend_EmitBC,       ///< Emit LLVM bitcode files
+  Backend_EmitLL,       ///< Emit human-readable LLVM assembly
+  Backend_EmitNothing,  ///< Don't emit anything (benchmarking mode)
+  Backend_EmitMCNull,   ///< Run CodeGen, but don't emit anything
+  Backend_EmitObj       ///< Emit native object files
+};
+
+void EmitBackendOutput(DiagnosticsEngine &Diags, const HeaderSearchOptions &,
+                       const CodeGenOptions &CGOpts, const TargetOptions &TOpts,
+                       const LangOptions &LOpts, StringRef TDesc,
+                       llvm::Module *M, BackendAction Action,
+                       llvm::IntrusiveRefCntPtr<llvm::vfs::FileSystem> VFS,
+                       std::unique_ptr<raw_pwrite_stream> OS,
+                       BackendConsumer *BC = nullptr);
+
+void EmbedBitcode(llvm::Module *M, const CodeGenOptions &CGOpts,
+                  llvm::MemoryBufferRef Buf);
+
+void EmbedObject(llvm::Module *M, const CodeGenOptions &CGOpts,
+                 DiagnosticsEngine &Diags);
+} // namespace clang
 
 #endif
diff --git a/clang/include/clang/Driver/Driver.h b/clang/include/clang/Driver/Driver.h
index c23d037e725b..80bce574a3b6 100644
--- a/clang/include/clang/Driver/Driver.h
+++ b/clang/include/clang/Driver/Driver.h
@@ -741,6 +741,11 @@ private:
   /// \returns true if error occurred.
   bool loadDefaultConfigFiles(llvm::cl::ExpansionContext &ExpCtx);
 
+  /// Tries to load options from customization file.
+  ///
+  /// \returns true if error occurred.
+  bool loadZOSCustomizationFile(llvm::cl::ExpansionContext &);
+
   /// Read options from the specified file.
   ///
   /// \param [in] FileName File to read.
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 0528104f0551..52823430919d 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -6945,6 +6945,9 @@ defm unsigned : OptInFC1FFlag<"unsigned", "Enables UNSIGNED type">;
 def fno_automatic : Flag<["-"], "fno-automatic">, Group<f_Group>,
   HelpText<"Implies the SAVE attribute for non-automatic local objects in subprograms unless RECURSIVE">;
 
+def fsave_main_program : Flag<["-"], "fsave-main-program">, Group<f_Group>,
+  HelpText<"Place all variables from the main program in static memory (otherwise scalars may be placed on the stack)">;
+
 defm stack_arrays : BoolOptionWithoutMarshalling<"f", "stack-arrays",
   PosFlag<SetTrue, [], [ClangOption], "Attempt to allocate array temporaries on the stack, no matter their size">,
   NegFlag<SetFalse, [], [ClangOption], "Allocate array temporaries on the heap (default)">>;
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index 18fd95f77ec2..a41f16f6dc8c 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -13062,7 +13062,6 @@ public:
   ///
   /// \param SkipForSpecialization when specified, any template specializations
   /// in a traversal would be ignored.
-  ///
   /// \param ForDefaultArgumentSubstitution indicates we should continue looking
   /// when encountering a specialized member function template, rather than
   /// returning immediately.
@@ -13074,17 +13073,6 @@ public:
       bool SkipForSpecialization = false,
       bool ForDefaultArgumentSubstitution = false);
 
-  /// Apart from storing the result to \p Result, this behaves the same as
-  /// another overload.
-  void getTemplateInstantiationArgs(
-      MultiLevelTemplateArgumentList &Result, const NamedDecl *D,
-      const DeclContext *DC = nullptr, bool Final = false,
-      std::optional<ArrayRef<TemplateArgument>> Innermost = std::nullopt,
-      bool RelativeToPrimary = false, const FunctionDecl *Pattern = nullptr,
-      bool ForConstraintInstantiation = false,
-      bool SkipForSpecialization = false,
-      bool ForDefaultArgumentSubstitution = false);
-
   /// RAII object to handle the state changes required to synthesize
   /// a function body.
   class SynthesizedFunctionScope {
@@ -13354,7 +13342,7 @@ public:
   ExprResult
   SubstConstraintExpr(Expr *E,
                       const MultiLevelTemplateArgumentList &TemplateArgs);
-  // Unlike the above, this does not evaluate constraints.
+  // Unlike the above, this does not evaluates constraints.
   ExprResult SubstConstraintExprWithoutSatisfaction(
       Expr *E, const MultiLevelTemplateArgumentList &TemplateArgs);
 
@@ -14475,10 +14463,10 @@ public:
       const MultiLevelTemplateArgumentList &TemplateArgs,
       SourceRange TemplateIDRange);
 
-  bool CheckFunctionTemplateConstraints(SourceLocation PointOfInstantiation,
-                                        FunctionDecl *Decl,
-                                        ArrayRef<TemplateArgument> TemplateArgs,
-                                        ConstraintSatisfaction &Satisfaction);
+  bool CheckInstantiatedFunctionTemplateConstraints(
+      SourceLocation PointOfInstantiation, FunctionDecl *Decl,
+      ArrayRef<TemplateArgument> TemplateArgs,
+      ConstraintSatisfaction &Satisfaction);
 
   /// \brief Emit diagnostics explaining why a constraint expression was deemed
   /// unsatisfied.
diff --git a/clang/include/clang/Sema/SemaOpenACC.h b/clang/include/clang/Sema/SemaOpenACC.h
index 03abf4ab2cec..0f86d46bc980 100644
--- a/clang/include/clang/Sema/SemaOpenACC.h
+++ b/clang/include/clang/Sema/SemaOpenACC.h
@@ -409,6 +409,8 @@ public:
               ClauseKind == OpenACCClauseKind::Detach ||
               ClauseKind == OpenACCClauseKind::DevicePtr ||
               ClauseKind == OpenACCClauseKind::Reduction ||
+              (ClauseKind == OpenACCClauseKind::Self &&
+               DirKind == OpenACCDirectiveKind::Update) ||
               ClauseKind == OpenACCClauseKind::FirstPrivate) &&
              "Parsed clause kind does not have a var-list");
 
@@ -551,6 +553,8 @@ public:
               ClauseKind == OpenACCClauseKind::UseDevice ||
               ClauseKind == OpenACCClauseKind::Detach ||
               ClauseKind == OpenACCClauseKind::DevicePtr ||
+              (ClauseKind == OpenACCClauseKind::Self &&
+               DirKind == OpenACCDirectiveKind::Update) ||
               ClauseKind == OpenACCClauseKind::FirstPrivate) &&
              "Parsed clause kind does not have a var-list");
       assert((!IsReadOnly || ClauseKind == OpenACCClauseKind::CopyIn ||
@@ -590,6 +594,8 @@ public:
               ClauseKind == OpenACCClauseKind::UseDevice ||
               ClauseKind == OpenACCClauseKind::Detach ||
               ClauseKind == OpenACCClauseKind::DevicePtr ||
+              (ClauseKind == OpenACCClauseKind::Self &&
+               DirKind == OpenACCDirectiveKind::Update) ||
               ClauseKind == OpenACCClauseKind::FirstPrivate) &&
              "Parsed clause kind does not have a var-list");
       assert((!IsReadOnly || ClauseKind == OpenACCClauseKind::CopyIn ||
diff --git a/clang/include/clang/Sema/Template.h b/clang/include/clang/Sema/Template.h
index 59a0575ca980..9800f75f676a 100644
--- a/clang/include/clang/Sema/Template.h
+++ b/clang/include/clang/Sema/Template.h
@@ -522,12 +522,6 @@ enum class TemplateSubstitutionKind : char {
     llvm::PointerUnion<Decl *, DeclArgumentPack *> *
     findInstantiationOf(const Decl *D);
 
-    /// Similar to \p findInstantiationOf(), but it wouldn't assert if the
-    /// instantiation was not found within the current instantiation scope. This
-    /// is helpful for on-demand declaration instantiation.
-    llvm::PointerUnion<Decl *, DeclArgumentPack *> *
-    findInstantiationUnsafe(const Decl *D);
-
     void InstantiatedLocal(const Decl *D, Decl *Inst);
     void InstantiatedLocalPackArg(const Decl *D, VarDecl *Inst);
     void MakeInstantiatedLocalArgPack(const Decl *D);
diff --git a/clang/include/clang/Serialization/ASTBitCodes.h b/clang/include/clang/Serialization/ASTBitCodes.h
index a46a7e133f1b..aac165130b71 100644
--- a/clang/include/clang/Serialization/ASTBitCodes.h
+++ b/clang/include/clang/Serialization/ASTBitCodes.h
@@ -2025,6 +2025,7 @@ enum StmtCode {
   STMT_OPENACC_INIT_CONSTRUCT,
   STMT_OPENACC_SHUTDOWN_CONSTRUCT,
   STMT_OPENACC_SET_CONSTRUCT,
+  STMT_OPENACC_UPDATE_CONSTRUCT,
 
   // HLSL Constructs
   EXPR_HLSL_OUT_ARG,
diff --git a/clang/include/clang/StaticAnalyzer/Core/AnalyzerOptions.h b/clang/include/clang/StaticAnalyzer/Core/AnalyzerOptions.h
index 3f341ecf8c1e..2c970301879d 100644
--- a/clang/include/clang/StaticAnalyzer/Core/AnalyzerOptions.h
+++ b/clang/include/clang/StaticAnalyzer/Core/AnalyzerOptions.h
@@ -126,11 +126,18 @@ enum class CTUPhase1InliningKind { None, Small, All };
 
 class PositiveAnalyzerOption {
 public:
-  PositiveAnalyzerOption() = default;
-  PositiveAnalyzerOption(const PositiveAnalyzerOption &) = default;
-  PositiveAnalyzerOption &operator=(const PositiveAnalyzerOption &) = default;
+  constexpr PositiveAnalyzerOption() = default;
+  constexpr PositiveAnalyzerOption(unsigned Value) : Value(Value) {
+    assert(Value > 0 && "only positive values are accepted");
+  }
+  constexpr PositiveAnalyzerOption(const PositiveAnalyzerOption &) = default;
+  constexpr PositiveAnalyzerOption &
+  operator=(const PositiveAnalyzerOption &Other) {
+    Value = Other.Value;
+    return *this;
+  }
 
-  static std::optional<PositiveAnalyzerOption> create(unsigned Val) {
+  static constexpr std::optional<PositiveAnalyzerOption> create(unsigned Val) {
     if (Val == 0)
       return std::nullopt;
     return PositiveAnalyzerOption{Val};
@@ -141,11 +148,9 @@ public:
       return std::nullopt;
     return PositiveAnalyzerOption::create(Parsed);
   }
-  operator unsigned() const { return Value; }
+  constexpr operator unsigned() const { return Value; }
 
 private:
-  explicit constexpr PositiveAnalyzerOption(unsigned Value) : Value(Value) {}
-
   unsigned Value = 1;
 };
 
diff --git a/clang/include/clang/Tooling/DependencyScanning/DependencyScanningTool.h b/clang/include/clang/Tooling/DependencyScanning/DependencyScanningTool.h
index 012237e0278f..ddb078dc16e3 100644
--- a/clang/include/clang/Tooling/DependencyScanning/DependencyScanningTool.h
+++ b/clang/include/clang/Tooling/DependencyScanning/DependencyScanningTool.h
@@ -15,6 +15,7 @@
 #include "clang/Tooling/JSONCompilationDatabase.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/MapVector.h"
+#include <functional>
 #include <optional>
 #include <string>
 #include <vector>
@@ -25,7 +26,7 @@ namespace dependencies {
 
 /// A callback to lookup module outputs for "-fmodule-file=", "-o" etc.
 using LookupModuleOutputCallback =
-    llvm::function_ref<std::string(const ModuleID &, ModuleOutputKind)>;
+    std::function<std::string(const ModuleID &, ModuleOutputKind)>;
 
 /// Graph of modular dependencies.
 using ModuleDepsGraph = std::vector<ModuleDeps>;
diff --git a/clang/include/module.modulemap b/clang/include/module.modulemap
index 5bb9f6b7a91f..f00dede7fd52 100644
--- a/clang/include/module.modulemap
+++ b/clang/include/module.modulemap
@@ -62,8 +62,6 @@ module Clang_Basic {
   textual header "clang/Basic/BuiltinsVE.def"
   textual header "clang/Basic/BuiltinsVEVL.gen.def"
   textual header "clang/Basic/BuiltinsWebAssembly.def"
-  textual header "clang/Basic/BuiltinsX86.def"
-  textual header "clang/Basic/BuiltinsX86_64.def"
   textual header "clang/Basic/BuiltinsXCore.def"
   textual header "clang/Basic/CFProtectionOptions.def"
   textual header "clang/Basic/CodeGenOptions.def"
diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp
index a9ecb4ee9c76..b10513f49a8d 100644
--- a/clang/lib/AST/ASTContext.cpp
+++ b/clang/lib/AST/ASTContext.cpp
@@ -14357,7 +14357,7 @@ QualType ASTContext::getCorrespondingSignedFixedPointType(QualType Ty) const {
 // corresponding backend features (which may contain duplicates).
 static std::vector<std::string> getFMVBackendFeaturesFor(
     const llvm::SmallVectorImpl<StringRef> &FMVFeatStrings) {
-  std::vector<std::string> BackendFeats{{"+fmv"}};
+  std::vector<std::string> BackendFeats;
   llvm::AArch64::ExtensionSet FeatureBits;
   for (StringRef F : FMVFeatStrings)
     if (auto FMVExt = llvm::AArch64::parseFMVExtension(F))
diff --git a/clang/lib/AST/OpenACCClause.cpp b/clang/lib/AST/OpenACCClause.cpp
index 76fea1fd47d2..da63b471d985 100644
--- a/clang/lib/AST/OpenACCClause.cpp
+++ b/clang/lib/AST/OpenACCClause.cpp
@@ -20,7 +20,7 @@ using namespace clang;
 bool OpenACCClauseWithParams::classof(const OpenACCClause *C) {
   return OpenACCDeviceTypeClause::classof(C) ||
          OpenACCClauseWithCondition::classof(C) ||
-         OpenACCClauseWithExprs::classof(C);
+         OpenACCClauseWithExprs::classof(C) || OpenACCSelfClause::classof(C);
 }
 bool OpenACCClauseWithExprs::classof(const OpenACCClause *C) {
   return OpenACCWaitClause::classof(C) || OpenACCNumGangsClause::classof(C) ||
@@ -41,7 +41,7 @@ bool OpenACCClauseWithVarList::classof(const OpenACCClause *C) {
          OpenACCReductionClause::classof(C) || OpenACCCreateClause::classof(C);
 }
 bool OpenACCClauseWithCondition::classof(const OpenACCClause *C) {
-  return OpenACCIfClause::classof(C) || OpenACCSelfClause::classof(C);
+  return OpenACCIfClause::classof(C);
 }
 bool OpenACCClauseWithSingleIntExpr::classof(const OpenACCClause *C) {
   return OpenACCNumWorkersClause::classof(C) ||
@@ -87,19 +87,43 @@ OpenACCSelfClause *OpenACCSelfClause::Create(const ASTContext &C,
                                              SourceLocation LParenLoc,
                                              Expr *ConditionExpr,
                                              SourceLocation EndLoc) {
-  void *Mem = C.Allocate(sizeof(OpenACCIfClause), alignof(OpenACCIfClause));
+  void *Mem = C.Allocate(OpenACCSelfClause::totalSizeToAlloc<Expr *>(1));
   return new (Mem)
       OpenACCSelfClause(BeginLoc, LParenLoc, ConditionExpr, EndLoc);
 }
 
+OpenACCSelfClause *OpenACCSelfClause::Create(const ASTContext &C,
+                                             SourceLocation BeginLoc,
+                                             SourceLocation LParenLoc,
+                                             ArrayRef<Expr *> VarList,
+                                             SourceLocation EndLoc) {
+  void *Mem =
+      C.Allocate(OpenACCSelfClause::totalSizeToAlloc<Expr *>(VarList.size()));
+  return new (Mem) OpenACCSelfClause(BeginLoc, LParenLoc, VarList, EndLoc);
+}
+
+OpenACCSelfClause::OpenACCSelfClause(SourceLocation BeginLoc,
+                                     SourceLocation LParenLoc,
+                                     llvm::ArrayRef<Expr *> VarList,
+                                     SourceLocation EndLoc)
+    : OpenACCClauseWithParams(OpenACCClauseKind::Self, BeginLoc, LParenLoc,
+                              EndLoc),
+      HasConditionExpr(std::nullopt), NumExprs(VarList.size()) {
+  std::uninitialized_copy(VarList.begin(), VarList.end(),
+                          getTrailingObjects<Expr *>());
+}
+
 OpenACCSelfClause::OpenACCSelfClause(SourceLocation BeginLoc,
                                      SourceLocation LParenLoc,
                                      Expr *ConditionExpr, SourceLocation EndLoc)
-    : OpenACCClauseWithCondition(OpenACCClauseKind::Self, BeginLoc, LParenLoc,
-                                 ConditionExpr, EndLoc) {
+    : OpenACCClauseWithParams(OpenACCClauseKind::Self, BeginLoc, LParenLoc,
+                              EndLoc),
+      HasConditionExpr(ConditionExpr != nullptr), NumExprs(1) {
   assert((!ConditionExpr || ConditionExpr->isInstantiationDependent() ||
           ConditionExpr->getType()->isScalarType()) &&
          "Condition expression type not scalar/dependent");
+  std::uninitialized_copy(&ConditionExpr, &ConditionExpr + 1,
+                          getTrailingObjects<Expr *>());
 }
 
 OpenACCClause::child_range OpenACCClause::children() {
@@ -555,9 +579,17 @@ void OpenACCClausePrinter::VisitIfClause(const OpenACCIfClause &C) {
 
 void OpenACCClausePrinter::VisitSelfClause(const OpenACCSelfClause &C) {
   OS << "self";
-  if (const Expr *CondExpr = C.getConditionExpr()) {
+
+  if (C.isConditionExprClause()) {
+    if (const Expr *CondExpr = C.getConditionExpr()) {
+      OS << "(";
+      printExpr(CondExpr);
+      OS << ")";
+    }
+  } else {
     OS << "(";
-    printExpr(CondExpr);
+    llvm::interleaveComma(C.getVarList(), OS,
+                          [&](const Expr *E) { printExpr(E); });
     OS << ")";
   }
 }
diff --git a/clang/lib/AST/ParentMap.cpp b/clang/lib/AST/ParentMap.cpp
index fd749b02b758..e62e71bf5a51 100644
--- a/clang/lib/AST/ParentMap.cpp
+++ b/clang/lib/AST/ParentMap.cpp
@@ -33,17 +33,19 @@ static void BuildParentMap(MapTy& M, Stmt* S,
   switch (S->getStmtClass()) {
   case Stmt::PseudoObjectExprClass: {
     PseudoObjectExpr *POE = cast<PseudoObjectExpr>(S);
-
-    if (OVMode == OV_Opaque && M[POE->getSyntacticForm()])
-      break;
-
-    // If we are rebuilding the map, clear out any existing state.
-    if (M[POE->getSyntacticForm()])
+    Expr *SF = POE->getSyntacticForm();
+
+    auto [Iter, Inserted] = M.try_emplace(SF, S);
+    if (!Inserted) {
+      // Nothing more to do in opaque mode if we are updating an existing map.
+      if (OVMode == OV_Opaque)
+        break;
+      // Update the entry in transparent mode, and clear existing state.
+      Iter->second = S;
       for (Stmt *SubStmt : S->children())
-        M[SubStmt] = nullptr;
-
-    M[POE->getSyntacticForm()] = S;
-    BuildParentMap(M, POE->getSyntacticForm(), OV_Transparent);
+        M.erase(SubStmt);
+    }
+    BuildParentMap(M, SF, OV_Transparent);
 
     for (PseudoObjectExpr::semantics_iterator I = POE->semantics_begin(),
                                               E = POE->semantics_end();
@@ -78,10 +80,15 @@ static void BuildParentMap(MapTy& M, Stmt* S,
     // The right thing to do is to give the OpaqueValueExpr its syntactic
     // parent, then not reassign that when traversing the semantic expressions.
     OpaqueValueExpr *OVE = cast<OpaqueValueExpr>(S);
-    if (OVMode == OV_Transparent || !M[OVE->getSourceExpr()]) {
-      M[OVE->getSourceExpr()] = S;
-      BuildParentMap(M, OVE->getSourceExpr(), OV_Transparent);
+    Expr *SrcExpr = OVE->getSourceExpr();
+    auto [Iter, Inserted] = M.try_emplace(SrcExpr, S);
+    // Force update in transparent mode.
+    if (!Inserted && OVMode == OV_Transparent) {
+      Iter->second = S;
+      Inserted = true;
     }
+    if (Inserted)
+      BuildParentMap(M, SrcExpr, OV_Transparent);
     break;
   }
   case Stmt::CapturedStmtClass:
diff --git a/clang/lib/AST/StmtOpenACC.cpp b/clang/lib/AST/StmtOpenACC.cpp
index 889573f57b40..2b0ac716bab5 100644
--- a/clang/lib/AST/StmtOpenACC.cpp
+++ b/clang/lib/AST/StmtOpenACC.cpp
@@ -284,3 +284,24 @@ OpenACCSetConstruct::Create(const ASTContext &C, SourceLocation Start,
   auto *Inst = new (Mem) OpenACCSetConstruct(Start, DirectiveLoc, End, Clauses);
   return Inst;
 }
+
+OpenACCUpdateConstruct *
+OpenACCUpdateConstruct::CreateEmpty(const ASTContext &C, unsigned NumClauses) {
+  void *Mem = C.Allocate(
+      OpenACCUpdateConstruct::totalSizeToAlloc<const OpenACCClause *>(
+          NumClauses));
+  auto *Inst = new (Mem) OpenACCUpdateConstruct(NumClauses);
+  return Inst;
+}
+
+OpenACCUpdateConstruct *
+OpenACCUpdateConstruct::Create(const ASTContext &C, SourceLocation Start,
+                               SourceLocation DirectiveLoc, SourceLocation End,
+                               ArrayRef<const OpenACCClause *> Clauses) {
+  void *Mem = C.Allocate(
+      OpenACCUpdateConstruct::totalSizeToAlloc<const OpenACCClause *>(
+          Clauses.size()));
+  auto *Inst =
+      new (Mem) OpenACCUpdateConstruct(Start, DirectiveLoc, End, Clauses);
+  return Inst;
+}
diff --git a/clang/lib/AST/StmtPrinter.cpp b/clang/lib/AST/StmtPrinter.cpp
index 52eead979b17..52bcb5135d35 100644
--- a/clang/lib/AST/StmtPrinter.cpp
+++ b/clang/lib/AST/StmtPrinter.cpp
@@ -1204,10 +1204,12 @@ void StmtPrinter::VisitOpenACCInitConstruct(OpenACCInitConstruct *S) {
 void StmtPrinter::VisitOpenACCShutdownConstruct(OpenACCShutdownConstruct *S) {
   PrintOpenACCConstruct(S);
 }
-
 void StmtPrinter::VisitOpenACCSetConstruct(OpenACCSetConstruct *S) {
   PrintOpenACCConstruct(S);
 }
+void StmtPrinter::VisitOpenACCUpdateConstruct(OpenACCUpdateConstruct *S) {
+  PrintOpenACCConstruct(S);
+}
 
 void StmtPrinter::VisitOpenACCWaitConstruct(OpenACCWaitConstruct *S) {
   Indent() << "#pragma acc wait";
diff --git a/clang/lib/AST/StmtProfile.cpp b/clang/lib/AST/StmtProfile.cpp
index 150b92ef6a1a..cd91a7900538 100644
--- a/clang/lib/AST/StmtProfile.cpp
+++ b/clang/lib/AST/StmtProfile.cpp
@@ -2555,8 +2555,13 @@ void OpenACCClauseProfiler::VisitCreateClause(
 }
 
 void OpenACCClauseProfiler::VisitSelfClause(const OpenACCSelfClause &Clause) {
-  if (Clause.hasConditionExpr())
-    Profiler.VisitStmt(Clause.getConditionExpr());
+  if (Clause.isConditionExprClause()) {
+    if (Clause.hasConditionExpr())
+      Profiler.VisitStmt(Clause.getConditionExpr());
+  } else {
+    for (auto *E : Clause.getVarList())
+      Profiler.VisitStmt(E);
+  }
 }
 
 void OpenACCClauseProfiler::VisitFinalizeClause(
@@ -2780,6 +2785,13 @@ void StmtProfiler::VisitOpenACCSetConstruct(const OpenACCSetConstruct *S) {
   P.VisitOpenACCClauseList(S->clauses());
 }
 
+void StmtProfiler::VisitOpenACCUpdateConstruct(
+    const OpenACCUpdateConstruct *S) {
+  VisitStmt(S);
+  OpenACCClauseProfiler P{*this};
+  P.VisitOpenACCClauseList(S->clauses());
+}
+
 void StmtProfiler::VisitHLSLOutArgExpr(const HLSLOutArgExpr *S) {
   VisitStmt(S);
 }
diff --git a/clang/lib/AST/TextNodeDumper.cpp b/clang/lib/AST/TextNodeDumper.cpp
index 00e3af3e8112..eedd8faad9e8 100644
--- a/clang/lib/AST/TextNodeDumper.cpp
+++ b/clang/lib/AST/TextNodeDumper.cpp
@@ -2931,7 +2931,6 @@ void TextNodeDumper::VisitOpenACCConstructStmt(const OpenACCConstructStmt *S) {
   OS << " " << S->getDirectiveKind();
 }
 void TextNodeDumper::VisitOpenACCLoopConstruct(const OpenACCLoopConstruct *S) {
-
   if (S->isOrphanedLoopConstruct())
     OS << " <orphan>";
   else
@@ -2940,40 +2939,44 @@ void TextNodeDumper::VisitOpenACCLoopConstruct(const OpenACCLoopConstruct *S) {
 
 void TextNodeDumper::VisitOpenACCCombinedConstruct(
     const OpenACCCombinedConstruct *S) {
-  OS << " " << S->getDirectiveKind();
+  VisitOpenACCConstructStmt(S);
 }
 
 void TextNodeDumper::VisitOpenACCDataConstruct(const OpenACCDataConstruct *S) {
-  OS << " " << S->getDirectiveKind();
+  VisitOpenACCConstructStmt(S);
 }
 
 void TextNodeDumper::VisitOpenACCEnterDataConstruct(
     const OpenACCEnterDataConstruct *S) {
-  OS << " " << S->getDirectiveKind();
+  VisitOpenACCConstructStmt(S);
 }
 
 void TextNodeDumper::VisitOpenACCExitDataConstruct(
     const OpenACCExitDataConstruct *S) {
-  OS << " " << S->getDirectiveKind();
+  VisitOpenACCConstructStmt(S);
 }
 
 void TextNodeDumper::VisitOpenACCHostDataConstruct(
     const OpenACCHostDataConstruct *S) {
-  OS << " " << S->getDirectiveKind();
+  VisitOpenACCConstructStmt(S);
 }
 
 void TextNodeDumper::VisitOpenACCWaitConstruct(const OpenACCWaitConstruct *S) {
-  OS << " " << S->getDirectiveKind();
+  VisitOpenACCConstructStmt(S);
 }
 void TextNodeDumper::VisitOpenACCInitConstruct(const OpenACCInitConstruct *S) {
-  OS << " " << S->getDirectiveKind();
+  VisitOpenACCConstructStmt(S);
 }
 void TextNodeDumper::VisitOpenACCShutdownConstruct(
     const OpenACCShutdownConstruct *S) {
-  OS << " " << S->getDirectiveKind();
+  VisitOpenACCConstructStmt(S);
 }
 void TextNodeDumper::VisitOpenACCSetConstruct(const OpenACCSetConstruct *S) {
-  OS << " " << S->getDirectiveKind();
+  VisitOpenACCConstructStmt(S);
+}
+void TextNodeDumper::VisitOpenACCUpdateConstruct(
+    const OpenACCUpdateConstruct *S) {
+  VisitOpenACCConstructStmt(S);
 }
 
 void TextNodeDumper::VisitEmbedExpr(const EmbedExpr *S) {
diff --git a/clang/lib/Analysis/FlowSensitive/Models/UncheckedOptionalAccessModel.cpp b/clang/lib/Analysis/FlowSensitive/Models/UncheckedOptionalAccessModel.cpp
index da5dda063344..e1394e28cd49 100644
--- a/clang/lib/Analysis/FlowSensitive/Models/UncheckedOptionalAccessModel.cpp
+++ b/clang/lib/Analysis/FlowSensitive/Models/UncheckedOptionalAccessModel.cpp
@@ -25,8 +25,10 @@
 #include "clang/Analysis/FlowSensitive/DataflowEnvironment.h"
 #include "clang/Analysis/FlowSensitive/Formula.h"
 #include "clang/Analysis/FlowSensitive/RecordOps.h"
+#include "clang/Analysis/FlowSensitive/SmartPointerAccessorCaching.h"
 #include "clang/Analysis/FlowSensitive/StorageLocation.h"
 #include "clang/Analysis/FlowSensitive/Value.h"
+#include "clang/Basic/OperatorKinds.h"
 #include "clang/Basic/SourceLocation.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Casting.h"
@@ -555,24 +557,25 @@ void handleConstMemberCall(const CallExpr *CE,
                            LatticeTransferState &State) {
   // If the const method returns an optional or reference to an optional.
   if (RecordLoc != nullptr && isSupportedOptionalType(CE->getType())) {
-    StorageLocation *Loc =
+    const FunctionDecl *DirectCallee = CE->getDirectCallee();
+    if (DirectCallee == nullptr)
+      return;
+    StorageLocation &Loc =
         State.Lattice.getOrCreateConstMethodReturnStorageLocation(
-            *RecordLoc, CE, State.Env, [&](StorageLocation &Loc) {
+            *RecordLoc, DirectCallee, State.Env, [&](StorageLocation &Loc) {
               setHasValue(cast<RecordStorageLocation>(Loc),
                           State.Env.makeAtomicBoolValue(), State.Env);
             });
-    if (Loc == nullptr)
-      return;
     if (CE->isGLValue()) {
       // If the call to the const method returns a reference to an optional,
       // link the call expression to the cached StorageLocation.
-      State.Env.setStorageLocation(*CE, *Loc);
+      State.Env.setStorageLocation(*CE, Loc);
     } else {
       // If the call to the const method returns an optional by value, we
       // need to use CopyRecord to link the optional to the result object
       // of the call expression.
       auto &ResultLoc = State.Env.getResultObjectLocation(*CE);
-      copyRecord(*cast<RecordStorageLocation>(Loc), ResultLoc, State.Env);
+      copyRecord(cast<RecordStorageLocation>(Loc), ResultLoc, State.Env);
     }
     return;
   }
@@ -1031,6 +1034,48 @@ auto buildTransferMatchSwitch() {
             transferOptionalAndValueCmp(Cmp, Cmp->getArg(1), State.Env);
           })
 
+      // Smart-pointer-like operator* and operator-> calls that may look like
+      // const accessors (below) but need special handling to allow mixing
+      // the accessor calls.
+      .CaseOfCFGStmt<CXXOperatorCallExpr>(
+          isSmartPointerLikeOperatorStar(),
+          [](const CXXOperatorCallExpr *E,
+             const MatchFinder::MatchResult &Result,
+             LatticeTransferState &State) {
+            transferSmartPointerLikeCachedDeref(
+                E,
+                dyn_cast_or_null<RecordStorageLocation>(
+                    getLocBehindPossiblePointer(*E->getArg(0), State.Env)),
+                State, [](StorageLocation &Loc) {});
+          })
+      .CaseOfCFGStmt<CXXOperatorCallExpr>(
+          isSmartPointerLikeOperatorArrow(),
+          [](const CXXOperatorCallExpr *E,
+             const MatchFinder::MatchResult &Result,
+             LatticeTransferState &State) {
+            transferSmartPointerLikeCachedGet(
+                E,
+                dyn_cast_or_null<RecordStorageLocation>(
+                    getLocBehindPossiblePointer(*E->getArg(0), State.Env)),
+                State, [](StorageLocation &Loc) {});
+          })
+      .CaseOfCFGStmt<CXXMemberCallExpr>(
+          isSmartPointerLikeValueMethodCall(),
+          [](const CXXMemberCallExpr *E, const MatchFinder::MatchResult &Result,
+             LatticeTransferState &State) {
+            transferSmartPointerLikeCachedDeref(
+                E, getImplicitObjectLocation(*E, State.Env), State,
+                [](StorageLocation &Loc) {});
+          })
+      .CaseOfCFGStmt<CXXMemberCallExpr>(
+          isSmartPointerLikeGetMethodCall(),
+          [](const CXXMemberCallExpr *E, const MatchFinder::MatchResult &Result,
+             LatticeTransferState &State) {
+            transferSmartPointerLikeCachedGet(
+                E, getImplicitObjectLocation(*E, State.Env), State,
+                [](StorageLocation &Loc) {});
+          })
+
       // const accessor calls
       .CaseOfCFGStmt<CXXMemberCallExpr>(isZeroParamConstMemberCall(),
                                         transferValue_ConstMemberCall)
diff --git a/clang/lib/Analysis/FlowSensitive/SmartPointerAccessorCaching.cpp b/clang/lib/Analysis/FlowSensitive/SmartPointerAccessorCaching.cpp
index a0c81aa933da..c58bd309545d 100644
--- a/clang/lib/Analysis/FlowSensitive/SmartPointerAccessorCaching.cpp
+++ b/clang/lib/Analysis/FlowSensitive/SmartPointerAccessorCaching.cpp
@@ -132,6 +132,7 @@ ast_matchers::StatementMatcher isSmartPointerLikeOperatorArrow() {
       callee(cxxMethodDecl(parameterCountIs(0), returns(pointerType()),
                            ofClass(smartPointerClassWithGetOrValue()))));
 }
+
 ast_matchers::StatementMatcher isSmartPointerLikeValueMethodCall() {
   return cxxMemberCallExpr(callee(
       cxxMethodDecl(parameterCountIs(0), returns(referenceType()),
@@ -144,4 +145,24 @@ ast_matchers::StatementMatcher isSmartPointerLikeGetMethodCall() {
                     ofClass(smartPointerClassWithGet()))));
 }
 
+const FunctionDecl *
+getCanonicalSmartPointerLikeOperatorCallee(const CallExpr *CE) {
+  const FunctionDecl *CanonicalCallee = nullptr;
+  const CXXMethodDecl *Callee =
+      cast_or_null<CXXMethodDecl>(CE->getDirectCallee());
+  if (Callee == nullptr)
+    return nullptr;
+  const CXXRecordDecl *RD = Callee->getParent();
+  if (RD == nullptr)
+    return nullptr;
+  for (const auto *MD : RD->methods()) {
+    if (MD->getOverloadedOperator() == OO_Star && MD->isConst() &&
+        MD->getNumParams() == 0 && MD->getReturnType()->isReferenceType()) {
+      CanonicalCallee = MD;
+      break;
+    }
+  }
+  return CanonicalCallee;
+}
+
 } // namespace clang::dataflow
diff --git a/clang/lib/Basic/Targets/AArch64.cpp b/clang/lib/Basic/Targets/AArch64.cpp
index 53e102bbe446..2b4b954d0c27 100644
--- a/clang/lib/Basic/Targets/AArch64.cpp
+++ b/clang/lib/Basic/Targets/AArch64.cpp
@@ -714,7 +714,7 @@ AArch64TargetInfo::getVScaleRange(const LangOptions &LangOpts) const {
   return std::nullopt;
 }
 
-unsigned AArch64TargetInfo::getFMVPriority(ArrayRef<StringRef> Features) const {
+uint64_t AArch64TargetInfo::getFMVPriority(ArrayRef<StringRef> Features) const {
   return llvm::AArch64::getFMVPriority(Features);
 }
 
diff --git a/clang/lib/Basic/Targets/AArch64.h b/clang/lib/Basic/Targets/AArch64.h
index 68a8b1ebad8c..4e927c0953b1 100644
--- a/clang/lib/Basic/Targets/AArch64.h
+++ b/clang/lib/Basic/Targets/AArch64.h
@@ -137,7 +137,7 @@ public:
   void fillValidCPUList(SmallVectorImpl<StringRef> &Values) const override;
   bool setCPU(const std::string &Name) override;
 
-  unsigned getFMVPriority(ArrayRef<StringRef> Features) const override;
+  uint64_t getFMVPriority(ArrayRef<StringRef> Features) const override;
 
   bool useFP16ConversionIntrinsics() const override {
     return false;
diff --git a/clang/lib/Basic/Targets/OSTargets.cpp b/clang/lib/Basic/Targets/OSTargets.cpp
index 88c054150ab2..6f98353fb8c2 100644
--- a/clang/lib/Basic/Targets/OSTargets.cpp
+++ b/clang/lib/Basic/Targets/OSTargets.cpp
@@ -114,9 +114,6 @@ void getDarwinDefines(MacroBuilder &Builder, const LangOptions &Opts,
     assert(OsVersion.getMinor().value_or(0) < 100 &&
            OsVersion.getSubminor().value_or(0) < 100 && "Invalid version!");
     Builder.defineMacro("__ENVIRONMENT_OS_VERSION_MIN_REQUIRED__", Str);
-
-    // Tell users about the kernel if there is one.
-    Builder.defineMacro("__MACH__");
   }
 
   PlatformMinVersion = OsVersion;
diff --git a/clang/lib/Basic/Targets/RISCV.cpp b/clang/lib/Basic/Targets/RISCV.cpp
index a541dfedc9b8..db23b0c22833 100644
--- a/clang/lib/Basic/Targets/RISCV.cpp
+++ b/clang/lib/Basic/Targets/RISCV.cpp
@@ -489,7 +489,7 @@ ParsedTargetAttr RISCVTargetInfo::parseTargetAttr(StringRef Features) const {
   return Ret;
 }
 
-unsigned RISCVTargetInfo::getFMVPriority(ArrayRef<StringRef> Features) const {
+uint64_t RISCVTargetInfo::getFMVPriority(ArrayRef<StringRef> Features) const {
   // Priority is explicitly specified on RISC-V unlike on other targets, where
   // it is derived by all the features of a specific version. Therefore if a
   // feature contains the priority string, then return it immediately.
@@ -501,7 +501,7 @@ unsigned RISCVTargetInfo::getFMVPriority(ArrayRef<StringRef> Features) const {
       Feature = RHS;
     else
       continue;
-    unsigned Priority;
+    uint64_t Priority;
     if (!Feature.getAsInteger(0, Priority))
       return Priority;
   }
diff --git a/clang/lib/Basic/Targets/RISCV.h b/clang/lib/Basic/Targets/RISCV.h
index 68f10e74ba98..bb3f3a5cda7c 100644
--- a/clang/lib/Basic/Targets/RISCV.h
+++ b/clang/lib/Basic/Targets/RISCV.h
@@ -122,7 +122,7 @@ public:
   void fillValidTuneCPUList(SmallVectorImpl<StringRef> &Values) const override;
   bool supportsTargetAttributeTune() const override { return true; }
   ParsedTargetAttr parseTargetAttr(StringRef Str) const override;
-  unsigned getFMVPriority(ArrayRef<StringRef> Features) const override;
+  uint64_t getFMVPriority(ArrayRef<StringRef> Features) const override;
 
   std::pair<unsigned, unsigned> hardwareInterferenceSizes() const override {
     return std::make_pair(32, 32);
diff --git a/clang/lib/Basic/Targets/X86.cpp b/clang/lib/Basic/Targets/X86.cpp
index d2d92fb864c3..40ad8fd9a096 100644
--- a/clang/lib/Basic/Targets/X86.cpp
+++ b/clang/lib/Basic/Targets/X86.cpp
@@ -1357,8 +1357,8 @@ static llvm::X86::ProcessorFeatures getFeature(StringRef Name) {
   // correct, so it asserts if the value is out of range.
 }
 
-unsigned X86TargetInfo::getFMVPriority(ArrayRef<StringRef> Features) const {
-  auto getPriority = [](StringRef Feature) -> unsigned {
+uint64_t X86TargetInfo::getFMVPriority(ArrayRef<StringRef> Features) const {
+  auto getPriority = [](StringRef Feature) -> uint64_t {
     // Valid CPUs have a 'key feature' that compares just better than its key
     // feature.
     using namespace llvm::X86;
@@ -1372,7 +1372,7 @@ unsigned X86TargetInfo::getFMVPriority(ArrayRef<StringRef> Features) const {
     return getFeaturePriority(getFeature(Feature)) << 1;
   };
 
-  unsigned Priority = 0;
+  uint64_t Priority = 0;
   for (StringRef Feature : Features)
     if (!Feature.empty())
       Priority = std::max(Priority, getPriority(Feature));
diff --git a/clang/lib/Basic/Targets/X86.h b/clang/lib/Basic/Targets/X86.h
index 553c452d4ba3..35aceb1c58e1 100644
--- a/clang/lib/Basic/Targets/X86.h
+++ b/clang/lib/Basic/Targets/X86.h
@@ -384,7 +384,7 @@ public:
     return CPU != llvm::X86::CK_None;
   }
 
-  unsigned getFMVPriority(ArrayRef<StringRef> Features) const override;
+  uint64_t getFMVPriority(ArrayRef<StringRef> Features) const override;
 
   bool setFPMath(StringRef Name) override;
 
diff --git a/clang/lib/CodeGen/BackendConsumer.h b/clang/lib/CodeGen/BackendConsumer.h
index a023d29cbd1d..d932a78f469b 100644
--- a/clang/lib/CodeGen/BackendConsumer.h
+++ b/clang/lib/CodeGen/BackendConsumer.h
@@ -29,17 +29,16 @@ class BackendConsumer : public ASTConsumer {
 
   virtual void anchor();
   DiagnosticsEngine &Diags;
-  BackendAction Action;
   const HeaderSearchOptions &HeaderSearchOpts;
   const CodeGenOptions &CodeGenOpts;
   const TargetOptions &TargetOpts;
   const LangOptions &LangOpts;
   std::unique_ptr<raw_pwrite_stream> AsmOutStream;
-  ASTContext *Context;
+  ASTContext *Context = nullptr;
   IntrusiveRefCntPtr<llvm::vfs::FileSystem> FS;
 
   llvm::Timer LLVMIRGeneration;
-  unsigned LLVMIRGenerationRefCount;
+  unsigned LLVMIRGenerationRefCount = 0;
 
   /// True if we've finished generating IR. This prevents us from generating
   /// additional LLVM IR after emitting output in HandleTranslationUnit. This
@@ -48,6 +47,8 @@ class BackendConsumer : public ASTConsumer {
 
   bool TimerIsEnabled = false;
 
+  BackendAction Action;
+
   std::unique_ptr<CodeGenerator> Gen;
 
   SmallVector<LinkModule, 4> LinkModules;
@@ -69,29 +70,12 @@ class BackendConsumer : public ASTConsumer {
   llvm::Module *CurLinkModule = nullptr;
 
 public:
-  BackendConsumer(BackendAction Action, DiagnosticsEngine &Diags,
-                  IntrusiveRefCntPtr<llvm::vfs::FileSystem> VFS,
-                  const HeaderSearchOptions &HeaderSearchOpts,
-                  const PreprocessorOptions &PPOpts,
-                  const CodeGenOptions &CodeGenOpts,
-                  const TargetOptions &TargetOpts, const LangOptions &LangOpts,
-                  const std::string &InFile,
-                  SmallVector<LinkModule, 4> LinkModules,
-                  std::unique_ptr<raw_pwrite_stream> OS, llvm::LLVMContext &C,
-                  CoverageSourceInfo *CoverageInfo = nullptr);
-
-  // This constructor is used in installing an empty BackendConsumer
-  // to use the clang diagnostic handler for IR input files. It avoids
-  // initializing the OS field.
-  BackendConsumer(BackendAction Action, DiagnosticsEngine &Diags,
+  BackendConsumer(const CompilerInstance &CI, BackendAction Action,
                   IntrusiveRefCntPtr<llvm::vfs::FileSystem> VFS,
-                  const HeaderSearchOptions &HeaderSearchOpts,
-                  const PreprocessorOptions &PPOpts,
-                  const CodeGenOptions &CodeGenOpts,
-                  const TargetOptions &TargetOpts, const LangOptions &LangOpts,
-                  llvm::Module *Module, SmallVector<LinkModule, 4> LinkModules,
-                  llvm::LLVMContext &C,
-                  CoverageSourceInfo *CoverageInfo = nullptr);
+                  llvm::LLVMContext &C, SmallVector<LinkModule, 4> LinkModules,
+                  StringRef InFile, std::unique_ptr<raw_pwrite_stream> OS,
+                  CoverageSourceInfo *CoverageInfo,
+                  llvm::Module *CurLinkModule = nullptr);
 
   llvm::Module *getModule() const;
   std::unique_ptr<llvm::Module> takeModule();
diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp
index 04358cd6d7c2..2dbab785658a 100644
--- a/clang/lib/CodeGen/BackendUtil.cpp
+++ b/clang/lib/CodeGen/BackendUtil.cpp
@@ -736,10 +736,8 @@ static void addSanitizers(const Triple &TargetTriple,
       MPM.addPass(createModuleToFunctionPassAdaptor(ThreadSanitizerPass()));
     }
 
-    if (LangOpts.Sanitize.has(SanitizerKind::Type)) {
-      MPM.addPass(ModuleTypeSanitizerPass());
-      MPM.addPass(createModuleToFunctionPassAdaptor(TypeSanitizerPass()));
-    }
+    if (LangOpts.Sanitize.has(SanitizerKind::Type))
+      MPM.addPass(TypeSanitizerPass());
 
     if (LangOpts.Sanitize.has(SanitizerKind::NumericalStability))
       MPM.addPass(NumericalStabilitySanitizerPass());
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 5cd893d70695..573be932f8b1 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -11327,6 +11327,19 @@ Value *CodeGenFunction::EmitAArch64SMEBuiltinExpr(unsigned BuiltinID,
   if (Builtin->LLVMIntrinsic == 0)
     return nullptr;
 
+  if (BuiltinID == SME::BI__builtin_sme___arm_in_streaming_mode) {
+    // If we already know the streaming mode, don't bother with the intrinsic
+    // and emit a constant instead
+    const auto *FD = cast<FunctionDecl>(CurFuncDecl);
+    if (const auto *FPT = FD->getType()->getAs<FunctionProtoType>()) {
+      unsigned SMEAttrs = FPT->getAArch64SMEAttributes();
+      if (!(SMEAttrs & FunctionType::SME_PStateSMCompatibleMask)) {
+        bool IsStreaming = SMEAttrs & FunctionType::SME_PStateSMEnabledMask;
+        return ConstantInt::getBool(Builder.getContext(), IsStreaming);
+      }
+    }
+  }
+
   // Predicates must match the main datatype.
   for (unsigned i = 0, e = Ops.size(); i != e; ++i)
     if (auto PredTy = dyn_cast<llvm::VectorType>(Ops[i]->getType()))
@@ -19199,8 +19212,9 @@ Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID,
     // TODO: Map to an hlsl_device address space.
     llvm::Type *RetTy = llvm::PointerType::getUnqual(getLLVMContext());
 
-    return Builder.CreateIntrinsic(RetTy, Intrinsic::dx_resource_getpointer,
-                                   ArrayRef<Value *>{HandleOp, IndexOp});
+    return Builder.CreateIntrinsic(
+        RetTy, CGM.getHLSLRuntime().getCreateResourceGetPointerIntrinsic(),
+        ArrayRef<Value *>{HandleOp, IndexOp});
   }
   case Builtin::BI__builtin_hlsl_all: {
     Value *Op0 = EmitScalarExpr(E->getArg(0));
diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
index 89e2eace9120..7b0ef4be9861 100644
--- a/clang/lib/CodeGen/CGCall.cpp
+++ b/clang/lib/CodeGen/CGCall.cpp
@@ -6090,6 +6090,8 @@ RValue CodeGenFunction::EmitVAArg(VAArgExpr *VE, Address &VAListAddr,
   VAListAddr = VE->isMicrosoftABI() ? EmitMSVAListRef(VE->getSubExpr())
                                     : EmitVAListRef(VE->getSubExpr());
   QualType Ty = VE->getType();
+  if (Ty->isVariablyModifiedType())
+    EmitVariablyModifiedType(Ty);
   if (VE->isMicrosoftABI())
     return CGM.getABIInfo().EmitMSVAArg(*this, VAListAddr, Ty, Slot);
   return CGM.getABIInfo().EmitVAArg(*this, VAListAddr, Ty, Slot);
diff --git a/clang/lib/CodeGen/CGDecl.cpp b/clang/lib/CodeGen/CGDecl.cpp
index 47b21bc9f63f..6f3ff050cb69 100644
--- a/clang/lib/CodeGen/CGDecl.cpp
+++ b/clang/lib/CodeGen/CGDecl.cpp
@@ -361,6 +361,8 @@ CodeGenFunction::AddInitializerToStaticVarDecl(const VarDecl &D,
     return GV;
   }
 
+  PGO.markStmtMaybeUsed(D.getInit()); // FIXME: Too lazy
+
 #ifndef NDEBUG
   CharUnits VarSize = CGM.getContext().getTypeSizeInChars(D.getType()) +
                       D.getFlexibleArrayInitChars(getContext());
@@ -1868,7 +1870,10 @@ void CodeGenFunction::EmitAutoVarInit(const AutoVarEmission &emission) {
   // If we are at an unreachable point, we don't need to emit the initializer
   // unless it contains a label.
   if (!HaveInsertPoint()) {
-    if (!Init || !ContainsLabel(Init)) return;
+    if (!Init || !ContainsLabel(Init)) {
+      PGO.markStmtMaybeUsed(Init);
+      return;
+    }
     EnsureInsertPoint();
   }
 
@@ -1979,6 +1984,8 @@ void CodeGenFunction::EmitAutoVarInit(const AutoVarEmission &emission) {
     return EmitExprAsInit(Init, &D, lv, capturedByInit);
   }
 
+  PGO.markStmtMaybeUsed(Init);
+
   if (!emission.IsConstantAggregate) {
     // For simple scalar/complex initialization, store the value directly.
     LValue lv = MakeAddrLValue(Loc, type);
diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp
index ba1cba291553..1bad7a722da0 100644
--- a/clang/lib/CodeGen/CGExpr.cpp
+++ b/clang/lib/CodeGen/CGExpr.cpp
@@ -5148,6 +5148,7 @@ std::optional<LValue> HandleConditionalOperatorLValueSimpleCase(
       // If the true case is live, we need to track its region.
       if (CondExprBool)
         CGF.incrementProfileCounter(E);
+      CGF.markStmtMaybeUsed(Dead);
       // If a throw expression we emit it and return an undefined lvalue
       // because it can't be used.
       if (auto *ThrowExpr = dyn_cast<CXXThrowExpr>(Live->IgnoreParens())) {
diff --git a/clang/lib/CodeGen/CGExprScalar.cpp b/clang/lib/CodeGen/CGExprScalar.cpp
index 4b71bd730ce1..0f27bd00422d 100644
--- a/clang/lib/CodeGen/CGExprScalar.cpp
+++ b/clang/lib/CodeGen/CGExprScalar.cpp
@@ -5003,7 +5003,8 @@ Value *ScalarExprEmitter::VisitBinLAnd(const BinaryOperator *E) {
         CGF.incrementProfileCounter(E->getRHS());
         CGF.EmitBranch(FBlock);
         CGF.EmitBlock(FBlock);
-      }
+      } else
+        CGF.markStmtMaybeUsed(E->getRHS());
 
       CGF.MCDCLogOpStack.pop_back();
       // If the top of the logical operator nest, update the MCDC bitmap.
@@ -5015,8 +5016,10 @@ Value *ScalarExprEmitter::VisitBinLAnd(const BinaryOperator *E) {
     }
 
     // 0 && RHS: If it is safe, just elide the RHS, and return 0/false.
-    if (!CGF.ContainsLabel(E->getRHS()))
+    if (!CGF.ContainsLabel(E->getRHS())) {
+      CGF.markStmtMaybeUsed(E->getRHS());
       return llvm::Constant::getNullValue(ResTy);
+    }
   }
 
   // If the top of the logical operator nest, reset the MCDC temp to 0.
@@ -5143,7 +5146,8 @@ Value *ScalarExprEmitter::VisitBinLOr(const BinaryOperator *E) {
         CGF.incrementProfileCounter(E->getRHS());
         CGF.EmitBranch(FBlock);
         CGF.EmitBlock(FBlock);
-      }
+      } else
+        CGF.markStmtMaybeUsed(E->getRHS());
 
       CGF.MCDCLogOpStack.pop_back();
       // If the top of the logical operator nest, update the MCDC bitmap.
@@ -5155,8 +5159,10 @@ Value *ScalarExprEmitter::VisitBinLOr(const BinaryOperator *E) {
     }
 
     // 1 || RHS: If it is safe, just elide the RHS, and return 1/true.
-    if (!CGF.ContainsLabel(E->getRHS()))
+    if (!CGF.ContainsLabel(E->getRHS())) {
+      CGF.markStmtMaybeUsed(E->getRHS());
       return llvm::ConstantInt::get(ResTy, 1);
+    }
   }
 
   // If the top of the logical operator nest, reset the MCDC temp to 0.
@@ -5280,6 +5286,7 @@ VisitAbstractConditionalOperator(const AbstractConditionalOperator *E) {
         CGF.incrementProfileCounter(E);
       }
       Value *Result = Visit(live);
+      CGF.markStmtMaybeUsed(dead);
 
       // If the live part is a throw expression, it acts like it has a void
       // type, so evaluating it returns a null Value*.  However, a conditional
@@ -5448,11 +5455,6 @@ Value *ScalarExprEmitter::VisitChooseExpr(ChooseExpr *E) {
 }
 
 Value *ScalarExprEmitter::VisitVAArgExpr(VAArgExpr *VE) {
-  QualType Ty = VE->getType();
-
-  if (Ty->isVariablyModifiedType())
-    CGF.EmitVariablyModifiedType(Ty);
-
   Address ArgValue = Address::invalid();
   RValue ArgPtr = CGF.EmitVAArg(VE, ArgValue);
 
diff --git a/clang/lib/CodeGen/CGHLSLRuntime.h b/clang/lib/CodeGen/CGHLSLRuntime.h
index 3d5724118611..46e472f0aae2 100644
--- a/clang/lib/CodeGen/CGHLSLRuntime.h
+++ b/clang/lib/CodeGen/CGHLSLRuntime.h
@@ -104,6 +104,8 @@ public:
   GENERATE_HLSL_INTRINSIC_FUNCTION(SClamp, sclamp)
   GENERATE_HLSL_INTRINSIC_FUNCTION(UClamp, uclamp)
 
+  GENERATE_HLSL_INTRINSIC_FUNCTION(CreateResourceGetPointer,
+                                   resource_getpointer)
   GENERATE_HLSL_INTRINSIC_FUNCTION(CreateHandleFromBinding,
                                    resource_handlefrombinding)
   GENERATE_HLSL_INTRINSIC_FUNCTION(BufferUpdateCounter, resource_updatecounter)
diff --git a/clang/lib/CodeGen/CGStmt.cpp b/clang/lib/CodeGen/CGStmt.cpp
index 85be2b47c6f2..e9a8500cc199 100644
--- a/clang/lib/CodeGen/CGStmt.cpp
+++ b/clang/lib/CodeGen/CGStmt.cpp
@@ -76,6 +76,7 @@ void CodeGenFunction::EmitStmt(const Stmt *S, ArrayRef<const Attr *> Attrs) {
       // Verify that any decl statements were handled as simple, they may be in
       // scope of subsequent reachable statements.
       assert(!isa<DeclStmt>(*S) && "Unexpected DeclStmt!");
+      PGO.markStmtMaybeUsed(S);
       return;
     }
 
@@ -482,6 +483,9 @@ void CodeGenFunction::EmitStmt(const Stmt *S, ArrayRef<const Attr *> Attrs) {
   case Stmt::OpenACCSetConstructClass:
     EmitOpenACCSetConstruct(cast<OpenACCSetConstruct>(*S));
     break;
+  case Stmt::OpenACCUpdateConstructClass:
+    EmitOpenACCUpdateConstruct(cast<OpenACCUpdateConstruct>(*S));
+    break;
   }
 }
 
@@ -754,8 +758,6 @@ void CodeGenFunction::EmitAttributedStmt(const AttributedStmt &S) {
   bool noinline = false;
   bool alwaysinline = false;
   bool noconvergent = false;
-  HLSLControlFlowHintAttr::Spelling flattenOrBranch =
-      HLSLControlFlowHintAttr::SpellingNotCalculated;
   const CallExpr *musttail = nullptr;
 
   for (const auto *A : S.getAttrs()) {
@@ -787,9 +789,6 @@ void CodeGenFunction::EmitAttributedStmt(const AttributedStmt &S) {
         Builder.CreateAssumption(AssumptionVal);
       }
     } break;
-    case attr::HLSLControlFlowHint: {
-      flattenOrBranch = cast<HLSLControlFlowHintAttr>(A)->getSemanticSpelling();
-    } break;
     }
   }
   SaveAndRestore save_nomerge(InNoMergeAttributedStmt, nomerge);
@@ -797,7 +796,6 @@ void CodeGenFunction::EmitAttributedStmt(const AttributedStmt &S) {
   SaveAndRestore save_alwaysinline(InAlwaysInlineAttributedStmt, alwaysinline);
   SaveAndRestore save_noconvergent(InNoConvergentAttributedStmt, noconvergent);
   SaveAndRestore save_musttail(MustTailCall, musttail);
-  SaveAndRestore save_flattenOrBranch(HLSLControlFlowAttr, flattenOrBranch);
   EmitStmt(S.getSubStmt(), S.getAttrs());
 }
 
@@ -878,6 +876,7 @@ void CodeGenFunction::EmitIfStmt(const IfStmt &S) {
         RunCleanupsScope ExecutedScope(*this);
         EmitStmt(Executed);
       }
+      PGO.markStmtMaybeUsed(Skipped);
       return;
     }
   }
@@ -2200,6 +2199,7 @@ void CodeGenFunction::EmitSwitchStmt(const SwitchStmt &S) {
       for (unsigned i = 0, e = CaseStmts.size(); i != e; ++i)
         EmitStmt(CaseStmts[i]);
       incrementProfileCounter(&S);
+      PGO.markStmtMaybeUsed(S.getBody());
 
       // Now we want to restore the saved switch instance so that nested
       // switches continue to function properly
diff --git a/clang/lib/CodeGen/CodeGenAction.cpp b/clang/lib/CodeGen/CodeGenAction.cpp
index cc927f44e032..f63cb9b082d5 100644
--- a/clang/lib/CodeGen/CodeGenAction.cpp
+++ b/clang/lib/CodeGen/CodeGenAction.cpp
@@ -106,46 +106,19 @@ static void reportOptRecordError(Error E, DiagnosticsEngine &Diags,
 }
 
 BackendConsumer::BackendConsumer(
-    BackendAction Action, DiagnosticsEngine &Diags,
-    IntrusiveRefCntPtr<llvm::vfs::FileSystem> VFS,
-    const HeaderSearchOptions &HeaderSearchOpts,
-    const PreprocessorOptions &PPOpts, const CodeGenOptions &CodeGenOpts,
-    const TargetOptions &TargetOpts, const LangOptions &LangOpts,
-    const std::string &InFile, SmallVector<LinkModule, 4> LinkModules,
-    std::unique_ptr<raw_pwrite_stream> OS, LLVMContext &C,
-    CoverageSourceInfo *CoverageInfo)
-    : Diags(Diags), Action(Action), HeaderSearchOpts(HeaderSearchOpts),
-      CodeGenOpts(CodeGenOpts), TargetOpts(TargetOpts), LangOpts(LangOpts),
-      AsmOutStream(std::move(OS)), Context(nullptr), FS(VFS),
-      LLVMIRGeneration("irgen", "LLVM IR Generation Time"),
-      LLVMIRGenerationRefCount(0),
-      Gen(CreateLLVMCodeGen(Diags, InFile, std::move(VFS), HeaderSearchOpts,
-                            PPOpts, CodeGenOpts, C, CoverageInfo)),
-      LinkModules(std::move(LinkModules)) {
-  TimerIsEnabled = CodeGenOpts.TimePasses;
-  llvm::TimePassesIsEnabled = CodeGenOpts.TimePasses;
-  llvm::TimePassesPerRun = CodeGenOpts.TimePassesPerRun;
-}
-
-// This constructor is used in installing an empty BackendConsumer
-// to use the clang diagnostic handler for IR input files. It avoids
-// initializing the OS field.
-BackendConsumer::BackendConsumer(
-    BackendAction Action, DiagnosticsEngine &Diags,
-    IntrusiveRefCntPtr<llvm::vfs::FileSystem> VFS,
-    const HeaderSearchOptions &HeaderSearchOpts,
-    const PreprocessorOptions &PPOpts, const CodeGenOptions &CodeGenOpts,
-    const TargetOptions &TargetOpts, const LangOptions &LangOpts,
-    llvm::Module *Module, SmallVector<LinkModule, 4> LinkModules,
-    LLVMContext &C, CoverageSourceInfo *CoverageInfo)
-    : Diags(Diags), Action(Action), HeaderSearchOpts(HeaderSearchOpts),
-      CodeGenOpts(CodeGenOpts), TargetOpts(TargetOpts), LangOpts(LangOpts),
-      Context(nullptr), FS(VFS),
-      LLVMIRGeneration("irgen", "LLVM IR Generation Time"),
-      LLVMIRGenerationRefCount(0),
-      Gen(CreateLLVMCodeGen(Diags, "", std::move(VFS), HeaderSearchOpts, PPOpts,
-                            CodeGenOpts, C, CoverageInfo)),
-      LinkModules(std::move(LinkModules)), CurLinkModule(Module) {
+    const CompilerInstance &CI, BackendAction Action,
+    IntrusiveRefCntPtr<llvm::vfs::FileSystem> VFS, LLVMContext &C,
+    SmallVector<LinkModule, 4> LinkModules, StringRef InFile,
+    std::unique_ptr<raw_pwrite_stream> OS, CoverageSourceInfo *CoverageInfo,
+    llvm::Module *CurLinkModule)
+    : Diags(CI.getDiagnostics()), HeaderSearchOpts(CI.getHeaderSearchOpts()),
+      CodeGenOpts(CI.getCodeGenOpts()), TargetOpts(CI.getTargetOpts()),
+      LangOpts(CI.getLangOpts()), AsmOutStream(std::move(OS)), FS(VFS),
+      LLVMIRGeneration("irgen", "LLVM IR Generation Time"), Action(Action),
+      Gen(CreateLLVMCodeGen(Diags, InFile, std::move(VFS),
+                            CI.getHeaderSearchOpts(), CI.getPreprocessorOpts(),
+                            CI.getCodeGenOpts(), C, CoverageInfo)),
+      LinkModules(std::move(LinkModules)), CurLinkModule(CurLinkModule) {
   TimerIsEnabled = CodeGenOpts.TimePasses;
   llvm::TimePassesIsEnabled = CodeGenOpts.TimePasses;
   llvm::TimePassesPerRun = CodeGenOpts.TimePassesPerRun;
@@ -1011,10 +984,8 @@ CodeGenAction::CreateASTConsumer(CompilerInstance &CI, StringRef InFile) {
         CI.getPreprocessor());
 
   std::unique_ptr<BackendConsumer> Result(new BackendConsumer(
-      BA, CI.getDiagnostics(), &CI.getVirtualFileSystem(),
-      CI.getHeaderSearchOpts(), CI.getPreprocessorOpts(), CI.getCodeGenOpts(),
-      CI.getTargetOpts(), CI.getLangOpts(), std::string(InFile),
-      std::move(LinkModules), std::move(OS), *VMContext, CoverageInfo));
+      CI, BA, &CI.getVirtualFileSystem(), *VMContext, std::move(LinkModules),
+      InFile, std::move(OS), CoverageInfo));
   BEConsumer = Result.get();
 
   // Enable generating macro debug info only when debug info is not disabled and
@@ -1182,11 +1153,9 @@ void CodeGenAction::ExecuteAction() {
 
   // Set clang diagnostic handler. To do this we need to create a fake
   // BackendConsumer.
-  BackendConsumer Result(BA, CI.getDiagnostics(), &CI.getVirtualFileSystem(),
-                         CI.getHeaderSearchOpts(), CI.getPreprocessorOpts(),
-                         CI.getCodeGenOpts(), CI.getTargetOpts(),
-                         CI.getLangOpts(), TheModule.get(),
-                         std::move(LinkModules), *VMContext, nullptr);
+  BackendConsumer Result(CI, BA, &CI.getVirtualFileSystem(), *VMContext,
+                         std::move(LinkModules), "", nullptr, nullptr,
+                         TheModule.get());
 
   // Link in each pending link module.
   if (!CodeGenOpts.LinkBitcodePostopt && Result.LinkInModules(&*TheModule))
diff --git a/clang/lib/CodeGen/CodeGenFunction.cpp b/clang/lib/CodeGen/CodeGenFunction.cpp
index 067ff55b87ae..27ec68bd2a87 100644
--- a/clang/lib/CodeGen/CodeGenFunction.cpp
+++ b/clang/lib/CodeGen/CodeGenFunction.cpp
@@ -40,7 +40,6 @@
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/FPEnv.h"
-#include "llvm/IR/Instruction.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/MDBuilder.h"
@@ -1617,6 +1616,8 @@ void CodeGenFunction::GenerateCode(GlobalDecl GD, llvm::Function *Fn,
   // Emit the standard function epilogue.
   FinishFunction(BodyRange.getEnd());
 
+  PGO.verifyCounterMap();
+
   // If we haven't marked the function nothrow through other means, do
   // a quick pass now to see if we can.
   if (!CurFn->doesNotThrow())
@@ -1739,6 +1740,7 @@ bool CodeGenFunction::ConstantFoldsToSimpleInteger(const Expr *Cond,
   if (!AllowLabels && CodeGenFunction::ContainsLabel(Cond))
     return false;  // Contains a label.
 
+  PGO.markStmtMaybeUsed(Cond);
   ResultInt = Int;
   return true;
 }
@@ -2084,29 +2086,7 @@ void CodeGenFunction::EmitBranchOnBoolExpr(
     Weights = createProfileWeights(TrueCount, CurrentCount - TrueCount);
   }
 
-  llvm::Instruction *BrInst = Builder.CreateCondBr(CondV, TrueBlock, FalseBlock,
-                                                   Weights, Unpredictable);
-  switch (HLSLControlFlowAttr) {
-  case HLSLControlFlowHintAttr::Microsoft_branch:
-  case HLSLControlFlowHintAttr::Microsoft_flatten: {
-    llvm::MDBuilder MDHelper(CGM.getLLVMContext());
-
-    llvm::ConstantInt *BranchHintConstant =
-        HLSLControlFlowAttr ==
-                HLSLControlFlowHintAttr::Spelling::Microsoft_branch
-            ? llvm::ConstantInt::get(CGM.Int32Ty, 1)
-            : llvm::ConstantInt::get(CGM.Int32Ty, 2);
-
-    SmallVector<llvm::Metadata *, 2> Vals(
-        {MDHelper.createString("hlsl.controlflow.hint"),
-         MDHelper.createConstant(BranchHintConstant)});
-    BrInst->setMetadata("hlsl.controlflow.hint",
-                        llvm::MDNode::get(CGM.getLLVMContext(), Vals));
-    break;
-  }
-  case HLSLControlFlowHintAttr::SpellingNotCalculated:
-    break;
-  }
+  Builder.CreateCondBr(CondV, TrueBlock, FalseBlock, Weights, Unpredictable);
 }
 
 /// ErrorUnsupported - Print out an error that codegen doesn't support the
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index 0cd03c97ae99..e2dc0b1e3816 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -615,10 +615,6 @@ public:
   /// True if the current statement has noconvergent attribute.
   bool InNoConvergentAttributedStmt = false;
 
-  /// HLSL Branch attribute.
-  HLSLControlFlowHintAttr::Spelling HLSLControlFlowAttr =
-      HLSLControlFlowHintAttr::SpellingNotCalculated;
-
   // The CallExpr within the current statement that the musttail attribute
   // applies to.  nullptr if there is no 'musttail' on the current statement.
   const CallExpr *MustTailCall = nullptr;
@@ -1624,6 +1620,13 @@ private:
                                             uint64_t LoopCount) const;
 
 public:
+  auto getIsCounterPair(const Stmt *S) const { return PGO.getIsCounterPair(S); }
+
+  void markStmtAsUsed(bool Skipped, const Stmt *S) {
+    PGO.markStmtAsUsed(Skipped, S);
+  }
+  void markStmtMaybeUsed(const Stmt *S) { PGO.markStmtMaybeUsed(S); }
+
   /// Increment the profiler's counter for the given statement by \p StepV.
   /// If \p StepV is null, the default increment is 1.
   void incrementProfileCounter(const Stmt *S, llvm::Value *StepV = nullptr) {
@@ -4142,6 +4145,11 @@ public:
     // but in the future we will implement some sort of IR.
   }
 
+  void EmitOpenACCUpdateConstruct(const OpenACCUpdateConstruct &S) {
+    // TODO OpenACC: Implement this.  It is currently implemented as a 'no-op',
+    // but in the future we will implement some sort of IR.
+  }
+
   //===--------------------------------------------------------------------===//
   //                         LValue Expression Emission
   //===--------------------------------------------------------------------===//
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index c49f76314882..7db1ed72fa5c 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -2748,7 +2748,21 @@ bool CodeGenModule::GetCPUAndFeaturesAttributes(GlobalDecl GD,
     Attrs.addAttribute("target-features", llvm::join(Features, ","));
     AddedAttr = true;
   }
-
+  if (getTarget().getTriple().isAArch64()) {
+    llvm::SmallVector<StringRef, 8> Feats;
+    if (TV)
+      TV->getFeatures(Feats);
+    else if (TC)
+      TC->getFeatures(Feats, GD.getMultiVersionIndex());
+    if (!Feats.empty()) {
+      llvm::sort(Feats);
+      std::string FMVFeatures;
+      for (StringRef F : Feats)
+        FMVFeatures.append(",+" + F.str());
+      Attrs.addAttribute("fmv-features", FMVFeatures.substr(1));
+      AddedAttr = true;
+    }
+  }
   return AddedAttr;
 }
 
@@ -4227,7 +4241,7 @@ void CodeGenModule::EmitGlobalDefinition(GlobalDecl GD, llvm::GlobalValue *GV) {
 static void ReplaceUsesOfNonProtoTypeWithRealFunction(llvm::GlobalValue *Old,
                                                       llvm::Function *NewFn);
 
-static unsigned getFMVPriority(const TargetInfo &TI,
+static uint64_t getFMVPriority(const TargetInfo &TI,
                                const CodeGenFunction::FMVResolverOption &RO) {
   llvm::SmallVector<StringRef, 8> Features{RO.Features};
   if (RO.Architecture)
diff --git a/clang/lib/CodeGen/CodeGenModule.h b/clang/lib/CodeGen/CodeGenModule.h
index 741b0f17da65..d5ef1a710eb4 100644
--- a/clang/lib/CodeGen/CodeGenModule.h
+++ b/clang/lib/CodeGen/CodeGenModule.h
@@ -102,6 +102,50 @@ enum ForDefinition_t : bool {
   ForDefinition = true
 };
 
+/// The Counter with an optional additional Counter for
+/// branches. `Skipped` counter can be calculated with `Executed` and
+/// a common Counter (like `Parent`) as `(Parent-Executed)`.
+///
+/// In SingleByte mode, Counters are binary. Subtraction is not
+/// applicable (but addition is capable). In this case, both
+/// `Executed` and `Skipped` counters are required.  `Skipped` is
+/// `None` by default. It is allocated in the coverage mapping.
+///
+/// There might be cases that `Parent` could be induced with
+/// `(Executed+Skipped)`. This is not always applicable.
+class CounterPair {
+public:
+  /// Optional value.
+  class ValueOpt {
+  private:
+    static constexpr uint32_t None = (1u << 31); /// None is allocated.
+    static constexpr uint32_t Mask = None - 1;
+
+    uint32_t Val;
+
+  public:
+    ValueOpt() : Val(None) {}
+
+    ValueOpt(unsigned InitVal) {
+      assert(!(InitVal & ~Mask));
+      Val = InitVal;
+    }
+
+    bool hasValue() const { return !(Val & None); }
+
+    operator uint32_t() const { return Val; }
+  };
+
+  ValueOpt Executed;
+  ValueOpt Skipped; /// May be None.
+
+  /// Initialized with Skipped=None.
+  CounterPair(unsigned Val) : Executed(Val) {}
+
+  // FIXME: Should work with {None, None}
+  CounterPair() : Executed(0) {}
+};
+
 struct OrderGlobalInitsOrStermFinalizers {
   unsigned int priority;
   unsigned int lex_order;
diff --git a/clang/lib/CodeGen/CodeGenPGO.cpp b/clang/lib/CodeGen/CodeGenPGO.cpp
index 17d7902f0cfb..792373839107 100644
--- a/clang/lib/CodeGen/CodeGenPGO.cpp
+++ b/clang/lib/CodeGen/CodeGenPGO.cpp
@@ -163,7 +163,7 @@ struct MapRegionCounters : public RecursiveASTVisitor<MapRegionCounters> {
   /// The function hash.
   PGOHash Hash;
   /// The map of statements to counters.
-  llvm::DenseMap<const Stmt *, unsigned> &CounterMap;
+  llvm::DenseMap<const Stmt *, CounterPair> &CounterMap;
   /// The state of MC/DC Coverage in this function.
   MCDC::State &MCDCState;
   /// Maximum number of supported MC/DC conditions in a boolean expression.
@@ -174,7 +174,7 @@ struct MapRegionCounters : public RecursiveASTVisitor<MapRegionCounters> {
   DiagnosticsEngine &Diag;
 
   MapRegionCounters(PGOHashVersion HashVersion, uint64_t ProfileVersion,
-                    llvm::DenseMap<const Stmt *, unsigned> &CounterMap,
+                    llvm::DenseMap<const Stmt *, CounterPair> &CounterMap,
                     MCDC::State &MCDCState, unsigned MCDCMaxCond,
                     DiagnosticsEngine &Diag)
       : NextCounter(0), Hash(HashVersion), CounterMap(CounterMap),
@@ -1083,7 +1083,7 @@ void CodeGenPGO::mapRegionCounters(const Decl *D) {
       (CGM.getCodeGenOpts().MCDCCoverage ? CGM.getCodeGenOpts().MCDCMaxConds
                                          : 0);
 
-  RegionCounterMap.reset(new llvm::DenseMap<const Stmt *, unsigned>);
+  RegionCounterMap.reset(new llvm::DenseMap<const Stmt *, CounterPair>);
   RegionMCDCState.reset(new MCDC::State);
   MapRegionCounters Walker(HashVersion, ProfileVersion, *RegionCounterMap,
                            *RegionMCDCState, MCDCMaxConditions, CGM.getDiags());
@@ -1185,12 +1185,23 @@ CodeGenPGO::applyFunctionAttributes(llvm::IndexedInstrProfReader *PGOReader,
   Fn->setEntryCount(FunctionCount);
 }
 
+std::pair<bool, bool> CodeGenPGO::getIsCounterPair(const Stmt *S) const {
+  if (!RegionCounterMap)
+    return {false, false};
+
+  auto I = RegionCounterMap->find(S);
+  if (I == RegionCounterMap->end())
+    return {false, false};
+
+  return {I->second.Executed.hasValue(), I->second.Skipped.hasValue()};
+}
+
 void CodeGenPGO::emitCounterSetOrIncrement(CGBuilderTy &Builder, const Stmt *S,
                                            llvm::Value *StepV) {
   if (!RegionCounterMap || !Builder.GetInsertBlock())
     return;
 
-  unsigned Counter = (*RegionCounterMap)[S];
+  unsigned Counter = (*RegionCounterMap)[S].Executed;
 
   // Make sure that pointer to global is passed in with zero addrspace
   // This is relevant during GPU profiling
diff --git a/clang/lib/CodeGen/CodeGenPGO.h b/clang/lib/CodeGen/CodeGenPGO.h
index 9d66ffad6f43..1944b640951d 100644
--- a/clang/lib/CodeGen/CodeGenPGO.h
+++ b/clang/lib/CodeGen/CodeGenPGO.h
@@ -35,7 +35,7 @@ private:
   std::array <unsigned, llvm::IPVK_Last + 1> NumValueSites;
   unsigned NumRegionCounters;
   uint64_t FunctionHash;
-  std::unique_ptr<llvm::DenseMap<const Stmt *, unsigned>> RegionCounterMap;
+  std::unique_ptr<llvm::DenseMap<const Stmt *, CounterPair>> RegionCounterMap;
   std::unique_ptr<llvm::DenseMap<const Stmt *, uint64_t>> StmtCountMap;
   std::unique_ptr<llvm::InstrProfRecord> ProfRecord;
   std::unique_ptr<MCDC::State> RegionMCDCState;
@@ -110,6 +110,7 @@ private:
   bool canEmitMCDCCoverage(const CGBuilderTy &Builder);
 
 public:
+  std::pair<bool, bool> getIsCounterPair(const Stmt *S) const;
   void emitCounterSetOrIncrement(CGBuilderTy &Builder, const Stmt *S,
                                  llvm::Value *StepV);
   void emitMCDCTestVectorBitmapUpdate(CGBuilderTy &Builder, const Expr *S,
@@ -122,6 +123,18 @@ public:
                                 Address MCDCCondBitmapAddr, llvm::Value *Val,
                                 CodeGenFunction &CGF);
 
+  void markStmtAsUsed(bool Skipped, const Stmt *S) {
+    // Do nothing.
+  }
+
+  void markStmtMaybeUsed(const Stmt *S) {
+    // Do nothing.
+  }
+
+  void verifyCounterMap() const {
+    // Do nothing.
+  }
+
   /// Return the region count for the counter at the given index.
   uint64_t getRegionCount(const Stmt *S) {
     if (!RegionCounterMap)
@@ -130,7 +143,7 @@ public:
       return 0;
     // With profiles from a differing version of clang we can have mismatched
     // decl counts. Don't crash in such a case.
-    auto Index = (*RegionCounterMap)[S];
+    auto Index = (*RegionCounterMap)[S].Executed;
     if (Index >= RegionCounts.size())
       return 0;
     return RegionCounts[Index];
diff --git a/clang/lib/CodeGen/CoverageMappingGen.cpp b/clang/lib/CodeGen/CoverageMappingGen.cpp
index cda218eac34a..f09157771d2b 100644
--- a/clang/lib/CodeGen/CoverageMappingGen.cpp
+++ b/clang/lib/CodeGen/CoverageMappingGen.cpp
@@ -882,7 +882,7 @@ struct CounterCoverageMappingBuilder
     : public CoverageMappingBuilder,
       public ConstStmtVisitor<CounterCoverageMappingBuilder> {
   /// The map of statements to count values.
-  llvm::DenseMap<const Stmt *, unsigned> &CounterMap;
+  llvm::DenseMap<const Stmt *, CounterPair> &CounterMap;
 
   MCDC::State &MCDCState;
 
@@ -919,15 +919,11 @@ struct CounterCoverageMappingBuilder
 
   /// Return a counter for the sum of \c LHS and \c RHS.
   Counter addCounters(Counter LHS, Counter RHS, bool Simplify = true) {
-    assert(!llvm::EnableSingleByteCoverage &&
-           "cannot add counters when single byte coverage mode is enabled");
     return Builder.add(LHS, RHS, Simplify);
   }
 
   Counter addCounters(Counter C1, Counter C2, Counter C3,
                       bool Simplify = true) {
-    assert(!llvm::EnableSingleByteCoverage &&
-           "cannot add counters when single byte coverage mode is enabled");
     return addCounters(addCounters(C1, C2, Simplify), C3, Simplify);
   }
 
@@ -935,7 +931,51 @@ struct CounterCoverageMappingBuilder
   ///
   /// This should only be called on statements that have a dedicated counter.
   Counter getRegionCounter(const Stmt *S) {
-    return Counter::getCounter(CounterMap[S]);
+    return Counter::getCounter(CounterMap[S].Executed);
+  }
+
+  struct BranchCounterPair {
+    Counter Executed; ///< The Counter previously assigned.
+    Counter Skipped;  ///< An expression (Parent-Executed), or equivalent to it.
+  };
+
+  /// Retrieve or assign the pair of Counter(s).
+  ///
+  /// This returns BranchCounterPair {Executed, Skipped}.
+  /// Executed is the Counter associated with S assigned by an earlier
+  /// CounterMapping pass.
+  /// Skipped may be an expression (Executed - ParentCnt) or newly
+  /// assigned Counter in EnableSingleByteCoverage, as subtract
+  /// expressions are not available in this mode.
+  ///
+  /// \param S Key to the CounterMap
+  /// \param ParentCnt The Counter representing how many times S is evaluated.
+  /// \param SkipCntForOld (To be removed later) Optional fake Counter
+  ///                      to override Skipped for adjustment of
+  ///                      expressions in the old behavior of
+  ///                      EnableSingleByteCoverage that is unaware of
+  ///                      Branch coverage.
+  BranchCounterPair
+  getBranchCounterPair(const Stmt *S, Counter ParentCnt,
+                       std::optional<Counter> SkipCntForOld = std::nullopt) {
+    Counter ExecCnt = getRegionCounter(S);
+
+    // The old behavior of SingleByte is unaware of Branches.
+    // Will be pruned after the migration of SingleByte.
+    if (llvm::EnableSingleByteCoverage) {
+      assert(SkipCntForOld &&
+             "SingleByte must provide SkipCntForOld as a fake Skipped count.");
+      return {ExecCnt, *SkipCntForOld};
+    }
+
+    return {ExecCnt, Builder.subtract(ParentCnt, ExecCnt)};
+  }
+
+  bool IsCounterEqual(Counter OutCount, Counter ParentCount) {
+    if (OutCount == ParentCount)
+      return true;
+
+    return false;
   }
 
   /// Push a region onto the stack.
@@ -1417,7 +1457,7 @@ struct CounterCoverageMappingBuilder
 
   CounterCoverageMappingBuilder(
       CoverageMappingModuleGen &CVM,
-      llvm::DenseMap<const Stmt *, unsigned> &CounterMap,
+      llvm::DenseMap<const Stmt *, CounterPair> &CounterMap,
       MCDC::State &MCDCState, SourceManager &SM, const LangOptions &LangOpts)
       : CoverageMappingBuilder(CVM, SM, LangOpts), CounterMap(CounterMap),
         MCDCState(MCDCState), MCDCBuilder(CVM.getCodeGenModule(), MCDCState) {}
@@ -1588,6 +1628,10 @@ struct CounterCoverageMappingBuilder
         llvm::EnableSingleByteCoverage
             ? getRegionCounter(S->getCond())
             : addCounters(ParentCount, BackedgeCount, BC.ContinueCount);
+    auto BranchCount = getBranchCounterPair(S, CondCount, getRegionCounter(S));
+    assert(BranchCount.Executed.isZero() || BranchCount.Executed == BodyCount ||
+           llvm::EnableSingleByteCoverage);
+
     propagateCounts(CondCount, S->getCond());
     adjustForOutOfOrderTraversal(getEnd(S));
 
@@ -1596,13 +1640,11 @@ struct CounterCoverageMappingBuilder
     if (Gap)
       fillGapAreaWithCount(Gap->getBegin(), Gap->getEnd(), BodyCount);
 
-    Counter OutCount =
-        llvm::EnableSingleByteCoverage
-            ? getRegionCounter(S)
-            : addCounters(BC.BreakCount,
-                          subtractCounters(CondCount, BodyCount));
-
-    if (OutCount != ParentCount) {
+    assert(
+        !llvm::EnableSingleByteCoverage ||
+        (BC.BreakCount.isZero() && BranchCount.Skipped == getRegionCounter(S)));
+    Counter OutCount = addCounters(BC.BreakCount, BranchCount.Skipped);
+    if (!IsCounterEqual(OutCount, ParentCount)) {
       pushRegion(OutCount);
       GapRegionCounter = OutCount;
       if (BodyHasTerminateStmt)
@@ -1611,8 +1653,7 @@ struct CounterCoverageMappingBuilder
 
     // Create Branch Region around condition.
     if (!llvm::EnableSingleByteCoverage)
-      createBranchRegion(S->getCond(), BodyCount,
-                         subtractCounters(CondCount, BodyCount));
+      createBranchRegion(S->getCond(), BodyCount, BranchCount.Skipped);
   }
 
   void VisitDoStmt(const DoStmt *S) {
@@ -1641,22 +1682,24 @@ struct CounterCoverageMappingBuilder
     Counter CondCount = llvm::EnableSingleByteCoverage
                             ? getRegionCounter(S->getCond())
                             : addCounters(BackedgeCount, BC.ContinueCount);
+    auto BranchCount = getBranchCounterPair(S, CondCount, getRegionCounter(S));
+    assert(BranchCount.Executed.isZero() || BranchCount.Executed == BodyCount ||
+           llvm::EnableSingleByteCoverage);
+
     propagateCounts(CondCount, S->getCond());
 
-    Counter OutCount =
-        llvm::EnableSingleByteCoverage
-            ? getRegionCounter(S)
-            : addCounters(BC.BreakCount,
-                          subtractCounters(CondCount, BodyCount));
-    if (OutCount != ParentCount) {
+    assert(
+        !llvm::EnableSingleByteCoverage ||
+        (BC.BreakCount.isZero() && BranchCount.Skipped == getRegionCounter(S)));
+    Counter OutCount = addCounters(BC.BreakCount, BranchCount.Skipped);
+    if (!IsCounterEqual(OutCount, ParentCount)) {
       pushRegion(OutCount);
       GapRegionCounter = OutCount;
     }
 
     // Create Branch Region around condition.
     if (!llvm::EnableSingleByteCoverage)
-      createBranchRegion(S->getCond(), BodyCount,
-                         subtractCounters(CondCount, BodyCount));
+      createBranchRegion(S->getCond(), BodyCount, BranchCount.Skipped);
 
     if (BodyHasTerminateStmt)
       HasTerminateStmt = true;
@@ -1705,6 +1748,9 @@ struct CounterCoverageMappingBuilder
             : addCounters(
                   addCounters(ParentCount, BackedgeCount, BodyBC.ContinueCount),
                   IncrementBC.ContinueCount);
+    auto BranchCount = getBranchCounterPair(S, CondCount, getRegionCounter(S));
+    assert(BranchCount.Executed.isZero() || BranchCount.Executed == BodyCount ||
+           llvm::EnableSingleByteCoverage);
 
     if (const Expr *Cond = S->getCond()) {
       propagateCounts(CondCount, Cond);
@@ -1716,12 +1762,11 @@ struct CounterCoverageMappingBuilder
     if (Gap)
       fillGapAreaWithCount(Gap->getBegin(), Gap->getEnd(), BodyCount);
 
-    Counter OutCount =
-        llvm::EnableSingleByteCoverage
-            ? getRegionCounter(S)
-            : addCounters(BodyBC.BreakCount, IncrementBC.BreakCount,
-                          subtractCounters(CondCount, BodyCount));
-    if (OutCount != ParentCount) {
+    assert(!llvm::EnableSingleByteCoverage ||
+           (BodyBC.BreakCount.isZero() && IncrementBC.BreakCount.isZero()));
+    Counter OutCount = addCounters(BodyBC.BreakCount, IncrementBC.BreakCount,
+                                   BranchCount.Skipped);
+    if (!IsCounterEqual(OutCount, ParentCount)) {
       pushRegion(OutCount);
       GapRegionCounter = OutCount;
       if (BodyHasTerminateStmt)
@@ -1730,8 +1775,7 @@ struct CounterCoverageMappingBuilder
 
     // Create Branch Region around condition.
     if (!llvm::EnableSingleByteCoverage)
-      createBranchRegion(S->getCond(), BodyCount,
-                         subtractCounters(CondCount, BodyCount));
+      createBranchRegion(S->getCond(), BodyCount, BranchCount.Skipped);
   }
 
   void VisitCXXForRangeStmt(const CXXForRangeStmt *S) {
@@ -1759,16 +1803,17 @@ struct CounterCoverageMappingBuilder
     if (Gap)
       fillGapAreaWithCount(Gap->getBegin(), Gap->getEnd(), BodyCount);
 
-    Counter OutCount;
-    Counter LoopCount;
-    if (llvm::EnableSingleByteCoverage)
-      OutCount = getRegionCounter(S);
-    else {
-      LoopCount = addCounters(ParentCount, BackedgeCount, BC.ContinueCount);
-      OutCount =
-          addCounters(BC.BreakCount, subtractCounters(LoopCount, BodyCount));
-    }
-    if (OutCount != ParentCount) {
+    Counter LoopCount =
+        addCounters(ParentCount, BackedgeCount, BC.ContinueCount);
+    auto BranchCount = getBranchCounterPair(S, LoopCount, getRegionCounter(S));
+    assert(BranchCount.Executed.isZero() || BranchCount.Executed == BodyCount ||
+           llvm::EnableSingleByteCoverage);
+    assert(
+        !llvm::EnableSingleByteCoverage ||
+        (BC.BreakCount.isZero() && BranchCount.Skipped == getRegionCounter(S)));
+
+    Counter OutCount = addCounters(BC.BreakCount, BranchCount.Skipped);
+    if (!IsCounterEqual(OutCount, ParentCount)) {
       pushRegion(OutCount);
       GapRegionCounter = OutCount;
       if (BodyHasTerminateStmt)
@@ -1777,8 +1822,7 @@ struct CounterCoverageMappingBuilder
 
     // Create Branch Region around condition.
     if (!llvm::EnableSingleByteCoverage)
-      createBranchRegion(S->getCond(), BodyCount,
-                         subtractCounters(LoopCount, BodyCount));
+      createBranchRegion(S->getCond(), BodyCount, BranchCount.Skipped);
   }
 
   void VisitObjCForCollectionStmt(const ObjCForCollectionStmt *S) {
@@ -1800,9 +1844,10 @@ struct CounterCoverageMappingBuilder
 
     Counter LoopCount =
         addCounters(ParentCount, BackedgeCount, BC.ContinueCount);
-    Counter OutCount =
-        addCounters(BC.BreakCount, subtractCounters(LoopCount, BodyCount));
-    if (OutCount != ParentCount) {
+    auto BranchCount = getBranchCounterPair(S, LoopCount);
+    assert(BranchCount.Executed.isZero() || BranchCount.Executed == BodyCount);
+    Counter OutCount = addCounters(BC.BreakCount, BranchCount.Skipped);
+    if (!IsCounterEqual(OutCount, ParentCount)) {
       pushRegion(OutCount);
       GapRegionCounter = OutCount;
     }
@@ -2010,9 +2055,12 @@ struct CounterCoverageMappingBuilder
     extendRegion(S->getCond());
 
     Counter ParentCount = getRegion().getCounter();
-    Counter ThenCount = llvm::EnableSingleByteCoverage
-                            ? getRegionCounter(S->getThen())
-                            : getRegionCounter(S);
+    auto [ThenCount, ElseCount] =
+        (llvm::EnableSingleByteCoverage
+             ? BranchCounterPair{getRegionCounter(S->getThen()),
+                                 (S->getElse() ? getRegionCounter(S->getElse())
+                                               : Counter::getZero())}
+             : getBranchCounterPair(S, ParentCount));
 
     // Emitting a counter for the condition makes it easier to interpret the
     // counter for the body when looking at the coverage.
@@ -2027,12 +2075,6 @@ struct CounterCoverageMappingBuilder
     extendRegion(S->getThen());
     Counter OutCount = propagateCounts(ThenCount, S->getThen());
 
-    Counter ElseCount;
-    if (!llvm::EnableSingleByteCoverage)
-      ElseCount = subtractCounters(ParentCount, ThenCount);
-    else if (S->getElse())
-      ElseCount = getRegionCounter(S->getElse());
-
     if (const Stmt *Else = S->getElse()) {
       bool ThenHasTerminateStmt = HasTerminateStmt;
       HasTerminateStmt = false;
@@ -2055,15 +2097,14 @@ struct CounterCoverageMappingBuilder
     if (llvm::EnableSingleByteCoverage)
       OutCount = getRegionCounter(S);
 
-    if (OutCount != ParentCount) {
+    if (!IsCounterEqual(OutCount, ParentCount)) {
       pushRegion(OutCount);
       GapRegionCounter = OutCount;
     }
 
     if (!llvm::EnableSingleByteCoverage)
       // Create Branch Region around condition.
-      createBranchRegion(S->getCond(), ThenCount,
-                         subtractCounters(ParentCount, ThenCount));
+      createBranchRegion(S->getCond(), ThenCount, ElseCount);
   }
 
   void VisitCXXTryStmt(const CXXTryStmt *S) {
@@ -2089,9 +2130,11 @@ struct CounterCoverageMappingBuilder
     extendRegion(E);
 
     Counter ParentCount = getRegion().getCounter();
-    Counter TrueCount = llvm::EnableSingleByteCoverage
-                            ? getRegionCounter(E->getTrueExpr())
-                            : getRegionCounter(E);
+    auto [TrueCount, FalseCount] =
+        (llvm::EnableSingleByteCoverage
+             ? BranchCounterPair{getRegionCounter(E->getTrueExpr()),
+                                 getRegionCounter(E->getFalseExpr())}
+             : getBranchCounterPair(E, ParentCount));
     Counter OutCount;
 
     if (const auto *BCO = dyn_cast<BinaryConditionalOperator>(E)) {
@@ -2110,25 +2153,20 @@ struct CounterCoverageMappingBuilder
     }
 
     extendRegion(E->getFalseExpr());
-    Counter FalseCount = llvm::EnableSingleByteCoverage
-                             ? getRegionCounter(E->getFalseExpr())
-                             : subtractCounters(ParentCount, TrueCount);
-
     Counter FalseOutCount = propagateCounts(FalseCount, E->getFalseExpr());
     if (llvm::EnableSingleByteCoverage)
       OutCount = getRegionCounter(E);
     else
       OutCount = addCounters(OutCount, FalseOutCount);
 
-    if (OutCount != ParentCount) {
+    if (!IsCounterEqual(OutCount, ParentCount)) {
       pushRegion(OutCount);
       GapRegionCounter = OutCount;
     }
 
     // Create Branch Region around condition.
     if (!llvm::EnableSingleByteCoverage)
-      createBranchRegion(E->getCond(), TrueCount,
-                         subtractCounters(ParentCount, TrueCount));
+      createBranchRegion(E->getCond(), TrueCount, FalseCount);
   }
 
   void createOrCancelDecision(const BinaryOperator *E, unsigned Since) {
@@ -2227,27 +2265,27 @@ struct CounterCoverageMappingBuilder
     extendRegion(E->getRHS());
     propagateCounts(getRegionCounter(E), E->getRHS());
 
+    if (llvm::EnableSingleByteCoverage)
+      return;
+
     // Track RHS True/False Decision.
     const auto DecisionRHS = MCDCBuilder.back();
 
+    // Extract the Parent Region Counter.
+    Counter ParentCnt = getRegion().getCounter();
+
     // Extract the RHS's Execution Counter.
-    Counter RHSExecCnt = getRegionCounter(E);
+    auto [RHSExecCnt, LHSExitCnt] = getBranchCounterPair(E, ParentCnt);
 
     // Extract the RHS's "True" Instance Counter.
-    Counter RHSTrueCnt = getRegionCounter(E->getRHS());
-
-    // Extract the Parent Region Counter.
-    Counter ParentCnt = getRegion().getCounter();
+    auto [RHSTrueCnt, RHSExitCnt] =
+        getBranchCounterPair(E->getRHS(), RHSExecCnt);
 
     // Create Branch Region around LHS condition.
-    if (!llvm::EnableSingleByteCoverage)
-      createBranchRegion(E->getLHS(), RHSExecCnt,
-                         subtractCounters(ParentCnt, RHSExecCnt), DecisionLHS);
+    createBranchRegion(E->getLHS(), RHSExecCnt, LHSExitCnt, DecisionLHS);
 
     // Create Branch Region around RHS condition.
-    if (!llvm::EnableSingleByteCoverage)
-      createBranchRegion(E->getRHS(), RHSTrueCnt,
-                         subtractCounters(RHSExecCnt, RHSTrueCnt), DecisionRHS);
+    createBranchRegion(E->getRHS(), RHSTrueCnt, RHSExitCnt, DecisionRHS);
 
     // Create MCDC Decision Region if at top-level (root).
     if (IsRootNode)
@@ -2288,31 +2326,31 @@ struct CounterCoverageMappingBuilder
     extendRegion(E->getRHS());
     propagateCounts(getRegionCounter(E), E->getRHS());
 
+    if (llvm::EnableSingleByteCoverage)
+      return;
+
     // Track RHS True/False Decision.
     const auto DecisionRHS = MCDCBuilder.back();
 
+    // Extract the Parent Region Counter.
+    Counter ParentCnt = getRegion().getCounter();
+
     // Extract the RHS's Execution Counter.
-    Counter RHSExecCnt = getRegionCounter(E);
+    auto [RHSExecCnt, LHSExitCnt] = getBranchCounterPair(E, ParentCnt);
 
     // Extract the RHS's "False" Instance Counter.
-    Counter RHSFalseCnt = getRegionCounter(E->getRHS());
+    auto [RHSFalseCnt, RHSExitCnt] =
+        getBranchCounterPair(E->getRHS(), RHSExecCnt);
 
     if (!shouldVisitRHS(E->getLHS())) {
       GapRegionCounter = OutCount;
     }
 
-    // Extract the Parent Region Counter.
-    Counter ParentCnt = getRegion().getCounter();
-
     // Create Branch Region around LHS condition.
-    if (!llvm::EnableSingleByteCoverage)
-      createBranchRegion(E->getLHS(), subtractCounters(ParentCnt, RHSExecCnt),
-                         RHSExecCnt, DecisionLHS);
+    createBranchRegion(E->getLHS(), LHSExitCnt, RHSExecCnt, DecisionLHS);
 
     // Create Branch Region around RHS condition.
-    if (!llvm::EnableSingleByteCoverage)
-      createBranchRegion(E->getRHS(), subtractCounters(RHSExecCnt, RHSFalseCnt),
-                         RHSFalseCnt, DecisionRHS);
+    createBranchRegion(E->getRHS(), RHSExitCnt, RHSFalseCnt, DecisionRHS);
 
     // Create MCDC Decision Region if at top-level (root).
     if (IsRootNode)
diff --git a/clang/lib/CodeGen/CoverageMappingGen.h b/clang/lib/CodeGen/CoverageMappingGen.h
index fe4b93f3af85..0ed50597e1dc 100644
--- a/clang/lib/CodeGen/CoverageMappingGen.h
+++ b/clang/lib/CodeGen/CoverageMappingGen.h
@@ -95,6 +95,7 @@ public:
 namespace CodeGen {
 
 class CodeGenModule;
+class CounterPair;
 
 namespace MCDC {
 struct State;
@@ -158,7 +159,7 @@ class CoverageMappingGen {
   CoverageMappingModuleGen &CVM;
   SourceManager &SM;
   const LangOptions &LangOpts;
-  llvm::DenseMap<const Stmt *, unsigned> *CounterMap;
+  llvm::DenseMap<const Stmt *, CounterPair> *CounterMap;
   MCDC::State *MCDCState;
 
 public:
@@ -169,7 +170,7 @@ public:
 
   CoverageMappingGen(CoverageMappingModuleGen &CVM, SourceManager &SM,
                      const LangOptions &LangOpts,
-                     llvm::DenseMap<const Stmt *, unsigned> *CounterMap,
+                     llvm::DenseMap<const Stmt *, CounterPair> *CounterMap,
                      MCDC::State *MCDCState)
       : CVM(CVM), SM(SM), LangOpts(LangOpts), CounterMap(CounterMap),
         MCDCState(MCDCState) {}
diff --git a/clang/lib/CodeGen/SanitizerMetadata.cpp b/clang/lib/CodeGen/SanitizerMetadata.cpp
index 61fdf3399ff3..b7b212ba46ef 100644
--- a/clang/lib/CodeGen/SanitizerMetadata.cpp
+++ b/clang/lib/CodeGen/SanitizerMetadata.cpp
@@ -145,7 +145,9 @@ void SanitizerMetadata::reportGlobal(llvm::GlobalVariable *GV, const VarDecl &D,
     for (auto *Attr : D.specific_attrs<NoSanitizeAttr>())
       NoSanitizeMask |= Attr->getMask();
 
-    if (D.hasExternalStorage())
+    // External definitions and incomplete types get handled at the place they
+    // are defined.
+    if (D.hasExternalStorage() || D.getType()->isIncompleteType())
       NoSanitizeMask |= SanitizerKind::Type;
 
     return NoSanitizeMask;
diff --git a/clang/lib/CodeGen/Targets/NVPTX.cpp b/clang/lib/CodeGen/Targets/NVPTX.cpp
index 0431d2cc4ddc..b82e4ddb9f3f 100644
--- a/clang/lib/CodeGen/Targets/NVPTX.cpp
+++ b/clang/lib/CodeGen/Targets/NVPTX.cpp
@@ -9,6 +9,7 @@
 #include "ABIInfoImpl.h"
 #include "TargetInfo.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/IR/CallingConv.h"
 #include "llvm/IR/IntrinsicsNVPTX.h"
 
 using namespace clang;
@@ -79,13 +80,11 @@ public:
   // Adds a NamedMDNode with GV, Name, and Operand as operands, and adds the
   // resulting MDNode to the nvvm.annotations MDNode.
   static void addNVVMMetadata(llvm::GlobalValue *GV, StringRef Name,
-                              int Operand,
-                              const SmallVectorImpl<int> &GridConstantArgs);
+                              int Operand);
 
-  static void addNVVMMetadata(llvm::GlobalValue *GV, StringRef Name,
-                              int Operand) {
-    addNVVMMetadata(GV, Name, Operand, SmallVector<int, 1>(0));
-  }
+  static void
+  addGridConstantNVVMMetadata(llvm::GlobalValue *GV,
+                              const SmallVectorImpl<int> &GridConstantArgs);
 
 private:
   static void emitBuiltinSurfTexDeviceCopy(CodeGenFunction &CGF, LValue Dst,
@@ -259,7 +258,7 @@ void NVPTXTargetCodeGenInfo::setTargetAttributes(
     if (FD->hasAttr<OpenCLKernelAttr>()) {
       // OpenCL __kernel functions get kernel metadata
       // Create !{<func-ref>, metadata !"kernel", i32 1} node
-      addNVVMMetadata(F, "kernel", 1);
+      F->setCallingConv(llvm::CallingConv::PTX_Kernel);
       // And kernel functions are not subject to inlining
       F->addFnAttr(llvm::Attribute::NoInline);
     }
@@ -277,7 +276,8 @@ void NVPTXTargetCodeGenInfo::setTargetAttributes(
           // For some reason arg indices are 1-based in NVVM
           GCI.push_back(IV.index() + 1);
       // Create !{<func-ref>, metadata !"kernel", i32 1} node
-      addNVVMMetadata(F, "kernel", 1, GCI);
+      F->setCallingConv(llvm::CallingConv::PTX_Kernel);
+      addGridConstantNVVMMetadata(F, GCI);
     }
     if (CUDALaunchBoundsAttr *Attr = FD->getAttr<CUDALaunchBoundsAttr>())
       M.handleCUDALaunchBoundsAttr(F, Attr);
@@ -285,13 +285,12 @@ void NVPTXTargetCodeGenInfo::setTargetAttributes(
 
   // Attach kernel metadata directly if compiling for NVPTX.
   if (FD->hasAttr<NVPTXKernelAttr>()) {
-    addNVVMMetadata(F, "kernel", 1);
+    F->setCallingConv(llvm::CallingConv::PTX_Kernel);
   }
 }
 
-void NVPTXTargetCodeGenInfo::addNVVMMetadata(
-    llvm::GlobalValue *GV, StringRef Name, int Operand,
-    const SmallVectorImpl<int> &GridConstantArgs) {
+void NVPTXTargetCodeGenInfo::addNVVMMetadata(llvm::GlobalValue *GV,
+                                             StringRef Name, int Operand) {
   llvm::Module *M = GV->getParent();
   llvm::LLVMContext &Ctx = M->getContext();
 
@@ -302,6 +301,21 @@ void NVPTXTargetCodeGenInfo::addNVVMMetadata(
       llvm::ConstantAsMetadata::get(GV), llvm::MDString::get(Ctx, Name),
       llvm::ConstantAsMetadata::get(
           llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), Operand))};
+
+  // Append metadata to nvvm.annotations
+  MD->addOperand(llvm::MDNode::get(Ctx, MDVals));
+}
+
+void NVPTXTargetCodeGenInfo::addGridConstantNVVMMetadata(
+    llvm::GlobalValue *GV, const SmallVectorImpl<int> &GridConstantArgs) {
+
+  llvm::Module *M = GV->getParent();
+  llvm::LLVMContext &Ctx = M->getContext();
+
+  // Get "nvvm.annotations" metadata node
+  llvm::NamedMDNode *MD = M->getOrInsertNamedMetadata("nvvm.annotations");
+
+  SmallVector<llvm::Metadata *, 5> MDVals = {llvm::ConstantAsMetadata::get(GV)};
   if (!GridConstantArgs.empty()) {
     SmallVector<llvm::Metadata *, 10> GCM;
     for (int I : GridConstantArgs)
@@ -310,6 +324,7 @@ void NVPTXTargetCodeGenInfo::addNVVMMetadata(
     MDVals.append({llvm::MDString::get(Ctx, "grid_constant"),
                    llvm::MDNode::get(Ctx, GCM)});
   }
+
   // Append metadata to nvvm.annotations
   MD->addOperand(llvm::MDNode::get(Ctx, MDVals));
 }
diff --git a/clang/lib/CodeGen/Targets/SPIR.cpp b/clang/lib/CodeGen/Targets/SPIR.cpp
index a48fe9d5f1ee..5c75e985e953 100644
--- a/clang/lib/CodeGen/Targets/SPIR.cpp
+++ b/clang/lib/CodeGen/Targets/SPIR.cpp
@@ -64,6 +64,8 @@ public:
   void setCUDAKernelCallingConvention(const FunctionType *&FT) const override;
   LangAS getGlobalVarAddressSpace(CodeGenModule &CGM,
                                   const VarDecl *D) const override;
+  void setTargetAttributes(const Decl *D, llvm::GlobalValue *GV,
+                           CodeGen::CodeGenModule &M) const override;
   llvm::SyncScope::ID getLLVMSyncScopeID(const LangOptions &LangOpts,
                                          SyncScope Scope,
                                          llvm::AtomicOrdering Ordering,
@@ -245,6 +247,41 @@ SPIRVTargetCodeGenInfo::getGlobalVarAddressSpace(CodeGenModule &CGM,
   return DefaultGlobalAS;
 }
 
+void SPIRVTargetCodeGenInfo::setTargetAttributes(
+    const Decl *D, llvm::GlobalValue *GV, CodeGen::CodeGenModule &M) const {
+  if (!M.getLangOpts().HIP ||
+      M.getTarget().getTriple().getVendor() != llvm::Triple::AMD)
+    return;
+  if (GV->isDeclaration())
+    return;
+
+  auto F = dyn_cast<llvm::Function>(GV);
+  if (!F)
+    return;
+
+  auto FD = dyn_cast_or_null<FunctionDecl>(D);
+  if (!FD)
+    return;
+  if (!FD->hasAttr<CUDAGlobalAttr>())
+    return;
+
+  unsigned N = M.getLangOpts().GPUMaxThreadsPerBlock;
+  if (auto FlatWGS = FD->getAttr<AMDGPUFlatWorkGroupSizeAttr>())
+    N = FlatWGS->getMax()->EvaluateKnownConstInt(M.getContext()).getExtValue();
+
+  // We encode the maximum flat WG size in the first component of the 3D
+  // max_work_group_size attribute, which will get reverse translated into the
+  // original AMDGPU attribute when targeting AMDGPU.
+  auto Int32Ty = llvm::IntegerType::getInt32Ty(M.getLLVMContext());
+  llvm::Metadata *AttrMDArgs[] = {
+      llvm::ConstantAsMetadata::get(llvm::ConstantInt::get(Int32Ty, N)),
+      llvm::ConstantAsMetadata::get(llvm::ConstantInt::get(Int32Ty, 1)),
+      llvm::ConstantAsMetadata::get(llvm::ConstantInt::get(Int32Ty, 1))};
+
+  F->setMetadata("max_work_group_size",
+                 llvm::MDNode::get(M.getLLVMContext(), AttrMDArgs));
+}
+
 llvm::SyncScope::ID
 SPIRVTargetCodeGenInfo::getLLVMSyncScopeID(const LangOptions &, SyncScope Scope,
                                            llvm::AtomicOrdering,
diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index 36d6c93c4332..528b7d1a9c7b 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -1063,6 +1063,34 @@ void Driver::CreateOffloadingDeviceToolChains(Compilation &C,
   //
 }
 
+bool Driver::loadZOSCustomizationFile(llvm::cl::ExpansionContext &ExpCtx) {
+  if (IsCLMode() || IsDXCMode() || IsFlangMode())
+    return false;
+
+  SmallString<128> CustomizationFile;
+  StringRef PathLIBEnv = StringRef(getenv("CLANG_CONFIG_PATH")).trim();
+  // If the env var is a directory then append "/clang.cfg" and treat
+  // that as the config file.  Otherwise treat the env var as the
+  // config file.
+  if (!PathLIBEnv.empty()) {
+    llvm::sys::path::append(CustomizationFile, PathLIBEnv);
+    if (llvm::sys::fs::is_directory(PathLIBEnv))
+      llvm::sys::path::append(CustomizationFile, "/clang.cfg");
+    if (llvm::sys::fs::is_regular_file(CustomizationFile))
+      return readConfigFile(CustomizationFile, ExpCtx);
+    Diag(diag::err_drv_config_file_not_found) << CustomizationFile;
+    return true;
+  }
+
+  SmallString<128> BaseDir(llvm::sys::path::parent_path(Dir));
+  llvm::sys::path::append(CustomizationFile, BaseDir + "/etc/clang.cfg");
+  if (llvm::sys::fs::is_regular_file(CustomizationFile))
+    return readConfigFile(CustomizationFile, ExpCtx);
+
+  // If no customization file, just return
+  return false;
+}
+
 static void appendOneArg(InputArgList &Args, const Arg *Opt) {
   // The args for config files or /clang: flags belong to different InputArgList
   // objects than Args. This copies an Arg from one of those other InputArgLists
@@ -1284,11 +1312,18 @@ bool Driver::loadDefaultConfigFiles(llvm::cl::ExpansionContext &ExpCtx) {
   }
 
   // Otherwise, use the real triple as used by the driver.
+  llvm::Triple RealTriple =
+      computeTargetTriple(*this, TargetTriple, *CLOptions);
   if (Triple.str().empty()) {
-    Triple = computeTargetTriple(*this, TargetTriple, *CLOptions);
+    Triple = RealTriple;
     assert(!Triple.str().empty());
   }
 
+  // On z/OS, start by loading the customization file before loading
+  // the usual default config file(s).
+  if (RealTriple.isOSzOS() && loadZOSCustomizationFile(ExpCtx))
+    return true;
+
   // Search for config files in the following order:
   // 1. <triple>-<mode>.cfg using real driver mode
   //    (e.g. i386-pc-linux-gnu-clang++.cfg).
@@ -6686,6 +6721,8 @@ const ToolChain &Driver::getToolChain(const ArgList &Args,
           TC = std::make_unique<toolchains::BareMetal>(*this, Target, Args);
         else if (Target.isOSBinFormatELF())
           TC = std::make_unique<toolchains::Generic_ELF>(*this, Target, Args);
+        else if (Target.isAppleMachO())
+          TC = std::make_unique<toolchains::AppleMachO>(*this, Target, Args);
         else if (Target.isOSBinFormatMachO())
           TC = std::make_unique<toolchains::MachO>(*this, Target, Args);
         else
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index a0002371da2f..c4b5374d3fff 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -5109,6 +5109,7 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
   const llvm::Triple *AuxTriple =
       (IsCuda || IsHIP) ? TC.getAuxTriple() : nullptr;
   bool IsWindowsMSVC = RawTriple.isWindowsMSVCEnvironment();
+  bool IsUEFI = RawTriple.isUEFI();
   bool IsIAMCU = RawTriple.isOSIAMCU();
 
   // Adjust IsWindowsXYZ for CUDA/HIP/SYCL compilations.  Even when compiling in
@@ -7252,7 +7253,7 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
 
   // -fms-extensions=0 is default.
   if (Args.hasFlag(options::OPT_fms_extensions, options::OPT_fno_ms_extensions,
-                   IsWindowsMSVC))
+                   IsWindowsMSVC || IsUEFI))
     CmdArgs.push_back("-fms-extensions");
 
   // -fms-compatibility=0 is default.
@@ -9281,6 +9282,10 @@ void LinkerWrapper::ConstructJob(Compilation &C, const JobAction &JA,
   if (const Arg *A = Args.getLastArg(options::OPT_Rpass_analysis_EQ))
     CmdArgs.push_back(Args.MakeArgString(
         Twine("--offload-opt=-pass-remarks-analysis=") + A->getValue()));
+
+  if (Args.getLastArg(options::OPT_ftime_report))
+    CmdArgs.push_back("--device-compiler=-ftime-report");
+
   if (Args.getLastArg(options::OPT_save_temps_EQ))
     CmdArgs.push_back("--save-temps");
 
diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp
index 60214c4d59ce..f8967890f722 100644
--- a/clang/lib/Driver/ToolChains/CommonArgs.cpp
+++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp
@@ -1209,6 +1209,10 @@ void tools::addLTOOptions(const ToolChain &ToolChain, const ArgList &Args,
   if (ImplicitMapSyms)
     CmdArgs.push_back(
         Args.MakeArgString(Twine(PluginOptPrefix) + "-implicit-mapsyms"));
+
+  if (Args.hasArg(options::OPT_ftime_report))
+    CmdArgs.push_back(
+        Args.MakeArgString(Twine(PluginOptPrefix) + "-time-passes"));
 }
 
 void tools::addOpenMPRuntimeLibraryPath(const ToolChain &TC,
diff --git a/clang/lib/Driver/ToolChains/Darwin.cpp b/clang/lib/Driver/ToolChains/Darwin.cpp
index 56b6dd78673c..e5dffb11d1a5 100644
--- a/clang/lib/Driver/ToolChains/Darwin.cpp
+++ b/clang/lib/Driver/ToolChains/Darwin.cpp
@@ -966,11 +966,14 @@ MachO::MachO(const Driver &D, const llvm::Triple &Triple, const ArgList &Args)
   getProgramPaths().push_back(getDriver().Dir);
 }
 
+AppleMachO::AppleMachO(const Driver &D, const llvm::Triple &Triple,
+                       const ArgList &Args)
+    : MachO(D, Triple, Args), CudaInstallation(D, Triple, Args),
+      RocmInstallation(D, Triple, Args), SYCLInstallation(D, Triple, Args) {}
+
 /// Darwin - Darwin tool chain for i386 and x86_64.
 Darwin::Darwin(const Driver &D, const llvm::Triple &Triple, const ArgList &Args)
-    : MachO(D, Triple, Args), TargetInitialized(false),
-      CudaInstallation(D, Triple, Args), RocmInstallation(D, Triple, Args),
-      SYCLInstallation(D, Triple, Args) {}
+    : AppleMachO(D, Triple, Args), TargetInitialized(false) {}
 
 types::ID MachO::LookupTypeForExtension(StringRef Ext) const {
   types::ID Ty = ToolChain::LookupTypeForExtension(Ext);
@@ -1019,18 +1022,18 @@ bool Darwin::hasBlocksRuntime() const {
   }
 }
 
-void Darwin::AddCudaIncludeArgs(const ArgList &DriverArgs,
-                                ArgStringList &CC1Args) const {
+void AppleMachO::AddCudaIncludeArgs(const ArgList &DriverArgs,
+                                    ArgStringList &CC1Args) const {
   CudaInstallation->AddCudaIncludeArgs(DriverArgs, CC1Args);
 }
 
-void Darwin::AddHIPIncludeArgs(const ArgList &DriverArgs,
-                               ArgStringList &CC1Args) const {
+void AppleMachO::AddHIPIncludeArgs(const ArgList &DriverArgs,
+                                   ArgStringList &CC1Args) const {
   RocmInstallation->AddHIPIncludeArgs(DriverArgs, CC1Args);
 }
 
-void Darwin::addSYCLIncludeArgs(const ArgList &DriverArgs,
-                                ArgStringList &CC1Args) const {
+void AppleMachO::addSYCLIncludeArgs(const ArgList &DriverArgs,
+                                    ArgStringList &CC1Args) const {
   SYCLInstallation->addSYCLIncludeArgs(DriverArgs, CC1Args);
 }
 
@@ -1125,6 +1128,8 @@ VersionTuple MachO::getLinkerVersion(const llvm::opt::ArgList &Args) const {
 
 Darwin::~Darwin() {}
 
+AppleMachO::~AppleMachO() {}
+
 MachO::~MachO() {}
 
 std::string Darwin::ComputeEffectiveClangTriple(const ArgList &Args,
@@ -2488,7 +2493,7 @@ static void AppendPlatformPrefix(SmallString<128> &Path,
 // Returns the effective sysroot from either -isysroot or --sysroot, plus the
 // platform prefix (if any).
 llvm::SmallString<128>
-DarwinClang::GetEffectiveSysroot(const llvm::opt::ArgList &DriverArgs) const {
+AppleMachO::GetEffectiveSysroot(const llvm::opt::ArgList &DriverArgs) const {
   llvm::SmallString<128> Path("/");
   if (DriverArgs.hasArg(options::OPT_isysroot))
     Path = DriverArgs.getLastArgValue(options::OPT_isysroot);
@@ -2501,8 +2506,9 @@ DarwinClang::GetEffectiveSysroot(const llvm::opt::ArgList &DriverArgs) const {
   return Path;
 }
 
-void DarwinClang::AddClangSystemIncludeArgs(const llvm::opt::ArgList &DriverArgs,
-                                            llvm::opt::ArgStringList &CC1Args) const {
+void AppleMachO::AddClangSystemIncludeArgs(
+    const llvm::opt::ArgList &DriverArgs,
+    llvm::opt::ArgStringList &CC1Args) const {
   const Driver &D = getDriver();
 
   llvm::SmallString<128> Sysroot = GetEffectiveSysroot(DriverArgs);
@@ -2580,7 +2586,7 @@ bool DarwinClang::AddGnuCPlusPlusIncludePaths(const llvm::opt::ArgList &DriverAr
   return getVFS().exists(Base);
 }
 
-void DarwinClang::AddClangCXXStdlibIncludeArgs(
+void AppleMachO::AddClangCXXStdlibIncludeArgs(
     const llvm::opt::ArgList &DriverArgs,
     llvm::opt::ArgStringList &CC1Args) const {
   // The implementation from a base class will pass through the -stdlib to
@@ -2637,55 +2643,60 @@ void DarwinClang::AddClangCXXStdlibIncludeArgs(
   }
 
   case ToolChain::CST_Libstdcxx:
-    llvm::SmallString<128> UsrIncludeCxx = Sysroot;
-    llvm::sys::path::append(UsrIncludeCxx, "usr", "include", "c++");
-
-    llvm::Triple::ArchType arch = getTriple().getArch();
-    bool IsBaseFound = true;
-    switch (arch) {
-    default: break;
-
-    case llvm::Triple::x86:
-    case llvm::Triple::x86_64:
-      IsBaseFound = AddGnuCPlusPlusIncludePaths(DriverArgs, CC1Args, UsrIncludeCxx,
-                                                "4.2.1",
-                                                "i686-apple-darwin10",
-                                                arch == llvm::Triple::x86_64 ? "x86_64" : "");
-      IsBaseFound |= AddGnuCPlusPlusIncludePaths(DriverArgs, CC1Args, UsrIncludeCxx,
-                                                "4.0.0", "i686-apple-darwin8",
-                                                 "");
-      break;
+    AddGnuCPlusPlusIncludePaths(DriverArgs, CC1Args);
+    break;
+  }
+}
 
-    case llvm::Triple::arm:
-    case llvm::Triple::thumb:
-      IsBaseFound = AddGnuCPlusPlusIncludePaths(DriverArgs, CC1Args, UsrIncludeCxx,
-                                                "4.2.1",
-                                                "arm-apple-darwin10",
-                                                "v7");
-      IsBaseFound |= AddGnuCPlusPlusIncludePaths(DriverArgs, CC1Args, UsrIncludeCxx,
-                                                "4.2.1",
-                                                "arm-apple-darwin10",
-                                                 "v6");
-      break;
+void AppleMachO::AddGnuCPlusPlusIncludePaths(
+    const llvm::opt::ArgList &DriverArgs,
+    llvm::opt::ArgStringList &CC1Args) const {}
 
-    case llvm::Triple::aarch64:
-      IsBaseFound = AddGnuCPlusPlusIncludePaths(DriverArgs, CC1Args, UsrIncludeCxx,
-                                                "4.2.1",
-                                                "arm64-apple-darwin10",
-                                                "");
-      break;
-    }
+void DarwinClang::AddGnuCPlusPlusIncludePaths(
+    const llvm::opt::ArgList &DriverArgs,
+    llvm::opt::ArgStringList &CC1Args) const {
+  llvm::SmallString<128> UsrIncludeCxx = GetEffectiveSysroot(DriverArgs);
+  llvm::sys::path::append(UsrIncludeCxx, "usr", "include", "c++");
 
-    if (!IsBaseFound) {
-      getDriver().Diag(diag::warn_drv_libstdcxx_not_found);
-    }
+  llvm::Triple::ArchType arch = getTriple().getArch();
+  bool IsBaseFound = true;
+  switch (arch) {
+  default:
+    break;
 
+  case llvm::Triple::x86:
+  case llvm::Triple::x86_64:
+    IsBaseFound = AddGnuCPlusPlusIncludePaths(
+        DriverArgs, CC1Args, UsrIncludeCxx, "4.2.1", "i686-apple-darwin10",
+        arch == llvm::Triple::x86_64 ? "x86_64" : "");
+    IsBaseFound |= AddGnuCPlusPlusIncludePaths(
+        DriverArgs, CC1Args, UsrIncludeCxx, "4.0.0", "i686-apple-darwin8", "");
+    break;
+
+  case llvm::Triple::arm:
+  case llvm::Triple::thumb:
+    IsBaseFound =
+        AddGnuCPlusPlusIncludePaths(DriverArgs, CC1Args, UsrIncludeCxx, "4.2.1",
+                                    "arm-apple-darwin10", "v7");
+    IsBaseFound |=
+        AddGnuCPlusPlusIncludePaths(DriverArgs, CC1Args, UsrIncludeCxx, "4.2.1",
+                                    "arm-apple-darwin10", "v6");
+    break;
+
+  case llvm::Triple::aarch64:
+    IsBaseFound =
+        AddGnuCPlusPlusIncludePaths(DriverArgs, CC1Args, UsrIncludeCxx, "4.2.1",
+                                    "arm64-apple-darwin10", "");
     break;
   }
+
+  if (!IsBaseFound) {
+    getDriver().Diag(diag::warn_drv_libstdcxx_not_found);
+  }
 }
 
-void DarwinClang::AddCXXStdlibLibArgs(const ArgList &Args,
-                                      ArgStringList &CmdArgs) const {
+void AppleMachO::AddCXXStdlibLibArgs(const ArgList &Args,
+                                     ArgStringList &CmdArgs) const {
   CXXStdlibType Type = GetCXXStdlibType(Args);
 
   switch (Type) {
@@ -3621,7 +3632,7 @@ SanitizerMask Darwin::getSupportedSanitizers() const {
   return Res;
 }
 
-void Darwin::printVerboseInfo(raw_ostream &OS) const {
+void AppleMachO::printVerboseInfo(raw_ostream &OS) const {
   CudaInstallation->print(OS);
   RocmInstallation->print(OS);
 }
diff --git a/clang/lib/Driver/ToolChains/Darwin.h b/clang/lib/Driver/ToolChains/Darwin.h
index 5bc18581cfd2..c44780c577f4 100644
--- a/clang/lib/Driver/ToolChains/Darwin.h
+++ b/clang/lib/Driver/ToolChains/Darwin.h
@@ -291,8 +291,52 @@ public:
   /// }
 };
 
+/// Apple specific MachO extensions
+class LLVM_LIBRARY_VISIBILITY AppleMachO : public MachO {
+public:
+  AppleMachO(const Driver &D, const llvm::Triple &Triple,
+             const llvm::opt::ArgList &Args);
+  ~AppleMachO() override;
+
+  /// }
+  /// @name Apple Specific ToolChain Implementation
+  /// {
+  void
+  AddClangSystemIncludeArgs(const llvm::opt::ArgList &DriverArgs,
+                            llvm::opt::ArgStringList &CC1Args) const override;
+
+  void AddCudaIncludeArgs(const llvm::opt::ArgList &DriverArgs,
+                          llvm::opt::ArgStringList &CC1Args) const override;
+  void AddHIPIncludeArgs(const llvm::opt::ArgList &DriverArgs,
+                         llvm::opt::ArgStringList &CC1Args) const override;
+  void addSYCLIncludeArgs(const llvm::opt::ArgList &DriverArgs,
+                          llvm::opt::ArgStringList &CC1Args) const override;
+
+  void AddClangCXXStdlibIncludeArgs(
+      const llvm::opt::ArgList &DriverArgs,
+      llvm::opt::ArgStringList &CC1Args) const override;
+  void AddCXXStdlibLibArgs(const llvm::opt::ArgList &Args,
+                           llvm::opt::ArgStringList &CmdArgs) const override;
+
+  void printVerboseInfo(raw_ostream &OS) const override;
+  /// }
+
+  LazyDetector<CudaInstallationDetector> CudaInstallation;
+  LazyDetector<RocmInstallationDetector> RocmInstallation;
+  LazyDetector<SYCLInstallationDetector> SYCLInstallation;
+
+protected:
+  llvm::SmallString<128>
+  GetEffectiveSysroot(const llvm::opt::ArgList &DriverArgs) const;
+
+private:
+  virtual void
+  AddGnuCPlusPlusIncludePaths(const llvm::opt::ArgList &DriverArgs,
+                              llvm::opt::ArgStringList &CC1Args) const;
+};
+
 /// Darwin - The base Darwin tool chain.
-class LLVM_LIBRARY_VISIBILITY Darwin : public MachO {
+class LLVM_LIBRARY_VISIBILITY Darwin : public AppleMachO {
 public:
   /// Whether the information on the target has been initialized.
   //
@@ -330,10 +374,6 @@ public:
   /// The target variant triple that was specified (if any).
   mutable std::optional<llvm::Triple> TargetVariantTriple;
 
-  LazyDetector<CudaInstallationDetector> CudaInstallation;
-  LazyDetector<RocmInstallationDetector> RocmInstallation;
-  LazyDetector<SYCLInstallationDetector> SYCLInstallation;
-
 private:
   void AddDeploymentTarget(llvm::opt::DerivedArgList &Args) const;
 
@@ -345,7 +385,7 @@ public:
   std::string ComputeEffectiveClangTriple(const llvm::opt::ArgList &Args,
                                           types::ID InputType) const override;
 
-  /// @name Apple Specific Toolchain Implementation
+  /// @name Darwin Specific Toolchain Implementation
   /// {
 
   void addMinVersionArgs(const llvm::opt::ArgList &Args,
@@ -561,13 +601,6 @@ public:
   ObjCRuntime getDefaultObjCRuntime(bool isNonFragile) const override;
   bool hasBlocksRuntime() const override;
 
-  void AddCudaIncludeArgs(const llvm::opt::ArgList &DriverArgs,
-                          llvm::opt::ArgStringList &CC1Args) const override;
-  void AddHIPIncludeArgs(const llvm::opt::ArgList &DriverArgs,
-                         llvm::opt::ArgStringList &CC1Args) const override;
-  void addSYCLIncludeArgs(const llvm::opt::ArgList &DriverArgs,
-                          llvm::opt::ArgStringList &CC1Args) const override;
-
   bool UseObjCMixedDispatch() const override {
     // This is only used with the non-fragile ABI and non-legacy dispatch.
 
@@ -598,8 +631,6 @@ public:
   bool SupportsEmbeddedBitcode() const override;
 
   SanitizerMask getSupportedSanitizers() const override;
-
-  void printVerboseInfo(raw_ostream &OS) const override;
 };
 
 /// DarwinClang - The Darwin toolchain used by Clang.
@@ -617,16 +648,6 @@ public:
                              llvm::opt::ArgStringList &CmdArgs,
                              bool ForceLinkBuiltinRT = false) const override;
 
-  void AddClangCXXStdlibIncludeArgs(
-      const llvm::opt::ArgList &DriverArgs,
-      llvm::opt::ArgStringList &CC1Args) const override;
-
-  void AddClangSystemIncludeArgs(const llvm::opt::ArgList &DriverArgs,
-                                 llvm::opt::ArgStringList &CC1Args) const override;
-
-  void AddCXXStdlibLibArgs(const llvm::opt::ArgList &Args,
-                           llvm::opt::ArgStringList &CmdArgs) const override;
-
   void AddCCKextLibArgs(const llvm::opt::ArgList &Args,
                         llvm::opt::ArgStringList &CmdArgs) const override;
 
@@ -651,15 +672,16 @@ private:
                                StringRef Sanitizer,
                                bool shared = true) const;
 
+  void
+  AddGnuCPlusPlusIncludePaths(const llvm::opt::ArgList &DriverArgs,
+                              llvm::opt::ArgStringList &CC1Args) const override;
+
   bool AddGnuCPlusPlusIncludePaths(const llvm::opt::ArgList &DriverArgs,
                                    llvm::opt::ArgStringList &CC1Args,
                                    llvm::SmallString<128> Base,
                                    llvm::StringRef Version,
                                    llvm::StringRef ArchDir,
                                    llvm::StringRef BitDir) const;
-
-  llvm::SmallString<128>
-  GetEffectiveSysroot(const llvm::opt::ArgList &DriverArgs) const;
 };
 
 } // end namespace toolchains
diff --git a/clang/lib/Driver/ToolChains/Flang.cpp b/clang/lib/Driver/ToolChains/Flang.cpp
index 7034e5b475c1..75b10e88371a 100644
--- a/clang/lib/Driver/ToolChains/Flang.cpp
+++ b/clang/lib/Driver/ToolChains/Flang.cpp
@@ -57,7 +57,8 @@ void Flang::addFortranDialectOptions(const ArgList &Args,
                             options::OPT_fno_automatic,
                             options::OPT_fhermetic_module_files,
                             options::OPT_frealloc_lhs,
-                            options::OPT_fno_realloc_lhs});
+                            options::OPT_fno_realloc_lhs,
+                            options::OPT_fsave_main_program});
 }
 
 void Flang::addPreprocessingOptions(const ArgList &Args,
diff --git a/clang/lib/Format/MatchFilePath.cpp b/clang/lib/Format/MatchFilePath.cpp
index 3d8614838e53..1f1e4bfac1e3 100644
--- a/clang/lib/Format/MatchFilePath.cpp
+++ b/clang/lib/Format/MatchFilePath.cpp
@@ -63,10 +63,10 @@ bool matchFilePath(StringRef Pattern, StringRef FilePath) {
       if (I == EOP) // `Pattern` ends with a star.
         return Globstar || NoMoreSeparatorsInFilePath;
       if (Pattern[I] != Separator) {
-        Globstar = false;
         // `Pattern` ends with a lone backslash.
         if (Pattern[I] == '\\' && ++I == EOP)
           return false;
+        Globstar = false;
       }
       // The star is followed by a (possibly escaped) `Separator`.
       if (Pattern[I] == Separator) {
diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp
index e18e9a6fcd07..bf5ee281c431 100644
--- a/clang/lib/Format/TokenAnnotator.cpp
+++ b/clang/lib/Format/TokenAnnotator.cpp
@@ -137,8 +137,6 @@ public:
 private:
   ScopeType getScopeType(const FormatToken &Token) const {
     switch (Token.getType()) {
-    case TT_LambdaLBrace:
-      return ST_ChildBlock;
     case TT_ClassLBrace:
     case TT_StructLBrace:
     case TT_UnionLBrace:
@@ -3395,13 +3393,13 @@ private:
   /// Parse unary operator expressions and surround them with fake
   /// parentheses if appropriate.
   void parseUnaryOperator() {
-    llvm::SmallVector<FormatToken *, 2> Tokens;
+    SmallVector<FormatToken *, 2> Tokens;
     while (Current && Current->is(TT_UnaryOperator)) {
       Tokens.push_back(Current);
       next();
     }
     parse(PrecedenceArrowAndPeriod);
-    for (FormatToken *Token : llvm::reverse(Tokens)) {
+    for (FormatToken *Token : reverse(Tokens)) {
       // The actual precedence doesn't matter.
       addFakeParenthesis(Token, prec::Unknown);
     }
@@ -3579,7 +3577,7 @@ private:
 void TokenAnnotator::setCommentLineLevels(
     SmallVectorImpl<AnnotatedLine *> &Lines) const {
   const AnnotatedLine *NextNonCommentLine = nullptr;
-  for (AnnotatedLine *Line : llvm::reverse(Lines)) {
+  for (AnnotatedLine *Line : reverse(Lines)) {
     assert(Line->First);
 
     // If the comment is currently aligned with the line immediately following
@@ -3700,7 +3698,7 @@ void TokenAnnotator::annotate(AnnotatedLine &Line) {
   Line.Type = Parser.parseLine();
 
   if (!Line.Children.empty()) {
-    ScopeStack.push_back(ST_ChildBlock);
+    ScopeStack.push_back(ST_Other);
     const bool InRequiresExpression = Line.Type == LT_RequiresExpression;
     for (auto &Child : Line.Children) {
       if (InRequiresExpression &&
diff --git a/clang/lib/Format/TokenAnnotator.h b/clang/lib/Format/TokenAnnotator.h
index fa15517042f2..16e920e8ad8a 100644
--- a/clang/lib/Format/TokenAnnotator.h
+++ b/clang/lib/Format/TokenAnnotator.h
@@ -38,13 +38,11 @@ enum LineType {
 };
 
 enum ScopeType {
-  // Contained in child block.
-  ST_ChildBlock,
   // Contained in class declaration/definition.
   ST_Class,
   // Contained in compound requirement.
   ST_CompoundRequirement,
-  // Contained within other scope block (function, loop, if/else, etc).
+  // Contained in other blocks (function, lambda, loop, if/else, child, etc).
   ST_Other,
 };
 
diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp
index 6e47b374d4ed..d711df02ce95 100644
--- a/clang/lib/Frontend/CompilerInvocation.cpp
+++ b/clang/lib/Frontend/CompilerInvocation.cpp
@@ -1274,9 +1274,7 @@ static void initOption(AnalyzerOptions::ConfigTable &Config,
     Diags->Report(diag::err_analyzer_config_invalid_input)
         << Name << "a positive";
 
-  auto Default = PositiveAnalyzerOption::create(DefaultVal);
-  assert(Default.has_value());
-  OptionField = Default.value();
+  OptionField = DefaultVal;
 }
 
 static void parseAnalyzerConfigs(AnalyzerOptions &AnOpts,
diff --git a/clang/lib/Frontend/InitPreprocessor.cpp b/clang/lib/Frontend/InitPreprocessor.cpp
index 29723b573e77..8eba766f21a6 100644
--- a/clang/lib/Frontend/InitPreprocessor.cpp
+++ b/clang/lib/Frontend/InitPreprocessor.cpp
@@ -1507,6 +1507,11 @@ static void InitializePredefinedMacros(const TargetInfo &TI,
   // ELF targets define __ELF__
   if (TI.getTriple().isOSBinFormatELF())
     Builder.defineMacro("__ELF__");
+  else if (TI.getTriple().isAppleMachO())
+    // Apple MachO targets define __MACH__ even when not using DarwinTargetInfo.
+    // Hurd will also define this in some circumstances, but that's done in
+    // HurdTargetInfo. Windows targets don't define this.
+    Builder.defineMacro("__MACH__");
 
   // Target OS macro definitions.
   if (PPOpts.DefineTargetOSMacros) {
diff --git a/clang/lib/Lex/InitHeaderSearch.cpp b/clang/lib/Lex/InitHeaderSearch.cpp
index 67c9d92b849e..bb2a21356fa8 100644
--- a/clang/lib/Lex/InitHeaderSearch.cpp
+++ b/clang/lib/Lex/InitHeaderSearch.cpp
@@ -313,7 +313,7 @@ bool InitHeaderSearch::ShouldAddDefaultIncludePaths(
     break;
 
   case llvm::Triple::UnknownOS:
-    if (triple.isWasm())
+    if (triple.isWasm() || triple.isAppleMachO())
       return false;
     break;
 
diff --git a/clang/lib/Parse/ParseOpenACC.cpp b/clang/lib/Parse/ParseOpenACC.cpp
index a9deae74cf27..c79ba97a2007 100644
--- a/clang/lib/Parse/ParseOpenACC.cpp
+++ b/clang/lib/Parse/ParseOpenACC.cpp
@@ -1003,7 +1003,9 @@ Parser::OpenACCClauseParseResult Parser::ParseOpenACCClauseParams(
       // the 'update' clause, so we have to handle it here.  U se an assert to
       // make sure we get the right differentiator.
       assert(DirKind == OpenACCDirectiveKind::Update);
-      [[fallthrough]];
+      ParsedClause.setVarListDetails(ParseOpenACCVarList(ClauseKind),
+                                     /*IsReadOnly=*/false, /*IsZero=*/false);
+      break;
     case OpenACCClauseKind::Device:
     case OpenACCClauseKind::DeviceResident:
     case OpenACCClauseKind::Host:
diff --git a/clang/lib/Sema/SemaAttr.cpp b/clang/lib/Sema/SemaAttr.cpp
index 44485e71d57a..42aa68d2905c 100644
--- a/clang/lib/Sema/SemaAttr.cpp
+++ b/clang/lib/Sema/SemaAttr.cpp
@@ -307,8 +307,8 @@ void Sema::inferLifetimeCaptureByAttribute(FunctionDecl *FD) {
       Annotate(MD);
     return;
   }
-  static const llvm::StringSet<> CapturingMethods{"insert", "push",
-                                                  "push_front", "push_back"};
+  static const llvm::StringSet<> CapturingMethods{
+      "insert", "insert_or_assign", "push", "push_front", "push_back"};
   if (!CapturingMethods.contains(MD->getName()))
     return;
   Annotate(MD);
diff --git a/clang/lib/Sema/SemaConcept.cpp b/clang/lib/Sema/SemaConcept.cpp
index 10f4920a761f..539de00bd104 100644
--- a/clang/lib/Sema/SemaConcept.cpp
+++ b/clang/lib/Sema/SemaConcept.cpp
@@ -846,7 +846,7 @@ bool Sema::CheckFunctionConstraints(const FunctionDecl *FD,
                                     bool ForOverloadResolution) {
   // Don't check constraints if the function is dependent. Also don't check if
   // this is a function template specialization, as the call to
-  // CheckFunctionTemplateConstraints after this will check it
+  // CheckinstantiatedFunctionTemplateConstraints after this will check it
   // better.
   if (FD->isDependentContext() ||
       FD->getTemplatedKind() ==
@@ -1111,55 +1111,12 @@ bool Sema::EnsureTemplateArgumentListConstraints(
   return false;
 }
 
-static bool CheckFunctionConstraintsWithoutInstantiation(
-    Sema &SemaRef, SourceLocation PointOfInstantiation,
-    FunctionTemplateDecl *Template, ArrayRef<TemplateArgument> TemplateArgs,
-    ConstraintSatisfaction &Satisfaction) {
-  SmallVector<const Expr *, 3> TemplateAC;
-  Template->getAssociatedConstraints(TemplateAC);
-  if (TemplateAC.empty()) {
-    Satisfaction.IsSatisfied = true;
-    return false;
-  }
-
-  LocalInstantiationScope Scope(SemaRef);
-
-  FunctionDecl *FD = Template->getTemplatedDecl();
-  // Collect the list of template arguments relative to the 'primary'
-  // template. We need the entire list, since the constraint is completely
-  // uninstantiated at this point.
-
-  // FIXME: Add TemplateArgs through the 'Innermost' parameter once
-  // the refactoring of getTemplateInstantiationArgs() relands.
-  MultiLevelTemplateArgumentList MLTAL;
-  MLTAL.addOuterTemplateArguments(Template, std::nullopt, /*Final=*/false);
-  SemaRef.getTemplateInstantiationArgs(
-      MLTAL, /*D=*/FD, FD,
-      /*Final=*/false, /*Innermost=*/std::nullopt, /*RelativeToPrimary=*/true,
-      /*Pattern=*/nullptr, /*ForConstraintInstantiation=*/true);
-  MLTAL.replaceInnermostTemplateArguments(Template, TemplateArgs);
-
-  Sema::ContextRAII SavedContext(SemaRef, FD);
-  std::optional<Sema::CXXThisScopeRAII> ThisScope;
-  if (auto *Method = dyn_cast<CXXMethodDecl>(FD))
-    ThisScope.emplace(SemaRef, /*Record=*/Method->getParent(),
-                      /*ThisQuals=*/Method->getMethodQualifiers());
-  return SemaRef.CheckConstraintSatisfaction(
-      Template, TemplateAC, MLTAL, PointOfInstantiation, Satisfaction);
-}
-
-bool Sema::CheckFunctionTemplateConstraints(
+bool Sema::CheckInstantiatedFunctionTemplateConstraints(
     SourceLocation PointOfInstantiation, FunctionDecl *Decl,
     ArrayRef<TemplateArgument> TemplateArgs,
     ConstraintSatisfaction &Satisfaction) {
   // In most cases we're not going to have constraints, so check for that first.
   FunctionTemplateDecl *Template = Decl->getPrimaryTemplate();
-
-  if (!Template)
-    return ::CheckFunctionConstraintsWithoutInstantiation(
-        *this, PointOfInstantiation, Decl->getDescribedFunctionTemplate(),
-        TemplateArgs, Satisfaction);
-
   // Note - code synthesis context for the constraints check is created
   // inside CheckConstraintsSatisfaction.
   SmallVector<const Expr *, 3> TemplateAC;
diff --git a/clang/lib/Sema/SemaExceptionSpec.cpp b/clang/lib/Sema/SemaExceptionSpec.cpp
index 94f59bbc0aa3..ac5d51a1d2ff 100644
--- a/clang/lib/Sema/SemaExceptionSpec.cpp
+++ b/clang/lib/Sema/SemaExceptionSpec.cpp
@@ -1402,6 +1402,7 @@ CanThrowResult Sema::canThrow(const Stmt *S) {
   case Stmt::OpenACCInitConstructClass:
   case Stmt::OpenACCShutdownConstructClass:
   case Stmt::OpenACCSetConstructClass:
+  case Stmt::OpenACCUpdateConstructClass:
     // These expressions can never throw.
     return CT_Cannot;
 
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index 562c98c6babe..ae40895980d9 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -16592,6 +16592,13 @@ ExprResult Sema::BuildVAArgExpr(SourceLocation BuiltinLoc,
         << TInfo->getTypeLoc().getSourceRange();
     }
 
+    if (TInfo->getType()->isArrayType()) {
+      DiagRuntimeBehavior(TInfo->getTypeLoc().getBeginLoc(), E,
+                          PDiag(diag::warn_second_parameter_to_va_arg_array)
+                              << TInfo->getType()
+                              << TInfo->getTypeLoc().getSourceRange());
+    }
+
     // Check for va_arg where arguments of the given type will be promoted
     // (i.e. this va_arg is guaranteed to have undefined behavior).
     QualType PromoteType;
diff --git a/clang/lib/Sema/SemaOpenACC.cpp b/clang/lib/Sema/SemaOpenACC.cpp
index 846b1966e765..51a95f99f062 100644
--- a/clang/lib/Sema/SemaOpenACC.cpp
+++ b/clang/lib/Sema/SemaOpenACC.cpp
@@ -498,12 +498,9 @@ bool checkAlreadyHasClauseOfKind(
 bool checkValidAfterDeviceType(
     SemaOpenACC &S, const OpenACCDeviceTypeClause &DeviceTypeClause,
     const SemaOpenACC::OpenACCParsedClause &NewClause) {
-  // This is only a requirement on compute, combined, data and loop constructs
-  // so far, so this is fine otherwise.
-  if (!isOpenACCComputeDirectiveKind(NewClause.getDirectiveKind()) &&
-      !isOpenACCCombinedDirectiveKind(NewClause.getDirectiveKind()) &&
-      NewClause.getDirectiveKind() != OpenACCDirectiveKind::Loop &&
-      NewClause.getDirectiveKind() != OpenACCDirectiveKind::Data)
+  // This is implemented for everything but 'routine', so treat as 'fine' for
+  // that.
+  if (NewClause.getDirectiveKind() == OpenACCDirectiveKind::Routine)
     return false;
 
   // OpenACC3.3: Section 2.4: Clauses that precede any device_type clause are
@@ -578,6 +575,21 @@ bool checkValidAfterDeviceType(
     default:
       break;
     }
+  } else if (NewClause.getDirectiveKind() == OpenACCDirectiveKind::Set ||
+             NewClause.getDirectiveKind() == OpenACCDirectiveKind::Init ||
+             NewClause.getDirectiveKind() == OpenACCDirectiveKind::Shutdown) {
+    // There are no restrictions on 'set', 'init', or 'shutdown'.
+    return false;
+  } else if (NewClause.getDirectiveKind() == OpenACCDirectiveKind::Update) {
+    // OpenACC3.3 section 2.14.4: Only the async and wait clauses may follow a
+    // device_type clause.
+    switch (NewClause.getClauseKind()) {
+    case OpenACCClauseKind::Async:
+    case OpenACCClauseKind::Wait:
+      return false;
+    default:
+      break;
+    }
   }
   S.Diag(NewClause.getBeginLoc(), diag::err_acc_clause_after_device_type)
       << NewClause.getClauseKind() << DeviceTypeClause.getClauseKind()
@@ -709,18 +721,11 @@ OpenACCClause *SemaOpenACCClauseVisitor::VisitTileClause(
 
 OpenACCClause *SemaOpenACCClauseVisitor::VisitIfClause(
     SemaOpenACC::OpenACCParsedClause &Clause) {
-  // Restrictions only properly implemented on 'compute'/'combined'/'data'
-  // constructs, and 'compute'/'combined'/'data' constructs are the only
-  // constructs that can do anything with this yet, so skip/treat as
-  // unimplemented in this case.
-  if (!isDirectiveKindImplemented(Clause.getDirectiveKind()))
-    return isNotImplemented();
-
   // There is no prose in the standard that says duplicates aren't allowed,
   // but this diagnostic is present in other compilers, as well as makes
   // sense. Prose DOES exist for 'data' and 'host_data', 'set', 'enter data' and
   // 'exit data' both don't, but other implmementations do this.  OpenACC issue
-  // 519 filed for the latter two.
+  // 519 filed for the latter two. Prose also exists for 'update'.
   // GCC allows this on init/shutdown, presumably for good reason, so we do too.
   if (Clause.getDirectiveKind() != OpenACCDirectiveKind::Init &&
       Clause.getDirectiveKind() != OpenACCDirectiveKind::Shutdown &&
@@ -731,14 +736,14 @@ OpenACCClause *SemaOpenACCClauseVisitor::VisitIfClause(
   // isn't really much to do here.
 
   // If the 'if' clause is true, it makes the 'self' clause have no effect,
-  // diagnose that here.
-  // TODO OpenACC: When we add these two to other constructs, we might not
-  // want to warn on this (for example, 'update').
-  const auto *Itr =
-      llvm::find_if(ExistingClauses, llvm::IsaPred<OpenACCSelfClause>);
-  if (Itr != ExistingClauses.end()) {
-    SemaRef.Diag(Clause.getBeginLoc(), diag::warn_acc_if_self_conflict);
-    SemaRef.Diag((*Itr)->getBeginLoc(), diag::note_acc_previous_clause_here);
+  // diagnose that here.  This only applies on compute/combined constructs.
+  if (Clause.getDirectiveKind() != OpenACCDirectiveKind::Update) {
+    const auto *Itr =
+        llvm::find_if(ExistingClauses, llvm::IsaPred<OpenACCSelfClause>);
+    if (Itr != ExistingClauses.end()) {
+      SemaRef.Diag(Clause.getBeginLoc(), diag::warn_acc_if_self_conflict);
+      SemaRef.Diag((*Itr)->getBeginLoc(), diag::note_acc_previous_clause_here);
+    }
   }
 
   return OpenACCIfClause::Create(Ctx, Clause.getBeginLoc(),
@@ -748,16 +753,6 @@ OpenACCClause *SemaOpenACCClauseVisitor::VisitIfClause(
 
 OpenACCClause *SemaOpenACCClauseVisitor::VisitSelfClause(
     SemaOpenACC::OpenACCParsedClause &Clause) {
-  // Restrictions only properly implemented on 'compute' constructs, and
-  // 'compute' constructs are the only construct that can do anything with
-  // this yet, so skip/treat as unimplemented in this case.
-  if (!isDirectiveKindImplemented(Clause.getDirectiveKind()))
-    return isNotImplemented();
-
-  // TODO OpenACC: When we implement this for 'update', this takes a
-  // 'var-list' instead of a condition expression, so semantics/handling has
-  // to happen differently here.
-
   // There is no prose in the standard that says duplicates aren't allowed,
   // but this diagnostic is present in other compilers, as well as makes
   // sense.
@@ -765,9 +760,12 @@ OpenACCClause *SemaOpenACCClauseVisitor::VisitSelfClause(
     return nullptr;
 
   // If the 'if' clause is true, it makes the 'self' clause have no effect,
-  // diagnose that here.
-  // TODO OpenACC: When we add these two to other constructs, we might not
-  // want to warn on this (for example, 'update').
+  // diagnose that here.  This only applies on compute/combined constructs.
+  if (Clause.getDirectiveKind() == OpenACCDirectiveKind::Update)
+    return OpenACCSelfClause::Create(Ctx, Clause.getBeginLoc(),
+                                     Clause.getLParenLoc(), Clause.getVarList(),
+                                     Clause.getEndLoc());
+
   const auto *Itr =
       llvm::find_if(ExistingClauses, llvm::IsaPred<OpenACCIfClause>);
   if (Itr != ExistingClauses.end()) {
@@ -944,13 +942,6 @@ OpenACCClause *SemaOpenACCClauseVisitor::VisitVectorLengthClause(
 
 OpenACCClause *SemaOpenACCClauseVisitor::VisitAsyncClause(
     SemaOpenACC::OpenACCParsedClause &Clause) {
-  // Restrictions only properly implemented on 'compute'/'combined'/'data'
-  // constructs, and 'compute'/'combined'/'data' constructs are the only
-  // construct that can do anything with this yet, so skip/treat as
-  // unimplemented in this case.
-  if (!isDirectiveKindImplemented(Clause.getDirectiveKind()))
-    return isNotImplemented();
-
   // There is no prose in the standard that says duplicates aren't allowed,
   // but this diagnostic is present in other compilers, as well as makes
   // sense.
@@ -1185,13 +1176,6 @@ OpenACCClause *SemaOpenACCClauseVisitor::VisitDevicePtrClause(
 
 OpenACCClause *SemaOpenACCClauseVisitor::VisitWaitClause(
     SemaOpenACC::OpenACCParsedClause &Clause) {
-  // Restrictions only properly implemented on 'compute'/'combined'/'data'
-  // constructs, and 'compute'/'combined'/'data' constructs are the only
-  // construct that can do anything with this yet, so skip/treat as
-  // unimplemented in this case.
-  if (!isDirectiveKindImplemented(Clause.getDirectiveKind()))
-    return isNotImplemented();
-
   return OpenACCWaitClause::Create(
       Ctx, Clause.getBeginLoc(), Clause.getLParenLoc(), Clause.getDevNumExpr(),
       Clause.getQueuesLoc(), Clause.getQueueIdExprs(), Clause.getEndLoc());
@@ -1199,11 +1183,8 @@ OpenACCClause *SemaOpenACCClauseVisitor::VisitWaitClause(
 
 OpenACCClause *SemaOpenACCClauseVisitor::VisitDeviceTypeClause(
     SemaOpenACC::OpenACCParsedClause &Clause) {
-  // Restrictions only properly implemented on 'compute', 'combined', 'data' and
-  // 'loop' constructs, and 'compute'/'combined'/'data'/'loop' constructs are
-  // the only construct that can do anything with this yet, so skip/treat as
-  // unimplemented in this case.
-  if (!isDirectiveKindImplemented(Clause.getDirectiveKind()))
+  // Restrictions implemented properly on everything except 'routine'.
+  if (Clause.getDirectiveKind() == OpenACCDirectiveKind::Routine)
     return isNotImplemented();
 
   // OpenACC 3.3 2.14.3: Two instances of the same clause may not appear on the
@@ -1744,8 +1725,6 @@ OpenACCClause *SemaOpenACCClauseVisitor::VisitFinalizeClause(
 
 OpenACCClause *SemaOpenACCClauseVisitor::VisitIfPresentClause(
     SemaOpenACC::OpenACCParsedClause &Clause) {
-  if (!isDirectiveKindImplemented(Clause.getDirectiveKind()))
-    return isNotImplemented();
   // There isn't anything to do here, this is only valid on one construct, and
   // has no associated rules.
   return OpenACCIfPresentClause::Create(Ctx, Clause.getBeginLoc(),
@@ -1936,6 +1915,7 @@ bool PreserveLoopRAIIDepthInAssociatedStmtRAII(OpenACCDirectiveKind DK) {
   case OpenACCDirectiveKind::Init:
   case OpenACCDirectiveKind::Shutdown:
   case OpenACCDirectiveKind::Set:
+  case OpenACCDirectiveKind::Update:
     llvm_unreachable("Doesn't have an associated stmt");
   default:
   case OpenACCDirectiveKind::Invalid:
@@ -2365,6 +2345,7 @@ void SemaOpenACC::ActOnConstruct(OpenACCDirectiveKind K,
   case OpenACCDirectiveKind::Init:
   case OpenACCDirectiveKind::Shutdown:
   case OpenACCDirectiveKind::Set:
+  case OpenACCDirectiveKind::Update:
     // Nothing to do here, there is no real legalization that needs to happen
     // here as these constructs do not take any arguments.
     break;
@@ -3713,6 +3694,9 @@ bool SemaOpenACC::ActOnStartStmtDirective(
                                 OpenACCClauseKind::DeviceType,
                                 OpenACCClauseKind::If});
 
+  // TODO: OpenACC: 'Update' construct needs to have one of 'self', 'host', or
+  // 'device'.  Implement here.
+
   return diagnoseConstructAppertainment(*this, K, StartLoc, /*IsStmt=*/true);
 }
 
@@ -3780,6 +3764,10 @@ StmtResult SemaOpenACC::ActOnEndStmtDirective(
     return OpenACCSetConstruct::Create(getASTContext(), StartLoc, DirLoc,
                                        EndLoc, Clauses);
   }
+  case OpenACCDirectiveKind::Update: {
+    return OpenACCUpdateConstruct::Create(getASTContext(), StartLoc, DirLoc,
+                                          EndLoc, Clauses);
+  }
   }
   llvm_unreachable("Unhandled case in directive handling?");
 }
diff --git a/clang/lib/Sema/SemaStmt.cpp b/clang/lib/Sema/SemaStmt.cpp
index d9149f7ee40b..25a07d0315ea 100644
--- a/clang/lib/Sema/SemaStmt.cpp
+++ b/clang/lib/Sema/SemaStmt.cpp
@@ -625,6 +625,15 @@ Sema::ActOnLabelStmt(SourceLocation IdentLoc, LabelDecl *TheDecl,
   if (getCurScope()->isInOpenACCComputeConstructScope())
     setFunctionHasBranchProtectedScope();
 
+  // OpenACC3.3 2.14.4:
+  // The update directive is executable.  It must not appear in place of the
+  // statement following an 'if', 'while', 'do', 'switch', or 'label' in C or
+  // C++.
+  if (isa<OpenACCUpdateConstruct>(SubStmt)) {
+    Diag(SubStmt->getBeginLoc(), diag::err_acc_update_as_body) << /*Label*/ 4;
+    SubStmt = new (Context) NullStmt(SubStmt->getBeginLoc());
+  }
+
   // Otherwise, things are good.  Fill in the declaration and return it.
   LabelStmt *LS = new (Context) LabelStmt(IdentLoc, TheDecl, SubStmt);
   TheDecl->setStmt(LS);
@@ -1019,6 +1028,15 @@ StmtResult Sema::ActOnIfStmt(SourceLocation IfLoc,
       Diags.Report(IfLoc, diag::warn_consteval_if_always_true) << Immediate;
   }
 
+  // OpenACC3.3 2.14.4:
+  // The update directive is executable.  It must not appear in place of the
+  // statement following an 'if', 'while', 'do', 'switch', or 'label' in C or
+  // C++.
+  if (isa<OpenACCUpdateConstruct>(thenStmt)) {
+    Diag(thenStmt->getBeginLoc(), diag::err_acc_update_as_body) << /*if*/ 0;
+    thenStmt = new (Context) NullStmt(thenStmt->getBeginLoc());
+  }
+
   return BuildIfStmt(IfLoc, StatementKind, LParenLoc, InitStmt, Cond, RParenLoc,
                      thenStmt, ElseLoc, elseStmt);
 }
@@ -1297,6 +1315,16 @@ Sema::ActOnFinishSwitchStmt(SourceLocation SwitchLoc, Stmt *Switch,
   getCurFunction()->SwitchStack.pop_back();
 
   if (!BodyStmt) return StmtError();
+
+  // OpenACC3.3 2.14.4:
+  // The update directive is executable.  It must not appear in place of the
+  // statement following an 'if', 'while', 'do', 'switch', or 'label' in C or
+  // C++.
+  if (isa<OpenACCUpdateConstruct>(BodyStmt)) {
+    Diag(BodyStmt->getBeginLoc(), diag::err_acc_update_as_body) << /*switch*/ 3;
+    BodyStmt = new (Context) NullStmt(BodyStmt->getBeginLoc());
+  }
+
   SS->setBody(BodyStmt, SwitchLoc);
 
   Expr *CondExpr = SS->getCond();
@@ -1774,6 +1802,15 @@ StmtResult Sema::ActOnWhileStmt(SourceLocation WhileLoc,
       !Diags.isIgnored(diag::warn_comma_operator, CondVal.second->getExprLoc()))
     CommaVisitor(*this).Visit(CondVal.second);
 
+  // OpenACC3.3 2.14.4:
+  // The update directive is executable.  It must not appear in place of the
+  // statement following an 'if', 'while', 'do', 'switch', or 'label' in C or
+  // C++.
+  if (isa<OpenACCUpdateConstruct>(Body)) {
+    Diag(Body->getBeginLoc(), diag::err_acc_update_as_body) << /*while*/ 1;
+    Body = new (Context) NullStmt(Body->getBeginLoc());
+  }
+
   if (isa<NullStmt>(Body))
     getCurCompoundScope().setHasEmptyLoopBodies();
 
@@ -1803,6 +1840,15 @@ Sema::ActOnDoStmt(SourceLocation DoLoc, Stmt *Body,
       !Diags.isIgnored(diag::warn_comma_operator, Cond->getExprLoc()))
     CommaVisitor(*this).Visit(Cond);
 
+  // OpenACC3.3 2.14.4:
+  // The update directive is executable.  It must not appear in place of the
+  // statement following an 'if', 'while', 'do', 'switch', or 'label' in C or
+  // C++.
+  if (isa<OpenACCUpdateConstruct>(Body)) {
+    Diag(Body->getBeginLoc(), diag::err_acc_update_as_body) << /*do*/ 2;
+    Body = new (Context) NullStmt(Body->getBeginLoc());
+  }
+
   return new (Context) DoStmt(Body, Cond, DoLoc, WhileLoc, CondRParen);
 }
 
diff --git a/clang/lib/Sema/SemaStmtAttr.cpp b/clang/lib/Sema/SemaStmtAttr.cpp
index 422d8abc1028..106e2430de90 100644
--- a/clang/lib/Sema/SemaStmtAttr.cpp
+++ b/clang/lib/Sema/SemaStmtAttr.cpp
@@ -619,12 +619,6 @@ static Attr *handleHLSLLoopHintAttr(Sema &S, Stmt *St, const ParsedAttr &A,
   return ::new (S.Context) HLSLLoopHintAttr(S.Context, A, UnrollFactor);
 }
 
-static Attr *handleHLSLControlFlowHint(Sema &S, Stmt *St, const ParsedAttr &A,
-                                       SourceRange Range) {
-
-  return ::new (S.Context) HLSLControlFlowHintAttr(S.Context, A);
-}
-
 static Attr *ProcessStmtAttribute(Sema &S, Stmt *St, const ParsedAttr &A,
                                   SourceRange Range) {
   if (A.isInvalid() || A.getKind() == ParsedAttr::IgnoredAttribute)
@@ -661,8 +655,6 @@ static Attr *ProcessStmtAttribute(Sema &S, Stmt *St, const ParsedAttr &A,
     return handleLoopHintAttr(S, St, A, Range);
   case ParsedAttr::AT_HLSLLoopHint:
     return handleHLSLLoopHintAttr(S, St, A, Range);
-  case ParsedAttr::AT_HLSLControlFlowHint:
-    return handleHLSLControlFlowHint(S, St, A, Range);
   case ParsedAttr::AT_OpenCLUnrollHint:
     return handleOpenCLUnrollHint(S, St, A, Range);
   case ParsedAttr::AT_Suppress:
diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp
index 20ec2fbeaa6a..ce672b00893b 100644
--- a/clang/lib/Sema/SemaTemplate.cpp
+++ b/clang/lib/Sema/SemaTemplate.cpp
@@ -4557,6 +4557,9 @@ Sema::CheckConceptTemplateId(const CXXScopeSpec &SS,
                              const TemplateArgumentListInfo *TemplateArgs) {
   assert(NamedConcept && "A concept template id without a template?");
 
+  if (NamedConcept->isInvalidDecl())
+    return ExprError();
+
   llvm::SmallVector<TemplateArgument, 4> SugaredConverted, CanonicalConverted;
   if (CheckTemplateArgumentList(
           NamedConcept, ConceptNameInfo.getLoc(),
diff --git a/clang/lib/Sema/SemaTemplateDeduction.cpp b/clang/lib/Sema/SemaTemplateDeduction.cpp
index acd1151184e4..1c1f6e30ab7b 100644
--- a/clang/lib/Sema/SemaTemplateDeduction.cpp
+++ b/clang/lib/Sema/SemaTemplateDeduction.cpp
@@ -3936,6 +3936,18 @@ TemplateDeductionResult Sema::FinishTemplateArgumentDeduction(
       Result != TemplateDeductionResult::Success)
     return Result;
 
+  // C++ [temp.deduct.call]p10: [DR1391]
+  //   If deduction succeeds for all parameters that contain
+  //   template-parameters that participate in template argument deduction,
+  //   and all template arguments are explicitly specified, deduced, or
+  //   obtained from default template arguments, remaining parameters are then
+  //   compared with the corresponding arguments. For each remaining parameter
+  //   P with a type that was non-dependent before substitution of any
+  //   explicitly-specified template arguments, if the corresponding argument
+  //   A cannot be implicitly converted to P, deduction fails.
+  if (CheckNonDependent())
+    return TemplateDeductionResult::NonDependentConversionFailure;
+
   // Form the template argument list from the deduced template arguments.
   TemplateArgumentList *SugaredDeducedArgumentList =
       TemplateArgumentList::CreateCopy(Context, SugaredBuilder);
@@ -3965,39 +3977,6 @@ TemplateDeductionResult Sema::FinishTemplateArgumentDeduction(
     FD = const_cast<FunctionDecl *>(FDFriend);
     Owner = FD->getLexicalDeclContext();
   }
-  // C++20 [temp.deduct.general]p5: [CWG2369]
-  //   If the function template has associated constraints, those constraints
-  //   are checked for satisfaction. If the constraints are not satisfied, type
-  //   deduction fails.
-  //
-  // FIXME: We haven't implemented CWG2369 for lambdas yet, because we need
-  // to figure out how to instantiate lambda captures to the scope without
-  // first instantiating the lambda.
-  bool IsLambda = isLambdaCallOperator(FD) || isLambdaConversionOperator(FD);
-  if (!IsLambda && !IsIncomplete) {
-    if (CheckFunctionTemplateConstraints(
-            Info.getLocation(),
-            FunctionTemplate->getCanonicalDecl()->getTemplatedDecl(),
-            CanonicalBuilder, Info.AssociatedConstraintsSatisfaction))
-      return TemplateDeductionResult::MiscellaneousDeductionFailure;
-    if (!Info.AssociatedConstraintsSatisfaction.IsSatisfied) {
-      Info.reset(Info.takeSugared(),
-                 TemplateArgumentList::CreateCopy(Context, CanonicalBuilder));
-      return TemplateDeductionResult::ConstraintsNotSatisfied;
-    }
-  }
-  // C++ [temp.deduct.call]p10: [CWG1391]
-  //   If deduction succeeds for all parameters that contain
-  //   template-parameters that participate in template argument deduction,
-  //   and all template arguments are explicitly specified, deduced, or
-  //   obtained from default template arguments, remaining parameters are then
-  //   compared with the corresponding arguments. For each remaining parameter
-  //   P with a type that was non-dependent before substitution of any
-  //   explicitly-specified template arguments, if the corresponding argument
-  //   A cannot be implicitly converted to P, deduction fails.
-  if (CheckNonDependent())
-    return TemplateDeductionResult::NonDependentConversionFailure;
-
   MultiLevelTemplateArgumentList SubstArgs(
       FunctionTemplate, CanonicalDeducedArgumentList->asArray(),
       /*Final=*/false);
@@ -4032,8 +4011,8 @@ TemplateDeductionResult Sema::FinishTemplateArgumentDeduction(
   //   ([temp.constr.decl]), those constraints are checked for satisfaction
   //   ([temp.constr.constr]). If the constraints are not satisfied, type
   //   deduction fails.
-  if (IsLambda && !IsIncomplete) {
-    if (CheckFunctionTemplateConstraints(
+  if (!IsIncomplete) {
+    if (CheckInstantiatedFunctionTemplateConstraints(
             Info.getLocation(), Specialization, CanonicalBuilder,
             Info.AssociatedConstraintsSatisfaction))
       return TemplateDeductionResult::MiscellaneousDeductionFailure;
diff --git a/clang/lib/Sema/SemaTemplateDeductionGuide.cpp b/clang/lib/Sema/SemaTemplateDeductionGuide.cpp
index 5d6c11a75303..d42c3765aa53 100644
--- a/clang/lib/Sema/SemaTemplateDeductionGuide.cpp
+++ b/clang/lib/Sema/SemaTemplateDeductionGuide.cpp
@@ -902,12 +902,10 @@ Expr *buildIsDeducibleConstraint(Sema &SemaRef,
       Context.getTrivialTypeSourceInfo(
           Context.getDeducedTemplateSpecializationType(
               TemplateName(AliasTemplate), /*DeducedType=*/QualType(),
-              /*IsDependent=*/true),
-          AliasTemplate->getLocation()), // template specialization type whose
-                                         // arguments will be deduced.
+              /*IsDependent=*/true)), // template specialization type whose
+                                      // arguments will be deduced.
       Context.getTrivialTypeSourceInfo(
-          ReturnType, AliasTemplate->getLocation()), // type from which template
-                                                     // arguments are deduced.
+          ReturnType), // type from which template arguments are deduced.
   };
   return TypeTraitExpr::Create(
       Context, Context.getLogicalOperationType(), AliasTemplate->getLocation(),
diff --git a/clang/lib/Sema/SemaTemplateInstantiate.cpp b/clang/lib/Sema/SemaTemplateInstantiate.cpp
index cab9ae79ce5c..fb0f38df62a7 100644
--- a/clang/lib/Sema/SemaTemplateInstantiate.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiate.cpp
@@ -475,21 +475,6 @@ MultiLevelTemplateArgumentList Sema::getTemplateInstantiationArgs(
   assert((ND || DC) && "Can't find arguments for a decl if one isn't provided");
   // Accumulate the set of template argument lists in this structure.
   MultiLevelTemplateArgumentList Result;
-  getTemplateInstantiationArgs(
-      Result, ND, DC, Final, Innermost, RelativeToPrimary, Pattern,
-      ForConstraintInstantiation, SkipForSpecialization,
-      ForDefaultArgumentSubstitution);
-  return Result;
-}
-
-void Sema::getTemplateInstantiationArgs(
-    MultiLevelTemplateArgumentList &Result, const NamedDecl *ND,
-    const DeclContext *DC, bool Final,
-    std::optional<ArrayRef<TemplateArgument>> Innermost, bool RelativeToPrimary,
-    const FunctionDecl *Pattern, bool ForConstraintInstantiation,
-    bool SkipForSpecialization, bool ForDefaultArgumentSubstitution) {
-  assert((ND || DC) && "Can't find arguments for a decl if one isn't provided");
-  // Accumulate the set of template argument lists in this structure.
 
   using namespace TemplateInstArgsHelpers;
   const Decl *CurDecl = ND;
@@ -550,12 +535,14 @@ void Sema::getTemplateInstantiationArgs(
     }
 
     if (R.IsDone)
-      return;
+      return Result;
     if (R.ClearRelativeToPrimary)
       RelativeToPrimary = false;
     assert(R.NextDecl);
     CurDecl = R.NextDecl;
   }
+
+  return Result;
 }
 
 bool Sema::CodeSynthesisContext::isInstantiationRecord() const {
@@ -1362,19 +1349,6 @@ namespace {
     // Whether an incomplete substituion should be treated as an error.
     bool BailOutOnIncomplete;
 
-  private:
-    bool isSubstitutingConstraints() const {
-      return llvm::any_of(SemaRef.CodeSynthesisContexts, [](auto &Context) {
-        return Context.Kind ==
-               Sema::CodeSynthesisContext::ConstraintSubstitution;
-      });
-    }
-
-    // CWG2770: Function parameters should be instantiated when they are
-    // needed by a satisfaction check of an atomic constraint or
-    // (recursively) by another function parameter.
-    bool maybeInstantiateFunctionParameterToScope(ParmVarDecl *OldParm);
-
   public:
     typedef TreeTransform<TemplateInstantiator> inherited;
 
@@ -1431,19 +1405,12 @@ namespace {
                                  ArrayRef<UnexpandedParameterPack> Unexpanded,
                                  bool &ShouldExpand, bool &RetainExpansion,
                                  std::optional<unsigned> &NumExpansions) {
-      if (SemaRef.CurrentInstantiationScope && isSubstitutingConstraints()) {
-        for (UnexpandedParameterPack ParmPack : Unexpanded) {
-          NamedDecl *VD = ParmPack.first.dyn_cast<NamedDecl *>();
-          if (!isa_and_present<ParmVarDecl>(VD))
-            continue;
-          if (maybeInstantiateFunctionParameterToScope(cast<ParmVarDecl>(VD)))
-            return true;
-        }
-      }
-
-      return getSema().CheckParameterPacksForExpansion(
-          EllipsisLoc, PatternRange, Unexpanded, TemplateArgs, ShouldExpand,
-          RetainExpansion, NumExpansions);
+      return getSema().CheckParameterPacksForExpansion(EllipsisLoc,
+                                                       PatternRange, Unexpanded,
+                                                       TemplateArgs,
+                                                       ShouldExpand,
+                                                       RetainExpansion,
+                                                       NumExpansions);
     }
 
     void ExpandingFunctionParameterPack(ParmVarDecl *Pack) {
@@ -1944,62 +1911,9 @@ Decl *TemplateInstantiator::TransformDecl(SourceLocation Loc, Decl *D) {
     // template parameter.
   }
 
-  if (SemaRef.CurrentInstantiationScope) {
-    if (isSubstitutingConstraints() && isa<ParmVarDecl>(D) &&
-        maybeInstantiateFunctionParameterToScope(cast<ParmVarDecl>(D)))
-      return nullptr;
-  }
-
   return SemaRef.FindInstantiatedDecl(Loc, cast<NamedDecl>(D), TemplateArgs);
 }
 
-bool TemplateInstantiator::maybeInstantiateFunctionParameterToScope(
-    ParmVarDecl *OldParm) {
-  if (SemaRef.CurrentInstantiationScope->findInstantiationUnsafe(OldParm))
-    return false;
-  // We're instantiating a function parameter whose associated function template
-  // has not been instantiated at this point for constraint evaluation, so make
-  // sure the instantiated parameters are owned by a function declaration such
-  // that they can be correctly 'captured' in tryCaptureVariable().
-  Sema::ContextRAII Context(SemaRef, OldParm->getDeclContext());
-
-  if (!OldParm->isParameterPack())
-    return !TransformFunctionTypeParam(OldParm, /*indexAdjustment=*/0,
-                                       /*NumExpansions=*/std::nullopt,
-                                       /*ExpectParameterPack=*/false);
-
-  SmallVector<UnexpandedParameterPack, 2> Unexpanded;
-
-  // Find the parameter packs that could be expanded.
-  TypeLoc TL = OldParm->getTypeSourceInfo()->getTypeLoc();
-  PackExpansionTypeLoc ExpansionTL = TL.castAs<PackExpansionTypeLoc>();
-  TypeLoc Pattern = ExpansionTL.getPatternLoc();
-  SemaRef.collectUnexpandedParameterPacks(Pattern, Unexpanded);
-  assert(!Unexpanded.empty() && "Pack expansion without parameter packs?");
-
-  bool ShouldExpand = false;
-  bool RetainExpansion = false;
-  std::optional<unsigned> OrigNumExpansions =
-      ExpansionTL.getTypePtr()->getNumExpansions();
-  std::optional<unsigned> NumExpansions = OrigNumExpansions;
-  if (TryExpandParameterPacks(ExpansionTL.getEllipsisLoc(),
-                              Pattern.getSourceRange(), Unexpanded,
-                              ShouldExpand, RetainExpansion, NumExpansions))
-    return true;
-
-  assert(ShouldExpand && !RetainExpansion &&
-         "Shouldn't preserve pack expansion when evaluating constraints");
-  ExpandingFunctionParameterPack(OldParm);
-  for (unsigned I = 0; I != *NumExpansions; ++I) {
-    Sema::ArgumentPackSubstitutionIndexRAII SubstIndex(getSema(), I);
-    if (!TransformFunctionTypeParam(OldParm, /*indexAdjustment=*/0,
-                                    /*NumExpansions=*/OrigNumExpansions,
-                                    /*ExpectParameterPack=*/false))
-      return true;
-  }
-  return false;
-}
-
 Decl *TemplateInstantiator::TransformDefinition(SourceLocation Loc, Decl *D) {
   Decl *Inst = getSema().SubstDecl(D, getSema().CurContext, TemplateArgs);
   if (!Inst)
@@ -4677,8 +4591,9 @@ static const Decl *getCanonicalParmVarDecl(const Decl *D) {
   return D;
 }
 
+
 llvm::PointerUnion<Decl *, LocalInstantiationScope::DeclArgumentPack *> *
-LocalInstantiationScope::findInstantiationUnsafe(const Decl *D) {
+LocalInstantiationScope::findInstantiationOf(const Decl *D) {
   D = getCanonicalParmVarDecl(D);
   for (LocalInstantiationScope *Current = this; Current;
        Current = Current->Outer) {
@@ -4703,14 +4618,6 @@ LocalInstantiationScope::findInstantiationUnsafe(const Decl *D) {
       break;
   }
 
-  return nullptr;
-}
-
-llvm::PointerUnion<Decl *, LocalInstantiationScope::DeclArgumentPack *> *
-LocalInstantiationScope::findInstantiationOf(const Decl *D) {
-  auto *Result = findInstantiationUnsafe(D);
-  if (Result)
-    return Result;
   // If we're performing a partial substitution during template argument
   // deduction, we may not have values for template parameters yet.
   if (isa<NonTypeTemplateParmDecl>(D) || isa<TemplateTypeParmDecl>(D) ||
diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h
index c40ff8b0d201..d00ad5a35e82 100644
--- a/clang/lib/Sema/TreeTransform.h
+++ b/clang/lib/Sema/TreeTransform.h
@@ -713,7 +713,7 @@ public:
   /// variables vector are acceptable.
   ///
   /// LastParamTransformed, if non-null, will be set to the index of the last
-  /// parameter on which transformation was started. In the event of an error,
+  /// parameter on which transfromation was started. In the event of an error,
   /// this will contain the parameter which failed to instantiate.
   ///
   /// Return true on error.
@@ -4178,6 +4178,15 @@ public:
         SourceLocation{}, {}, SourceLocation{}, EndLoc, Clauses, {});
   }
 
+  StmtResult RebuildOpenACCUpdateConstruct(SourceLocation BeginLoc,
+                                           SourceLocation DirLoc,
+                                           SourceLocation EndLoc,
+                                           ArrayRef<OpenACCClause *> Clauses) {
+    return getSema().OpenACC().ActOnEndStmtDirective(
+        OpenACCDirectiveKind::Update, BeginLoc, DirLoc, SourceLocation{},
+        SourceLocation{}, {}, SourceLocation{}, EndLoc, Clauses, {});
+  }
+
   StmtResult RebuildOpenACCWaitConstruct(
       SourceLocation BeginLoc, SourceLocation DirLoc, SourceLocation LParenLoc,
       Expr *DevNumExpr, SourceLocation QueuesLoc, ArrayRef<Expr *> QueueIdExprs,
@@ -11638,22 +11647,48 @@ template <typename Derived>
 void OpenACCClauseTransform<Derived>::VisitSelfClause(
     const OpenACCSelfClause &C) {
 
-  if (C.hasConditionExpr()) {
-    Expr *Cond = const_cast<Expr *>(C.getConditionExpr());
-    Sema::ConditionResult Res =
-        Self.TransformCondition(Cond->getExprLoc(), /*Var=*/nullptr, Cond,
-                                Sema::ConditionKind::Boolean);
+  // If this is an 'update' 'self' clause, this is actually a var list instead.
+  if (ParsedClause.getDirectiveKind() == OpenACCDirectiveKind::Update) {
+    llvm::SmallVector<Expr *> InstantiatedVarList;
+    for (Expr *CurVar : C.getVarList()) {
+      ExprResult Res = Self.TransformExpr(CurVar);
 
-    if (Res.isInvalid() || !Res.get().second)
-      return;
+      if (!Res.isUsable())
+        continue;
 
-    ParsedClause.setConditionDetails(Res.get().second);
-  }
+      Res = Self.getSema().OpenACC().ActOnVar(ParsedClause.getClauseKind(),
+                                              Res.get());
 
-  NewClause = OpenACCSelfClause::Create(
-      Self.getSema().getASTContext(), ParsedClause.getBeginLoc(),
-      ParsedClause.getLParenLoc(), ParsedClause.getConditionExpr(),
-      ParsedClause.getEndLoc());
+      if (Res.isUsable())
+        InstantiatedVarList.push_back(Res.get());
+    }
+
+    ParsedClause.setVarListDetails(InstantiatedVarList,
+                                   /*IsReadOnly=*/false, /*IsZero=*/false);
+
+    NewClause = OpenACCSelfClause::Create(
+        Self.getSema().getASTContext(), ParsedClause.getBeginLoc(),
+        ParsedClause.getLParenLoc(), ParsedClause.getVarList(),
+        ParsedClause.getEndLoc());
+  } else {
+
+    if (C.hasConditionExpr()) {
+      Expr *Cond = const_cast<Expr *>(C.getConditionExpr());
+      Sema::ConditionResult Res =
+          Self.TransformCondition(Cond->getExprLoc(), /*Var=*/nullptr, Cond,
+                                  Sema::ConditionKind::Boolean);
+
+      if (Res.isInvalid() || !Res.get().second)
+        return;
+
+      ParsedClause.setConditionDetails(Res.get().second);
+    }
+
+    NewClause = OpenACCSelfClause::Create(
+        Self.getSema().getASTContext(), ParsedClause.getBeginLoc(),
+        ParsedClause.getLParenLoc(), ParsedClause.getConditionExpr(),
+        ParsedClause.getEndLoc());
+  }
 }
 
 template <typename Derived>
@@ -12472,6 +12507,23 @@ TreeTransform<Derived>::TransformOpenACCSetConstruct(OpenACCSetConstruct *C) {
 }
 
 template <typename Derived>
+StmtResult TreeTransform<Derived>::TransformOpenACCUpdateConstruct(
+    OpenACCUpdateConstruct *C) {
+  getSema().OpenACC().ActOnConstruct(C->getDirectiveKind(), C->getBeginLoc());
+
+  llvm::SmallVector<OpenACCClause *> TransformedClauses =
+      getDerived().TransformOpenACCClauseList(C->getDirectiveKind(),
+                                              C->clauses());
+  if (getSema().OpenACC().ActOnStartStmtDirective(
+          C->getDirectiveKind(), C->getBeginLoc(), TransformedClauses))
+    return StmtError();
+
+  return getDerived().RebuildOpenACCUpdateConstruct(
+      C->getBeginLoc(), C->getDirectiveLoc(), C->getEndLoc(),
+      TransformedClauses);
+}
+
+template <typename Derived>
 StmtResult
 TreeTransform<Derived>::TransformOpenACCWaitConstruct(OpenACCWaitConstruct *C) {
   getSema().OpenACC().ActOnConstruct(C->getDirectiveKind(), C->getBeginLoc());
diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp
index 0c82e540047f..0368990ca150 100644
--- a/clang/lib/Serialization/ASTReader.cpp
+++ b/clang/lib/Serialization/ASTReader.cpp
@@ -12387,9 +12387,18 @@ OpenACCClause *ASTRecordReader::readOpenACCClause() {
   }
   case OpenACCClauseKind::Self: {
     SourceLocation LParenLoc = readSourceLocation();
-    Expr *CondExpr = readBool() ? readSubExpr() : nullptr;
-    return OpenACCSelfClause::Create(getContext(), BeginLoc, LParenLoc,
-                                     CondExpr, EndLoc);
+    bool isConditionExprClause = readBool();
+    if (isConditionExprClause) {
+      Expr *CondExpr = readBool() ? readSubExpr() : nullptr;
+      return OpenACCSelfClause::Create(getContext(), BeginLoc, LParenLoc,
+                                       CondExpr, EndLoc);
+    }
+    unsigned NumVars = readInt();
+    llvm::SmallVector<Expr *> VarList;
+    for (unsigned I = 0; I < NumVars; ++I)
+      VarList.push_back(readSubExpr());
+    return OpenACCSelfClause::Create(getContext(), BeginLoc, LParenLoc, VarList,
+                                     EndLoc);
   }
   case OpenACCClauseKind::NumGangs: {
     SourceLocation LParenLoc = readSourceLocation();
diff --git a/clang/lib/Serialization/ASTReaderDecl.cpp b/clang/lib/Serialization/ASTReaderDecl.cpp
index 719bc0d06f5b..8c60e85c93d7 100644
--- a/clang/lib/Serialization/ASTReaderDecl.cpp
+++ b/clang/lib/Serialization/ASTReaderDecl.cpp
@@ -2663,7 +2663,8 @@ void ASTDeclReader::VisitTemplateTypeParmDecl(TemplateTypeParmDecl *D) {
 
   D->setDeclaredWithTypename(Record.readInt());
 
-  if (D->hasTypeConstraint()) {
+  bool TypeConstraintInitialized = D->hasTypeConstraint() && Record.readBool();
+  if (TypeConstraintInitialized) {
     ConceptReference *CR = nullptr;
     if (Record.readBool())
       CR = Record.readConceptReference();
diff --git a/clang/lib/Serialization/ASTReaderStmt.cpp b/clang/lib/Serialization/ASTReaderStmt.cpp
index 32e20c150814..4766f34e9f3a 100644
--- a/clang/lib/Serialization/ASTReaderStmt.cpp
+++ b/clang/lib/Serialization/ASTReaderStmt.cpp
@@ -2880,6 +2880,11 @@ void ASTStmtReader::VisitOpenACCSetConstruct(OpenACCSetConstruct *S) {
   VisitOpenACCConstructStmt(S);
 }
 
+void ASTStmtReader::VisitOpenACCUpdateConstruct(OpenACCUpdateConstruct *S) {
+  VisitStmt(S);
+  VisitOpenACCConstructStmt(S);
+}
+
 void ASTStmtReader::VisitOpenACCHostDataConstruct(OpenACCHostDataConstruct *S) {
   VisitStmt(S);
   VisitOpenACCAssociatedStmtConstruct(S);
@@ -4417,6 +4422,11 @@ Stmt *ASTReader::ReadStmtFromStream(ModuleFile &F) {
       S = OpenACCSetConstruct::CreateEmpty(Context, NumClauses);
       break;
     }
+    case STMT_OPENACC_UPDATE_CONSTRUCT: {
+      unsigned NumClauses = Record[ASTStmtReader::NumStmtFields];
+      S = OpenACCUpdateConstruct::CreateEmpty(Context, NumClauses);
+      break;
+    }
     case EXPR_REQUIRES: {
       unsigned numLocalParameters = Record[ASTStmtReader::NumExprFields];
       unsigned numRequirement = Record[ASTStmtReader::NumExprFields + 1];
diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp
index 39f8b0fd5ba0..8d9396e28ed5 100644
--- a/clang/lib/Serialization/ASTWriter.cpp
+++ b/clang/lib/Serialization/ASTWriter.cpp
@@ -8321,9 +8321,16 @@ void ASTRecordWriter::writeOpenACCClause(const OpenACCClause *C) {
   case OpenACCClauseKind::Self: {
     const auto *SC = cast<OpenACCSelfClause>(C);
     writeSourceLocation(SC->getLParenLoc());
-    writeBool(SC->hasConditionExpr());
-    if (SC->hasConditionExpr())
-      AddStmt(const_cast<Expr*>(SC->getConditionExpr()));
+    writeBool(SC->isConditionExprClause());
+    if (SC->isConditionExprClause()) {
+      writeBool(SC->hasConditionExpr());
+      if (SC->hasConditionExpr())
+        AddStmt(const_cast<Expr *>(SC->getConditionExpr()));
+    } else {
+      writeUInt32(SC->getVarList().size());
+      for (Expr *E : SC->getVarList())
+        AddStmt(E);
+    }
     return;
   }
   case OpenACCClauseKind::NumGangs: {
diff --git a/clang/lib/Serialization/ASTWriterDecl.cpp b/clang/lib/Serialization/ASTWriterDecl.cpp
index 75c1d9a6d438..f8ed155ca389 100644
--- a/clang/lib/Serialization/ASTWriterDecl.cpp
+++ b/clang/lib/Serialization/ASTWriterDecl.cpp
@@ -1951,7 +1951,8 @@ void ASTDeclWriter::VisitTemplateTypeParmDecl(TemplateTypeParmDecl *D) {
   Record.push_back(D->wasDeclaredWithTypename());
 
   const TypeConstraint *TC = D->getTypeConstraint();
-  assert((bool)TC == D->hasTypeConstraint());
+  if (D->hasTypeConstraint())
+    Record.push_back(/*TypeConstraintInitialized=*/TC != nullptr);
   if (TC) {
     auto *CR = TC->getConceptReference();
     Record.push_back(CR != nullptr);
@@ -1969,7 +1970,7 @@ void ASTDeclWriter::VisitTemplateTypeParmDecl(TemplateTypeParmDecl *D) {
   if (OwnsDefaultArg)
     Record.AddTemplateArgumentLoc(D->getDefaultArgument());
 
-  if (!TC && !OwnsDefaultArg &&
+  if (!D->hasTypeConstraint() && !OwnsDefaultArg &&
       D->getDeclContext() == D->getLexicalDeclContext() &&
       !D->isInvalidDecl() && !D->hasAttrs() &&
       !D->isTopLevelDeclInObjCContainer() && !D->isImplicit() &&
diff --git a/clang/lib/Serialization/ASTWriterStmt.cpp b/clang/lib/Serialization/ASTWriterStmt.cpp
index de0e7bf5f176..7eedf7da7d3f 100644
--- a/clang/lib/Serialization/ASTWriterStmt.cpp
+++ b/clang/lib/Serialization/ASTWriterStmt.cpp
@@ -2963,6 +2963,12 @@ void ASTStmtWriter::VisitOpenACCSetConstruct(OpenACCSetConstruct *S) {
   Code = serialization::STMT_OPENACC_SET_CONSTRUCT;
 }
 
+void ASTStmtWriter::VisitOpenACCUpdateConstruct(OpenACCUpdateConstruct *S) {
+  VisitStmt(S);
+  VisitOpenACCConstructStmt(S);
+  Code = serialization::STMT_OPENACC_UPDATE_CONSTRUCT;
+}
+
 void ASTStmtWriter::VisitOpenACCHostDataConstruct(OpenACCHostDataConstruct *S) {
   VisitStmt(S);
   VisitOpenACCAssociatedStmtConstruct(S);
diff --git a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
index 70e95c2c644c..ff8bdcea9a22 100644
--- a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
+++ b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
@@ -1833,6 +1833,7 @@ void ExprEngine::Visit(const Stmt *S, ExplodedNode *Pred,
     case Stmt::OpenACCInitConstructClass:
     case Stmt::OpenACCShutdownConstructClass:
     case Stmt::OpenACCSetConstructClass:
+    case Stmt::OpenACCUpdateConstructClass:
     case Stmt::OMPUnrollDirectiveClass:
     case Stmt::OMPMetaDirectiveClass:
     case Stmt::HLSLOutArgExprClass: {
diff --git a/clang/test/AST/HLSL/HLSLControlFlowHint.hlsl b/clang/test/AST/HLSL/HLSLControlFlowHint.hlsl
deleted file mode 100644
index a36779c05fbc..000000000000
--- a/clang/test/AST/HLSL/HLSLControlFlowHint.hlsl
+++ /dev/null
@@ -1,43 +0,0 @@
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-compute -ast-dump %s | FileCheck %s
-
-// CHECK: FunctionDecl 0x{{[0-9A-Fa-f]+}} <{{.*}}> {{.*}} used branch 'int (int)'
-// CHECK: AttributedStmt 0x{{[0-9A-Fa-f]+}} <<invalid sloc>
-// CHECK-NEXT: -HLSLControlFlowHintAttr 0x{{[0-9A-Fa-f]+}} <{{.*}}> branch
-export int branch(int X){
-    int resp;
-    [branch] if (X > 0) {
-        resp = -X;
-    } else {
-        resp = X * 2;
-    }
-
-    return resp;
-}
-
-// CHECK: FunctionDecl 0x{{[0-9A-Fa-f]+}} <{{.*}}> {{.*}} used flatten 'int (int)'
-// CHECK: AttributedStmt 0x{{[0-9A-Fa-f]+}} <<invalid sloc>
-// CHECK-NEXT: -HLSLControlFlowHintAttr 0x{{[0-9A-Fa-f]+}} <{{.*}}> flatten
-export int flatten(int X){
-    int resp;
-    [flatten] if (X > 0) {
-        resp = -X;
-    } else {
-        resp = X * 2;
-    }
-
-    return resp;
-}
-
-// CHECK: FunctionDecl 0x{{[0-9A-Fa-f]+}} <{{.*}}> {{.*}} used no_attr 'int (int)'
-// CHECK-NOT: AttributedStmt 0x{{[0-9A-Fa-f]+}} <<invalid sloc>
-// CHECK-NOT: -HLSLControlFlowHintAttr
-export int no_attr(int X){
-    int resp;
-    if (X > 0) {
-        resp = -X;
-    } else {
-        resp = X * 2;
-    }
-
-    return resp;
-}
diff --git a/clang/test/AST/ast-print-openacc-update-construct.cpp b/clang/test/AST/ast-print-openacc-update-construct.cpp
new file mode 100644
index 000000000000..a7f5b2a42285
--- /dev/null
+++ b/clang/test/AST/ast-print-openacc-update-construct.cpp
@@ -0,0 +1,41 @@
+// RUN: %clang_cc1 -fopenacc -ast-print %s -o - | FileCheck %s
+void uses(bool cond) {
+  int I;
+  int *iPtr;
+  int array[5];
+  // CHECK: #pragma acc update
+#pragma acc update
+
+// CHECK: #pragma acc update if_present
+#pragma acc update if_present
+// CHECK: #pragma acc update if(cond)
+#pragma acc update if(cond)
+
+// CHECK: #pragma acc update async
+#pragma acc update async
+// CHECK: #pragma acc update async(*iPtr)
+#pragma acc update async(*iPtr)
+// CHECK: #pragma acc update async(I)
+#pragma acc update async(I)
+
+// CHECK: #pragma acc update wait(*iPtr, I) async
+#pragma acc update wait(*iPtr, I) async
+
+// CHECK: #pragma acc update wait(queues: *iPtr, I) async(*iPtr)
+#pragma acc update wait(queues:*iPtr, I) async(*iPtr)
+
+// CHECK: #pragma acc update wait(devnum: I : *iPtr, I) async(I)
+#pragma acc update wait(devnum:I:*iPtr, I) async(I)
+
+// CHECK: #pragma acc update wait(devnum: I : queues: *iPtr, I) if(I == array[I]) async(I)
+#pragma acc update wait(devnum:I:queues:*iPtr, I) if(I == array[I]) async(I)
+
+// CHECK: #pragma acc update device_type(I) dtype(H)
+#pragma acc update device_type(I) dtype(H)
+
+// CHECK: #pragma acc update device_type(J) dtype(K)
+#pragma acc update device_type(J) dtype(K)
+
+// CHECK: #pragma acc update self(I, iPtr, array, array[1], array[1:2])
+#pragma acc update self(I, iPtr, array, array[1], array[1:2])
+}
diff --git a/clang/test/CXX/drs/cwg23xx.cpp b/clang/test/CXX/drs/cwg23xx.cpp
index d0ec48e3f86c..d144cf9e4e86 100644
--- a/clang/test/CXX/drs/cwg23xx.cpp
+++ b/clang/test/CXX/drs/cwg23xx.cpp
@@ -365,35 +365,6 @@ struct A {
 #endif
 } // namespace cwg2363
 
-namespace cwg2369 { // cwg2369: partial
-#if __cplusplus >= 202002L
-template <class T> struct Z {
-  typedef typename T::x xx;
-};
-
-template <class T>
-concept C = requires { typename T::A; };
-template <C T> typename Z<T>::xx f(void *, T); // #1
-template <class T> void f(int, T);             // #2
-
-struct A {
-} a;
-
-struct ZZ {
-  template <class T, class = typename Z<T>::xx> operator T *();
-  operator int();
-};
-
-void foo() {
-  ZZ zz;
-  f(1, a); // OK, deduction fails for #1 because there is no conversion from int
-           // to void*
-  f(zz, 42); // OK, deduction fails for #1 because C<int> is not satisfied
-}
-
-#endif
-} // namespace cwg2369
-
 namespace cwg2370 { // cwg2370: no
 namespace N {
 typedef int type;
diff --git a/clang/test/CXX/drs/cwg26xx.cpp b/clang/test/CXX/drs/cwg26xx.cpp
index 23d7635ff906..efc49b0b502a 100644
--- a/clang/test/CXX/drs/cwg26xx.cpp
+++ b/clang/test/CXX/drs/cwg26xx.cpp
@@ -319,7 +319,7 @@ void f(T) requires requires { []() { T::invalid; } (); };
 //   since-cxx20-note@-3 {{in instantiation of requirement here}}
 //   since-cxx20-note@-4 {{while substituting template arguments into constraint expression here}}
 //   since-cxx20-note@#cwg2672-f-0 {{while checking constraint satisfaction for template 'f<int>' required here}}
-//   since-cxx20-note@#cwg2672-f-0 {{while substituting deduced template arguments into function template 'f' [with T = int]}}
+//   since-cxx20-note@#cwg2672-f-0 {{in instantiation of function template specialization 'cwg2672::f<int>' requested here}}
 void f(...);
 
 template <class T>
diff --git a/clang/test/CXX/drs/cwg27xx.cpp b/clang/test/CXX/drs/cwg27xx.cpp
index 7caf36a9f23b..a87d26dfc9ac 100644
--- a/clang/test/CXX/drs/cwg27xx.cpp
+++ b/clang/test/CXX/drs/cwg27xx.cpp
@@ -174,26 +174,6 @@ static_assert(!__is_layout_compatible(StructWithAnonUnion, StructWithAnonUnion3)
 #endif
 } // namespace cwg2759
 
-namespace cwg2770 { // cwg2770: 20 open 2023-07-14
-#if __cplusplus >= 202002L
-template<typename T>
-struct B {
-  static_assert(sizeof(T) == 1);
-  using type = int;
-};
-
-template<typename T>
-int f(T t, typename B<T>::type u) requires (sizeof(t) == 1);
-
-template<typename T>
-int f(T t, long);
-
-int i = f(1, 2);
-int j = f('a', 2);
-
-#endif
-} // namespace cwg2770
-
 namespace cwg2789 { // cwg2789: 18
 #if __cplusplus >= 202302L
 template <typename T = int>
diff --git a/clang/test/CXX/expr/expr.prim/expr.prim.req/nested-requirement.cpp b/clang/test/CXX/expr/expr.prim/expr.prim.req/nested-requirement.cpp
index a23f7dc59517..763d983d20f6 100644
--- a/clang/test/CXX/expr/expr.prim/expr.prim.req/nested-requirement.cpp
+++ b/clang/test/CXX/expr/expr.prim/expr.prim.req/nested-requirement.cpp
@@ -154,7 +154,7 @@ void func() {
 
   bar<int>();
   // expected-note@-1 {{while checking constraint satisfaction for template 'bar<int>' required here}} \
-  // expected-note@-1 {{while substituting deduced template arguments into function template 'bar' [with T = int]}}
+  // expected-note@-1 {{in instantiation of function template specialization}}
   // expected-note@#bar {{in instantiation of static data member}}
   // expected-note@#bar {{in instantiation of requirement here}}
   // expected-note@#bar {{while checking the satisfaction of nested requirement requested here}}
diff --git a/clang/test/CXX/temp/temp.constr/temp.constr.atomic/constrant-satisfaction-conversions.cpp b/clang/test/CXX/temp/temp.constr/temp.constr.atomic/constrant-satisfaction-conversions.cpp
index c41de77986bc..ba8e2dc372e9 100644
--- a/clang/test/CXX/temp/temp.constr/temp.constr.atomic/constrant-satisfaction-conversions.cpp
+++ b/clang/test/CXX/temp/temp.constr/temp.constr.atomic/constrant-satisfaction-conversions.cpp
@@ -11,7 +11,7 @@ template<typename T> struct S {
 
 // expected-error@+3{{atomic constraint must be of type 'bool' (found 'S<int>')}}
 // expected-note@#FINST{{while checking constraint satisfaction}}
-// expected-note@#FINST{{while substituting deduced template arguments into function template 'f' [with T = int]}}
+// expected-note@#FINST{{in instantiation of function template specialization}}
 template<typename T> requires (S<T>{})
 void f(T);
 void f(int);
@@ -19,7 +19,7 @@ void f(int);
 // Ensure this applies to operator && as well.
 // expected-error@+3{{atomic constraint must be of type 'bool' (found 'S<int>')}}
 // expected-note@#F2INST{{while checking constraint satisfaction}}
-// expected-note@#F2INST{{while substituting deduced template arguments into function template 'f2' [with T = int]}}
+// expected-note@#F2INST{{in instantiation of function template specialization}}
 template<typename T> requires (S<T>{} && true)
 void f2(T);
 void f2(int);
@@ -32,7 +32,7 @@ template<typename T> requires requires {
   // expected-note@-4{{while checking the satisfaction}}
   // expected-note@-6{{while substituting template arguments}}
   // expected-note@#F3INST{{while checking constraint satisfaction}}
-  // expected-note@#F3INST{{while substituting deduced template arguments into function template 'f3' [with T = int]}}
+  // expected-note@#F3INST{{in instantiation of function template specialization}}
   //
 }
 void f3(T);
diff --git a/clang/test/CodeGen/AArch64/fmv-dependencies.c b/clang/test/CodeGen/AArch64/fmv-dependencies.c
index 3a524b89496e..097b85e989d8 100644
--- a/clang/test/CodeGen/AArch64/fmv-dependencies.c
+++ b/clang/test/CodeGen/AArch64/fmv-dependencies.c
@@ -42,7 +42,7 @@ __attribute__((target_version("flagm"))) int fmv(void) { return 0; }
 // CHECK: define dso_local i32 @fmv._Mflagm2() #[[flagm2:[0-9]+]] {
 __attribute__((target_version("flagm2"))) int fmv(void) { return 0; }
 
-// CHECK: define dso_local i32 @fmv._Mfp() #[[default:[0-9]+]] {
+// CHECK: define dso_local i32 @fmv._Mfp() #[[fp:[0-9]+]] {
 __attribute__((target_version("fp"))) int fmv(void) { return 0; }
 
 // CHECK: define dso_local i32 @fmv._Mfp16() #[[fp16:[0-9]+]] {
@@ -99,7 +99,7 @@ __attribute__((target_version("sha2"))) int fmv(void) { return 0; }
 // CHECK: define dso_local i32 @fmv._Msha3() #[[sha3:[0-9]+]] {
 __attribute__((target_version("sha3"))) int fmv(void) { return 0; }
 
-// CHECK: define dso_local i32 @fmv._Msimd() #[[default]] {
+// CHECK: define dso_local i32 @fmv._Msimd() #[[simd:[0-9]+]] {
 __attribute__((target_version("simd"))) int fmv(void) { return 0; }
 
 // CHECK: define dso_local i32 @fmv._Msm4() #[[sm4:[0-9]+]] {
@@ -150,48 +150,49 @@ int caller() {
   return fmv();
 }
 
-// CHECK: attributes #[[aes]] = { {{.*}} "target-features"="+aes,+fmv,+fp-armv8,+neon,+outline-atomics,+v8a"
-// CHECK: attributes #[[bf16]] = { {{.*}} "target-features"="+bf16,+fmv,+fp-armv8,+neon,+outline-atomics,+v8a"
-// CHECK: attributes #[[bti]] = { {{.*}} "target-features"="+bti,+fmv,+fp-armv8,+neon,+outline-atomics,+v8a"
-// CHECK: attributes #[[crc]] = { {{.*}} "target-features"="+crc,+fmv,+fp-armv8,+neon,+outline-atomics,+v8a"
-// CHECK: attributes #[[dit]] = { {{.*}} "target-features"="+dit,+fmv,+fp-armv8,+neon,+outline-atomics,+v8a"
-// CHECK: attributes #[[dotprod]] = { {{.*}} "target-features"="+dotprod,+fmv,+fp-armv8,+neon,+outline-atomics,+v8a"
-// CHECK: attributes #[[dpb]] = { {{.*}} "target-features"="+ccpp,+fmv,+fp-armv8,+neon,+outline-atomics,+v8a"
-// CHECK: attributes #[[dpb2]] = { {{.*}} "target-features"="+ccdp,+ccpp,+fmv,+fp-armv8,+neon,+outline-atomics,+v8a"
-// CHECK: attributes #[[f32mm]] = { {{.*}} "target-features"="+f32mm,+fmv,+fp-armv8,+fullfp16,+neon,+outline-atomics,+sve,+v8a"
-// CHECK: attributes #[[f64mm]] = { {{.*}} "target-features"="+f64mm,+fmv,+fp-armv8,+fullfp16,+neon,+outline-atomics,+sve,+v8a"
-// CHECK: attributes #[[fcma]] = { {{.*}} "target-features"="+complxnum,+fmv,+fp-armv8,+neon,+outline-atomics,+v8a"
-// CHECK: attributes #[[flagm]] = { {{.*}} "target-features"="+flagm,+fmv,+fp-armv8,+neon,+outline-atomics,+v8a"
-// CHECK: attributes #[[flagm2]] = { {{.*}} "target-features"="+altnzcv,+flagm,+fmv,+fp-armv8,+neon,+outline-atomics,+v8a"
-// CHECK: attributes #[[default]] = { {{.*}} "target-features"="+fmv,+fp-armv8,+neon,+outline-atomics,+v8a"
-// CHECK: attributes #[[fp16]] = { {{.*}} "target-features"="+fmv,+fp-armv8,+fullfp16,+neon,+outline-atomics,+v8a"
-// CHECK: attributes #[[fp16fml]] = { {{.*}} "target-features"="+fmv,+fp-armv8,+fp16fml,+fullfp16,+neon,+outline-atomics,+v8a"
-// CHECK: attributes #[[frintts]] = { {{.*}} "target-features"="+fmv,+fp-armv8,+fptoint,+neon,+outline-atomics,+v8a"
-// CHECK: attributes #[[i8mm]] = { {{.*}} "target-features"="+fmv,+fp-armv8,+i8mm,+neon,+outline-atomics,+v8a"
-// CHECK: attributes #[[jscvt]] = { {{.*}} "target-features"="+fmv,+fp-armv8,+jsconv,+neon,+outline-atomics,+v8a"
-// CHECK: attributes #[[ls64]] = { {{.*}} "target-features"="+fmv,+fp-armv8,+ls64,+neon,+outline-atomics,+v8a"
-// CHECK: attributes #[[lse]] = { {{.*}} "target-features"="+fmv,+fp-armv8,+lse,+neon,+outline-atomics,+v8a"
-// CHECK: attributes #[[memtag]] = { {{.*}} "target-features"="+fmv,+fp-armv8,+mte,+neon,+outline-atomics,+v8a"
-// CHECK: attributes #[[mops]] = { {{.*}} "target-features"="+fmv,+fp-armv8,+mops,+neon,+outline-atomics,+v8a"
-// CHECK: attributes #[[predres]] = { {{.*}} "target-features"="+fmv,+fp-armv8,+neon,+outline-atomics,+predres,+v8a"
-// CHECK: attributes #[[rcpc]] = { {{.*}} "target-features"="+fmv,+fp-armv8,+neon,+outline-atomics,+rcpc,+v8a"
-// CHECK: attributes #[[rcpc2]] = { {{.*}} "target-features"="+fmv,+fp-armv8,+neon,+outline-atomics,+rcpc,+rcpc-immo,+v8a"
-// CHECK: attributes #[[rcpc3]] = { {{.*}} "target-features"="+fmv,+fp-armv8,+neon,+outline-atomics,+rcpc,+rcpc-immo,+rcpc3,+v8a"
-// CHECK: attributes #[[rdm]] = { {{.*}} "target-features"="+fmv,+fp-armv8,+neon,+outline-atomics,+rdm,+v8a"
-// CHECK: attributes #[[rng]] = { {{.*}} "target-features"="+fmv,+fp-armv8,+neon,+outline-atomics,+rand,+v8a"
-// CHECK: attributes #[[sb]] = { {{.*}} "target-features"="+fmv,+fp-armv8,+neon,+outline-atomics,+sb,+v8a"
-// CHECK: attributes #[[sha2]] = { {{.*}} "target-features"="+fmv,+fp-armv8,+neon,+outline-atomics,+sha2,+v8a"
-// CHECK: attributes #[[sha3]] = { {{.*}} "target-features"="+fmv,+fp-armv8,+neon,+outline-atomics,+sha2,+sha3,+v8a"
-// CHECK: attributes #[[sm4]] = { {{.*}} "target-features"="+fmv,+fp-armv8,+neon,+outline-atomics,+sm4,+v8a"
-// CHECK: attributes #[[sme]] = { {{.*}} "target-features"="+bf16,+fmv,+fp-armv8,+fullfp16,+neon,+outline-atomics,+sme,+v8a"
-// CHECK: attributes #[[sme_f64f64]] = { {{.*}} "target-features"="+bf16,+fmv,+fp-armv8,+fullfp16,+neon,+outline-atomics,+sme,+sme-f64f64,+v8a"
-// CHECK: attributes #[[sme_i16i64]] = { {{.*}} "target-features"="+bf16,+fmv,+fp-armv8,+fullfp16,+neon,+outline-atomics,+sme,+sme-i16i64,+v8a"
-// CHECK: attributes #[[sme2]] = { {{.*}} "target-features"="+bf16,+fmv,+fp-armv8,+fullfp16,+neon,+outline-atomics,+sme,+sme2,+v8a"
-// CHECK: attributes #[[ssbs]] = { {{.*}} "target-features"="+fmv,+fp-armv8,+neon,+outline-atomics,+ssbs,+v8a"
-// CHECK: attributes #[[sve]] = { {{.*}} "target-features"="+fmv,+fp-armv8,+fullfp16,+neon,+outline-atomics,+sve,+v8a"
-// CHECK: attributes #[[sve2]] = { {{.*}} "target-features"="+fmv,+fp-armv8,+fullfp16,+neon,+outline-atomics,+sve,+sve2,+v8a"
-// CHECK: attributes #[[sve2_aes]] = { {{.*}} "target-features"="+aes,+fmv,+fp-armv8,+fullfp16,+neon,+outline-atomics,+sve,+sve-aes,+sve2,+sve2-aes,+v8a"
-// CHECK: attributes #[[sve2_bitperm]] = { {{.*}} "target-features"="+fmv,+fp-armv8,+fullfp16,+neon,+outline-atomics,+sve,+sve2,+sve2-bitperm,+v8a"
-// CHECK: attributes #[[sve2_sha3]] = { {{.*}} "target-features"="+fmv,+fp-armv8,+fullfp16,+neon,+outline-atomics,+sha2,+sha3,+sve,+sve2,+sve2-sha3,+v8a"
-// CHECK: attributes #[[sve2_sm4]] = { {{.*}} "target-features"="+fmv,+fp-armv8,+fullfp16,+neon,+outline-atomics,+sm4,+sve,+sve2,+sve2-sm4,+v8a"
-// CHECK: attributes #[[wfxt]] = { {{.*}} "target-features"="+fmv,+fp-armv8,+neon,+outline-atomics,+v8a,+wfxt"
+// CHECK: attributes #[[aes]] = { {{.*}} "target-features"="+aes,+fp-armv8,+neon,+outline-atomics,+v8a"
+// CHECK: attributes #[[bf16]] = { {{.*}} "target-features"="+bf16,+fp-armv8,+neon,+outline-atomics,+v8a"
+// CHECK: attributes #[[bti]] = { {{.*}} "target-features"="+bti,+fp-armv8,+neon,+outline-atomics,+v8a"
+// CHECK: attributes #[[crc]] = { {{.*}} "target-features"="+crc,+fp-armv8,+neon,+outline-atomics,+v8a"
+// CHECK: attributes #[[dit]] = { {{.*}} "target-features"="+dit,+fp-armv8,+neon,+outline-atomics,+v8a"
+// CHECK: attributes #[[dotprod]] = { {{.*}} "target-features"="+dotprod,+fp-armv8,+neon,+outline-atomics,+v8a"
+// CHECK: attributes #[[dpb]] = { {{.*}} "target-features"="+ccpp,+fp-armv8,+neon,+outline-atomics,+v8a"
+// CHECK: attributes #[[dpb2]] = { {{.*}} "target-features"="+ccdp,+ccpp,+fp-armv8,+neon,+outline-atomics,+v8a"
+// CHECK: attributes #[[f32mm]] = { {{.*}} "target-features"="+f32mm,+fp-armv8,+fullfp16,+neon,+outline-atomics,+sve,+v8a"
+// CHECK: attributes #[[f64mm]] = { {{.*}} "target-features"="+f64mm,+fp-armv8,+fullfp16,+neon,+outline-atomics,+sve,+v8a"
+// CHECK: attributes #[[fcma]] = { {{.*}} "target-features"="+complxnum,+fp-armv8,+neon,+outline-atomics,+v8a"
+// CHECK: attributes #[[flagm]] = { {{.*}} "target-features"="+flagm,+fp-armv8,+neon,+outline-atomics,+v8a"
+// CHECK: attributes #[[flagm2]] = { {{.*}} "target-features"="+altnzcv,+flagm,+fp-armv8,+neon,+outline-atomics,+v8a"
+// CHECK: attributes #[[fp]] = { {{.*}} "target-features"="+fp-armv8,+neon,+outline-atomics,+v8a"
+// CHECK: attributes #[[fp16]] = { {{.*}} "target-features"="+fp-armv8,+fullfp16,+neon,+outline-atomics,+v8a"
+// CHECK: attributes #[[fp16fml]] = { {{.*}} "target-features"="+fp-armv8,+fp16fml,+fullfp16,+neon,+outline-atomics,+v8a"
+// CHECK: attributes #[[frintts]] = { {{.*}} "target-features"="+fp-armv8,+fptoint,+neon,+outline-atomics,+v8a"
+// CHECK: attributes #[[i8mm]] = { {{.*}} "target-features"="+fp-armv8,+i8mm,+neon,+outline-atomics,+v8a"
+// CHECK: attributes #[[jscvt]] = { {{.*}} "target-features"="+fp-armv8,+jsconv,+neon,+outline-atomics,+v8a"
+// CHECK: attributes #[[ls64]] = { {{.*}} "target-features"="+fp-armv8,+ls64,+neon,+outline-atomics,+v8a"
+// CHECK: attributes #[[lse]] = { {{.*}} "target-features"="+fp-armv8,+lse,+neon,+outline-atomics,+v8a"
+// CHECK: attributes #[[memtag]] = { {{.*}} "target-features"="+fp-armv8,+mte,+neon,+outline-atomics,+v8a"
+// CHECK: attributes #[[mops]] = { {{.*}} "target-features"="+fp-armv8,+mops,+neon,+outline-atomics,+v8a"
+// CHECK: attributes #[[predres]] = { {{.*}} "target-features"="+fp-armv8,+neon,+outline-atomics,+predres,+v8a"
+// CHECK: attributes #[[rcpc]] = { {{.*}} "target-features"="+fp-armv8,+neon,+outline-atomics,+rcpc,+v8a"
+// CHECK: attributes #[[rcpc2]] = { {{.*}} "target-features"="+fp-armv8,+neon,+outline-atomics,+rcpc,+rcpc-immo,+v8a"
+// CHECK: attributes #[[rcpc3]] = { {{.*}} "target-features"="+fp-armv8,+neon,+outline-atomics,+rcpc,+rcpc-immo,+rcpc3,+v8a"
+// CHECK: attributes #[[rdm]] = { {{.*}} "target-features"="+fp-armv8,+neon,+outline-atomics,+rdm,+v8a"
+// CHECK: attributes #[[rng]] = { {{.*}} "target-features"="+fp-armv8,+neon,+outline-atomics,+rand,+v8a"
+// CHECK: attributes #[[sb]] = { {{.*}} "target-features"="+fp-armv8,+neon,+outline-atomics,+sb,+v8a"
+// CHECK: attributes #[[sha2]] = { {{.*}} "target-features"="+fp-armv8,+neon,+outline-atomics,+sha2,+v8a"
+// CHECK: attributes #[[sha3]] = { {{.*}} "target-features"="+fp-armv8,+neon,+outline-atomics,+sha2,+sha3,+v8a"
+// CHECK: attributes #[[simd]] = { {{.*}} "target-features"="+fp-armv8,+neon,+outline-atomics,+v8a"
+// CHECK: attributes #[[sm4]] = { {{.*}} "target-features"="+fp-armv8,+neon,+outline-atomics,+sm4,+v8a"
+// CHECK: attributes #[[sme]] = { {{.*}} "target-features"="+bf16,+fp-armv8,+fullfp16,+neon,+outline-atomics,+sme,+v8a"
+// CHECK: attributes #[[sme_f64f64]] = { {{.*}} "target-features"="+bf16,+fp-armv8,+fullfp16,+neon,+outline-atomics,+sme,+sme-f64f64,+v8a"
+// CHECK: attributes #[[sme_i16i64]] = { {{.*}} "target-features"="+bf16,+fp-armv8,+fullfp16,+neon,+outline-atomics,+sme,+sme-i16i64,+v8a"
+// CHECK: attributes #[[sme2]] = { {{.*}} "target-features"="+bf16,+fp-armv8,+fullfp16,+neon,+outline-atomics,+sme,+sme2,+v8a"
+// CHECK: attributes #[[ssbs]] = { {{.*}} "target-features"="+fp-armv8,+neon,+outline-atomics,+ssbs,+v8a"
+// CHECK: attributes #[[sve]] = { {{.*}} "target-features"="+fp-armv8,+fullfp16,+neon,+outline-atomics,+sve,+v8a"
+// CHECK: attributes #[[sve2]] = { {{.*}} "target-features"="+fp-armv8,+fullfp16,+neon,+outline-atomics,+sve,+sve2,+v8a"
+// CHECK: attributes #[[sve2_aes]] = { {{.*}} "target-features"="+aes,+fp-armv8,+fullfp16,+neon,+outline-atomics,+sve,+sve-aes,+sve2,+sve2-aes,+v8a"
+// CHECK: attributes #[[sve2_bitperm]] = { {{.*}} "target-features"="+fp-armv8,+fullfp16,+neon,+outline-atomics,+sve,+sve2,+sve2-bitperm,+v8a"
+// CHECK: attributes #[[sve2_sha3]] = { {{.*}} "target-features"="+fp-armv8,+fullfp16,+neon,+outline-atomics,+sha2,+sha3,+sve,+sve2,+sve2-sha3,+v8a"
+// CHECK: attributes #[[sve2_sm4]] = { {{.*}} "target-features"="+fp-armv8,+fullfp16,+neon,+outline-atomics,+sm4,+sve,+sve2,+sve2-sm4,+v8a"
+// CHECK: attributes #[[wfxt]] = { {{.*}} "target-features"="+fp-armv8,+neon,+outline-atomics,+v8a,+wfxt"
diff --git a/clang/test/CodeGen/AArch64/fmv-features.c b/clang/test/CodeGen/AArch64/fmv-features.c
new file mode 100644
index 000000000000..f78bf4b5d59c
--- /dev/null
+++ b/clang/test/CodeGen/AArch64/fmv-features.c
@@ -0,0 +1,200 @@
+// Test all of the AArch64 fmv-features metadata without any dependency expansion.
+// It is used to propagate the attribute string information from C/C++ source to LLVM IR.
+
+// RUN: %clang --target=aarch64-linux-gnu --rtlib=compiler-rt -emit-llvm -S -o - %s | FileCheck %s
+
+// CHECK: define dso_local i32 @fmv._Maes() #[[aes:[0-9]+]] {
+// CHECK: define dso_local i32 @fmv._Mbf16() #[[bf16:[0-9]+]] {
+__attribute__((target_clones("aes", "bf16"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Mbti() #[[bti:[0-9]+]] {
+__attribute__((target_version("bti"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Mcrc() #[[crc:[0-9]+]] {
+__attribute__((target_version("crc"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Mdit() #[[dit:[0-9]+]] {
+__attribute__((target_version("dit"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Mdotprod() #[[dotprod:[0-9]+]] {
+__attribute__((target_version("dotprod"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Mdpb() #[[dpb:[0-9]+]] {
+__attribute__((target_version("dpb"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Mdpb2() #[[dpb2:[0-9]+]] {
+__attribute__((target_version("dpb2"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Mf32mm() #[[f32mm:[0-9]+]] {
+__attribute__((target_version("f32mm"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Mf64mm() #[[f64mm:[0-9]+]] {
+__attribute__((target_version("f64mm"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Mfcma() #[[fcma:[0-9]+]] {
+__attribute__((target_version("fcma"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Mflagm() #[[flagm:[0-9]+]] {
+__attribute__((target_version("flagm"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Mflagm2() #[[flagm2:[0-9]+]] {
+__attribute__((target_version("flagm2"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Mfp() #[[fp:[0-9]+]] {
+__attribute__((target_version("fp"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Mfp16() #[[fp16:[0-9]+]] {
+__attribute__((target_version("fp16"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Mfp16fml() #[[fp16fml:[0-9]+]] {
+__attribute__((target_version("fp16fml"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Mfrintts() #[[frintts:[0-9]+]] {
+__attribute__((target_version("frintts"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Mi8mm() #[[i8mm:[0-9]+]] {
+__attribute__((target_version("i8mm"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Mjscvt() #[[jscvt:[0-9]+]] {
+__attribute__((target_version("jscvt"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Mls64() #[[ls64:[0-9]+]] {
+__attribute__((target_version("ls64"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Mlse() #[[lse:[0-9]+]] {
+__attribute__((target_version("lse"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Mmemtag() #[[memtag:[0-9]+]] {
+__attribute__((target_version("memtag"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Mmops() #[[mops:[0-9]+]] {
+__attribute__((target_version("mops"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Mpredres() #[[predres:[0-9]+]] {
+__attribute__((target_version("predres"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Mrcpc() #[[rcpc:[0-9]+]] {
+__attribute__((target_version("rcpc"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Mrcpc2() #[[rcpc2:[0-9]+]] {
+__attribute__((target_version("rcpc2"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Mrcpc3() #[[rcpc3:[0-9]+]] {
+__attribute__((target_version("rcpc3"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Mrdm() #[[rdm:[0-9]+]] {
+__attribute__((target_version("rdm"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Mrng() #[[rng:[0-9]+]] {
+__attribute__((target_version("rng"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Msb() #[[sb:[0-9]+]] {
+__attribute__((target_version("sb"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Msha2() #[[sha2:[0-9]+]] {
+__attribute__((target_version("sha2"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Msha3() #[[sha3:[0-9]+]] {
+__attribute__((target_version("sha3"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Msimd() #[[simd:[0-9]+]] {
+__attribute__((target_version("simd"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Msm4() #[[sm4:[0-9]+]] {
+__attribute__((target_version("sm4"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Msme() #[[sme:[0-9]+]] {
+__attribute__((target_version("sme"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Msme-f64f64() #[[sme_f64f64:[0-9]+]] {
+__attribute__((target_version("sme-f64f64"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Msme-i16i64() #[[sme_i16i64:[0-9]+]] {
+__attribute__((target_version("sme-i16i64"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Msme2() #[[sme2:[0-9]+]] {
+__attribute__((target_version("sme2"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Mssbs() #[[ssbs:[0-9]+]] {
+__attribute__((target_version("ssbs"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Msve() #[[sve:[0-9]+]] {
+__attribute__((target_version("sve"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Msve2() #[[sve2:[0-9]+]] {
+__attribute__((target_version("sve2"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Msve2-aes() #[[sve2_aes:[0-9]+]] {
+__attribute__((target_version("sve2-aes"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Msve2-bitperm() #[[sve2_bitperm:[0-9]+]] {
+__attribute__((target_version("sve2-bitperm"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Msve2-sha3() #[[sve2_sha3:[0-9]+]] {
+__attribute__((target_version("sve2-sha3"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Msve2-sm4() #[[sve2_sm4:[0-9]+]] {
+__attribute__((target_version("sve2-sm4"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._Mwfxt() #[[wfxt:[0-9]+]] {
+__attribute__((target_version("wfxt"))) int fmv(void) { return 0; }
+
+// CHECK: define dso_local i32 @fmv._MaesMbf16MbtiMcrc() #[[multiple_features:[0-9]+]] {
+__attribute__((target_version("aes+bf16+bti+crc"))) int fmv(void) { return 0; }
+
+// CHECK-NOT: define dso_local i32 @fmv._M{{.*}}
+__attribute__((target_version("non_existent_extension"))) int fmv(void);
+
+__attribute__((target_version("default"))) int fmv(void);
+
+int caller() {
+  return fmv();
+}
+
+// CHECK: attributes #[[aes]] = { {{.*}} "fmv-features"="+aes"
+// CHECK: attributes #[[bf16]] = { {{.*}} "fmv-features"="+bf16"
+// CHECK: attributes #[[bti]] = { {{.*}} "fmv-features"="+bti"
+// CHECK: attributes #[[crc]] = { {{.*}} "fmv-features"="+crc"
+// CHECK: attributes #[[dit]] = { {{.*}} "fmv-features"="+dit"
+// CHECK: attributes #[[dotprod]] = { {{.*}} "fmv-features"="+dotprod"
+// CHECK: attributes #[[dpb]] = { {{.*}} "fmv-features"="+dpb"
+// CHECK: attributes #[[dpb2]] = { {{.*}} "fmv-features"="+dpb2"
+// CHECK: attributes #[[f32mm]] = { {{.*}} "fmv-features"="+f32mm"
+// CHECK: attributes #[[f64mm]] = { {{.*}} "fmv-features"="+f64mm"
+// CHECK: attributes #[[fcma]] = { {{.*}} "fmv-features"="+fcma"
+// CHECK: attributes #[[flagm]] = { {{.*}} "fmv-features"="+flagm"
+// CHECK: attributes #[[flagm2]] = { {{.*}} "fmv-features"="+flagm2"
+// CHECK: attributes #[[fp]] = { {{.*}} "fmv-features"="+fp"
+// CHECK: attributes #[[fp16]] = { {{.*}} "fmv-features"="+fp16"
+// CHECK: attributes #[[fp16fml]] = { {{.*}} "fmv-features"="+fp16fml"
+// CHECK: attributes #[[frintts]] = { {{.*}} "fmv-features"="+frintts"
+// CHECK: attributes #[[i8mm]] = { {{.*}} "fmv-features"="+i8mm"
+// CHECK: attributes #[[jscvt]] = { {{.*}} "fmv-features"="+jscvt"
+// CHECK: attributes #[[ls64]] = { {{.*}} "fmv-features"="+ls64"
+// CHECK: attributes #[[lse]] = { {{.*}} "fmv-features"="+lse"
+// CHECK: attributes #[[memtag]] = { {{.*}} "fmv-features"="+memtag"
+// CHECK: attributes #[[mops]] = { {{.*}} "fmv-features"="+mops"
+// CHECK: attributes #[[predres]] = { {{.*}} "fmv-features"="+predres"
+// CHECK: attributes #[[rcpc]] = { {{.*}} "fmv-features"="+rcpc"
+// CHECK: attributes #[[rcpc2]] = { {{.*}} "fmv-features"="+rcpc2"
+// CHECK: attributes #[[rcpc3]] = { {{.*}} "fmv-features"="+rcpc3"
+// CHECK: attributes #[[rdm]] = { {{.*}} "fmv-features"="+rdm"
+// CHECK: attributes #[[rng]] = { {{.*}} "fmv-features"="+rng"
+// CHECK: attributes #[[sb]] = { {{.*}} "fmv-features"="+sb"
+// CHECK: attributes #[[sha2]] = { {{.*}} "fmv-features"="+sha2"
+// CHECK: attributes #[[sha3]] = { {{.*}} "fmv-features"="+sha3"
+// CHECK: attributes #[[simd]] = { {{.*}} "fmv-features"="+simd"
+// CHECK: attributes #[[sm4]] = { {{.*}} "fmv-features"="+sm4"
+// CHECK: attributes #[[sme]] = { {{.*}} "fmv-features"="+sme"
+// CHECK: attributes #[[sme_f64f64]] = { {{.*}} "fmv-features"="+sme-f64f64"
+// CHECK: attributes #[[sme_i16i64]] = { {{.*}} "fmv-features"="+sme-i16i64"
+// CHECK: attributes #[[sme2]] = { {{.*}} "fmv-features"="+sme2"
+// CHECK: attributes #[[ssbs]] = { {{.*}} "fmv-features"="+ssbs"
+// CHECK: attributes #[[sve]] = { {{.*}} "fmv-features"="+sve"
+// CHECK: attributes #[[sve2]] = { {{.*}} "fmv-features"="+sve2"
+// CHECK: attributes #[[sve2_aes]] = { {{.*}} "fmv-features"="+sve2-aes"
+// CHECK: attributes #[[sve2_bitperm]] = { {{.*}} "fmv-features"="+sve2-bitperm"
+// CHECK: attributes #[[sve2_sha3]] = { {{.*}} "fmv-features"="+sve2-sha3"
+// CHECK: attributes #[[sve2_sm4]] = { {{.*}} "fmv-features"="+sve2-sm4"
+// CHECK: attributes #[[wfxt]] = { {{.*}} "fmv-features"="+wfxt"
+// CHECK: attributes #[[multiple_features]] = { {{.*}} "fmv-features"="+aes,+bf16,+bti,+crc"
diff --git a/clang/test/CodeGen/AArch64/fmv-priority.c b/clang/test/CodeGen/AArch64/fmv-priority.c
new file mode 100644
index 000000000000..080bb54736a7
--- /dev/null
+++ b/clang/test/CodeGen/AArch64/fmv-priority.c
@@ -0,0 +1,55 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --include-generated-funcs --version 5
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -emit-llvm -o - %s | FileCheck %s
+
+// Priority biskmasks after feature dependency expansion:
+//
+// MSB                                                    LSB
+//
+// sme2 | ls64 | sme | bf16 |       |      | fp16 | simd | fp
+// -----+------+-----+------+-------+------+------+------+---
+// sme2 |      | sme | bf16 | rcpc2 | rcpc | fp16 | simd | fp
+//
+// Dependencies should not affect priorities, since a
+// feature can only depend on lower priority features:
+// https://github.com/ARM-software/acle/pull/376
+
+__attribute__((target_version("sme2+ls64"))) int fn(void);
+__attribute__((target_version("sme2+rcpc2"))) int fn(void);
+__attribute__((target_version("default"))) int fn(void) { return 0; }
+
+int call() { return fn(); }
+
+// CHECK-LABEL: define dso_local i32 @fn.default(
+// CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret i32 0
+//
+//
+// CHECK-LABEL: define dso_local i32 @call(
+// CHECK-SAME: ) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CALL:%.*]] = call i32 @fn()
+// CHECK-NEXT:    ret i32 [[CALL]]
+//
+//
+// CHECK-LABEL: define weak_odr ptr @fn.resolver() comdat {
+// CHECK-NEXT:  [[RESOLVER_ENTRY:.*:]]
+// CHECK-NEXT:    call void @__init_cpu_features_resolver()
+// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 153126785511392000
+// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 153126785511392000
+// CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
+// CHECK-NEXT:    br i1 [[TMP3]], label %[[RESOLVER_RETURN:.*]], label %[[RESOLVER_ELSE:.*]]
+// CHECK:       [[RESOLVER_RETURN]]:
+// CHECK-NEXT:    ret ptr @fn._Mls64Msme2
+// CHECK:       [[RESOLVER_ELSE]]:
+// CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 144119586269233920
+// CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 144119586269233920
+// CHECK-NEXT:    [[TMP7:%.*]] = and i1 true, [[TMP6]]
+// CHECK-NEXT:    br i1 [[TMP7]], label %[[RESOLVER_RETURN1:.*]], label %[[RESOLVER_ELSE2:.*]]
+// CHECK:       [[RESOLVER_RETURN1]]:
+// CHECK-NEXT:    ret ptr @fn._Mrcpc2Msme2
+// CHECK:       [[RESOLVER_ELSE2]]:
+// CHECK-NEXT:    ret ptr @fn.default
+//
diff --git a/clang/test/CodeGen/AArch64/fmv-streaming.c b/clang/test/CodeGen/AArch64/fmv-streaming.c
index dc0c35a9a307..68ba3e5cfaa7 100644
--- a/clang/test/CodeGen/AArch64/fmv-streaming.c
+++ b/clang/test/CodeGen/AArch64/fmv-streaming.c
@@ -53,10 +53,10 @@ __attribute__((target_version("default"))) void sc_callee(void) __arm_streaming_
 
 
 // CHECK-LABEL: define {{[^@]+}}@n_caller
-// CHECK-SAME: () #[[caller:[0-9]+]] {
+// CHECK-SAME: () #[[default]] {
 // CHECK:    call void @n_callee()
-// CHECK:    call void @s_callee() #[[callsite_streaming:[0-9]+]]
-// CHECK:    call void @sc_callee() #[[callsite_streaming_compatible:[0-9]+]]
+// CHECK:    call void @s_callee() #[[streaming:[0-9]+]]
+// CHECK:    call void @sc_callee() #[[streaming_compatible:[0-9]+]]
 //
 void n_caller(void) {
   n_callee();
@@ -66,10 +66,10 @@ void n_caller(void) {
 
 
 // CHECK-LABEL: define {{[^@]+}}@s_caller
-// CHECK-SAME: () #[[caller_streaming:[0-9]+]] {
+// CHECK-SAME: () #[[default_streaming]] {
 // CHECK:    call void @n_callee()
-// CHECK:    call void @s_callee() #[[callsite_streaming]]
-// CHECK:    call void @sc_callee() #[[callsite_streaming_compatible]]
+// CHECK:    call void @s_callee() #[[streaming]]
+// CHECK:    call void @sc_callee() #[[streaming_compatible]]
 //
 void s_caller(void) __arm_streaming {
   n_callee();
@@ -79,10 +79,10 @@ void s_caller(void) __arm_streaming {
 
 
 // CHECK-LABEL: define {{[^@]+}}@sc_caller
-// CHECK-SAME: () #[[caller_streaming_compatible:[0-9]+]] {
+// CHECK-SAME: () #[[default_streaming_compatible]] {
 // CHECK:    call void @n_callee()
-// CHECK:    call void @s_callee() #[[callsite_streaming]]
-// CHECK:    call void @sc_callee() #[[callsite_streaming_compatible]]
+// CHECK:    call void @s_callee() #[[streaming]]
+// CHECK:    call void @sc_callee() #[[streaming_compatible]]
 //
 void sc_caller(void) __arm_streaming_compatible {
   n_callee();
@@ -103,8 +103,5 @@ void sc_caller(void) __arm_streaming_compatible {
 // CHECK: attributes #[[simd_streaming_compatible]] = {{.*}} "aarch64_pstate_sm_compatible"
 // CHECK: attributes #[[locally_streaming_sme2_streaming_compatible]] = {{.*}} "aarch64_pstate_sm_body" "aarch64_pstate_sm_compatible"
 // CHECK: attributes #[[default_streaming_compatible]] = {{.*}} "aarch64_pstate_sm_compatible"
-// CHECK: attributes #[[caller]] = {{.*}}
-// CHECK: attributes #[[caller_streaming]] = {{.*}} "aarch64_pstate_sm_enabled"
-// CHECK: attributes #[[caller_streaming_compatible]] = {{.*}} "aarch64_pstate_sm_compatible"
-// CHECK: attributes #[[callsite_streaming]] = {{.*}} "aarch64_pstate_sm_enabled"
-// CHECK: attributes #[[callsite_streaming_compatible]] = {{.*}} "aarch64_pstate_sm_compatible"
+// CHECK: attributes #[[streaming]] = {{.*}} "aarch64_pstate_sm_enabled"
+// CHECK: attributes #[[streaming_compatible]] = {{.*}} "aarch64_pstate_sm_compatible"
diff --git a/clang/test/CodeGen/AArch64/sme-intrinsics/acle_sme_state_funs.c b/clang/test/CodeGen/AArch64/sme-intrinsics/acle_sme_state_funs.c
index 9ba1527f2696..72f2d17fc6dc 100644
--- a/clang/test/CodeGen/AArch64/sme-intrinsics/acle_sme_state_funs.c
+++ b/clang/test/CodeGen/AArch64/sme-intrinsics/acle_sme_state_funs.c
@@ -6,34 +6,53 @@
 
 #include <arm_sme.h>
 
-// CHECK-LABEL: @test_in_streaming_mode(
+// CHECK-LABEL: @test_in_streaming_mode_streaming_compatible(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call aarch64_sme_preservemost_from_x2 { i64, i64 } @__arm_sme_state() #[[ATTR3:[0-9]+]]
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { i64, i64 } [[TMP0]], 0
-// CHECK-NEXT:    [[AND_I:%.*]] = and i64 [[TMP1]], 1
-// CHECK-NEXT:    [[TOBOOL_I:%.*]] = icmp ne i64 [[AND_I]], 0
-// CHECK-NEXT:    ret i1 [[TOBOOL_I]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i1 @llvm.aarch64.sme.in.streaming.mode()
+// CHECK-NEXT:    ret i1 [[TMP0]]
 //
-// CPP-CHECK-LABEL: @_Z22test_in_streaming_modev(
+// CPP-CHECK-LABEL: @_Z43test_in_streaming_mode_streaming_compatiblev(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call aarch64_sme_preservemost_from_x2 { i64, i64 } @__arm_sme_state() #[[ATTR3:[0-9]+]]
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { i64, i64 } [[TMP0]], 0
-// CPP-CHECK-NEXT:    [[AND_I:%.*]] = and i64 [[TMP1]], 1
-// CPP-CHECK-NEXT:    [[TOBOOL_I:%.*]] = icmp ne i64 [[AND_I]], 0
-// CPP-CHECK-NEXT:    ret i1 [[TOBOOL_I]]
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call i1 @llvm.aarch64.sme.in.streaming.mode()
+// CPP-CHECK-NEXT:    ret i1 [[TMP0]]
+//
+bool test_in_streaming_mode_streaming_compatible(void) __arm_streaming_compatible {
+  return __arm_in_streaming_mode();
+}
+
+// CHECK-LABEL: @test_in_streaming_mode_streaming(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret i1 true
+//
+// CPP-CHECK-LABEL: @_Z32test_in_streaming_mode_streamingv(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    ret i1 true
+//
+bool test_in_streaming_mode_streaming(void) __arm_streaming {
+//
+  return __arm_in_streaming_mode();
+}
+
+// CHECK-LABEL: @test_in_streaming_mode_non_streaming(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret i1 false
+//
+// CPP-CHECK-LABEL: @_Z36test_in_streaming_mode_non_streamingv(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    ret i1 false
 //
-bool test_in_streaming_mode(void) __arm_streaming_compatible {
+bool test_in_streaming_mode_non_streaming(void) {
   return __arm_in_streaming_mode();
 }
 
 // CHECK-LABEL: @test_za_disable(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    tail call void @__arm_za_disable() #[[ATTR3]]
+// CHECK-NEXT:    tail call void @__arm_za_disable() #[[ATTR7:[0-9]+]]
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z15test_za_disablev(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    tail call void @__arm_za_disable() #[[ATTR3]]
+// CPP-CHECK-NEXT:    tail call void @__arm_za_disable() #[[ATTR7:[0-9]+]]
 // CPP-CHECK-NEXT:    ret void
 //
 void test_za_disable(void) __arm_streaming_compatible {
@@ -42,14 +61,14 @@ void test_za_disable(void) __arm_streaming_compatible {
 
 // CHECK-LABEL: @test_has_sme(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call aarch64_sme_preservemost_from_x2 { i64, i64 } @__arm_sme_state() #[[ATTR3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call aarch64_sme_preservemost_from_x2 { i64, i64 } @__arm_sme_state() #[[ATTR7]]
 // CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { i64, i64 } [[TMP0]], 0
 // CHECK-NEXT:    [[TOBOOL_I:%.*]] = icmp slt i64 [[TMP1]], 0
 // CHECK-NEXT:    ret i1 [[TOBOOL_I]]
 //
 // CPP-CHECK-LABEL: @_Z12test_has_smev(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call aarch64_sme_preservemost_from_x2 { i64, i64 } @__arm_sme_state() #[[ATTR3]]
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call aarch64_sme_preservemost_from_x2 { i64, i64 } @__arm_sme_state() #[[ATTR7]]
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { i64, i64 } [[TMP0]], 0
 // CPP-CHECK-NEXT:    [[TOBOOL_I:%.*]] = icmp slt i64 [[TMP1]], 0
 // CPP-CHECK-NEXT:    ret i1 [[TOBOOL_I]]
@@ -72,12 +91,12 @@ void test_svundef_za(void) __arm_streaming_compatible __arm_out("za") {
 
 // CHECK-LABEL: @test_sc_memcpy(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[CALL:%.*]] = tail call ptr @__arm_sc_memcpy(ptr noundef [[DEST:%.*]], ptr noundef [[SRC:%.*]], i64 noundef [[N:%.*]]) #[[ATTR3]]
+// CHECK-NEXT:    [[CALL:%.*]] = tail call ptr @__arm_sc_memcpy(ptr noundef [[DEST:%.*]], ptr noundef [[SRC:%.*]], i64 noundef [[N:%.*]]) #[[ATTR7]]
 // CHECK-NEXT:    ret ptr [[CALL]]
 //
 // CPP-CHECK-LABEL: @_Z14test_sc_memcpyPvPKvm(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[CALL:%.*]] = tail call ptr @__arm_sc_memcpy(ptr noundef [[DEST:%.*]], ptr noundef [[SRC:%.*]], i64 noundef [[N:%.*]]) #[[ATTR3]]
+// CPP-CHECK-NEXT:    [[CALL:%.*]] = tail call ptr @__arm_sc_memcpy(ptr noundef [[DEST:%.*]], ptr noundef [[SRC:%.*]], i64 noundef [[N:%.*]]) #[[ATTR7]]
 // CPP-CHECK-NEXT:    ret ptr [[CALL]]
 //
 void *test_sc_memcpy(void *dest, const void *src, size_t n) __arm_streaming_compatible {
@@ -86,12 +105,12 @@ void *test_sc_memcpy(void *dest, const void *src, size_t n) __arm_streaming_comp
 
 // CHECK-LABEL: @test_sc_memmove(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[CALL:%.*]] = tail call ptr @__arm_sc_memmove(ptr noundef [[DEST:%.*]], ptr noundef [[SRC:%.*]], i64 noundef [[N:%.*]]) #[[ATTR3]]
+// CHECK-NEXT:    [[CALL:%.*]] = tail call ptr @__arm_sc_memmove(ptr noundef [[DEST:%.*]], ptr noundef [[SRC:%.*]], i64 noundef [[N:%.*]]) #[[ATTR7]]
 // CHECK-NEXT:    ret ptr [[CALL]]
 //
 // CPP-CHECK-LABEL: @_Z15test_sc_memmovePvPKvm(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[CALL:%.*]] = tail call ptr @__arm_sc_memmove(ptr noundef [[DEST:%.*]], ptr noundef [[SRC:%.*]], i64 noundef [[N:%.*]]) #[[ATTR3]]
+// CPP-CHECK-NEXT:    [[CALL:%.*]] = tail call ptr @__arm_sc_memmove(ptr noundef [[DEST:%.*]], ptr noundef [[SRC:%.*]], i64 noundef [[N:%.*]]) #[[ATTR7]]
 // CPP-CHECK-NEXT:    ret ptr [[CALL]]
 //
 void *test_sc_memmove(void *dest, const void *src, size_t n) __arm_streaming_compatible {
@@ -100,12 +119,12 @@ void *test_sc_memmove(void *dest, const void *src, size_t n) __arm_streaming_com
 
 // CHECK-LABEL: @test_sc_memset(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[CALL:%.*]] = tail call ptr @__arm_sc_memset(ptr noundef [[S:%.*]], i32 noundef [[C:%.*]], i64 noundef [[N:%.*]]) #[[ATTR3]]
+// CHECK-NEXT:    [[CALL:%.*]] = tail call ptr @__arm_sc_memset(ptr noundef [[S:%.*]], i32 noundef [[C:%.*]], i64 noundef [[N:%.*]]) #[[ATTR7]]
 // CHECK-NEXT:    ret ptr [[CALL]]
 //
 // CPP-CHECK-LABEL: @_Z14test_sc_memsetPvim(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[CALL:%.*]] = tail call ptr @__arm_sc_memset(ptr noundef [[S:%.*]], i32 noundef [[C:%.*]], i64 noundef [[N:%.*]]) #[[ATTR3]]
+// CPP-CHECK-NEXT:    [[CALL:%.*]] = tail call ptr @__arm_sc_memset(ptr noundef [[S:%.*]], i32 noundef [[C:%.*]], i64 noundef [[N:%.*]]) #[[ATTR7]]
 // CPP-CHECK-NEXT:    ret ptr [[CALL]]
 //
 void *test_sc_memset(void *s, int c, size_t n) __arm_streaming_compatible {
@@ -114,12 +133,12 @@ void *test_sc_memset(void *s, int c, size_t n) __arm_streaming_compatible {
 
 // CHECK-LABEL: @test_sc_memchr(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[CALL:%.*]] = tail call ptr @__arm_sc_memchr(ptr noundef [[S:%.*]], i32 noundef [[C:%.*]], i64 noundef [[N:%.*]]) #[[ATTR3]]
+// CHECK-NEXT:    [[CALL:%.*]] = tail call ptr @__arm_sc_memchr(ptr noundef [[S:%.*]], i32 noundef [[C:%.*]], i64 noundef [[N:%.*]]) #[[ATTR7]]
 // CHECK-NEXT:    ret ptr [[CALL]]
 //
 // CPP-CHECK-LABEL: @_Z14test_sc_memchrPvim(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[CALL:%.*]] = tail call ptr @__arm_sc_memchr(ptr noundef [[S:%.*]], i32 noundef [[C:%.*]], i64 noundef [[N:%.*]]) #[[ATTR3]]
+// CPP-CHECK-NEXT:    [[CALL:%.*]] = tail call ptr @__arm_sc_memchr(ptr noundef [[S:%.*]], i32 noundef [[C:%.*]], i64 noundef [[N:%.*]]) #[[ATTR7]]
 // CPP-CHECK-NEXT:    ret ptr [[CALL]]
 //
 void *test_sc_memchr(void *s, int c, size_t n) __arm_streaming_compatible {
diff --git a/clang/test/CodeGen/attr-target-clones-aarch64.c b/clang/test/CodeGen/attr-target-clones-aarch64.c
index 6b7acbbd4fc5..b7e3a328db87 100644
--- a/clang/test/CodeGen/attr-target-clones-aarch64.c
+++ b/clang/test/CodeGen/attr-target-clones-aarch64.c
@@ -64,20 +64,20 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default"))
 // CHECK-NEXT:  resolver_entry:
 // CHECK-NEXT:    call void @__init_cpu_features_resolver()
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 33664
-// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 33664
+// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 69793284352
+// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 69793284352
 // CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
 // CHECK-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
 // CHECK:       resolver_return:
-// CHECK-NEXT:    ret ptr @ftc._MaesMlse
+// CHECK-NEXT:    ret ptr @ftc._Msve2
 // CHECK:       resolver_else:
 // CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 69793284352
-// CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 69793284352
+// CHECK-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 33664
+// CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 33664
 // CHECK-NEXT:    [[TMP7:%.*]] = and i1 true, [[TMP6]]
 // CHECK-NEXT:    br i1 [[TMP7]], label [[RESOLVER_RETURN1:%.*]], label [[RESOLVER_ELSE2:%.*]]
 // CHECK:       resolver_return1:
-// CHECK-NEXT:    ret ptr @ftc._Msve2
+// CHECK-NEXT:    ret ptr @ftc._MaesMlse
 // CHECK:       resolver_else2:
 // CHECK-NEXT:    ret ptr @ftc.default
 //
@@ -252,56 +252,56 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default"))
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@ftc.default
-// CHECK-SAME: () #[[ATTR9:[0-9]+]] {
+// CHECK-SAME: () #[[ATTR8]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 0
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@ftc_def.default
-// CHECK-SAME: () #[[ATTR9]] {
+// CHECK-SAME: () #[[ATTR8]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 1
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@ftc_dup1.default
-// CHECK-SAME: () #[[ATTR9]] {
+// CHECK-SAME: () #[[ATTR8]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 2
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@ftc_dup2.default
-// CHECK-SAME: () #[[ATTR9]] {
+// CHECK-SAME: () #[[ATTR8]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 3
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@ftc_dup3.default
-// CHECK-SAME: () #[[ATTR9]] {
+// CHECK-SAME: () #[[ATTR8]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 4
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@ftc_inline2._Mfp16
-// CHECK-SAME: () #[[ATTR10:[0-9]+]] {
+// CHECK-SAME: () #[[ATTR9:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 2
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@ftc_inline2._MfcmaMsve2-bitperm
-// CHECK-SAME: () #[[ATTR11:[0-9]+]] {
+// CHECK-SAME: () #[[ATTR10:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 2
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@ftc_inline2.default
-// CHECK-SAME: () #[[ATTR9]] {
+// CHECK-SAME: () #[[ATTR8]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 2
 //
@@ -330,28 +330,28 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default"))
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@ftc_inline1._MrngMsimd
-// CHECK-SAME: () #[[ATTR12:[0-9]+]] {
+// CHECK-SAME: () #[[ATTR11:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 1
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@ftc_inline1._MpredresMrcpc
-// CHECK-SAME: () #[[ATTR13:[0-9]+]] {
+// CHECK-SAME: () #[[ATTR12:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 1
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@ftc_inline1._Msve2-aesMwfxt
-// CHECK-SAME: () #[[ATTR14:[0-9]+]] {
+// CHECK-SAME: () #[[ATTR13:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 1
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@ftc_inline1.default
-// CHECK-SAME: () #[[ATTR9]] {
+// CHECK-SAME: () #[[ATTR8]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 1
 //
@@ -395,14 +395,14 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default"))
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@ftc_inline3._MsbMsve
-// CHECK-SAME: () #[[ATTR15:[0-9]+]] {
+// CHECK-SAME: () #[[ATTR14:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 3
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@ftc_inline3.default
-// CHECK-SAME: () #[[ATTR9]] {
+// CHECK-SAME: () #[[ATTR8]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 3
 //
@@ -411,20 +411,20 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default"))
 // CHECK-NEXT:  resolver_entry:
 // CHECK-NEXT:    call void @__init_cpu_features_resolver()
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 70369817985280
-// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 70369817985280
+// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 1125899906842624
+// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 1125899906842624
 // CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
 // CHECK-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
 // CHECK:       resolver_return:
-// CHECK-NEXT:    ret ptr @ftc_inline3._MsbMsve
+// CHECK-NEXT:    ret ptr @ftc_inline3._Mbti
 // CHECK:       resolver_else:
 // CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 1125899906842624
-// CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 1125899906842624
+// CHECK-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 70369817985280
+// CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 70369817985280
 // CHECK-NEXT:    [[TMP7:%.*]] = and i1 true, [[TMP6]]
 // CHECK-NEXT:    br i1 [[TMP7]], label [[RESOLVER_RETURN1:%.*]], label [[RESOLVER_ELSE2:%.*]]
 // CHECK:       resolver_return1:
-// CHECK-NEXT:    ret ptr @ftc_inline3._Mbti
+// CHECK-NEXT:    ret ptr @ftc_inline3._MsbMsve
 // CHECK:       resolver_else2:
 // CHECK-NEXT:    ret ptr @ftc_inline3.default
 //
@@ -521,20 +521,20 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default"))
 // CHECK-MTE-BTI-NEXT:  resolver_entry:
 // CHECK-MTE-BTI-NEXT:    call void @__init_cpu_features_resolver()
 // CHECK-MTE-BTI-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-MTE-BTI-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 33664
-// CHECK-MTE-BTI-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 33664
+// CHECK-MTE-BTI-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 69793284352
+// CHECK-MTE-BTI-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 69793284352
 // CHECK-MTE-BTI-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
 // CHECK-MTE-BTI-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
 // CHECK-MTE-BTI:       resolver_return:
-// CHECK-MTE-BTI-NEXT:    ret ptr @ftc._MaesMlse
+// CHECK-MTE-BTI-NEXT:    ret ptr @ftc._Msve2
 // CHECK-MTE-BTI:       resolver_else:
 // CHECK-MTE-BTI-NEXT:    [[TMP4:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-MTE-BTI-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 69793284352
-// CHECK-MTE-BTI-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 69793284352
+// CHECK-MTE-BTI-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 33664
+// CHECK-MTE-BTI-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 33664
 // CHECK-MTE-BTI-NEXT:    [[TMP7:%.*]] = and i1 true, [[TMP6]]
 // CHECK-MTE-BTI-NEXT:    br i1 [[TMP7]], label [[RESOLVER_RETURN1:%.*]], label [[RESOLVER_ELSE2:%.*]]
 // CHECK-MTE-BTI:       resolver_return1:
-// CHECK-MTE-BTI-NEXT:    ret ptr @ftc._Msve2
+// CHECK-MTE-BTI-NEXT:    ret ptr @ftc._MaesMlse
 // CHECK-MTE-BTI:       resolver_else2:
 // CHECK-MTE-BTI-NEXT:    ret ptr @ftc.default
 //
@@ -548,7 +548,7 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default"))
 //
 // CHECK-MTE-BTI: Function Attrs: noinline nounwind optnone
 // CHECK-MTE-BTI-LABEL: define {{[^@]+}}@ftc_def._MmemtagMsha2
-// CHECK-MTE-BTI-SAME: () #[[ATTR2]] {
+// CHECK-MTE-BTI-SAME: () #[[ATTR3:[0-9]+]] {
 // CHECK-MTE-BTI-NEXT:  entry:
 // CHECK-MTE-BTI-NEXT:    ret i32 1
 //
@@ -598,14 +598,14 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default"))
 //
 // CHECK-MTE-BTI: Function Attrs: noinline nounwind optnone
 // CHECK-MTE-BTI-LABEL: define {{[^@]+}}@ftc_dup2._Mfp
-// CHECK-MTE-BTI-SAME: () #[[ATTR3:[0-9]+]] {
+// CHECK-MTE-BTI-SAME: () #[[ATTR4:[0-9]+]] {
 // CHECK-MTE-BTI-NEXT:  entry:
 // CHECK-MTE-BTI-NEXT:    ret i32 3
 //
 //
 // CHECK-MTE-BTI: Function Attrs: noinline nounwind optnone
 // CHECK-MTE-BTI-LABEL: define {{[^@]+}}@ftc_dup2._McrcMdotprod
-// CHECK-MTE-BTI-SAME: () #[[ATTR4:[0-9]+]] {
+// CHECK-MTE-BTI-SAME: () #[[ATTR5:[0-9]+]] {
 // CHECK-MTE-BTI-NEXT:  entry:
 // CHECK-MTE-BTI-NEXT:    ret i32 3
 //
@@ -634,14 +634,14 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default"))
 //
 // CHECK-MTE-BTI: Function Attrs: noinline nounwind optnone
 // CHECK-MTE-BTI-LABEL: define {{[^@]+}}@ftc_dup3._Mmemtag
-// CHECK-MTE-BTI-SAME: () #[[ATTR5:[0-9]+]] {
+// CHECK-MTE-BTI-SAME: () #[[ATTR6:[0-9]+]] {
 // CHECK-MTE-BTI-NEXT:  entry:
 // CHECK-MTE-BTI-NEXT:    ret i32 4
 //
 //
 // CHECK-MTE-BTI: Function Attrs: noinline nounwind optnone
 // CHECK-MTE-BTI-LABEL: define {{[^@]+}}@ftc_dup3._Mbti
-// CHECK-MTE-BTI-SAME: () #[[ATTR5]] {
+// CHECK-MTE-BTI-SAME: () #[[ATTR7:[0-9]+]] {
 // CHECK-MTE-BTI-NEXT:  entry:
 // CHECK-MTE-BTI-NEXT:    ret i32 4
 //
@@ -670,7 +670,7 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default"))
 //
 // CHECK-MTE-BTI: Function Attrs: noinline nounwind optnone
 // CHECK-MTE-BTI-LABEL: define {{[^@]+}}@foo
-// CHECK-MTE-BTI-SAME: () #[[ATTR6:[0-9]+]] {
+// CHECK-MTE-BTI-SAME: () #[[ATTR8:[0-9]+]] {
 // CHECK-MTE-BTI-NEXT:  entry:
 // CHECK-MTE-BTI-NEXT:    [[CALL:%.*]] = call i32 @ftc()
 // CHECK-MTE-BTI-NEXT:    [[CALL1:%.*]] = call i32 @ftc_def()
@@ -686,14 +686,14 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default"))
 //
 // CHECK-MTE-BTI: Function Attrs: noinline nounwind optnone
 // CHECK-MTE-BTI-LABEL: define {{[^@]+}}@ftc_direct
-// CHECK-MTE-BTI-SAME: () #[[ATTR6]] {
+// CHECK-MTE-BTI-SAME: () #[[ATTR8]] {
 // CHECK-MTE-BTI-NEXT:  entry:
 // CHECK-MTE-BTI-NEXT:    ret i32 4
 //
 //
 // CHECK-MTE-BTI: Function Attrs: noinline nounwind optnone
 // CHECK-MTE-BTI-LABEL: define {{[^@]+}}@main
-// CHECK-MTE-BTI-SAME: () #[[ATTR6]] {
+// CHECK-MTE-BTI-SAME: () #[[ATTR8]] {
 // CHECK-MTE-BTI-NEXT:  entry:
 // CHECK-MTE-BTI-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
 // CHECK-MTE-BTI-NEXT:    store i32 0, ptr [[RETVAL]], align 4
@@ -709,56 +709,56 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default"))
 //
 // CHECK-MTE-BTI: Function Attrs: noinline nounwind optnone
 // CHECK-MTE-BTI-LABEL: define {{[^@]+}}@ftc.default
-// CHECK-MTE-BTI-SAME: () #[[ATTR5]] {
+// CHECK-MTE-BTI-SAME: () #[[ATTR8]] {
 // CHECK-MTE-BTI-NEXT:  entry:
 // CHECK-MTE-BTI-NEXT:    ret i32 0
 //
 //
 // CHECK-MTE-BTI: Function Attrs: noinline nounwind optnone
 // CHECK-MTE-BTI-LABEL: define {{[^@]+}}@ftc_def.default
-// CHECK-MTE-BTI-SAME: () #[[ATTR5]] {
+// CHECK-MTE-BTI-SAME: () #[[ATTR8]] {
 // CHECK-MTE-BTI-NEXT:  entry:
 // CHECK-MTE-BTI-NEXT:    ret i32 1
 //
 //
 // CHECK-MTE-BTI: Function Attrs: noinline nounwind optnone
 // CHECK-MTE-BTI-LABEL: define {{[^@]+}}@ftc_dup1.default
-// CHECK-MTE-BTI-SAME: () #[[ATTR5]] {
+// CHECK-MTE-BTI-SAME: () #[[ATTR8]] {
 // CHECK-MTE-BTI-NEXT:  entry:
 // CHECK-MTE-BTI-NEXT:    ret i32 2
 //
 //
 // CHECK-MTE-BTI: Function Attrs: noinline nounwind optnone
 // CHECK-MTE-BTI-LABEL: define {{[^@]+}}@ftc_dup2.default
-// CHECK-MTE-BTI-SAME: () #[[ATTR5]] {
+// CHECK-MTE-BTI-SAME: () #[[ATTR8]] {
 // CHECK-MTE-BTI-NEXT:  entry:
 // CHECK-MTE-BTI-NEXT:    ret i32 3
 //
 //
 // CHECK-MTE-BTI: Function Attrs: noinline nounwind optnone
 // CHECK-MTE-BTI-LABEL: define {{[^@]+}}@ftc_dup3.default
-// CHECK-MTE-BTI-SAME: () #[[ATTR5]] {
+// CHECK-MTE-BTI-SAME: () #[[ATTR8]] {
 // CHECK-MTE-BTI-NEXT:  entry:
 // CHECK-MTE-BTI-NEXT:    ret i32 4
 //
 //
 // CHECK-MTE-BTI: Function Attrs: noinline nounwind optnone
 // CHECK-MTE-BTI-LABEL: define {{[^@]+}}@ftc_inline2._Mfp16
-// CHECK-MTE-BTI-SAME: () #[[ATTR7:[0-9]+]] {
+// CHECK-MTE-BTI-SAME: () #[[ATTR9:[0-9]+]] {
 // CHECK-MTE-BTI-NEXT:  entry:
 // CHECK-MTE-BTI-NEXT:    ret i32 2
 //
 //
 // CHECK-MTE-BTI: Function Attrs: noinline nounwind optnone
 // CHECK-MTE-BTI-LABEL: define {{[^@]+}}@ftc_inline2._MfcmaMsve2-bitperm
-// CHECK-MTE-BTI-SAME: () #[[ATTR8:[0-9]+]] {
+// CHECK-MTE-BTI-SAME: () #[[ATTR10:[0-9]+]] {
 // CHECK-MTE-BTI-NEXT:  entry:
 // CHECK-MTE-BTI-NEXT:    ret i32 2
 //
 //
 // CHECK-MTE-BTI: Function Attrs: noinline nounwind optnone
 // CHECK-MTE-BTI-LABEL: define {{[^@]+}}@ftc_inline2.default
-// CHECK-MTE-BTI-SAME: () #[[ATTR5]] {
+// CHECK-MTE-BTI-SAME: () #[[ATTR8]] {
 // CHECK-MTE-BTI-NEXT:  entry:
 // CHECK-MTE-BTI-NEXT:    ret i32 2
 //
@@ -787,28 +787,28 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default"))
 //
 // CHECK-MTE-BTI: Function Attrs: noinline nounwind optnone
 // CHECK-MTE-BTI-LABEL: define {{[^@]+}}@ftc_inline1._MrngMsimd
-// CHECK-MTE-BTI-SAME: () #[[ATTR9:[0-9]+]] {
+// CHECK-MTE-BTI-SAME: () #[[ATTR11:[0-9]+]] {
 // CHECK-MTE-BTI-NEXT:  entry:
 // CHECK-MTE-BTI-NEXT:    ret i32 1
 //
 //
 // CHECK-MTE-BTI: Function Attrs: noinline nounwind optnone
 // CHECK-MTE-BTI-LABEL: define {{[^@]+}}@ftc_inline1._MpredresMrcpc
-// CHECK-MTE-BTI-SAME: () #[[ATTR10:[0-9]+]] {
+// CHECK-MTE-BTI-SAME: () #[[ATTR12:[0-9]+]] {
 // CHECK-MTE-BTI-NEXT:  entry:
 // CHECK-MTE-BTI-NEXT:    ret i32 1
 //
 //
 // CHECK-MTE-BTI: Function Attrs: noinline nounwind optnone
 // CHECK-MTE-BTI-LABEL: define {{[^@]+}}@ftc_inline1._Msve2-aesMwfxt
-// CHECK-MTE-BTI-SAME: () #[[ATTR11:[0-9]+]] {
+// CHECK-MTE-BTI-SAME: () #[[ATTR13:[0-9]+]] {
 // CHECK-MTE-BTI-NEXT:  entry:
 // CHECK-MTE-BTI-NEXT:    ret i32 1
 //
 //
 // CHECK-MTE-BTI: Function Attrs: noinline nounwind optnone
 // CHECK-MTE-BTI-LABEL: define {{[^@]+}}@ftc_inline1.default
-// CHECK-MTE-BTI-SAME: () #[[ATTR5]] {
+// CHECK-MTE-BTI-SAME: () #[[ATTR8]] {
 // CHECK-MTE-BTI-NEXT:  entry:
 // CHECK-MTE-BTI-NEXT:    ret i32 1
 //
@@ -845,21 +845,21 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default"))
 //
 // CHECK-MTE-BTI: Function Attrs: noinline nounwind optnone
 // CHECK-MTE-BTI-LABEL: define {{[^@]+}}@ftc_inline3._Mbti
-// CHECK-MTE-BTI-SAME: () #[[ATTR5]] {
+// CHECK-MTE-BTI-SAME: () #[[ATTR7]] {
 // CHECK-MTE-BTI-NEXT:  entry:
 // CHECK-MTE-BTI-NEXT:    ret i32 3
 //
 //
 // CHECK-MTE-BTI: Function Attrs: noinline nounwind optnone
 // CHECK-MTE-BTI-LABEL: define {{[^@]+}}@ftc_inline3._MsbMsve
-// CHECK-MTE-BTI-SAME: () #[[ATTR12:[0-9]+]] {
+// CHECK-MTE-BTI-SAME: () #[[ATTR14:[0-9]+]] {
 // CHECK-MTE-BTI-NEXT:  entry:
 // CHECK-MTE-BTI-NEXT:    ret i32 3
 //
 //
 // CHECK-MTE-BTI: Function Attrs: noinline nounwind optnone
 // CHECK-MTE-BTI-LABEL: define {{[^@]+}}@ftc_inline3.default
-// CHECK-MTE-BTI-SAME: () #[[ATTR5]] {
+// CHECK-MTE-BTI-SAME: () #[[ATTR8]] {
 // CHECK-MTE-BTI-NEXT:  entry:
 // CHECK-MTE-BTI-NEXT:    ret i32 3
 //
@@ -868,20 +868,20 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default"))
 // CHECK-MTE-BTI-NEXT:  resolver_entry:
 // CHECK-MTE-BTI-NEXT:    call void @__init_cpu_features_resolver()
 // CHECK-MTE-BTI-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-MTE-BTI-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 70369817985280
-// CHECK-MTE-BTI-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 70369817985280
+// CHECK-MTE-BTI-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 1125899906842624
+// CHECK-MTE-BTI-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 1125899906842624
 // CHECK-MTE-BTI-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
 // CHECK-MTE-BTI-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
 // CHECK-MTE-BTI:       resolver_return:
-// CHECK-MTE-BTI-NEXT:    ret ptr @ftc_inline3._MsbMsve
+// CHECK-MTE-BTI-NEXT:    ret ptr @ftc_inline3._Mbti
 // CHECK-MTE-BTI:       resolver_else:
 // CHECK-MTE-BTI-NEXT:    [[TMP4:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-MTE-BTI-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 1125899906842624
-// CHECK-MTE-BTI-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 1125899906842624
+// CHECK-MTE-BTI-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 70369817985280
+// CHECK-MTE-BTI-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 70369817985280
 // CHECK-MTE-BTI-NEXT:    [[TMP7:%.*]] = and i1 true, [[TMP6]]
 // CHECK-MTE-BTI-NEXT:    br i1 [[TMP7]], label [[RESOLVER_RETURN1:%.*]], label [[RESOLVER_ELSE2:%.*]]
 // CHECK-MTE-BTI:       resolver_return1:
-// CHECK-MTE-BTI-NEXT:    ret ptr @ftc_inline3._Mbti
+// CHECK-MTE-BTI-NEXT:    ret ptr @ftc_inline3._MsbMsve
 // CHECK-MTE-BTI:       resolver_else2:
 // CHECK-MTE-BTI-NEXT:    ret ptr @ftc_inline3.default
 //
diff --git a/clang/test/CodeGen/attr-target-version.c b/clang/test/CodeGen/attr-target-version.c
index 951401c498de..336d8b0a4dff 100644
--- a/clang/test/CodeGen/attr-target-version.c
+++ b/clang/test/CodeGen/attr-target-version.c
@@ -141,6 +141,7 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de
 // CHECK: @fmv_two = weak_odr ifunc i32 (), ptr @fmv_two.resolver
 // CHECK: @fmv_e = weak_odr ifunc i32 (), ptr @fmv_e.resolver
 // CHECK: @fmv_d = internal ifunc i32 (), ptr @fmv_d.resolver
+// CHECK: @fmv_default = weak_odr ifunc i32 (), ptr @fmv_default.resolver
 // CHECK: @fmv_c = weak_odr ifunc void (), ptr @fmv_c.resolver
 // CHECK: @fmv_inline = weak_odr ifunc i32 (), ptr @fmv_inline.resolver
 // CHECK: @reca = weak_odr ifunc void (), ptr @reca.resolver
@@ -271,7 +272,7 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@foo
-// CHECK-SAME: () #[[ATTR15:[0-9]+]] {
+// CHECK-SAME: () #[[ATTR9]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[CALL:%.*]] = call i32 @fmv()
 // CHECK-NEXT:    [[CALL1:%.*]] = call i32 @fmv_one()
@@ -297,7 +298,7 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@fmv_c._Mssbs
-// CHECK-SAME: () #[[ATTR17:[0-9]+]] {
+// CHECK-SAME: () #[[ATTR15:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret void
 //
@@ -311,7 +312,7 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@goo
-// CHECK-SAME: () #[[ATTR15]] {
+// CHECK-SAME: () #[[ATTR9]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[CALL:%.*]] = call i32 @fmv_inline()
 // CHECK-NEXT:    [[CALL1:%.*]] = call i32 @fmv_e()
@@ -323,7 +324,7 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@recur
-// CHECK-SAME: () #[[ATTR15]] {
+// CHECK-SAME: () #[[ATTR9]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    call void @reca()
 // CHECK-NEXT:    ret void
@@ -331,7 +332,7 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@hoo
-// CHECK-SAME: () #[[ATTR15]] {
+// CHECK-SAME: () #[[ATTR9]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[FP1:%.*]] = alloca ptr, align 8
 // CHECK-NEXT:    [[FP2:%.*]] = alloca ptr, align 8
@@ -348,28 +349,28 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@unused_with_forward_default_decl._Mmops
-// CHECK-SAME: () #[[ATTR19:[0-9]+]] {
+// CHECK-SAME: () #[[ATTR17:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 0
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@unused_with_implicit_extern_forward_default_decl._Mdotprod
-// CHECK-SAME: () #[[ATTR20:[0-9]+]] {
+// CHECK-SAME: () #[[ATTR18:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 0
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@unused_with_default_decl._Maes
-// CHECK-SAME: () #[[ATTR5]] {
+// CHECK-SAME: () #[[ATTR19:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 0
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@unused_with_default_def._Msve
-// CHECK-SAME: () #[[ATTR21:[0-9]+]] {
+// CHECK-SAME: () #[[ATTR20:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 0
 //
@@ -383,7 +384,7 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@unused_with_implicit_default_def._Mfp16
-// CHECK-SAME: () #[[ATTR22:[0-9]+]] {
+// CHECK-SAME: () #[[ATTR21:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 0
 //
@@ -397,49 +398,49 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@unused_with_implicit_forward_default_def.default
-// CHECK-SAME: () #[[ATTR15]] {
+// CHECK-SAME: () #[[ATTR9]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 0
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@unused_with_implicit_forward_default_def._Mlse
-// CHECK-SAME: () #[[ATTR23:[0-9]+]] {
+// CHECK-SAME: () #[[ATTR22:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 1
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@unused_without_default._Mrdm
-// CHECK-SAME: () #[[ATTR24:[0-9]+]] {
+// CHECK-SAME: () #[[ATTR23:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 0
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@default_def_with_version_decls.default
-// CHECK-SAME: () #[[ATTR15]] {
+// CHECK-SAME: () #[[ATTR9]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 0
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@used_def_without_default_decl._Mjscvt
-// CHECK-SAME: () #[[ATTR26:[0-9]+]] {
+// CHECK-SAME: () #[[ATTR25:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 1
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@used_def_without_default_decl._Mrdm
-// CHECK-SAME: () #[[ATTR24]] {
+// CHECK-SAME: () #[[ATTR26:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 2
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@caller
-// CHECK-SAME: () #[[ATTR15]] {
+// CHECK-SAME: () #[[ATTR9]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[CALL:%.*]] = call i32 @used_def_without_default_decl()
 // CHECK-NEXT:    [[CALL1:%.*]] = call i32 @used_decl_without_default_decl()
@@ -462,12 +463,12 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de
 // CHECK-NEXT:  resolver_entry:
 // CHECK-NEXT:    call void @__init_cpu_features_resolver()
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 66315
-// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 66315
+// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 144119586256651008
+// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 144119586256651008
 // CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
 // CHECK-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
 // CHECK:       resolver_return:
-// CHECK-NEXT:    ret ptr @fmv._MflagmMfp16fmlMrng
+// CHECK-NEXT:    ret ptr @fmv._Msme2
 // CHECK:       resolver_else:
 // CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
 // CHECK-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 72061992218723078
@@ -478,60 +479,60 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de
 // CHECK-NEXT:    ret ptr @fmv._Mflagm2Msme-i16i64
 // CHECK:       resolver_else2:
 // CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP9:%.*]] = and i64 [[TMP8]], 9007199254741776
-// CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[TMP9]], 9007199254741776
+// CHECK-NEXT:    [[TMP9:%.*]] = and i64 [[TMP8]], 9007199254742016
+// CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[TMP9]], 9007199254742016
 // CHECK-NEXT:    [[TMP11:%.*]] = and i1 true, [[TMP10]]
 // CHECK-NEXT:    br i1 [[TMP11]], label [[RESOLVER_RETURN3:%.*]], label [[RESOLVER_ELSE4:%.*]]
 // CHECK:       resolver_return3:
-// CHECK-NEXT:    ret ptr @fmv._MdotprodMls64
+// CHECK-NEXT:    ret ptr @fmv._McrcMls64
 // CHECK:       resolver_else4:
 // CHECK-NEXT:    [[TMP12:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = and i64 [[TMP12]], 9007199254742016
-// CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP13]], 9007199254742016
+// CHECK-NEXT:    [[TMP13:%.*]] = and i64 [[TMP12]], 9007199254741776
+// CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP13]], 9007199254741776
 // CHECK-NEXT:    [[TMP15:%.*]] = and i1 true, [[TMP14]]
 // CHECK-NEXT:    br i1 [[TMP15]], label [[RESOLVER_RETURN5:%.*]], label [[RESOLVER_ELSE6:%.*]]
 // CHECK:       resolver_return5:
-// CHECK-NEXT:    ret ptr @fmv._McrcMls64
+// CHECK-NEXT:    ret ptr @fmv._MdotprodMls64
 // CHECK:       resolver_else6:
 // CHECK-NEXT:    [[TMP16:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP17:%.*]] = and i64 [[TMP16]], 17592186110728
-// CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[TMP17]], 17592186110728
+// CHECK-NEXT:    [[TMP17:%.*]] = and i64 [[TMP16]], 1125899906842624
+// CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[TMP17]], 1125899906842624
 // CHECK-NEXT:    [[TMP19:%.*]] = and i1 true, [[TMP18]]
 // CHECK-NEXT:    br i1 [[TMP19]], label [[RESOLVER_RETURN7:%.*]], label [[RESOLVER_ELSE8:%.*]]
 // CHECK:       resolver_return7:
-// CHECK-NEXT:    ret ptr @fmv._Mfp16fmlMmemtag
+// CHECK-NEXT:    ret ptr @fmv._Mbti
 // CHECK:       resolver_else8:
 // CHECK-NEXT:    [[TMP20:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP21:%.*]] = and i64 [[TMP20]], 33536
-// CHECK-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[TMP21]], 33536
+// CHECK-NEXT:    [[TMP21:%.*]] = and i64 [[TMP20]], 17592186110728
+// CHECK-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[TMP21]], 17592186110728
 // CHECK-NEXT:    [[TMP23:%.*]] = and i1 true, [[TMP22]]
 // CHECK-NEXT:    br i1 [[TMP23]], label [[RESOLVER_RETURN9:%.*]], label [[RESOLVER_ELSE10:%.*]]
 // CHECK:       resolver_return9:
-// CHECK-NEXT:    ret ptr @fmv._MaesMfp
+// CHECK-NEXT:    ret ptr @fmv._Mfp16fmlMmemtag
 // CHECK:       resolver_else10:
 // CHECK-NEXT:    [[TMP24:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP25:%.*]] = and i64 [[TMP24]], 4992
-// CHECK-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[TMP25]], 4992
+// CHECK-NEXT:    [[TMP25:%.*]] = and i64 [[TMP24]], 66315
+// CHECK-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[TMP25]], 66315
 // CHECK-NEXT:    [[TMP27:%.*]] = and i1 true, [[TMP26]]
 // CHECK-NEXT:    br i1 [[TMP27]], label [[RESOLVER_RETURN11:%.*]], label [[RESOLVER_ELSE12:%.*]]
 // CHECK:       resolver_return11:
-// CHECK-NEXT:    ret ptr @fmv._MlseMsha2
+// CHECK-NEXT:    ret ptr @fmv._MflagmMfp16fmlMrng
 // CHECK:       resolver_else12:
 // CHECK-NEXT:    [[TMP28:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP29:%.*]] = and i64 [[TMP28]], 144119586256651008
-// CHECK-NEXT:    [[TMP30:%.*]] = icmp eq i64 [[TMP29]], 144119586256651008
+// CHECK-NEXT:    [[TMP29:%.*]] = and i64 [[TMP28]], 33536
+// CHECK-NEXT:    [[TMP30:%.*]] = icmp eq i64 [[TMP29]], 33536
 // CHECK-NEXT:    [[TMP31:%.*]] = and i1 true, [[TMP30]]
 // CHECK-NEXT:    br i1 [[TMP31]], label [[RESOLVER_RETURN13:%.*]], label [[RESOLVER_ELSE14:%.*]]
 // CHECK:       resolver_return13:
-// CHECK-NEXT:    ret ptr @fmv._Msme2
+// CHECK-NEXT:    ret ptr @fmv._MaesMfp
 // CHECK:       resolver_else14:
 // CHECK-NEXT:    [[TMP32:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP33:%.*]] = and i64 [[TMP32]], 1125899906842624
-// CHECK-NEXT:    [[TMP34:%.*]] = icmp eq i64 [[TMP33]], 1125899906842624
+// CHECK-NEXT:    [[TMP33:%.*]] = and i64 [[TMP32]], 4992
+// CHECK-NEXT:    [[TMP34:%.*]] = icmp eq i64 [[TMP33]], 4992
 // CHECK-NEXT:    [[TMP35:%.*]] = and i1 true, [[TMP34]]
 // CHECK-NEXT:    br i1 [[TMP35]], label [[RESOLVER_RETURN15:%.*]], label [[RESOLVER_ELSE16:%.*]]
 // CHECK:       resolver_return15:
-// CHECK-NEXT:    ret ptr @fmv._Mbti
+// CHECK-NEXT:    ret ptr @fmv._MlseMsha2
 // CHECK:       resolver_else16:
 // CHECK-NEXT:    ret ptr @fmv.default
 //
@@ -630,6 +631,11 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de
 // CHECK-NEXT:    ret ptr @fmv_d.default
 //
 //
+// CHECK-LABEL: define {{[^@]+}}@fmv_default.resolver() comdat {
+// CHECK-NEXT:  resolver_entry:
+// CHECK-NEXT:    ret ptr @fmv_default.default
+//
+//
 // CHECK-LABEL: define {{[^@]+}}@fmv_c.resolver() comdat {
 // CHECK-NEXT:  resolver_entry:
 // CHECK-NEXT:    call void @__init_cpu_features_resolver()
@@ -767,60 +773,60 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de
 // CHECK-NEXT:  resolver_entry:
 // CHECK-NEXT:    call void @__init_cpu_features_resolver()
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 4398182892352
-// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 4398182892352
+// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 864708720653762560
+// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 864708720653762560
 // CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
 // CHECK-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
 // CHECK:       resolver_return:
-// CHECK-NEXT:    ret ptr @fmv_inline._MfcmaMfp16MrdmMsme
+// CHECK-NEXT:    ret ptr @fmv_inline._MmemtagMmopsMrcpc3
 // CHECK:       resolver_else:
 // CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 864708720653762560
-// CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 864708720653762560
+// CHECK-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 19861002584864
+// CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 19861002584864
 // CHECK-NEXT:    [[TMP7:%.*]] = and i1 true, [[TMP6]]
 // CHECK-NEXT:    br i1 [[TMP7]], label [[RESOLVER_RETURN1:%.*]], label [[RESOLVER_ELSE2:%.*]]
 // CHECK:       resolver_return1:
-// CHECK-NEXT:    ret ptr @fmv_inline._MmemtagMmopsMrcpc3
+// CHECK-NEXT:    ret ptr @fmv_inline._MmemtagMsve2-sm4
 // CHECK:       resolver_else2:
 // CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP9:%.*]] = and i64 [[TMP8]], 894427038464
-// CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[TMP9]], 894427038464
+// CHECK-NEXT:    [[TMP9:%.*]] = and i64 [[TMP8]], 4398182892352
+// CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[TMP9]], 4398182892352
 // CHECK-NEXT:    [[TMP11:%.*]] = and i1 true, [[TMP10]]
 // CHECK-NEXT:    br i1 [[TMP11]], label [[RESOLVER_RETURN3:%.*]], label [[RESOLVER_ELSE4:%.*]]
 // CHECK:       resolver_return3:
-// CHECK-NEXT:    ret ptr @fmv_inline._Msve2Msve2-aesMsve2-bitperm
+// CHECK-NEXT:    ret ptr @fmv_inline._MfcmaMfp16MrdmMsme
 // CHECK:       resolver_else4:
 // CHECK-NEXT:    [[TMP12:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = and i64 [[TMP12]], 35433583360
-// CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP13]], 35433583360
+// CHECK-NEXT:    [[TMP13:%.*]] = and i64 [[TMP12]], 1444182864640
+// CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP13]], 1444182864640
 // CHECK-NEXT:    [[TMP15:%.*]] = and i1 true, [[TMP14]]
 // CHECK-NEXT:    br i1 [[TMP15]], label [[RESOLVER_RETURN5:%.*]], label [[RESOLVER_ELSE6:%.*]]
 // CHECK:       resolver_return5:
-// CHECK-NEXT:    ret ptr @fmv_inline._MaesMf64mmMsha2
+// CHECK-NEXT:    ret ptr @fmv_inline._Msve2-aesMsve2-sha3
 // CHECK:       resolver_else6:
 // CHECK-NEXT:    [[TMP16:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP17:%.*]] = and i64 [[TMP16]], 18320798464
-// CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[TMP17]], 18320798464
+// CHECK-NEXT:    [[TMP17:%.*]] = and i64 [[TMP16]], 894427038464
+// CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[TMP17]], 894427038464
 // CHECK-NEXT:    [[TMP19:%.*]] = and i1 true, [[TMP18]]
 // CHECK-NEXT:    br i1 [[TMP19]], label [[RESOLVER_RETURN7:%.*]], label [[RESOLVER_ELSE8:%.*]]
 // CHECK:       resolver_return7:
-// CHECK-NEXT:    ret ptr @fmv_inline._Mf32mmMi8mmMsha3
+// CHECK-NEXT:    ret ptr @fmv_inline._Msve2Msve2-aesMsve2-bitperm
 // CHECK:       resolver_else8:
 // CHECK-NEXT:    [[TMP20:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP21:%.*]] = and i64 [[TMP20]], 19861002584864
-// CHECK-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[TMP21]], 19861002584864
+// CHECK-NEXT:    [[TMP21:%.*]] = and i64 [[TMP20]], 35433583360
+// CHECK-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[TMP21]], 35433583360
 // CHECK-NEXT:    [[TMP23:%.*]] = and i1 true, [[TMP22]]
 // CHECK-NEXT:    br i1 [[TMP23]], label [[RESOLVER_RETURN9:%.*]], label [[RESOLVER_ELSE10:%.*]]
 // CHECK:       resolver_return9:
-// CHECK-NEXT:    ret ptr @fmv_inline._MmemtagMsve2-sm4
+// CHECK-NEXT:    ret ptr @fmv_inline._MaesMf64mmMsha2
 // CHECK:       resolver_else10:
 // CHECK-NEXT:    [[TMP24:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP25:%.*]] = and i64 [[TMP24]], 1444182864640
-// CHECK-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[TMP25]], 1444182864640
+// CHECK-NEXT:    [[TMP25:%.*]] = and i64 [[TMP24]], 18320798464
+// CHECK-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[TMP25]], 18320798464
 // CHECK-NEXT:    [[TMP27:%.*]] = and i1 true, [[TMP26]]
 // CHECK-NEXT:    br i1 [[TMP27]], label [[RESOLVER_RETURN11:%.*]], label [[RESOLVER_ELSE12:%.*]]
 // CHECK:       resolver_return11:
-// CHECK-NEXT:    ret ptr @fmv_inline._Msve2-aesMsve2-sha3
+// CHECK-NEXT:    ret ptr @fmv_inline._Mf32mmMi8mmMsha3
 // CHECK:       resolver_else12:
 // CHECK-NEXT:    [[TMP28:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
 // CHECK-NEXT:    [[TMP29:%.*]] = and i64 [[TMP28]], 1208025856
@@ -984,7 +990,7 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@func
-// CHECK-SAME: () #[[ATTR15]] {
+// CHECK-SAME: () #[[ATTR9]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret void
 //
@@ -1008,21 +1014,21 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de
 //
 // CHECK-NOFMV: Function Attrs: noinline nounwind optnone
 // CHECK-NOFMV-LABEL: define {{[^@]+}}@fmv
-// CHECK-NOFMV-SAME: () #[[ATTR1:[0-9]+]] {
+// CHECK-NOFMV-SAME: () #[[ATTR0]] {
 // CHECK-NOFMV-NEXT:  entry:
 // CHECK-NOFMV-NEXT:    ret i32 0
 //
 //
 // CHECK-NOFMV: Function Attrs: noinline nounwind optnone
 // CHECK-NOFMV-LABEL: define {{[^@]+}}@fmv_one
-// CHECK-NOFMV-SAME: () #[[ATTR1]] {
+// CHECK-NOFMV-SAME: () #[[ATTR0]] {
 // CHECK-NOFMV-NEXT:  entry:
 // CHECK-NOFMV-NEXT:    ret i32 0
 //
 //
 // CHECK-NOFMV: Function Attrs: noinline nounwind optnone
 // CHECK-NOFMV-LABEL: define {{[^@]+}}@fmv_two
-// CHECK-NOFMV-SAME: () #[[ATTR1]] {
+// CHECK-NOFMV-SAME: () #[[ATTR0]] {
 // CHECK-NOFMV-NEXT:  entry:
 // CHECK-NOFMV-NEXT:    ret i32 0
 //
@@ -1048,21 +1054,21 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de
 //
 // CHECK-NOFMV: Function Attrs: noinline nounwind optnone
 // CHECK-NOFMV-LABEL: define {{[^@]+}}@fmv_d
-// CHECK-NOFMV-SAME: () #[[ATTR1]] {
+// CHECK-NOFMV-SAME: () #[[ATTR0]] {
 // CHECK-NOFMV-NEXT:  entry:
 // CHECK-NOFMV-NEXT:    ret i32 1
 //
 //
 // CHECK-NOFMV: Function Attrs: noinline nounwind optnone
 // CHECK-NOFMV-LABEL: define {{[^@]+}}@fmv_c
-// CHECK-NOFMV-SAME: () #[[ATTR1]] {
+// CHECK-NOFMV-SAME: () #[[ATTR0]] {
 // CHECK-NOFMV-NEXT:  entry:
 // CHECK-NOFMV-NEXT:    ret void
 //
 //
 // CHECK-NOFMV: Function Attrs: noinline nounwind optnone
 // CHECK-NOFMV-LABEL: define {{[^@]+}}@fmv_default
-// CHECK-NOFMV-SAME: () #[[ATTR1]] {
+// CHECK-NOFMV-SAME: () #[[ATTR0]] {
 // CHECK-NOFMV-NEXT:  entry:
 // CHECK-NOFMV-NEXT:    ret i32 111
 //
@@ -1115,7 +1121,7 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de
 //
 // CHECK-NOFMV: Function Attrs: noinline nounwind optnone
 // CHECK-NOFMV-LABEL: define {{[^@]+}}@main
-// CHECK-NOFMV-SAME: () #[[ATTR1]] {
+// CHECK-NOFMV-SAME: () #[[ATTR0]] {
 // CHECK-NOFMV-NEXT:  entry:
 // CHECK-NOFMV-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
 // CHECK-NOFMV-NEXT:    store i32 0, ptr [[RETVAL]], align 4
@@ -1126,7 +1132,7 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de
 //
 // CHECK-NOFMV: Function Attrs: noinline nounwind optnone
 // CHECK-NOFMV-LABEL: define {{[^@]+}}@unused_with_default_def
-// CHECK-NOFMV-SAME: () #[[ATTR1]] {
+// CHECK-NOFMV-SAME: () #[[ATTR0]] {
 // CHECK-NOFMV-NEXT:  entry:
 // CHECK-NOFMV-NEXT:    ret i32 1
 //
diff --git a/clang/test/CodeGen/nvptx_attributes.c b/clang/test/CodeGen/nvptx_attributes.c
index 7dbd9f1321e2..8b9f3a2c18a1 100644
--- a/clang/test/CodeGen/nvptx_attributes.c
+++ b/clang/test/CodeGen/nvptx_attributes.c
@@ -10,8 +10,14 @@
 // CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[RET_ADDR]], align 8
 // CHECK-NEXT:    store i32 1, ptr [[TMP0]], align 4
 // CHECK-NEXT:    ret void
+//
 __attribute__((nvptx_kernel)) void foo(int *ret) {
   *ret = 1;
 }
 
-// CHECK: !0 = !{ptr @foo, !"kernel", i32 1}
+//.
+// CHECK: attributes #[[ATTR0]] = { convergent noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx32,+sm_61" }
+//.
+// CHECK: [[META0:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+// CHECK: [[META1:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+//.
diff --git a/clang/test/CodeGen/sanitize-type-globals.cpp b/clang/test/CodeGen/sanitize-type-globals.cpp
index 7cb8de8b238c..1154ab4ca5df 100644
--- a/clang/test/CodeGen/sanitize-type-globals.cpp
+++ b/clang/test/CodeGen/sanitize-type-globals.cpp
@@ -3,7 +3,10 @@
 
 //.
 // CHECK: @x = global %struct.CompleteS zeroinitializer, align 8
+// CHECK: @xExtern = external global %struct.CompleteS, align 8
 // CHECK: @y = external global %struct.S, align 1
+// CHECK: @d = global %class.b zeroinitializer, align 1
+// CHECK: @_ZN1b1eE = external global %class.a, align 1
 // CHECK: @__tysan_shadow_memory_address = external global i64
 // CHECK: @__tysan_app_memory_mask = external global i64
 // CHECK: @__tysan_v1_Simple_20C_2b_2b_20TBAA = linkonce_odr constant { i64, i64, [16 x i8] } { i64 2, i64 0, [16 x i8] c"Simple C++ TBAA\00" }, comdat
@@ -12,8 +15,9 @@
 // CHECK: @__tysan_v1_any_20pointer = linkonce_odr constant { i64, i64, ptr, i64, [12 x i8] } { i64 2, i64 1, ptr @__tysan_v1_omnipotent_20char, i64 0, [12 x i8] c"any pointer\00" }, comdat
 // CHECK: @__tysan_v1_p1_20int = linkonce_odr constant { i64, i64, ptr, i64, [7 x i8] } { i64 2, i64 1, ptr @__tysan_v1_any_20pointer, i64 0, [7 x i8] c"p1 int\00" }, comdat
 // CHECK: @__tysan_v1___ZTS9CompleteS = linkonce_odr constant { i64, i64, ptr, i64, ptr, i64, [15 x i8] } { i64 2, i64 2, ptr @__tysan_v1_int, i64 0, ptr @__tysan_v1_p1_20int, i64 8, [15 x i8] c"_ZTS9CompleteS\00" }, comdat
-// CHECK: @llvm.used = appending global [7 x ptr] [ptr @tysan.module_ctor, ptr @__tysan_v1_Simple_20C_2b_2b_20TBAA, ptr @__tysan_v1_omnipotent_20char, ptr @__tysan_v1_int, ptr @__tysan_v1_any_20pointer, ptr @__tysan_v1_p1_20int, ptr @__tysan_v1___ZTS9CompleteS], section "llvm.metadata"
-// CHECK: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 0, ptr @tysan.module_ctor, ptr null }]
+// CHECK: @__tysan_v1___ZTS1b = linkonce_odr constant { i64, i64, [7 x i8] } { i64 2, i64 0, [7 x i8] c"_ZTS1b\00" }, comdat
+// CHECK: @llvm.used = appending global [8 x ptr] [ptr @tysan.module_ctor, ptr @__tysan_v1_Simple_20C_2b_2b_20TBAA, ptr @__tysan_v1_omnipotent_20char, ptr @__tysan_v1_int, ptr @__tysan_v1_any_20pointer, ptr @__tysan_v1_p1_20int, ptr @__tysan_v1___ZTS9CompleteS, ptr @__tysan_v1___ZTS1b], section "llvm.metadata"
+// CHECK: @llvm.global_ctors = appending global [2 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 65535, ptr @_GLOBAL__sub_I_sanitize_type_globals.cpp, ptr null }, { i32, ptr, ptr } { i32 0, ptr @tysan.module_ctor, ptr null }]
 //.
 struct CompleteS {
   int x;
@@ -22,13 +26,18 @@ struct CompleteS {
 
 void f(CompleteS *);
 CompleteS x;
+extern CompleteS xExtern;
 // CHECK-LABEL: define dso_local void @_Z1gv(
 // CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
 // CHECK:  [[ENTRY:.*:]]
 // CHECK:    call void @_Z1fP9CompleteS(ptr noundef @x)
+// CHECK:    call void @_Z1fP9CompleteS(ptr noundef @xExtern)
 // CHECK:    ret void
 //
-void g() { f(&x); }
+void g() {
+  f(&x);
+  f(&xExtern);
+}
 
 typedef struct S IncompleteS;
 void f(IncompleteS *);
@@ -40,11 +49,21 @@ extern IncompleteS y;
 // CHECK:    ret void
 //
 void h() { f(&y); }
+
+class a;
+class b {
+public:
+  using c = a;
+  static c e;
+  b(int, c & = e);
+} d = 0;
+
 //.
 // CHECK: attributes #[[ATTR0]] = { mustprogress noinline nounwind optnone sanitize_type "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" }
 // CHECK: attributes #[[ATTR1:[0-9]+]] = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" }
-// CHECK: attributes #[[ATTR2:[0-9]+]] = { nounwind "target-features"="+cx8,+mmx,+sse,+sse2,+x87" }
-// CHECK: attributes #[[ATTR3:[0-9]+]] = { nounwind }
+// CHECK: attributes #[[ATTR2:[0-9]+]] = { noinline nounwind sanitize_type "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" }
+// CHECK: attributes #[[ATTR3:[0-9]+]] = { nounwind "target-features"="+cx8,+mmx,+sse,+sse2,+x87" }
+// CHECK: attributes #[[ATTR4:[0-9]+]] = { nounwind }
 //.
 // CHECK: [[META0:![0-9]+]] = !{ptr @x, [[META1:![0-9]+]]}
 // CHECK: [[META1]] = !{!"_ZTS9CompleteS", [[META2:![0-9]+]], i64 0, [[META5:![0-9]+]], i64 8}
@@ -53,6 +72,8 @@ void h() { f(&y); }
 // CHECK: [[META4]] = !{!"Simple C++ TBAA"}
 // CHECK: [[META5]] = !{!"p1 int", [[META6:![0-9]+]], i64 0}
 // CHECK: [[META6]] = !{!"any pointer", [[META3]], i64 0}
-// CHECK: [[META7:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
-// CHECK: [[META8:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+// CHECK: [[META7:![0-9]+]] = !{ptr @d, [[META8:![0-9]+]]}
+// CHECK: [[META8]] = !{!"_ZTS1b"}
+// CHECK: [[META9:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+// CHECK: [[META10:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
 //.
diff --git a/clang/test/CodeGen/tbaa-pointers.c b/clang/test/CodeGen/tbaa-pointers.c
index 6aee2ff3717a..4aae2552f107 100644
--- a/clang/test/CodeGen/tbaa-pointers.c
+++ b/clang/test/CodeGen/tbaa-pointers.c
@@ -193,11 +193,10 @@ typedef struct {
 void unamed_struct_typedef(TypedefS *ptr) {
 // COMMON-LABEL: define void @unamed_struct_typedef(
 // COMMON-SAME: ptr noundef [[PTRA:%.+]])
-// COMMON:   [[PTR_ADDR:%.+]]  = alloca ptr, align 8
+// COMMON:        [[PTR_ADDR:%.+]]  = alloca ptr, align 8
 // DISABLE-NEXT:  store ptr [[PTRA]], ptr [[PTR_ADDR]], align 8, !tbaa [[ANYPTR]]
-// DISABLE-NEXT:  [[L0:%.+]] = load ptr, ptr  [[PTR_ADDR]], align 8, !tbaa  [[ANYPTR]]
-// DEFAULT-NEXT:  store ptr [[PTRA]], ptr [[PTR_ADDR]], align 8, !tbaa [[P1TYPEDEF:!.+]]
-// DEFAULT-NEXT:  [[L0:%.+]] = load ptr, ptr [[PTR_ADDR]], align 8, !tbaa  [[P1TYPEDEF]]
+// DEFAULT-NEXT:  store ptr [[PTRA]], ptr [[PTR_ADDR]], align 8, !tbaa [[ANYPTR:!.+]]
+// COMMON-NEXT:   [[L0:%.+]] = load ptr, ptr [[PTR_ADDR]], align 8, !tbaa  [[ANYPTR]]
 // COMMON-NEXT:   [[GEP:%.+]]  = getelementptr inbounds nuw %struct.TypedefS, ptr [[L0]], i32 0, i32 0
 // COMMON-NEXT:   store i32 0, ptr [[GEP]], align 4
 // COMMON-NEXT:   ret void
@@ -205,6 +204,24 @@ void unamed_struct_typedef(TypedefS *ptr) {
   ptr->i1 = 0;
 }
 
+int void_ptrs(void **ptr) {
+// COMMON-LABEL: define i32 @void_ptrs(
+// COMMON-SAME: ptr noundef [[PTRA:%.+]])
+// COMMON:        [[PTR_ADDR:%.+]]  = alloca ptr, align 8
+// DISABLE-NEXT:  store ptr [[PTRA]], ptr [[PTR_ADDR]], align 8, !tbaa [[ANYPTR]]
+// DISABLE-NEXT:  [[L0:%.+]] = load ptr, ptr  [[PTR_ADDR]], align 8, !tbaa  [[ANYPTR]]
+// DISABLE-NEXT:  [[L1:%.+]] = load ptr, ptr [[L0]], align 8, !tbaa  [[ANYPTR]]
+// DEFAULT-NEXT:  store ptr [[PTRA]], ptr [[PTR_ADDR]], align 8, !tbaa [[P2VOID:!.+]]
+// DEFAULT-NEXT:  [[L0:%.+]] = load ptr, ptr  [[PTR_ADDR]], align 8, !tbaa  [[P2VOID]]
+// DEFAULT-NEXT:  [[L1:%.+]] = load ptr, ptr [[L0]], align 8, !tbaa  [[P1VOID:!.+]]
+// COMMON-NEXT:   [[BOOL:%.+]] = icmp ne ptr [[L1]], null
+// COMMON-NEXT:   [[BOOL_EXT:%.+]] = zext i1 [[BOOL]] to i64
+// COMMON-NEXT:   [[COND:%.+]] = select i1 [[BOOL]], i32 0, i32 1
+// COMMON-NEXT:   ret i32 [[COND]]
+
+  return *ptr ? 0 : 1;
+}
+
 // DEFAULT: [[P2INT_0]] = !{[[P2INT:!.+]], [[P2INT]], i64 0}
 // DEFAULT: [[P2INT]] = !{!"p2 int", [[ANY_POINTER:!.+]], i64 0}
 // DISABLE: [[ANYPTR]] = !{[[ANY_POINTER:!.+]], [[ANY_POINTER]], i64 0}
@@ -236,4 +253,8 @@ void unamed_struct_typedef(TypedefS *ptr) {
 // DISABLE: [[S2_TY]]  = !{!"S2", [[ANY_POINTER]], i64 0}
 // COMMON:  [[INT_TAG]] = !{[[INT_TY:!.+]], [[INT_TY]], i64 0}
 // COMMON:  [[INT_TY]] = !{!"int", [[CHAR]], i64 0}
-// DEFAULT: [[P1TYPEDEF]] = !{[[ANY_POINTER]],  [[ANY_POINTER]], i64 0}
+// DEFAULT: [[ANYPTR]] = !{[[ANY_POINTER]],  [[ANY_POINTER]], i64 0}
+// DEFAULT: [[P2VOID]] = !{[[P2VOID_TY:!.+]], [[P2VOID_TY]], i64 0}
+// DEFAULT: [[P2VOID_TY]] = !{!"p2 void", [[ANY_POINTER]], i64 0}
+// DEFAULT: [[P1VOID]] = !{[[P1VOID_TY:!.+]], [[P1VOID_TY]], i64 0}
+// DEFAULT: [[P1VOID_TY]] = !{!"p1 void", [[ANY_POINTER]], i64 0}
diff --git a/clang/test/CodeGen/xcore-abi.c b/clang/test/CodeGen/xcore-abi.c
index bb8d2fec46bd..40e2f418f730 100644
--- a/clang/test/CodeGen/xcore-abi.c
+++ b/clang/test/CodeGen/xcore-abi.c
@@ -76,7 +76,8 @@ void testva (int n, ...) {
   // CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[V5]], ptr align 4 [[P]], i32 20, i1 false)
   // CHECK: call void @f(ptr noundef [[V5]])
 
-  int* v6 = va_arg (ap, int[4]);  // an unusual aggregate type
+  // an unusual aggregate type
+  int* v6 = va_arg (ap, int[4]);  // expected-warning{{second argument to 'va_arg' is of array type 'int[4]'}}
   f(v6);
   // CHECK: [[I:%[a-z0-9]+]] = load ptr, ptr [[AP]]
   // CHECK: [[P:%[a-z0-9]+]] = load ptr, ptr [[I]]
diff --git a/clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu b/clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu
index fab87aac1831..19730e392551 100644
--- a/clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu
+++ b/clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu
@@ -33,7 +33,7 @@
 // CHECK-NEXT:    ret void
 //
 // CHECK-SPIRV-LABEL: define spir_kernel void @_Z7kernel1Pi(
-// CHECK-SPIRV-SAME: ptr addrspace(1) noundef [[X_COERCE:%.*]]) addrspace(4) #[[ATTR0:[0-9]+]] {
+// CHECK-SPIRV-SAME: ptr addrspace(1) noundef [[X_COERCE:%.*]]) addrspace(4) #[[ATTR0:[0-9]+]] !max_work_group_size [[META5:![0-9]+]] {
 // CHECK-SPIRV-NEXT:  [[ENTRY:.*:]]
 // CHECK-SPIRV-NEXT:    [[X:%.*]] = alloca ptr addrspace(4), align 8
 // CHECK-SPIRV-NEXT:    [[X_ADDR:%.*]] = alloca ptr addrspace(4), align 8
@@ -58,7 +58,7 @@
 // OPT-NEXT:    ret void
 //
 // OPT-SPIRV-LABEL: define spir_kernel void @_Z7kernel1Pi(
-// OPT-SPIRV-SAME: ptr addrspace(1) noundef [[X_COERCE:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR0:[0-9]+]] {
+// OPT-SPIRV-SAME: ptr addrspace(1) noundef [[X_COERCE:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR0:[0-9]+]] !max_work_group_size [[META5:![0-9]+]] {
 // OPT-SPIRV-NEXT:  [[ENTRY:.*:]]
 // OPT-SPIRV-NEXT:    [[TMP0:%.*]] = ptrtoint ptr addrspace(1) [[X_COERCE]] to i64
 // OPT-SPIRV-NEXT:    [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4)
@@ -102,7 +102,7 @@ __global__ void kernel1(int *x) {
 // CHECK-NEXT:    ret void
 //
 // CHECK-SPIRV-LABEL: define spir_kernel void @_Z7kernel2Ri(
-// CHECK-SPIRV-SAME: ptr addrspace(1) noundef align 4 dereferenceable(4) [[X_COERCE:%.*]]) addrspace(4) #[[ATTR0]] {
+// CHECK-SPIRV-SAME: ptr addrspace(1) noundef align 4 dereferenceable(4) [[X_COERCE:%.*]]) addrspace(4) #[[ATTR0]] !max_work_group_size [[META5]] {
 // CHECK-SPIRV-NEXT:  [[ENTRY:.*:]]
 // CHECK-SPIRV-NEXT:    [[X:%.*]] = alloca ptr addrspace(4), align 8
 // CHECK-SPIRV-NEXT:    [[X_ADDR:%.*]] = alloca ptr addrspace(4), align 8
@@ -126,7 +126,7 @@ __global__ void kernel1(int *x) {
 // OPT-NEXT:    ret void
 //
 // OPT-SPIRV-LABEL: define spir_kernel void @_Z7kernel2Ri(
-// OPT-SPIRV-SAME: ptr addrspace(1) noundef align 4 dereferenceable(4) [[X_COERCE:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR0]] {
+// OPT-SPIRV-SAME: ptr addrspace(1) noundef align 4 dereferenceable(4) [[X_COERCE:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR0]] !max_work_group_size [[META5]] {
 // OPT-SPIRV-NEXT:  [[ENTRY:.*:]]
 // OPT-SPIRV-NEXT:    [[TMP0:%.*]] = ptrtoint ptr addrspace(1) [[X_COERCE]] to i64
 // OPT-SPIRV-NEXT:    [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4)
@@ -171,7 +171,7 @@ __global__ void kernel2(int &x) {
 // CHECK-NEXT:    ret void
 //
 // CHECK-SPIRV-LABEL: define spir_kernel void @_Z7kernel3PU3AS2iPU3AS1i(
-// CHECK-SPIRV-SAME: ptr addrspace(2) noundef [[X:%.*]], ptr addrspace(1) noundef [[Y:%.*]]) addrspace(4) #[[ATTR0]] {
+// CHECK-SPIRV-SAME: ptr addrspace(2) noundef [[X:%.*]], ptr addrspace(1) noundef [[Y:%.*]]) addrspace(4) #[[ATTR0]] !max_work_group_size [[META5]] {
 // CHECK-SPIRV-NEXT:  [[ENTRY:.*:]]
 // CHECK-SPIRV-NEXT:    [[X_ADDR:%.*]] = alloca ptr addrspace(2), align 8
 // CHECK-SPIRV-NEXT:    [[Y_ADDR:%.*]] = alloca ptr addrspace(1), align 8
@@ -195,7 +195,7 @@ __global__ void kernel2(int &x) {
 // OPT-NEXT:    ret void
 //
 // OPT-SPIRV-LABEL: define spir_kernel void @_Z7kernel3PU3AS2iPU3AS1i(
-// OPT-SPIRV-SAME: ptr addrspace(2) nocapture noundef readonly [[X:%.*]], ptr addrspace(1) nocapture noundef writeonly initializes((0, 4)) [[Y:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR1:[0-9]+]] {
+// OPT-SPIRV-SAME: ptr addrspace(2) nocapture noundef readonly [[X:%.*]], ptr addrspace(1) nocapture noundef writeonly initializes((0, 4)) [[Y:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR1:[0-9]+]] !max_work_group_size [[META5]] {
 // OPT-SPIRV-NEXT:  [[ENTRY:.*:]]
 // OPT-SPIRV-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(2) [[X]], align 4
 // OPT-SPIRV-NEXT:    store i32 [[TMP0]], ptr addrspace(1) [[Y]], align 4
@@ -302,7 +302,7 @@ struct S {
 // CHECK-NEXT:    ret void
 //
 // CHECK-SPIRV-LABEL: define spir_kernel void @_Z7kernel41S(
-// CHECK-SPIRV-SAME: [[STRUCT_S:%.*]] [[S_COERCE:%.*]]) addrspace(4) #[[ATTR0]] {
+// CHECK-SPIRV-SAME: [[STRUCT_S:%.*]] [[S_COERCE:%.*]]) addrspace(4) #[[ATTR0]] !max_work_group_size [[META5]] {
 // CHECK-SPIRV-NEXT:  [[ENTRY:.*:]]
 // CHECK-SPIRV-NEXT:    [[S:%.*]] = alloca [[STRUCT_S]], align 8
 // CHECK-SPIRV-NEXT:    [[S1:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(4)
@@ -343,7 +343,7 @@ struct S {
 // OPT-NEXT:    ret void
 //
 // OPT-SPIRV-LABEL: define spir_kernel void @_Z7kernel41S(
-// OPT-SPIRV-SAME: [[STRUCT_S:%.*]] [[S_COERCE:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR0]] {
+// OPT-SPIRV-SAME: [[STRUCT_S:%.*]] [[S_COERCE:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR0]] !max_work_group_size [[META5]] {
 // OPT-SPIRV-NEXT:  [[ENTRY:.*:]]
 // OPT-SPIRV-NEXT:    [[TMP0:%.*]] = extractvalue [[STRUCT_S]] [[S_COERCE]], 0
 // OPT-SPIRV-NEXT:    [[TMP1:%.*]] = extractvalue [[STRUCT_S]] [[S_COERCE]], 1
@@ -406,7 +406,7 @@ __global__ void kernel4(struct S s) {
 // CHECK-NEXT:    ret void
 //
 // CHECK-SPIRV-LABEL: define spir_kernel void @_Z7kernel5P1S(
-// CHECK-SPIRV-SAME: ptr addrspace(1) noundef [[S_COERCE:%.*]]) addrspace(4) #[[ATTR0]] {
+// CHECK-SPIRV-SAME: ptr addrspace(1) noundef [[S_COERCE:%.*]]) addrspace(4) #[[ATTR0]] !max_work_group_size [[META5]] {
 // CHECK-SPIRV-NEXT:  [[ENTRY:.*:]]
 // CHECK-SPIRV-NEXT:    [[S:%.*]] = alloca ptr addrspace(4), align 8
 // CHECK-SPIRV-NEXT:    [[S_ADDR:%.*]] = alloca ptr addrspace(4), align 8
@@ -432,7 +432,7 @@ __global__ void kernel4(struct S s) {
 // CHECK-SPIRV-NEXT:    ret void
 //
 // OPT-LABEL: define dso_local amdgpu_kernel void @_Z7kernel5P1S(
-// OPT-SAME: ptr addrspace(1) nocapture noundef readonly [[S_COERCE:%.*]]) local_unnamed_addr #[[ATTR3:[0-9]+]] {
+// OPT-SAME: ptr addrspace(1) nocapture noundef readonly [[S_COERCE:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // OPT-NEXT:  [[ENTRY:.*:]]
 // OPT-NEXT:    [[TMP0:%.*]] = load ptr, ptr addrspace(1) [[S_COERCE]], align 8
 // OPT-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
@@ -446,7 +446,7 @@ __global__ void kernel4(struct S s) {
 // OPT-NEXT:    ret void
 //
 // OPT-SPIRV-LABEL: define spir_kernel void @_Z7kernel5P1S(
-// OPT-SPIRV-SAME: ptr addrspace(1) noundef [[S_COERCE:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR0]] {
+// OPT-SPIRV-SAME: ptr addrspace(1) noundef [[S_COERCE:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR0]] !max_work_group_size [[META5]] {
 // OPT-SPIRV-NEXT:  [[ENTRY:.*:]]
 // OPT-SPIRV-NEXT:    [[TMP0:%.*]] = ptrtoint ptr addrspace(1) [[S_COERCE]] to i64
 // OPT-SPIRV-NEXT:    [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4)
@@ -511,7 +511,7 @@ struct T {
 // CHECK-NEXT:    ret void
 //
 // CHECK-SPIRV-LABEL: define spir_kernel void @_Z7kernel61T(
-// CHECK-SPIRV-SAME: [[STRUCT_T:%.*]] [[T_COERCE:%.*]]) addrspace(4) #[[ATTR0]] {
+// CHECK-SPIRV-SAME: [[STRUCT_T:%.*]] [[T_COERCE:%.*]]) addrspace(4) #[[ATTR0]] !max_work_group_size [[META5]] {
 // CHECK-SPIRV-NEXT:  [[ENTRY:.*:]]
 // CHECK-SPIRV-NEXT:    [[T:%.*]] = alloca [[STRUCT_T]], align 8
 // CHECK-SPIRV-NEXT:    [[T1:%.*]] = addrspacecast ptr [[T]] to ptr addrspace(4)
@@ -551,7 +551,7 @@ struct T {
 // OPT-NEXT:    ret void
 //
 // OPT-SPIRV-LABEL: define spir_kernel void @_Z7kernel61T(
-// OPT-SPIRV-SAME: [[STRUCT_T:%.*]] [[T_COERCE:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR0]] {
+// OPT-SPIRV-SAME: [[STRUCT_T:%.*]] [[T_COERCE:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR0]] !max_work_group_size [[META5]] {
 // OPT-SPIRV-NEXT:  [[ENTRY:.*:]]
 // OPT-SPIRV-NEXT:    [[TMP0:%.*]] = extractvalue [[STRUCT_T]] [[T_COERCE]], 0
 // OPT-SPIRV-NEXT:    [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [2 x ptr addrspace(4)] [[TMP0]], 0
@@ -606,7 +606,7 @@ __global__ void kernel6(struct T t) {
 // CHECK-NEXT:    ret void
 //
 // CHECK-SPIRV-LABEL: define spir_kernel void @_Z7kernel7Pi(
-// CHECK-SPIRV-SAME: ptr addrspace(1) noalias noundef [[X_COERCE:%.*]]) addrspace(4) #[[ATTR0]] {
+// CHECK-SPIRV-SAME: ptr addrspace(1) noalias noundef [[X_COERCE:%.*]]) addrspace(4) #[[ATTR0]] !max_work_group_size [[META5]] {
 // CHECK-SPIRV-NEXT:  [[ENTRY:.*:]]
 // CHECK-SPIRV-NEXT:    [[X:%.*]] = alloca ptr addrspace(4), align 8
 // CHECK-SPIRV-NEXT:    [[X_ADDR:%.*]] = alloca ptr addrspace(4), align 8
@@ -631,7 +631,7 @@ __global__ void kernel6(struct T t) {
 // OPT-NEXT:    ret void
 //
 // OPT-SPIRV-LABEL: define spir_kernel void @_Z7kernel7Pi(
-// OPT-SPIRV-SAME: ptr addrspace(1) noalias noundef [[X_COERCE:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR0]] {
+// OPT-SPIRV-SAME: ptr addrspace(1) noalias noundef [[X_COERCE:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR0]] !max_work_group_size [[META5]] {
 // OPT-SPIRV-NEXT:  [[ENTRY:.*:]]
 // OPT-SPIRV-NEXT:    [[TMP0:%.*]] = ptrtoint ptr addrspace(1) [[X_COERCE]] to i64
 // OPT-SPIRV-NEXT:    [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4)
@@ -677,7 +677,7 @@ struct SS {
 // CHECK-NEXT:    ret void
 //
 // CHECK-SPIRV-LABEL: define spir_kernel void @_Z7kernel82SS(
-// CHECK-SPIRV-SAME: [[STRUCT_SS:%.*]] [[A_COERCE:%.*]]) addrspace(4) #[[ATTR0]] {
+// CHECK-SPIRV-SAME: [[STRUCT_SS:%.*]] [[A_COERCE:%.*]]) addrspace(4) #[[ATTR0]] !max_work_group_size [[META5]] {
 // CHECK-SPIRV-NEXT:  [[ENTRY:.*:]]
 // CHECK-SPIRV-NEXT:    [[A:%.*]] = alloca [[STRUCT_SS]], align 8
 // CHECK-SPIRV-NEXT:    [[A1:%.*]] = addrspacecast ptr [[A]] to ptr addrspace(4)
@@ -700,7 +700,7 @@ struct SS {
 // OPT-NEXT:    ret void
 //
 // OPT-SPIRV-LABEL: define spir_kernel void @_Z7kernel82SS(
-// OPT-SPIRV-SAME: [[STRUCT_SS:%.*]] [[A_COERCE:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR0]] {
+// OPT-SPIRV-SAME: [[STRUCT_SS:%.*]] [[A_COERCE:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR0]] !max_work_group_size [[META5]] {
 // OPT-SPIRV-NEXT:  [[ENTRY:.*:]]
 // OPT-SPIRV-NEXT:    [[TMP0:%.*]] = extractvalue [[STRUCT_SS]] [[A_COERCE]], 0
 // OPT-SPIRV-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(4) [[TMP0]], align 4
@@ -727,5 +727,9 @@ __global__ void kernel8(struct SS a) {
   *a.x += 3.f;
 }
 //.
+// CHECK-SPIRV: [[META5]] = !{i32 1024, i32 1, i32 1}
+//.
 // OPT: [[META4]] = !{}
 //.
+// OPT-SPIRV: [[META5]] = !{i32 1024, i32 1, i32 1}
+//.
diff --git a/clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu b/clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu
index 11a133fd1351..253ac0898f54 100644
--- a/clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu
+++ b/clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu
@@ -4,6 +4,9 @@
 // RUN: %clang_cc1 -triple amdgcn-amd-amdhsa --gpu-max-threads-per-block=1024 \
 // RUN:     -fcuda-is-device -emit-llvm -o - -x hip %s \
 // RUN:     | FileCheck -check-prefixes=CHECK,MAX1024 %s
+// RUN: %clang_cc1 -triple spirv64-amd-amdhsa --gpu-max-threads-per-block=1024 \
+// RUN:     -fcuda-is-device -emit-llvm -o - -x hip %s \
+// RUN:     | FileCheck -check-prefixes=CHECK-SPIRV,MAX1024-SPIRV %s
 // RUN: %clang_cc1 -triple nvptx \
 // RUN:     -fcuda-is-device -emit-llvm -o - %s | FileCheck %s \
 // RUN:     -check-prefix=NAMD
@@ -21,12 +24,14 @@
 
 __global__ void flat_work_group_size_default() {
 // CHECK: define{{.*}} amdgpu_kernel void @_Z28flat_work_group_size_defaultv() [[FLAT_WORK_GROUP_SIZE_DEFAULT:#[0-9]+]]
+// CHECK-SPIRV: define{{.*}} spir_kernel void @_Z28flat_work_group_size_defaultv(){{.*}} !max_work_group_size [[MAX_WORK_GROUP_SIZE_DEFAULT:![0-9]+]]
 // NOUB: define{{.*}} void @_Z28flat_work_group_size_defaultv() [[NOUB:#[0-9]+]]
 }
 
 __attribute__((amdgpu_flat_work_group_size(32, 64))) // expected-no-diagnostics
 __global__ void flat_work_group_size_32_64() {
 // CHECK: define{{.*}} amdgpu_kernel void @_Z26flat_work_group_size_32_64v() [[FLAT_WORK_GROUP_SIZE_32_64:#[0-9]+]]
+// CHECK-SPIRV: define{{.*}} spir_kernel void @_Z26flat_work_group_size_32_64v(){{.*}} !max_work_group_size [[MAX_WORK_GROUP_SIZE_64:![0-9]+]]
 }
 __attribute__((amdgpu_waves_per_eu(2))) // expected-no-diagnostics
 __global__ void waves_per_eu_2() {
@@ -82,7 +87,9 @@ template __global__ void template_32_4_a_max_num_work_groups<2>();
 
 // DEFAULT-DAG: attributes [[FLAT_WORK_GROUP_SIZE_DEFAULT]] = {{.*}}"amdgpu-flat-work-group-size"="1,1024"{{.*}}"uniform-work-group-size"="true"
 // MAX1024-DAG: attributes [[FLAT_WORK_GROUP_SIZE_DEFAULT]] = {{.*}}"amdgpu-flat-work-group-size"="1,1024"
+// MAX1024-SPIRV-DAG: [[MAX_WORK_GROUP_SIZE_DEFAULT]] = !{i32 1024, i32 1, i32 1}
 // CHECK-DAG: attributes [[FLAT_WORK_GROUP_SIZE_32_64]] = {{.*}}"amdgpu-flat-work-group-size"="32,64"
+// CHECK-SPIRV-DAG: [[MAX_WORK_GROUP_SIZE_64]] = !{i32 64, i32 1, i32 1}
 // CHECK-DAG: attributes [[WAVES_PER_EU_2]] = {{.*}}"amdgpu-waves-per-eu"="2"
 // CHECK-DAG: attributes [[NUM_SGPR_32]] = {{.*}}"amdgpu-num-sgpr"="32"
 // CHECK-DAG: attributes [[NUM_VGPR_64]] = {{.*}}"amdgpu-num-vgpr"="64"
diff --git a/clang/test/CodeGenCUDA/device-fun-linkage.cu b/clang/test/CodeGenCUDA/device-fun-linkage.cu
index 54899e0e9c0f..bdac62d1d03e 100644
--- a/clang/test/CodeGenCUDA/device-fun-linkage.cu
+++ b/clang/test/CodeGenCUDA/device-fun-linkage.cu
@@ -17,8 +17,8 @@ template __device__ void func<int>();
 // RDC:       define weak_odr void @_Z4funcIiEvv()
 
 template __global__ void kernel<int>();
-// NORDC:     define void @_Z6kernelIiEvv()
-// RDC:       define weak_odr void @_Z6kernelIiEvv()
+// NORDC:     define ptx_kernel void @_Z6kernelIiEvv()
+// RDC:       define weak_odr ptx_kernel void @_Z6kernelIiEvv()
 
 // Ensure that unused static device function is eliminated
 static __device__ void static_func() {}
@@ -28,5 +28,5 @@ static __device__ void static_func() {}
 // Ensure that kernel function has external or weak_odr
 // linkage regardless static specifier
 static __global__ void static_kernel() {}
-// NORDC:     define void @_ZL13static_kernelv()
-// RDC:       define weak_odr void @_ZL13static_kernelv[[FILEID:.*]]()
+// NORDC:     define ptx_kernel void @_ZL13static_kernelv()
+// RDC:       define weak_odr ptx_kernel void @_ZL13static_kernelv[[FILEID:.*]]()
diff --git a/clang/test/CodeGenCUDA/grid-constant.cu b/clang/test/CodeGenCUDA/grid-constant.cu
index 8d4be9c9dc7e..e7000cab3cda 100644
--- a/clang/test/CodeGenCUDA/grid-constant.cu
+++ b/clang/test/CodeGenCUDA/grid-constant.cu
@@ -21,11 +21,11 @@ void foo() {
 }
 //.
 //.
-// CHECK: [[META0:![0-9]+]] = !{ptr @_Z6kernel1Sii, !"kernel", i32 1, !"grid_constant", [[META1:![0-9]+]]}
+// CHECK: [[META0:![0-9]+]] = !{ptr @_Z6kernel1Sii, !"grid_constant", [[META1:![0-9]+]]}
 // CHECK: [[META1]] = !{i32 1, i32 3}
-// CHECK: [[META2:![0-9]+]] = !{ptr @_Z13tkernel_constIK1SEvT_, !"kernel", i32 1, !"grid_constant", [[META3:![0-9]+]]}
+// CHECK: [[META2:![0-9]+]] = !{ptr @_Z13tkernel_constIK1SEvT_, !"grid_constant", [[META3:![0-9]+]]}
 // CHECK: [[META3]] = !{i32 1}
-// CHECK: [[META4:![0-9]+]] = !{ptr @_Z13tkernel_constI1SEvT_, !"kernel", i32 1, !"grid_constant", [[META3]]}
-// CHECK: [[META5:![0-9]+]] = !{ptr @_Z7tkernelIK1SEviT_, !"kernel", i32 1, !"grid_constant", [[META6:![0-9]+]]}
+// CHECK: [[META4:![0-9]+]] = !{ptr @_Z13tkernel_constI1SEvT_, !"grid_constant", [[META3]]}
+// CHECK: [[META5:![0-9]+]] = !{ptr @_Z7tkernelIK1SEviT_, !"grid_constant", [[META6:![0-9]+]]}
 // CHECK: [[META6]] = !{i32 2}
 //.
diff --git a/clang/test/CodeGenCUDA/offload_via_llvm.cu b/clang/test/CodeGenCUDA/offload_via_llvm.cu
index 434eba99c179..62942d8dc075 100644
--- a/clang/test/CodeGenCUDA/offload_via_llvm.cu
+++ b/clang/test/CodeGenCUDA/offload_via_llvm.cu
@@ -7,7 +7,7 @@
 #define __OFFLOAD_VIA_LLVM__ 1
 #include "Inputs/cuda.h"
 
-// HST-LABEL: define dso_local void @_Z18__device_stub__fooisPvS_(
+// HST-LABEL: define dso_local ptx_kernel void @_Z18__device_stub__fooisPvS_(
 // HST-SAME: i32 noundef [[TMP0:%.*]], i16 noundef signext [[TMP1:%.*]], ptr noundef [[TMP2:%.*]], ptr noundef [[TMP3:%.*]]) #[[ATTR0:[0-9]+]] {
 // HST-NEXT:  [[ENTRY:.*:]]
 // HST-NEXT:    [[DOTADDR:%.*]] = alloca i32, align 4
@@ -50,7 +50,7 @@
 // HST:       [[SETUP_END]]:
 // HST-NEXT:    ret void
 //
-// DEV-LABEL: define dso_local void @_Z3fooisPvS_(
+// DEV-LABEL: define dso_local ptx_kernel void @_Z3fooisPvS_(
 // DEV-SAME: i32 noundef [[TMP0:%.*]], i16 noundef signext [[TMP1:%.*]], ptr noundef [[TMP2:%.*]], ptr noundef [[TMP3:%.*]]) #[[ATTR0:[0-9]+]] {
 // DEV-NEXT:  [[ENTRY:.*:]]
 // DEV-NEXT:    [[DOTADDR:%.*]] = alloca i32, align 4
diff --git a/clang/test/CodeGenCUDA/ptx-kernels.cu b/clang/test/CodeGenCUDA/ptx-kernels.cu
index b7172b773692..a7d5e11bd496 100644
--- a/clang/test/CodeGenCUDA/ptx-kernels.cu
+++ b/clang/test/CodeGenCUDA/ptx-kernels.cu
@@ -10,7 +10,7 @@
 extern "C"
 __device__ void device_function() {}
 
-// CHECK-LABEL: define{{.*}} void @global_function
+// CHECK-LABEL: define{{.*}} ptx_kernel void @global_function
 extern "C"
 __global__ void global_function() {
   // CHECK: call void @device_function
@@ -19,7 +19,7 @@ __global__ void global_function() {
 
 // Make sure host-instantiated kernels are preserved on device side.
 template <typename T> __global__ void templated_kernel(T param) {}
-// CHECK-DAG: define{{.*}} void @_Z16templated_kernelIiEvT_(
+// CHECK-DAG: define{{.*}} ptx_kernel void @_Z16templated_kernelIiEvT_(
 
 namespace {
 __global__ void anonymous_ns_kernel() {}
@@ -30,6 +30,3 @@ void host_function() {
   templated_kernel<<<0, 0>>>(0);
   anonymous_ns_kernel<<<0,0>>>();
 }
-
-// CHECK: !{{[0-9]+}} = !{ptr @global_function, !"kernel", i32 1}
-// CHECK: !{{[0-9]+}} = !{ptr @_Z16templated_kernelIiEvT_, !"kernel", i32 1}
diff --git a/clang/test/CodeGenCUDA/usual-deallocators.cu b/clang/test/CodeGenCUDA/usual-deallocators.cu
index b85a706813fc..64560efb7413 100644
--- a/clang/test/CodeGenCUDA/usual-deallocators.cu
+++ b/clang/test/CodeGenCUDA/usual-deallocators.cu
@@ -109,7 +109,7 @@ __host__ __device__ void tests_hd(void *t) {
 }
 
 // Make sure that we've generated the kernel used by A::~A.
-// DEVICE-LABEL: define void @_Z1fIiEvT_
+// DEVICE-LABEL: define ptx_kernel void @_Z1fIiEvT_
 
 // Make sure we've picked deallocator for the correct side of compilation.
 
@@ -147,5 +147,3 @@ __host__ __device__ void tests_hd(void *t) {
 // COMMON-LABEL: define  linkonce_odr void @_ZN8H1H2D1D2dlEPv(ptr noundef %0)
 // DEVICE: call void @dev_fn()
 // HOST: call void @host_fn()
-
-// DEVICE: !0 = !{ptr @_Z1fIiEvT_, !"kernel", i32 1}
diff --git a/clang/test/CodeGenHLSL/HLSLControlFlowHint.hlsl b/clang/test/CodeGenHLSL/HLSLControlFlowHint.hlsl
deleted file mode 100644
index aa13b2758185..000000000000
--- a/clang/test/CodeGenHLSL/HLSLControlFlowHint.hlsl
+++ /dev/null
@@ -1,48 +0,0 @@
-// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -emit-llvm -o - | FileCheck %s
-// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple spirv-vulkan-library %s -fnative-half-type -emit-llvm -o - | FileCheck %s
-
-// CHECK: define {{.*}} i32 {{.*}}test_branch{{.*}}(i32 {{.*}} [[VALD:%.*]])
-// CHECK: [[PARAM:%.*]] = load i32, ptr [[VALD]].addr, align 4
-// CHECK: [[CMP:%.*]] = icmp sgt i32 [[PARAM]], 0
-// CHECK: br i1 [[CMP]], label %if.then, label %if.else, !hlsl.controlflow.hint [[HINT_BRANCH:![0-9]+]]
-export int test_branch(int X){
-    int resp;
-    [branch] if (X > 0) {
-        resp = -X;
-    } else {
-        resp = X * 2;
-    }
-
-    return resp;
-}
-
-// CHECK: define {{.*}} i32 {{.*}}test_flatten{{.*}}(i32 {{.*}} [[VALD:%.*]])
-// CHECK: [[PARAM:%.*]] = load i32, ptr [[VALD]].addr, align 4
-// CHECK: [[CMP:%.*]] = icmp sgt i32 [[PARAM]], 0
-// CHECK: br i1 [[CMP]], label %if.then, label %if.else, !hlsl.controlflow.hint [[HINT_FLATTEN:![0-9]+]]
-export int test_flatten(int X){
-    int resp;
-    [flatten] if (X > 0) {
-        resp = -X;
-    } else {
-        resp = X * 2;
-    }
-
-    return resp;
-}
-
-// CHECK: define {{.*}} i32 {{.*}}test_no_attr{{.*}}(i32 {{.*}} [[VALD:%.*]])
-// CHECK-NOT: !hlsl.controlflow.hint
-export int test_no_attr(int X){
-    int resp;
-    if (X > 0) {
-        resp = -X;
-    } else {
-        resp = X * 2;
-    }
-
-    return resp;
-}
-
-//CHECK: [[HINT_BRANCH]] = !{!"hlsl.controlflow.hint", i32 1}
-//CHECK: [[HINT_FLATTEN]] = !{!"hlsl.controlflow.hint", i32 2}
diff --git a/clang/test/CodeGenHLSL/builtins/RWBuffer-subscript.hlsl b/clang/test/CodeGenHLSL/builtins/RWBuffer-subscript.hlsl
index 4428b77dd9ec..2ad5b82a0291 100644
--- a/clang/test/CodeGenHLSL/builtins/RWBuffer-subscript.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/RWBuffer-subscript.hlsl
@@ -1,4 +1,5 @@
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -emit-llvm -o - -O0 %s | FileCheck %s
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -emit-llvm -o - -O0 %s | FileCheck %s --check-prefixes=DXC,CHECK
+// RUN: %clang_cc1 -triple spirv1.6-pc-vulkan1.3-compute -emit-llvm -o - -O0 %s | FileCheck %s --check-prefixes=SPIRV,CHECK
 
 RWBuffer<int> In;
 RWBuffer<int> Out;
@@ -7,15 +8,19 @@ RWBuffer<int> Out;
 void main(unsigned GI : SV_GroupIndex) {
   // CHECK: define void @main()
 
-  // CHECK: %[[INPTR:.*]] = call noundef nonnull align 4 dereferenceable(4) ptr @llvm.dx.resource.getpointer.p0.tdx.TypedBuffer_i32_1_0_1t(target("dx.TypedBuffer", i32, 1, 0, 1) %{{.*}}, i32 %{{.*}})
+  // DXC: %[[INPTR:.*]] = call noundef nonnull align 4 dereferenceable(4) ptr @llvm.dx.resource.getpointer.p0.tdx.TypedBuffer_i32_1_0_1t(target("dx.TypedBuffer", i32, 1, 0, 1) %{{.*}}, i32 %{{.*}})
+  // SPIRV: %[[INPTR:.*]] = call noundef nonnull align 4 dereferenceable(4) ptr @llvm.spv.resource.getpointer.p0.tspirv.Image_i32_5_2_0_0_2_0t(target("spirv.Image", i32, 5, 2, 0, 0, 2, 0) %{{.*}}, i32 %{{.*}})
   // CHECK: %[[LOAD:.*]] = load i32, ptr %[[INPTR]]
-  // CHECK: %[[OUTPTR:.*]] = call noundef nonnull align 4 dereferenceable(4) ptr @llvm.dx.resource.getpointer.p0.tdx.TypedBuffer_i32_1_0_1t(target("dx.TypedBuffer", i32, 1, 0, 1) %{{.*}}, i32 %{{.*}})
+  // DXC: %[[OUTPTR:.*]] = call noundef nonnull align 4 dereferenceable(4) ptr @llvm.dx.resource.getpointer.p0.tdx.TypedBuffer_i32_1_0_1t(target("dx.TypedBuffer", i32, 1, 0, 1) %{{.*}}, i32 %{{.*}})
+  // SPIRV: %[[OUTPTR:.*]] = call noundef nonnull align 4 dereferenceable(4) ptr @llvm.spv.resource.getpointer.p0.tspirv.Image_i32_5_2_0_0_2_0t(target("spirv.Image", i32, 5, 2, 0, 0, 2, 0) %{{.*}}, i32 %{{.*}})
   // CHECK: store i32 %[[LOAD]], ptr %[[OUTPTR]]
   Out[GI] = In[GI];
 
-  // CHECK: %[[INPTR:.*]] = call ptr @llvm.dx.resource.getpointer.p0.tdx.TypedBuffer_i32_1_0_1t(target("dx.TypedBuffer", i32, 1, 0, 1) %{{.*}}, i32 %{{.*}})
+  // DXC: %[[INPTR:.*]] = call ptr @llvm.dx.resource.getpointer.p0.tdx.TypedBuffer_i32_1_0_1t(target("dx.TypedBuffer", i32, 1, 0, 1) %{{.*}}, i32 %{{.*}})
+  // SPIRV: %[[INPTR:.*]] = call ptr @llvm.spv.resource.getpointer.p0.tspirv.Image_i32_5_2_0_0_2_0t(target("spirv.Image", i32, 5, 2, 0, 0, 2, 0) %{{.*}}, i32 %{{.*}})
   // CHECK: %[[LOAD:.*]] = load i32, ptr %[[INPTR]]
-  // CHECK: %[[OUTPTR:.*]] = call noundef nonnull align 4 dereferenceable(4) ptr @llvm.dx.resource.getpointer.p0.tdx.TypedBuffer_i32_1_0_1t(target("dx.TypedBuffer", i32, 1, 0, 1) %{{.*}}, i32 %{{.*}})
+  // DXC: %[[OUTPTR:.*]] = call noundef nonnull align 4 dereferenceable(4) ptr @llvm.dx.resource.getpointer.p0.tdx.TypedBuffer_i32_1_0_1t(target("dx.TypedBuffer", i32, 1, 0, 1) %{{.*}}, i32 %{{.*}})
+  // SPIRV: %[[OUTPTR:.*]] = call noundef nonnull align 4 dereferenceable(4) ptr @llvm.spv.resource.getpointer.p0.tspirv.Image_i32_5_2_0_0_2_0t(target("spirv.Image", i32, 5, 2, 0, 0, 2, 0) %{{.*}}, i32 %{{.*}})
   // CHECK: store i32 %[[LOAD]], ptr %[[OUTPTR]]
   Out[GI] = In.Load(GI);
 }
diff --git a/clang/test/CodeGenOpenCL/ptx-calls.cl b/clang/test/CodeGenOpenCL/ptx-calls.cl
index 0081152ae40e..ae187173b173 100644
--- a/clang/test/CodeGenOpenCL/ptx-calls.cl
+++ b/clang/test/CodeGenOpenCL/ptx-calls.cl
@@ -7,7 +7,5 @@ void device_function() {
 __kernel void kernel_function() {
   device_function();
 }
-// CHECK-LABEL: define{{.*}} spir_kernel void @kernel_function()
+// CHECK-LABEL: define{{.*}} ptx_kernel void @kernel_function()
 // CHECK: call void @device_function()
-// CHECK: !{{[0-9]+}} = !{ptr @kernel_function, !"kernel", i32 1}
-
diff --git a/clang/test/CodeGenOpenCL/ptx-kernels.cl b/clang/test/CodeGenOpenCL/ptx-kernels.cl
index 210e5682ac72..eac0df4abfbe 100644
--- a/clang/test/CodeGenOpenCL/ptx-kernels.cl
+++ b/clang/test/CodeGenOpenCL/ptx-kernels.cl
@@ -6,6 +6,4 @@ void device_function() {
 
 __kernel void kernel_function() {
 }
-// CHECK-LABEL: define{{.*}} spir_kernel void @kernel_function()
-
-// CHECK: !{{[0-9]+}} = !{ptr @kernel_function, !"kernel", i32 1}
+// CHECK-LABEL: define{{.*}} ptx_kernel void @kernel_function()
diff --git a/clang/test/CodeGenOpenCL/reflect.cl b/clang/test/CodeGenOpenCL/reflect.cl
index 9ae4a5f027d3..f5b618f6a35d 100644
--- a/clang/test/CodeGenOpenCL/reflect.cl
+++ b/clang/test/CodeGenOpenCL/reflect.cl
@@ -12,8 +12,8 @@ bool device_function() {
   return __nvvm_reflect("__CUDA_ARCH") >= 700;
 }
 
-// CHECK-LABEL: define dso_local spir_kernel void @kernel_function(
-// CHECK-SAME: ptr addrspace(1) noundef align 4 [[I:%.*]]) #[[ATTR2:[0-9]+]] !kernel_arg_addr_space !4 !kernel_arg_access_qual !5 !kernel_arg_type !6 !kernel_arg_base_type !6 !kernel_arg_type_qual !7 {
+// CHECK-LABEL: define dso_local ptx_kernel void @kernel_function(
+// CHECK-SAME: ptr addrspace(1) noundef align 4 [[I:%.*]]) #[[ATTR2:[0-9]+]] !kernel_arg_addr_space [[META3:![0-9]+]] !kernel_arg_access_qual [[META4:![0-9]+]] !kernel_arg_type [[META5:![0-9]+]] !kernel_arg_base_type [[META5]] !kernel_arg_type_qual [[META6:![0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[I_ADDR:%.*]] = alloca ptr addrspace(1), align 4
 // CHECK-NEXT:    store ptr addrspace(1) [[I]], ptr [[I_ADDR]], align 4
@@ -26,3 +26,9 @@ bool device_function() {
 __kernel void kernel_function(__global int *i) {
   *i = device_function();
 }
+//.
+// CHECK: [[META3]] = !{i32 1}
+// CHECK: [[META4]] = !{!"none"}
+// CHECK: [[META5]] = !{!"int*"}
+// CHECK: [[META6]] = !{!""}
+//.
diff --git a/clang/test/Driver/Inputs/MacOSX15.1.sdk/embedded/usr/include/.keep b/clang/test/Driver/Inputs/MacOSX15.1.sdk/embedded/usr/include/.keep
new file mode 100644
index 000000000000..e69de29bb2d1
--- /dev/null
+++ b/clang/test/Driver/Inputs/MacOSX15.1.sdk/embedded/usr/include/.keep
diff --git a/clang/test/Driver/Inputs/MacOSX15.1.sdk/embedded/usr/local/include/.keep b/clang/test/Driver/Inputs/MacOSX15.1.sdk/embedded/usr/local/include/.keep
new file mode 100644
index 000000000000..e69de29bb2d1
--- /dev/null
+++ b/clang/test/Driver/Inputs/MacOSX15.1.sdk/embedded/usr/local/include/.keep
diff --git a/clang/test/Driver/Inputs/MacOSX15.1.sdk/usr/include/c++/v1/.keep b/clang/test/Driver/Inputs/MacOSX15.1.sdk/usr/include/c++/v1/.keep
new file mode 100644
index 000000000000..e69de29bb2d1
--- /dev/null
+++ b/clang/test/Driver/Inputs/MacOSX15.1.sdk/usr/include/c++/v1/.keep
diff --git a/clang/test/Driver/Inputs/MacOSX15.1.sdk/usr/local/include/.keep b/clang/test/Driver/Inputs/MacOSX15.1.sdk/usr/local/include/.keep
new file mode 100644
index 000000000000..e69de29bb2d1
--- /dev/null
+++ b/clang/test/Driver/Inputs/MacOSX15.1.sdk/usr/local/include/.keep
diff --git a/clang/test/Driver/Inputs/config-zos/clang.cfg b/clang/test/Driver/Inputs/config-zos/clang.cfg
new file mode 100644
index 000000000000..43a5dbfaa618
--- /dev/null
+++ b/clang/test/Driver/Inputs/config-zos/clang.cfg
@@ -0,0 +1 @@
+-DABC=123
diff --git a/clang/test/Driver/Inputs/config-zos/def.cfg b/clang/test/Driver/Inputs/config-zos/def.cfg
new file mode 100644
index 000000000000..156f9c85fb4f
--- /dev/null
+++ b/clang/test/Driver/Inputs/config-zos/def.cfg
@@ -0,0 +1 @@
+-DDEF=456
diff --git a/clang/test/Driver/Inputs/config-zos/tst/def.cfg b/clang/test/Driver/Inputs/config-zos/tst/def.cfg
new file mode 100644
index 000000000000..156f9c85fb4f
--- /dev/null
+++ b/clang/test/Driver/Inputs/config-zos/tst/def.cfg
@@ -0,0 +1 @@
+-DDEF=456
diff --git a/clang/test/Driver/amdgpu-openmp-toolchain.c b/clang/test/Driver/amdgpu-openmp-toolchain.c
index f596708047c1..1c2ee2617313 100644
--- a/clang/test/Driver/amdgpu-openmp-toolchain.c
+++ b/clang/test/Driver/amdgpu-openmp-toolchain.c
@@ -81,3 +81,7 @@
 // RUN:   %clang -### --target=x86_64-unknown-linux-gnu -emit-llvm -S -fopenmp --offload-arch=gfx803 \
 // RUN:     -stdlib=libc++ -nogpulib %s 2>&1 | FileCheck %s --check-prefix=LIBCXX
 // LIBCXX-NOT: include/amdgcn-amd-amdhsa/c++/v1
+
+// RUN: %clang -### -target x86_64-pc-linux-gnu -nogpulib  -fopenmp --offload-arch=gfx90a \
+// RUN:   -ftime-report %s 2>&1 | FileCheck %s --check-prefix=CHECK-TIME-REPORT
+// CHECK-TIME-REPORT: clang-linker-wrapper{{.*}}"--device-compiler=-ftime-report"
diff --git a/clang/test/Driver/config-zos.c b/clang/test/Driver/config-zos.c
new file mode 100644
index 000000000000..8de02ec101b9
--- /dev/null
+++ b/clang/test/Driver/config-zos.c
@@ -0,0 +1,17 @@
+// REQUIRES: shell
+// REQUIRES: systemz-registered-target
+
+// RUN: unset CLANG_NO_DEFAULT_CONFIG
+// RUN: rm -rf %t && mkdir %t
+
+// RUN: mkdir -p %t/testbin
+// RUN: mkdir -p %t/etc
+// RUN: ln -s %clang %t/testbin/clang
+// RUN: echo "-DXYZ=789" >%t/etc/clang.cfg
+// RUN: %t/testbin/clang --target=s390x-ibm-zos -c -### -no-canonical-prefixes %s 2>&1 | FileCheck -DDIR=%t %s 
+// RUN: %t/testbin/clang --target=s390x-ibm-zos -c -### -no-canonical-prefixes --no-default-config %s 2>&1 | FileCheck -check-prefix=NOCONFIG %s 
+//
+// CHECK: Configuration file: [[DIR]]/etc/clang.cfg
+// CHECK: "-D" "XYZ=789"
+// NOCONFIG-NOT: Configuration file: {{.*}}/etc/clang.cfg
+// NOCONFIG-NOT: "-D" "XYZ=789"
diff --git a/clang/test/Driver/config-zos1.c b/clang/test/Driver/config-zos1.c
new file mode 100644
index 000000000000..5b1012d00736
--- /dev/null
+++ b/clang/test/Driver/config-zos1.c
@@ -0,0 +1,23 @@
+// REQUIRES: shell
+// REQUIRES: systemz-registered-target
+
+// RUN: unset CLANG_NO_DEFAULT_CONFIG
+
+// RUN: export CLANG_CONFIG_PATH=%S/Inputs/config-zos
+// RUN: %clang --target=s390x-ibm-zos -c -### %s 2>&1 | FileCheck %s 
+// CHECK: Configuration file: {{.*}}/Inputs/config-zos/clang.cfg
+// CHECK: "-D" "ABC=123"
+
+// RUN: export CLANG_CONFIG_PATH=%S/Inputs/config-zos/def.cfg
+// RUN: %clang --target=s390x-ibm-zos -c -### %s 2>&1 | FileCheck %s -check-prefix=CHECK-DEF
+// CHECK-DEF: Configuration file: {{.*}}/Inputs/config-zos/def.cfg
+// CHECK-DEF: "-D" "DEF=456"
+
+// RUN: export CLANG_CONFIG_PATH=%S/Inputs/config-zos/Garbage
+// RUN: not %clang --target=s390x-ibm-zos -c -### %s 2>&1 | FileCheck %s  -check-prefix=CHECK-ERR
+// CHECK-ERR:  error: configuration file '{{.*}}/Inputs/config-zos/Garbage' cannot be found
+
+// The directory exists but no clang.cfg in it
+// RUN: export CLANG_CONFIG_PATH=%S/Inputs/config-zos/tst
+// RUN: not %clang --target=s390x-ibm-zos -c -### %s 2>&1 | FileCheck %s  -check-prefix=CHECK-ERRDIR
+// CHECK-ERRDIR:  error: configuration file '{{.*}}/Inputs/config-zos/tst/clang.cfg' cannot be found
diff --git a/clang/test/Driver/darwin-embedded-search-paths-libcxx.c b/clang/test/Driver/darwin-embedded-search-paths-libcxx.c
new file mode 100644
index 000000000000..0f9a8467b061
--- /dev/null
+++ b/clang/test/Driver/darwin-embedded-search-paths-libcxx.c
@@ -0,0 +1,45 @@
+// REQUIRES: default-cxx-stdlib=libc++
+// UNSUPPORTED: system-windows
+//   Windows is unsupported because we use the Unix path separator `/` in the test.
+
+// Unlike the Darwin driver, the MachO driver doesn't add any framework search paths,
+// only the normal header ones.
+// RUN: %clang -x c -target arm64-apple-none-macho -isysroot %S/Inputs/MacOSX15.1.sdk -### -c %s 2>&1 \
+// RUN: | FileCheck --check-prefixes=CC1,NO-CXX,ULI,CI,UI,NO-FW -DSDKROOT=%S/Inputs/MacOSX15.1.sdk %s
+
+// Unlike the Darwin driver, the MachO driver doesn't default to libc++, but when
+// CLANG_DEFAULT_CXX_STDLIB is libc++ then the MachO driver should find the search path.
+// RUN: %clang -x c++ -target arm64-apple-none-macho -isysroot %S/Inputs/MacOSX15.1.sdk -### -c %s 2>&1 \
+// RUN: | FileCheck --check-prefixes=CC1,CXX,ULI,CI,UI,NO-FW -DSDKROOT=%S/Inputs/MacOSX15.1.sdk %s
+
+// If the user requests libc++, the MachO driver should still find the search path.
+// RUN: %clang -x c++ -stdlib=libc++ -target arm64-apple-none-macho -isysroot %S/Inputs/MacOSX15.1.sdk -### -c %s 2>&1 \
+// RUN: | FileCheck --check-prefixes=CC1,CXX,ULI,CI,UI,NO-FW -DSDKROOT=%S/Inputs/MacOSX15.1.sdk %s
+
+// Verify that embedded uses can swap in alternate usr/include and usr/local/include directories.
+// usr/local/include is specified in the driver as -internal-isystem, however, the driver generated
+// paths come before the paths in the driver arguments. In order to keep usr/local/include in the
+// same position, -isystem has to be used instead of -Xclang -internal-isystem. There isn't an
+// -externc-isystem, but it's ok to use -Xclang -internal-externc-isystem since the driver doesn't
+// use that if -nostdlibinc or -nostdinc is passed.
+// RUN: %clang -x c++ -stdlib=libc++ -target arm64-apple-none-macho -isysroot %S/Inputs/MacOSX15.1.sdk \
+// RUN:        -nostdlibinc -isystem %S/Inputs/MacOSX15.1.sdk/embedded/usr/local/include \
+// RUN:        -Xclang -internal-externc-isystem -Xclang %S/Inputs/MacOSX15.1.sdk/embedded/usr/include \
+// RUN:        -### -c %s 2>&1 | FileCheck --check-prefixes=CC1,NO-CXX,EULI,CI,EUI,NO-FW -DSDKROOT=%S/Inputs/MacOSX15.1.sdk %s
+
+
+// The ordering of these flags doesn't matter, and so this test is a little
+// fragile. i.e. all of the -internal-isystem paths will be searched before the
+// -internal-externc-isystem ones, and their order on the command line doesn't
+// matter. The line order here is just the current order that the driver writes
+// the cc1 arguments.
+
+// CC1: "-cc1"
+// NO-CXX-NOT: "-internal-isystem" "{{.*}}/include/c++/v1"
+// CXX-SAME: "-internal-isystem" "{{.*}}/include/c++/v1"
+// ULI-SAME: "-internal-isystem" "[[SDKROOT]]/usr/local/include"
+// EULI-SAME: "-isystem" "[[SDKROOT]]/embedded/usr/local/include"
+// CI-SAME: "-internal-isystem" "{{.*}}/clang/{{[[:digit:].]*}}/include"
+// UI-SAME: "-internal-externc-isystem" "[[SDKROOT]]/usr/include"
+// EUI-SAME: "-internal-externc-isystem" "[[SDKROOT]]/embedded/usr/include"
+// NO-FW-NOT: "-internal-iframework"
diff --git a/clang/test/Driver/darwin-embedded-search-paths.c b/clang/test/Driver/darwin-embedded-search-paths.c
new file mode 100644
index 000000000000..bd651b7a1cd1
--- /dev/null
+++ b/clang/test/Driver/darwin-embedded-search-paths.c
@@ -0,0 +1,46 @@
+// REQUIRES: !(default-cxx-stdlib=libc++)
+// UNSUPPORTED: system-windows
+//   Windows is unsupported because we use the Unix path separator `/` in the test.
+
+// Unlike the Darwin driver, the MachO driver doesn't add any framework search paths,
+// only the normal header ones.
+// RUN: %clang -x c -target arm64-apple-none-macho -isysroot %S/Inputs/MacOSX15.1.sdk -### -c %s 2>&1 \
+// RUN: | FileCheck --check-prefixes=CC1,NO-CXX,ULI,CI,UI,NO-FW -DSDKROOT=%S/Inputs/MacOSX15.1.sdk %s
+
+// Unlike the Darwin driver, the MachO driver doesn't default to libc++, and unless
+// CLANG_DEFAULT_CXX_STDLIB is libc++ it won't add any search paths.
+// RUN: %clang -x c++ -target arm64-apple-none-macho -isysroot %S/Inputs/MacOSX15.1.sdk -### -c %s 2>&1 \
+// RUN: | FileCheck --check-prefixes=CC1,NO-CXX,ULI,CI,UI,NO-FW -DSDKROOT=%S/Inputs/MacOSX15.1.sdk %s
+
+// However, if the user requests libc++, the MachO driver should find the search path.
+// RUN: %clang -x c++ -stdlib=libc++ -target arm64-apple-none-macho -isysroot %S/Inputs/MacOSX15.1.sdk -### -c %s 2>&1 \
+// RUN: | FileCheck --check-prefixes=CC1,CXX,ULI,CI,UI,NO-FW -DSDKROOT=%S/Inputs/MacOSX15.1.sdk %s
+
+// Verify that embedded uses can swap in alternate usr/include and usr/local/include directories.
+// usr/local/include is specified in the driver as -internal-isystem, however, the driver generated
+// paths come before the paths in the driver arguments. In order to keep usr/local/include in the
+// same position, -isystem has to be used instead of -Xclang -internal-isystem. There isn't an
+// -externc-isystem, but it's ok to use -Xclang -internal-externc-isystem since the driver doesn't
+// use that if -nostdlibinc or -nostdinc is passed.
+// RUN: %clang -x c++ -stdlib=libc++ -target arm64-apple-none-macho -isysroot %S/Inputs/MacOSX15.1.sdk \
+// RUN:        -nostdlibinc -isystem %S/Inputs/MacOSX15.1.sdk/embedded/usr/local/include \
+// RUN:        -Xclang -internal-externc-isystem -Xclang %S/Inputs/MacOSX15.1.sdk/embedded/usr/include \
+// RUN:        -### -c %s 2>&1 | FileCheck --check-prefixes=CC1,NO-CXX,EULI,CI,EUI,NO-FW -DSDKROOT=%S/Inputs/MacOSX15.1.sdk %s
+
+
+// The ordering of these flags doesn't matter, and so this test is a little
+// fragile. i.e. all of the -internal-isystem paths will be searched before the
+// -internal-externc-isystem ones, and their order on the command line doesn't
+// matter. The line order here is just the current order that the driver writes
+// the cc1 arguments.
+
+// CC1: "-cc1"
+// CC1: "-resource-dir" "[[RESOURCE_DIR:[^"]*]]"
+// NO-CXX-NOT: "-internal-isystem" "{{.*}}/include/c++/v1"
+// CXX-SAME: "-internal-isystem" "{{.*}}/include/c++/v1"
+// ULI-SAME: "-internal-isystem" "[[SDKROOT]]/usr/local/include"
+// EULI-SAME: "-isystem" "[[SDKROOT]]/embedded/usr/local/include"
+// CI-SAME: "-internal-isystem" "[[RESOURCE_DIR]]/include"
+// UI-SAME: "-internal-externc-isystem" "[[SDKROOT]]/usr/include"
+// EUI-SAME: "-internal-externc-isystem" "[[SDKROOT]]/embedded/usr/include"
+// NO-FW-NOT: "-internal-iframework"
diff --git a/clang/test/Driver/lto.c b/clang/test/Driver/lto.c
index 5be95013f00d..a85f953af37a 100644
--- a/clang/test/Driver/lto.c
+++ b/clang/test/Driver/lto.c
@@ -114,3 +114,9 @@
 //
 // CHECK-GISEL:         "-plugin-opt=-global-isel=1"
 // CHECK-DISABLE-GISEL: "-plugin-opt=-global-isel=0"
+
+// -flto passes -time-passes when -ftime-report is passed
+// RUN: %clang --target=x86_64-unknown-linux-gnu -### %s -flto -ftime-report 2> %t
+// RUN: FileCheck --check-prefix=CHECK-TIME-REPORT < %t %s
+
+// CHECK-TIME-REPORT: "-plugin-opt=-time-passes"
diff --git a/clang/test/Driver/print-supported-extensions-riscv.c b/clang/test/Driver/print-supported-extensions-riscv.c
index f08ff00c9cbe..a8d9fcd8569c 100644
--- a/clang/test/Driver/print-supported-extensions-riscv.c
+++ b/clang/test/Driver/print-supported-extensions-riscv.c
@@ -193,6 +193,7 @@
 // CHECK-NEXT:     xqcia                0.2       'Xqcia' (Qualcomm uC Arithmetic Extension)
 // CHECK-NEXT:     xqciac               0.2       'Xqciac' (Qualcomm uC Load-Store Address Calculation Extension)
 // CHECK-NEXT:     xqcicli              0.2       'Xqcicli' (Qualcomm uC Conditional Load Immediate Extension)
+// CHECK-NEXT:     xqcicm               0.2       'Xqcicm' (Qualcomm uC Conditional Move Extension)
 // CHECK-NEXT:     xqcics               0.2       'Xqcics' (Qualcomm uC Conditional Select Extension)
 // CHECK-NEXT:     xqcicsr              0.2       'Xqcicsr' (Qualcomm uC CSR Extension)
 // CHECK-NEXT:     xqcilsm              0.2       'Xqcilsm' (Qualcomm uC Load Store Multiple Extension)
diff --git a/clang/test/Driver/riscv-cpus.c b/clang/test/Driver/riscv-cpus.c
index 1b09945620f8..e97b6940662d 100644
--- a/clang/test/Driver/riscv-cpus.c
+++ b/clang/test/Driver/riscv-cpus.c
@@ -433,6 +433,19 @@
 // MCPU-SIFIVE-P470-SAME: "-target-feature" "+zvkt"
 // MCPU-SIFIVE-P470-SAME: "-target-abi" "lp64d"
 
+// RUN: %clang -target riscv64 -### -c %s 2>&1 -mcpu=sifive-p550 | FileCheck -check-prefix=MCPU-SIFIVE-P550 %s
+// MCPU-SIFIVE-P550: "-nostdsysteminc" "-target-cpu" "sifive-p550"
+// MCPU-SIFIVE-P550-SAME: "-target-feature" "+m"
+// MCPU-SIFIVE-P550-SAME: "-target-feature" "+a"
+// MCPU-SIFIVE-P550-SAME: "-target-feature" "+f"
+// MCPU-SIFIVE-P550-SAME: "-target-feature" "+d"
+// MCPU-SIFIVE-P550-SAME: "-target-feature" "+c"
+// MCPU-SIFIVE-P550-SAME: "-target-feature" "+zicsr"
+// MCPU-SIFIVE-P550-SAME: "-target-feature" "+zifencei"
+// MCPU-SIFIVE-P550-SAME: "-target-feature" "+zba"
+// MCPU-SIFIVE-P550-SAME: "-target-feature" "+zbb"
+// MCPU-SIFIVE-P550-SAME: "-target-abi" "lp64d"
+
 // RUN: %clang -target riscv64 -### -c %s 2>&1 -mcpu=sifive-p670 | FileCheck -check-prefix=MCPU-SIFIVE-P670 %s
 // MCPU-SIFIVE-P670: "-target-cpu" "sifive-p670"
 // MCPU-SIFIVE-P670-SAME: "-target-feature" "+m"
diff --git a/clang/test/Driver/spirv-openmp-toolchain.c b/clang/test/Driver/spirv-openmp-toolchain.c
index 377b2d9be0b0..3a94d978c2d7 100644
--- a/clang/test/Driver/spirv-openmp-toolchain.c
+++ b/clang/test/Driver/spirv-openmp-toolchain.c
@@ -1,4 +1,4 @@
-// RUN: %clang -### --target=x86_64-unknown-linux-gnu -fopenmp -fopenmp-targets=spirv64-intel \
+// RUN: %clang -### --target=x86_64-unknown-linux-gnu -fopenmp=libomp -fopenmp-targets=spirv64-intel \
 // RUN:        --libomptarget-spirv-bc-path=%t/ -nogpulib %s 2>&1 \
 // RUN: | FileCheck %s
 
diff --git a/clang/test/Driver/uefi-constructed-args.c b/clang/test/Driver/uefi-constructed-args.c
index e90857bb6fb5..3cc5abe69745 100644
--- a/clang/test/Driver/uefi-constructed-args.c
+++ b/clang/test/Driver/uefi-constructed-args.c
@@ -4,6 +4,7 @@
 // CHECK-SAME: "-triple" "x86_64-unknown-uefi"
 // CHECK-SAME: "-mrelocation-model" "pic" "-pic-level" "2"
 // CHECK-SAME: "-mframe-pointer=all"
+// CHECK-SAME: "-fms-extensions"
 // CHECK-NEXT: "-nologo"
 // CHECK-SAME: "-subsystem:efi_application"
 // CHECK-SAME: "-entry:EfiMain"
diff --git a/clang/test/Headers/gpuintrin.c b/clang/test/Headers/gpuintrin.c
index 2e45f73692f5..281339716c3e 100644
--- a/clang/test/Headers/gpuintrin.c
+++ b/clang/test/Headers/gpuintrin.c
@@ -44,7 +44,7 @@
 // AMDGPU-NEXT:    call void @__gpu_exit() #[[ATTR8:[0-9]+]]
 // AMDGPU-NEXT:    unreachable
 //
-// NVPTX-LABEL: define protected void @foo(
+// NVPTX-LABEL: define protected ptx_kernel void @foo(
 // NVPTX-SAME: ) #[[ATTR0:[0-9]+]] {
 // NVPTX-NEXT:  [[ENTRY:.*:]]
 // NVPTX-NEXT:    [[CALL:%.*]] = call i32 @__gpu_num_blocks_x() #[[ATTR6:[0-9]+]]
diff --git a/clang/test/Misc/target-invalid-cpu-note/riscv.c b/clang/test/Misc/target-invalid-cpu-note/riscv.c
index fc8536d99cb8..fb54dcb5b3a9 100644
--- a/clang/test/Misc/target-invalid-cpu-note/riscv.c
+++ b/clang/test/Misc/target-invalid-cpu-note/riscv.c
@@ -29,6 +29,7 @@
 // RISCV64-SAME: {{^}}, rocket-rv64
 // RISCV64-SAME: {{^}}, sifive-p450
 // RISCV64-SAME: {{^}}, sifive-p470
+// RISCV64-SAME: {{^}}, sifive-p550
 // RISCV64-SAME: {{^}}, sifive-p670
 // RISCV64-SAME: {{^}}, sifive-s21
 // RISCV64-SAME: {{^}}, sifive-s51
@@ -77,6 +78,7 @@
 // TUNE-RISCV64-SAME: {{^}}, rocket-rv64
 // TUNE-RISCV64-SAME: {{^}}, sifive-p450
 // TUNE-RISCV64-SAME: {{^}}, sifive-p470
+// TUNE-RISCV64-SAME: {{^}}, sifive-p550
 // TUNE-RISCV64-SAME: {{^}}, sifive-p670
 // TUNE-RISCV64-SAME: {{^}}, sifive-s21
 // TUNE-RISCV64-SAME: {{^}}, sifive-s51
diff --git a/clang/test/ParserOpenACC/parse-clauses.c b/clang/test/ParserOpenACC/parse-clauses.c
index 27970615c958..73a09697710f 100644
--- a/clang/test/ParserOpenACC/parse-clauses.c
+++ b/clang/test/ParserOpenACC/parse-clauses.c
@@ -343,23 +343,16 @@ struct HasMembersArray {
 void SelfUpdate() {
   struct Members s;
 
-  // expected-error@+2{{expected '('}}
-  // expected-warning@+1{{OpenACC construct 'update' not yet implemented, pragma ignored}}
+  // expected-error@+1{{expected '('}}
 #pragma acc update self
   for(int i = 0; i < 5;++i) {}
 
-  // expected-error@+6{{use of undeclared identifier 'zero'}}
-  // expected-error@+5{{expected ','}}
-  // expected-error@+4{{expected expression}}
-  // expected-warning@+3{{OpenACC clause 'self' not yet implemented, clause ignored}}
-  // expected-warning@+2{{OpenACC clause 'if_present' not yet implemented, clause ignored}}
-  // expected-warning@+1{{OpenACC construct 'update' not yet implemented, pragma ignored}}
+  // expected-error@+3{{use of undeclared identifier 'zero'}}
+  // expected-error@+2{{expected ','}}
+  // expected-error@+1{{expected expression}}
 #pragma acc update self(zero : s.array[s.value : 5], s.value), if_present
   for(int i = 0; i < 5;++i) {}
 
-  // expected-warning@+3{{OpenACC clause 'self' not yet implemented, clause ignored}}
-  // expected-warning@+2{{OpenACC clause 'if_present' not yet implemented, clause ignored}}
-  // expected-warning@+1{{OpenACC construct 'update' not yet implemented, pragma ignored}}
 #pragma acc update self(s.array[s.value : 5], s.value), if_present
   for(int i = 0; i < 5;++i) {}
 }
diff --git a/clang/test/ParserOpenACC/parse-constructs.c b/clang/test/ParserOpenACC/parse-constructs.c
index 7f090f828feb..9948e33ac94d 100644
--- a/clang/test/ParserOpenACC/parse-constructs.c
+++ b/clang/test/ParserOpenACC/parse-constructs.c
@@ -151,8 +151,7 @@ void func() {
   // expected-error@+1{{OpenACC 'set' construct must have at least one 'default_async', 'device_num', 'device_type' or 'if' clause}}
 #pragma acc set clause list
   for(;;){}
-  // expected-error@+2{{invalid OpenACC clause 'clause'}}
-  // expected-warning@+1{{OpenACC construct 'update' not yet implemented, pragma ignored}}
+  // expected-error@+1{{invalid OpenACC clause 'clause'}}
 #pragma acc update clause list
   for(;;){}
 }
diff --git a/clang/test/Preprocessor/macho-embedded-predefines.c b/clang/test/Preprocessor/macho-embedded-predefines.c
index 74f29199218c..a7e5777a89a9 100644
--- a/clang/test/Preprocessor/macho-embedded-predefines.c
+++ b/clang/test/Preprocessor/macho-embedded-predefines.c
@@ -3,18 +3,18 @@
 // CHECK-7M: #define __APPLE_CC__
 // CHECK-7M: #define __APPLE__
 // CHECK-7M: #define __ARM_ARCH_7M__
-// CHECK-7M-NOT: #define __MACH__
+// CHECK-7M: #define __MACH__
 
 // RUN: %clang_cc1 -E -dM -triple thumbv7em-apple-unknown-macho -target-cpu cortex-m4 %s | FileCheck %s -check-prefix CHECK-7EM
 
 // CHECK-7EM: #define __APPLE_CC__
 // CHECK-7EM: #define __APPLE__
 // CHECK-7EM: #define __ARM_ARCH_7EM__
-// CHECK-7EM-NOT: #define __MACH__
+// CHECK-7EM: #define __MACH__
 
 // RUN: %clang_cc1 -E -dM -triple thumbv6m-apple-unknown-macho -target-cpu cortex-m0 %s | FileCheck %s -check-prefix CHECK-6M
 
 // CHECK-6M: #define __APPLE_CC__
 // CHECK-6M: #define __APPLE__
 // CHECK-6M: #define __ARM_ARCH_6M__
-// CHECK-6M-NOT: #define __MACH__
+// CHECK-6M: #define __MACH__
diff --git a/clang/test/Sema/varargs.c b/clang/test/Sema/varargs.c
index 2cb7270f604a..bec41dda65d5 100644
--- a/clang/test/Sema/varargs.c
+++ b/clang/test/Sema/varargs.c
@@ -75,6 +75,11 @@ void f9(__builtin_va_list args)
     (void)__builtin_va_arg(args, enum E); // Don't warn here in C
     (void)__builtin_va_arg(args, short); // expected-warning {{second argument to 'va_arg' is of promotable type 'short'}}
     (void)__builtin_va_arg(args, char); // expected-warning {{second argument to 'va_arg' is of promotable type 'char'}}
+    // Don't crash on some undefined behaviors.
+    int n;
+    (void)__builtin_va_arg(args, int[10]); // expected-warning{{second argument to 'va_arg' is of array type 'int[10]'}}
+    (void)__builtin_va_arg(args, int[++n]); // expected-warning{{second argument to 'va_arg' is of array type 'int[++n]'}}
+    (void)__builtin_va_arg(args, int[n][n]); // expected-warning{{second argument to 'va_arg' is of array type 'int[n][n]'}}
 }
 
 void f10(int a, ...) {
diff --git a/clang/test/SemaCXX/concept-crash-on-diagnostic.cpp b/clang/test/SemaCXX/concept-crash-on-diagnostic.cpp
index ccc109cbca0f..71e55c8290ee 100644
--- a/clang/test/SemaCXX/concept-crash-on-diagnostic.cpp
+++ b/clang/test/SemaCXX/concept-crash-on-diagnostic.cpp
@@ -31,7 +31,7 @@ void function() {
 // expected-note@#3 {{checking the satisfaction of concept 'convertible_to<bool, bool>'}}
 // expected-note@#2 {{substituting template arguments into constraint expression here}}
 // expected-note@#5 {{checking constraint satisfaction for template 'compare<Object *, Object *>'}}
-// expected-note@#5 {{while substituting deduced template arguments into function template 'compare' [with IteratorL = Object *, IteratorR = Object *]}}
+// expected-note@#5 {{in instantiation of function template specialization 'compare<Object *, Object *>' requested here}}
 
 // expected-note@#4 {{candidate template ignored: constraints not satisfied [with IteratorL = Object *, IteratorR = Object *]}}
 // We don't know exactly the substituted type for `lhs == rhs`, thus a placeholder 'expr-type' is emitted.
diff --git a/clang/test/SemaCXX/cxx20-ctad-type-alias.cpp b/clang/test/SemaCXX/cxx20-ctad-type-alias.cpp
index 23c898e6379b..2d43e46b9e3d 100644
--- a/clang/test/SemaCXX/cxx20-ctad-type-alias.cpp
+++ b/clang/test/SemaCXX/cxx20-ctad-type-alias.cpp
@@ -196,7 +196,7 @@ struct Foo {
 
 template <int K>
 using Bar = Foo<double, K>; // expected-note {{constraints not satisfied for class template 'Foo'}}
-// expected-note@-1 {{candidate template ignored: could not match}} expected-note@-1 {{candidate template ignored: constraints not satisfied}}
+// expected-note@-1 {{candidate template ignored: could not match}}
 // expected-note@-2 {{implicit deduction guide declared as 'template <int K> requires __is_deducible(test14::Bar, Foo<double, K>) Bar(Foo<double, K>) -> Foo<double, K>'}}
 // expected-note@-3 {{implicit deduction guide declared as 'template <int K> requires __is_deducible(test14::Bar, Foo<double, K>) Bar(const double (&)[K]) -> Foo<double, K>'}}
 double abc[3];
diff --git a/clang/test/SemaCXX/cxx23-assume.cpp b/clang/test/SemaCXX/cxx23-assume.cpp
index 726cb3bff652..7f80cdfe7d45 100644
--- a/clang/test/SemaCXX/cxx23-assume.cpp
+++ b/clang/test/SemaCXX/cxx23-assume.cpp
@@ -129,12 +129,12 @@ constexpr int f5() requires (!C<T>) { return 2; } // expected-note 4 {{while che
 
 static_assert(f5<int>() == 1);
 static_assert(f5<D>() == 1); // expected-note 3 {{while checking constraint satisfaction}}
-                             // expected-note@-1 3 {{while substituting deduced template arguments}}
+                             // expected-note@-1 3 {{in instantiation of}}
                              // expected-error@-2 {{no matching function for call}}
 
 static_assert(f5<double>() == 2);
-static_assert(f5<E>() == 1); // expected-note {{while checking constraint satisfaction}} expected-note {{while substituting deduced template arguments}}
-static_assert(f5<F>() == 2); // expected-note {{while checking constraint satisfaction}} expected-note {{while substituting deduced template arguments}}
+static_assert(f5<E>() == 1); // expected-note {{while checking constraint satisfaction}} expected-note {{in instantiation of}}
+static_assert(f5<F>() == 2); // expected-note {{while checking constraint satisfaction}} expected-note {{in instantiation of}}
 
 // Do not validate assumptions whose evaluation would have side-effects.
 constexpr int foo() {
diff --git a/clang/test/SemaCXX/cxx2c-fold-exprs.cpp b/clang/test/SemaCXX/cxx2c-fold-exprs.cpp
index 4220486d3aed..48061439941f 100644
--- a/clang/test/SemaCXX/cxx2c-fold-exprs.cpp
+++ b/clang/test/SemaCXX/cxx2c-fold-exprs.cpp
@@ -233,7 +233,7 @@ void g() {
     A<Thingy, Thingy> *ap;
     f(ap, ap); // expected-error{{no matching function for call to 'f'}} \
                // expected-note {{while checking constraint satisfaction}} \
-               // expected-note {{while substituting deduced template arguments}}
+               // expected-note {{in instantiation of function template specialization}}
 }
 
 }
diff --git a/clang/test/SemaCXX/lambda-unevaluated.cpp b/clang/test/SemaCXX/lambda-unevaluated.cpp
index d3f937281f20..a9bcab58464e 100644
--- a/clang/test/SemaCXX/lambda-unevaluated.cpp
+++ b/clang/test/SemaCXX/lambda-unevaluated.cpp
@@ -174,7 +174,7 @@ int* func(T) requires requires { []() { T::foo(); }; }; // expected-error{{type
 double* func(...);
 
 static_assert(__is_same(decltype(func(0)), double*)); // expected-note {{while checking constraint satisfaction for template 'func<int>' required here}}
-                                                      // expected-note@-1 {{while substituting deduced template arguments into function template 'func' [with T = int]}}
+                                                      // expected-note@-1 {{in instantiation of function template specialization 'lambda_in_constraints::func<int>'}}
 static_assert(__is_same(decltype(func(WithFoo())), int*));
 
 template <class T>
@@ -252,7 +252,7 @@ S s("a"); // #use
 // expected-note@#S-requires {{substituting template arguments into constraint expression here}}
 // expected-note@#S-requires {{in instantiation of requirement here}}
 // expected-note@#use {{checking constraint satisfaction for template 'S<const char *>' required here}}
-// expected-note@#use {{while substituting deduced template arguments into function template 'S' [with value:auto = const char *]}}
+// expected-note@#use {{requested here}}
 // expected-note-re@#S 2{{candidate constructor {{.*}} not viable}}
 // expected-note@#S-ctor {{constraints not satisfied}}
 // expected-note-re@#S-requires {{because {{.*}} would be invalid}}
diff --git a/clang/test/SemaOpenACC/combined-construct-self-ast.cpp b/clang/test/SemaOpenACC/combined-construct-self-ast.cpp
index 3a6ba3ca6aea..e504ea7f5a07 100644
--- a/clang/test/SemaOpenACC/combined-construct-self-ast.cpp
+++ b/clang/test/SemaOpenACC/combined-construct-self-ast.cpp
@@ -20,6 +20,7 @@ void TemplFunc() {
   for (unsigned i = 0; i < 5; ++i);
   // CHECK-NEXT: OpenACCCombinedConstruct{{.*}}serial loop
   // CHECK-NEXT: self clause
+  // CHECK-NEXT: <<<NULL>>
   // CHECK-NEXT: ForStmt
   // CHECK: NullStmt
 
@@ -65,6 +66,7 @@ void TemplFunc() {
   //
   // CHECK-NEXT: OpenACCCombinedConstruct{{.*}}serial loop
   // CHECK-NEXT: self clause
+  // CHECK-NEXT: <<<NULL>>
   // CHECK-NEXT: ForStmt
   // CHECK: NullStmt
 
diff --git a/clang/test/SemaOpenACC/compute-construct-clause-ast.cpp b/clang/test/SemaOpenACC/compute-construct-clause-ast.cpp
index 69f65f4083ae..58c12b828439 100644
--- a/clang/test/SemaOpenACC/compute-construct-clause-ast.cpp
+++ b/clang/test/SemaOpenACC/compute-construct-clause-ast.cpp
@@ -197,6 +197,7 @@ void TemplFunc() {
   while(true);
   // CHECK-NEXT: OpenACCComputeConstruct{{.*}}serial
   // CHECK-NEXT: self clause
+  // CHECK-NEXT: <<<NULL>>
   // CHECK-NEXT: WhileStmt
   // CHECK-NEXT: CXXBoolLiteralExpr
   // CHECK-NEXT: NullStmt
@@ -393,6 +394,7 @@ void TemplFunc() {
 
   // CHECK-NEXT: OpenACCComputeConstruct{{.*}}serial
   // CHECK-NEXT: self clause
+  // CHECK-NEXT: <<<NULL>>
   // CHECK-NEXT: WhileStmt
   // CHECK-NEXT: CXXBoolLiteralExpr
   // CHECK-NEXT: NullStmt
diff --git a/clang/test/SemaOpenACC/update-construct-ast.cpp b/clang/test/SemaOpenACC/update-construct-ast.cpp
new file mode 100644
index 000000000000..9048e8823f5f
--- /dev/null
+++ b/clang/test/SemaOpenACC/update-construct-ast.cpp
@@ -0,0 +1,267 @@
+// RUN: %clang_cc1 %s -fopenacc -ast-dump | FileCheck %s
+
+// Test this with PCH.
+// RUN: %clang_cc1 %s -fopenacc -emit-pch -o %t %s
+// RUN: %clang_cc1 %s -fopenacc -include-pch %t -ast-dump-all | FileCheck %s
+
+#ifndef PCH_HELPER
+#define PCH_HELPER
+
+int some_int();
+long some_long();
+
+int Global;
+short GlobalArray[5];
+
+
+void NormalFunc() {
+  // CHECK-LABEL: NormalFunc
+  // CHECK-NEXT: CompoundStmt
+
+#pragma acc update if_present if (some_int() < some_long())
+  // CHECK-NEXT: OpenACCUpdateConstruct{{.*}}update
+  // CHECK-NEXT: if_present clause
+  // CHECK-NEXT: if clause
+  // CHECK-NEXT: BinaryOperator{{.*}}'bool' '<'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}}'long'
+  // CHECK-NEXT: CallExpr{{.*}}'int'
+  // CHECK-NEXT: ImplicitCastExpr
+  // CHECK-NEXT: DeclRefExpr{{.*}}'some_int' 'int ()'
+  // CHECK-NEXT: CallExpr{{.*}} 'long'
+  // CHECK-NEXT: ImplicitCastExpr
+  // CHECK-NEXT: DeclRefExpr{{.*}}'some_long' 'long ()'
+
+#pragma acc update wait async device_type(A) dtype(B)
+  // CHECK-NEXT: OpenACCUpdateConstruct{{.*}}update
+  // CHECK-NEXT: wait clause
+  // CHECK-NEXT: <<<NULL>>>
+  // CHECK-NEXT: async clause
+  // CHECK-NEXT: device_type(A)
+  // CHECK-NEXT: dtype(B)
+#pragma acc update wait(some_int(), some_long()) async(some_int())
+  // CHECK-NEXT: OpenACCUpdateConstruct{{.*}}update
+  // CHECK-NEXT: wait clause
+  // CHECK-NEXT: <<<NULL>>>
+  // CHECK-NEXT: CallExpr{{.*}}'int'
+  // CHECK-NEXT: ImplicitCastExpr
+  // CHECK-NEXT: DeclRefExpr{{.*}}'some_int' 'int ()'
+  // CHECK-NEXT: CallExpr{{.*}}'long'
+  // CHECK-NEXT: ImplicitCastExpr
+  // CHECK-NEXT: DeclRefExpr{{.*}}'some_long' 'long ()'
+  // CHECK-NEXT: async clause
+  // CHECK-NEXT: CallExpr{{.*}}'int'
+  // CHECK-NEXT: ImplicitCastExpr
+  // CHECK-NEXT: DeclRefExpr{{.*}}'some_int' 'int ()'
+#pragma acc update wait(queues:some_int(), some_long())
+  // CHECK-NEXT: OpenACCUpdateConstruct{{.*}}update
+  // CHECK-NEXT: wait clause
+  // CHECK-NEXT: <<<NULL>>>
+  // CHECK-NEXT: CallExpr{{.*}}'int'
+  // CHECK-NEXT: ImplicitCastExpr
+  // CHECK-NEXT: DeclRefExpr{{.*}}'some_int' 'int ()'
+  // CHECK-NEXT: CallExpr{{.*}}'long'
+  // CHECK-NEXT: ImplicitCastExpr
+  // CHECK-NEXT: DeclRefExpr{{.*}}'some_long' 'long ()'
+#pragma acc update wait(devnum: some_int() :some_int(), some_long())
+  // CHECK-NEXT: OpenACCUpdateConstruct{{.*}}update
+  // CHECK-NEXT: wait clause
+  // CHECK-NEXT: CallExpr{{.*}}'int'
+  // CHECK-NEXT: ImplicitCastExpr
+  // CHECK-NEXT: DeclRefExpr{{.*}}'some_int' 'int ()'
+  // CHECK-NEXT: CallExpr{{.*}}'int'
+  // CHECK-NEXT: ImplicitCastExpr
+  // CHECK-NEXT: DeclRefExpr{{.*}}'some_int' 'int ()'
+  // CHECK-NEXT: CallExpr{{.*}}'long'
+  // CHECK-NEXT: ImplicitCastExpr
+  // CHECK-NEXT: DeclRefExpr{{.*}}'some_long' 'long ()'
+
+#pragma acc update self(Global, GlobalArray, GlobalArray[0], GlobalArray[0:1])
+  // CHECK-NEXT: OpenACCUpdateConstruct{{.*}}update
+  // CHECK-NEXT: self clause
+  // CHECK-NEXT: DeclRefExpr{{.*}}'Global' 'int'
+  // CHECK-NEXT: DeclRefExpr{{.*}}'GlobalArray' 'short[5]'
+  // CHECK-NEXT: ArraySubscriptExpr{{.*}} 'short' lvalue
+  // CHECK-NEXT: ImplicitCastExpr
+  // CHECK-NEXT: DeclRefExpr{{.*}}'GlobalArray' 'short[5]'
+  // CHECK-NEXT: IntegerLiteral{{.*}} 0
+  // CHECK-NEXT: ArraySectionExpr
+  // CHECK-NEXT: ImplicitCastExpr
+  // CHECK-NEXT: DeclRefExpr{{.*}}'GlobalArray' 'short[5]'
+  // CHECK-NEXT: IntegerLiteral{{.*}} 0
+  // CHECK-NEXT: IntegerLiteral{{.*}} 1
+}
+
+template<typename T>
+void TemplFunc(T t) {
+  // CHECK-LABEL: FunctionTemplateDecl {{.*}}TemplFunc
+  // CHECK-NEXT: TemplateTypeParmDecl
+  // CHECK-NEXT: FunctionDecl{{.*}}TemplFunc
+  // CHECK-NEXT: ParmVarDecl{{.*}} t 'T'
+  // CHECK-NEXT: CompoundStmt
+
+#pragma acc update if_present if (T::value < t)
+  // CHECK-NEXT: OpenACCUpdateConstruct{{.*}}update
+  // CHECK-NEXT: if_present clause
+  // CHECK-NEXT: if clause
+  // CHECK-NEXT: BinaryOperator{{.*}}'<dependent type>' '<'
+  // CHECK-NEXT: DependentScopeDeclRefExpr{{.*}} '<dependent type>'
+  // CHECK-NEXT: NestedNameSpecifier TypeSpec 'T'
+  // CHECK-NEXT: DeclRefExpr{{.*}}'t' 'T'
+
+#pragma acc update wait async device_type(T) dtype(U)
+  // CHECK-NEXT: OpenACCUpdateConstruct{{.*}}update
+  // CHECK-NEXT: wait clause
+  // CHECK-NEXT: <<<NULL>>>
+  // CHECK-NEXT: async clause
+  // CHECK-NEXT: device_type(T)
+  // CHECK-NEXT: dtype(U)
+#pragma acc update wait(T::value, t) async(T::value)
+  // CHECK-NEXT: OpenACCUpdateConstruct{{.*}}update
+  // CHECK-NEXT: wait clause
+  // CHECK-NEXT: <<<NULL>>>
+  // CHECK-NEXT: DependentScopeDeclRefExpr{{.*}}'<dependent type>'
+  // CHECK-NEXT: NestedNameSpecifier TypeSpec 'T'
+  // CHECK-NEXT: DeclRefExpr{{.*}} 't' 'T'
+  // CHECK-NEXT: async clause
+  // CHECK-NEXT: DependentScopeDeclRefExpr{{.*}}'<dependent type>'
+  // CHECK-NEXT: NestedNameSpecifier TypeSpec 'T'
+#pragma acc update wait(queues:T::value, t) async(t)
+  // CHECK-NEXT: OpenACCUpdateConstruct{{.*}}update
+  // CHECK-NEXT: wait clause
+  // CHECK-NEXT: <<<NULL>>>
+  // CHECK-NEXT: DependentScopeDeclRefExpr{{.*}}'<dependent type>'
+  // CHECK-NEXT: NestedNameSpecifier TypeSpec 'T'
+  // CHECK-NEXT: DeclRefExpr{{.*}} 't' 'T'
+  // CHECK-NEXT: async clause
+  // CHECK-NEXT: DeclRefExpr{{.*}} 't' 'T'
+#pragma acc update wait(devnum: T::value:t, T::value)
+  // CHECK-NEXT: OpenACCUpdateConstruct{{.*}}update
+  // CHECK-NEXT: wait clause
+  // CHECK-NEXT: DependentScopeDeclRefExpr{{.*}}'<dependent type>'
+  // CHECK-NEXT: NestedNameSpecifier TypeSpec 'T'
+  // CHECK-NEXT: DeclRefExpr{{.*}} 't' 'T'
+  // CHECK-NEXT: DependentScopeDeclRefExpr{{.*}}'<dependent type>'
+  // CHECK-NEXT: NestedNameSpecifier TypeSpec 'T'
+
+  decltype(T::value) Local = 0, LocalArray[5] = {};
+  // CHECK-NEXT: DeclStmt 
+  // CHECK-NEXT: VarDecl
+  // CHECK-NEXT: IntegerLiteral
+  // CHECK-NEXT: VarDecl
+  // CHECK-NEXT: InitListExpr
+
+#pragma acc update self(Local, LocalArray, LocalArray[0], LocalArray[0:1])
+  // CHECK-NEXT: OpenACCUpdateConstruct{{.*}}update
+  // CHECK-NEXT: self clause
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'Local' 'decltype(T::value)'
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'LocalArray' 'decltype(T::value)[5]'
+  // CHECK-NEXT: ArraySubscriptExpr
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'LocalArray' 'decltype(T::value)[5]'
+  // CHECK-NEXT: IntegerLiteral{{.*}}0
+  // CHECK-NEXT: ArraySectionExpr
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'LocalArray' 'decltype(T::value)[5]'
+  // CHECK-NEXT: IntegerLiteral{{.*}}0
+  // CHECK-NEXT: IntegerLiteral{{.*}}1
+
+  // Instantiation:
+  // CHECK-NEXT: FunctionDecl{{.*}} TemplFunc 'void (SomeStruct)' implicit_instantiation
+  // CHECK-NEXT: TemplateArgument type 'SomeStruct'
+  // CHECK-NEXT: RecordType{{.*}} 'SomeStruct'
+  // CHECK-NEXT: CXXRecord{{.*}} 'SomeStruct'
+  // CHECK-NEXT: ParmVarDecl{{.*}} t 'SomeStruct'
+  // CHECK-NEXT: CompoundStmt
+
+  // CHECK-NEXT: OpenACCUpdateConstruct{{.*}}update
+  // CHECK-NEXT: if_present clause
+  // CHECK-NEXT: if clause
+  // CHECK-NEXT: BinaryOperator{{.*}}'bool' '<'
+  // CHECK-NEXT: ImplicitCastExpr {{.*}}'unsigned int'
+  // CHECK-NEXT: DeclRefExpr{{.*}}'value' 'const unsigned int'
+  // CHECK-NEXT: NestedNameSpecifier TypeSpec 'SomeStruct'
+  // CHECK-NEXT: ImplicitCastExpr {{.*}}'unsigned int'
+  // CHECK-NEXT: CXXMemberCallExpr{{.*}}'unsigned int'
+  // CHECK-NEXT: MemberExpr{{.*}}.operator unsigned int
+  // CHECK-NEXT: DeclRefExpr{{.*}}'t' 'SomeStruct'
+
+  // CHECK-NEXT: OpenACCUpdateConstruct{{.*}}update
+  // CHECK-NEXT: wait clause
+  // CHECK-NEXT: <<<NULL>>>
+  // CHECK-NEXT: async clause
+  // CHECK-NEXT: device_type(T)
+  // CHECK-NEXT: dtype(U)
+
+  // CHECK-NEXT: OpenACCUpdateConstruct{{.*}}update
+  // CHECK-NEXT: wait clause
+  // CHECK-NEXT: <<<NULL>>>
+  // CHECK-NEXT: ImplicitCastExpr{{.*}}'unsigned int'
+  // CHECK-NEXT: DeclRefExpr{{.*}}'value' 'const unsigned int'
+  // CHECK-NEXT: NestedNameSpecifier TypeSpec 'SomeStruct'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}}'unsigned int'
+  // CHECK-NEXT: CXXMemberCallExpr{{.*}}'unsigned int'
+  // CHECK-NEXT: MemberExpr{{.*}}.operator unsigned int
+  // CHECK-NEXT: DeclRefExpr{{.*}}'t' 'SomeStruct'
+  // CHECK-NEXT: async clause
+  // CHECK-NEXT: ImplicitCastExpr{{.*}}'unsigned int'
+  // CHECK-NEXT: DeclRefExpr{{.*}}'value' 'const unsigned int'
+  // CHECK-NEXT: NestedNameSpecifier TypeSpec 'SomeStruct'
+
+  // CHECK-NEXT: OpenACCUpdateConstruct{{.*}}update
+  // CHECK-NEXT: wait clause
+  // CHECK-NEXT: <<<NULL>>>
+  // CHECK-NEXT: ImplicitCastExpr{{.*}}'unsigned int'
+  // CHECK-NEXT: DeclRefExpr{{.*}}'value' 'const unsigned int'
+  // CHECK-NEXT: NestedNameSpecifier TypeSpec 'SomeStruct'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}}'unsigned int'
+  // CHECK-NEXT: CXXMemberCallExpr{{.*}}'unsigned int'
+  // CHECK-NEXT: MemberExpr{{.*}}.operator unsigned int
+  // CHECK-NEXT: DeclRefExpr{{.*}}'t' 'SomeStruct'
+  // CHECK-NEXT: async clause
+  // CHECK-NEXT: ImplicitCastExpr{{.*}}'unsigned int'
+  // CHECK-NEXT: CXXMemberCallExpr{{.*}}'unsigned int'
+  // CHECK-NEXT: MemberExpr{{.*}}.operator unsigned int
+  // CHECK-NEXT: DeclRefExpr{{.*}}'t' 'SomeStruct'
+
+  // CHECK-NEXT: OpenACCUpdateConstruct{{.*}}update
+  // CHECK-NEXT: wait clause
+  // CHECK-NEXT: ImplicitCastExpr{{.*}}'unsigned int'
+  // CHECK-NEXT: DeclRefExpr{{.*}}'value' 'const unsigned int'
+  // CHECK-NEXT: NestedNameSpecifier TypeSpec 'SomeStruct'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}}'unsigned int'
+  // CHECK-NEXT: CXXMemberCallExpr{{.*}}'unsigned int'
+  // CHECK-NEXT: MemberExpr{{.*}}.operator unsigned int
+  // CHECK-NEXT: DeclRefExpr{{.*}}'t' 'SomeStruct'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}}'unsigned int'
+  // CHECK-NEXT: DeclRefExpr{{.*}}'value' 'const unsigned int'
+  // CHECK-NEXT: NestedNameSpecifier TypeSpec 'SomeStruct'
+
+  // CHECK-NEXT: DeclStmt 
+  // CHECK-NEXT: VarDecl
+  // CHECK-NEXT: ImplicitCastExpr
+  // CHECK-NEXT: IntegerLiteral
+  // CHECK-NEXT: VarDecl
+  // CHECK-NEXT: InitListExpr
+  // CHECK-NEXT: array_filler
+
+  // CHECK-NEXT: OpenACCUpdateConstruct{{.*}}update
+  // CHECK-NEXT: self clause
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'Local' 'decltype(SomeStruct::value)':'const unsigned int'
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'LocalArray' 'decltype(SomeStruct::value)[5]'
+  // CHECK-NEXT: ArraySubscriptExpr
+  // CHECK-NEXT: ImplicitCastExpr
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'LocalArray' 'decltype(SomeStruct::value)[5]'
+  // CHECK-NEXT: IntegerLiteral{{.*}}0
+  // CHECK-NEXT: ArraySectionExpr
+  // CHECK-NEXT: ImplicitCastExpr
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'LocalArray' 'decltype(SomeStruct::value)[5]'
+  // CHECK-NEXT: IntegerLiteral{{.*}}0
+  // CHECK-NEXT: IntegerLiteral{{.*}}1
+}
+
+struct SomeStruct{
+  static constexpr unsigned value = 5;
+  operator unsigned();
+};
+void use() {
+  TemplFunc(SomeStruct{});
+}
+#endif
diff --git a/clang/test/SemaOpenACC/update-construct.cpp b/clang/test/SemaOpenACC/update-construct.cpp
new file mode 100644
index 000000000000..2abd7a30eda8
--- /dev/null
+++ b/clang/test/SemaOpenACC/update-construct.cpp
@@ -0,0 +1,167 @@
+// RUN: %clang_cc1 %s -fopenacc -verify
+
+struct NotConvertible{} NC;
+int getI();
+void uses() {
+  int Var;
+#pragma acc update async self(Var)
+#pragma acc update wait self(Var)
+#pragma acc update self(Var) device_type(I)
+#pragma acc update if(true) self(Var)
+#pragma acc update if_present self(Var)
+#pragma acc update self(Var)
+  // expected-warning@+1{{OpenACC clause 'host' not yet implemented}}
+#pragma acc update host(Var)
+  // expected-warning@+1{{OpenACC clause 'device' not yet implemented}}
+#pragma acc update device(Var)
+
+  // expected-error@+2{{OpenACC clause 'if' may not follow a 'device_type' clause in a 'update' construct}}
+  // expected-note@+1{{previous clause is here}}
+#pragma acc update self(Var) device_type(I) if(true)
+  // expected-error@+2{{OpenACC clause 'if_present' may not follow a 'device_type' clause in a 'update' construct}}
+  // expected-note@+1{{previous clause is here}}
+#pragma acc update self(Var) device_type(I) if_present
+  // expected-error@+2{{OpenACC clause 'self' may not follow a 'device_type' clause in a 'update' construct}}
+  // expected-note@+1{{previous clause is here}}
+#pragma acc update device_type(I) self(Var)
+  // expected-error@+2{{OpenACC clause 'host' may not follow a 'device_type' clause in a 'update' construct}}
+  // expected-note@+1{{previous clause is here}}
+#pragma acc update device_type(I) host(Var)
+  // expected-error@+2{{OpenACC clause 'device' may not follow a 'device_type' clause in a 'update' construct}}
+  // expected-note@+1{{previous clause is here}}
+#pragma acc update device_type(I) device(Var)
+  // These 2 are OK.
+#pragma acc update self(Var) device_type(I) async
+#pragma acc update self(Var) device_type(I) wait
+  // Unless otherwise specified, we assume 'device_type' can happen after itself.
+#pragma acc update self(Var) device_type(I) device_type(I)
+
+  // TODO: OpenACC: These should diagnose because there isn't at least 1 of
+  // 'self', 'host', or 'device'.
+#pragma acc update async
+#pragma acc update wait
+#pragma acc update device_type(I)
+#pragma acc update if(true)
+#pragma acc update if_present
+
+  // expected-error@+1{{value of type 'struct NotConvertible' is not contextually convertible to 'bool'}}
+#pragma acc update if (NC) device_type(I)
+
+  // expected-error@+2{{OpenACC 'if' clause cannot appear more than once on a 'update' directive}}
+  // expected-note@+1{{previous clause is here}}
+#pragma acc update if(true) if (false)
+
+  // TODO: OpenACC: There is restrictions on the contents of a 'varlist', so
+  // those should be checked here too.
+
+  // Cannot be the body of an 'if', 'while', 'do', 'switch', or
+  // 'label'.
+  // expected-error@+3{{OpenACC 'update' construct may not appear in place of the statement following an if statement}}
+  if (true)
+    // expected-warning@+1{{OpenACC clause 'device' not yet implemented}}
+#pragma acc update device(Var)
+
+  // expected-error@+3{{OpenACC 'update' construct may not appear in place of the statement following a while statement}}
+  while (true)
+    // expected-warning@+1{{OpenACC clause 'device' not yet implemented}}
+#pragma acc update device(Var)
+
+  // expected-error@+3{{OpenACC 'update' construct may not appear in place of the statement following a do statement}}
+  do
+    // expected-warning@+1{{OpenACC clause 'device' not yet implemented}}
+#pragma acc update device(Var)
+  while (true);
+
+  // expected-error@+3{{OpenACC 'update' construct may not appear in place of the statement following a switch statement}}
+  switch(Var)
+    // expected-warning@+1{{OpenACC clause 'device' not yet implemented}}
+#pragma acc update device(Var)
+
+  // expected-error@+3{{OpenACC 'update' construct may not appear in place of the statement following a label statement}}
+  LABEL:
+    // expected-warning@+1{{OpenACC clause 'device' not yet implemented}}
+#pragma acc update device(Var)
+
+  // For loops are OK.
+  for (;;)
+    // expected-warning@+1{{OpenACC clause 'device' not yet implemented}}
+#pragma acc update device(Var)
+
+  // Checking for 'async', which requires an 'int' expression.
+#pragma acc update async
+
+#pragma acc update async(getI())
+  // expected-error@+2{{expected ')'}}
+  // expected-note@+1{{to match this '('}}
+#pragma acc update async(getI(), getI())
+  // expected-error@+2{{OpenACC 'async' clause cannot appear more than once on a 'update' directive}}
+  // expected-note@+1{{previous clause is here}}
+#pragma acc update async(getI()) async(getI())
+  // expected-error@+1{{OpenACC clause 'async' requires expression of integer type ('struct NotConvertible' invalid)}}
+#pragma acc update async(NC)
+
+  // Checking for 'wait', which has a complicated set arguments.
+#pragma acc update wait
+#pragma acc update wait()
+#pragma acc update wait(getI(), getI())
+#pragma acc update wait(devnum: getI():  getI())
+#pragma acc update wait(devnum: getI(): queues: getI(), getI())
+  // expected-error@+1{{OpenACC clause 'wait' requires expression of integer type ('struct NotConvertible' invalid)}}
+#pragma acc update wait(devnum:NC : 5)
+  // expected-error@+1{{OpenACC clause 'wait' requires expression of integer type ('struct NotConvertible' invalid)}}
+#pragma acc update wait(devnum:5 : NC)
+
+    int arr[5];
+  // expected-error@+3{{OpenACC clause 'wait' requires expression of integer type ('int[5]' invalid)}}
+  // expected-error@+2{{OpenACC clause 'wait' requires expression of integer type ('int[5]' invalid)}}
+  // expected-error@+1{{OpenACC clause 'wait' requires expression of integer type ('struct NotConvertible' invalid)}}
+#pragma acc update wait(devnum:arr : queues: arr, NC, 5)
+}
+
+struct SomeS {
+  int Array[5];
+  int MemberOfComp;
+};
+
+template<typename I, typename T>
+void varlist_restrictions_templ() {
+  I iArray[5];
+  T Single;
+  T Array[5];
+
+  // Members of a subarray of struct or class type may not appear, but others
+  // are permitted to.
+#pragma acc update self(iArray[0:1])
+
+#pragma acc update self(Array[0:1])
+
+  // expected-error@+1{{OpenACC sub-array is not allowed here}}
+#pragma acc update self(Array[0:1].MemberOfComp)
+}
+
+void varlist_restrictions() {
+  varlist_restrictions_templ<int, SomeS>();// expected-note{{in instantiation of}}
+  int iArray[5];
+  SomeS Single;
+  SomeS Array[5];
+
+  int LocalInt;
+  int *LocalPtr;
+
+#pragma acc update self(LocalInt, LocalPtr, Single)
+
+#pragma acc update self(Single.MemberOfComp)
+
+#pragma acc update self(Single.Array[0:1])
+
+
+  // Members of a subarray of struct or class type may not appear, but others
+  // are permitted to.
+#pragma acc update self(iArray[0:1])
+
+#pragma acc update self(Array[0:1])
+
+  // expected-error@+1{{OpenACC sub-array is not allowed here}}
+#pragma acc update self(Array[0:1].MemberOfComp)
+}
+
diff --git a/clang/test/SemaTemplate/concepts-recursive-inst.cpp b/clang/test/SemaTemplate/concepts-recursive-inst.cpp
index 30a410cef91e..9330df8cdd03 100644
--- a/clang/test/SemaTemplate/concepts-recursive-inst.cpp
+++ b/clang/test/SemaTemplate/concepts-recursive-inst.cpp
@@ -76,7 +76,7 @@ auto it = begin(rng); // #BEGIN_CALL
 // expected-note@#INF_BEGIN {{while checking the satisfaction of concept 'Inf<DirectRecursiveCheck::my_range>' requested here}}
 // expected-note@#INF_BEGIN {{while substituting template arguments into constraint expression here}}
 // expected-note@#BEGIN_CALL {{while checking constraint satisfaction for template 'begin<DirectRecursiveCheck::my_range>' required here}}
-// expected-note@#BEGIN_CALL {{while substituting deduced template arguments into function template}}
+// expected-note@#BEGIN_CALL {{in instantiation of function template specialization}}
 
 // Fallout of the failure is failed lookup, which is necessary to stop odd
 // cascading errors.
@@ -103,7 +103,7 @@ namespace GH50891 {
   // expected-note@#OP_TO {{while checking the satisfaction of concept 'Numeric<GH50891::Deferred>' requested here}}
   // expected-note@#OP_TO {{while substituting template arguments into constraint expression here}}
   // expected-note@#FOO_CALL {{while checking constraint satisfaction for template}}
-  // expected-note@#FOO_CALL {{while substituting deduced template arguments into function template}}
+  // expected-note@#FOO_CALL {{in instantiation of function template specialization}}
   // expected-note@#FOO_CALL {{in instantiation of requirement here}}
   // expected-note@#NUMERIC {{while substituting template arguments into constraint expression here}}
 
diff --git a/clang/test/SemaTemplate/concepts.cpp b/clang/test/SemaTemplate/concepts.cpp
index 312469313fc5..f335ca3bd22b 100644
--- a/clang/test/SemaTemplate/concepts.cpp
+++ b/clang/test/SemaTemplate/concepts.cpp
@@ -1165,3 +1165,15 @@ concept C = invalid; // expected-error {{use of undeclared identifier 'invalid'}
 bool val2 = C<int>;
 
 } // namespace GH109780
+
+namespace GH121980 {
+
+template <class>
+concept has_member_difference_type; // expected-error {{expected '='}}
+
+template <has_member_difference_type> struct incrementable_traits; // expected-note {{declared here}}
+
+template <has_member_difference_type Tp>
+struct incrementable_traits<Tp>; // expected-error {{not more specialized than the primary}}
+
+}
diff --git a/clang/test/SemaTemplate/cxx2a-constraint-exprs.cpp b/clang/test/SemaTemplate/cxx2a-constraint-exprs.cpp
index 5809ef684bbf..f4403587a625 100644
--- a/clang/test/SemaTemplate/cxx2a-constraint-exprs.cpp
+++ b/clang/test/SemaTemplate/cxx2a-constraint-exprs.cpp
@@ -34,7 +34,7 @@ namespace constant_evaluated {
      expected-note@-1{{candidate template ignored}}
   int a = (foo<int>(), 0);
   // expected-note@-1 {{while checking}} expected-error@-1{{no matching function}} \
-     expected-note@-1 {{while substituting}}
+     expected-note@-1 {{in instantiation}}
   template<typename T> void bar() requires requires { requires f<int[2]>; } { };
   // expected-note@-1{{in instantiation}} \
      expected-note@-1{{while substituting}} \
diff --git a/clang/test/SemaTemplate/deduction-guide.cpp b/clang/test/SemaTemplate/deduction-guide.cpp
index 67d00bb49f77..d03c783313dd 100644
--- a/clang/test/SemaTemplate/deduction-guide.cpp
+++ b/clang/test/SemaTemplate/deduction-guide.cpp
@@ -234,6 +234,11 @@ F s(0);
 // CHECK: | `-CXXBoolLiteralExpr {{.*}} 'bool' false
 // CHECK: |-CXXDeductionGuideDecl {{.*}} implicit <deduction guide for F> 'auto (U) -> F<>'
 // CHECK: | `-ParmVarDecl {{.*}} 'U'
+// CHECK: `-CXXDeductionGuideDecl {{.*}} implicit <deduction guide for F> 'auto (int) -> F<>'
+// CHECK:   |-TemplateArgument integral ''x''
+// CHECK:   |-TemplateArgument type 'int'
+// CHECK:   | `-BuiltinType {{.*}} 'int'
+// CHECK:   `-ParmVarDecl {{.*}} 'int'
 // CHECK: FunctionProtoType {{.*}} 'auto (U) -> F<>' dependent trailing_return cdecl
 // CHECK: |-InjectedClassNameType {{.*}} 'F<>' dependent
 // CHECK: | `-CXXRecord {{.*}} 'F'
diff --git a/clang/test/SemaTemplate/nested-implicit-deduction-guides.cpp b/clang/test/SemaTemplate/nested-implicit-deduction-guides.cpp
index 5c7a90273d0e..af3e3358f613 100644
--- a/clang/test/SemaTemplate/nested-implicit-deduction-guides.cpp
+++ b/clang/test/SemaTemplate/nested-implicit-deduction-guides.cpp
@@ -38,7 +38,7 @@ template<typename A, typename T>
 concept True = true;
 
 template<typename T>
-concept False = false; // #False
+concept False = false;
 
 template<class X> struct concepts {
     template<class Y> struct B {
@@ -68,7 +68,7 @@ template<typename X> struct nested_init_list {
         Y y;
     };
 
-    template<False F>  // #INIT_LIST_INNER_INVALID_HEADER
+    template<False F>
     struct concept_fail { // #INIT_LIST_INNER_INVALID
         X x;
         F f;
@@ -81,9 +81,7 @@ using NIL = nested_init_list<int>::B<int>;
 
 // expected-error@+1 {{no viable constructor or deduction guide for deduction of template arguments of 'nested_init_list<int>::concept_fail'}}
 nested_init_list<int>::concept_fail nil_invalid{1, ""};
-// expected-note@#INIT_LIST_INNER_INVALID {{candidate template ignored: constraints not satisfied [with F = const char *]}}
-// expected-note@#INIT_LIST_INNER_INVALID_HEADER {{because 'const char *' does not satisfy 'False'}}
-// expected-note@#False {{because 'false' evaluated to false}}
+// expected-note@#INIT_LIST_INNER_INVALID {{candidate template ignored: substitution failure [with F = const char *]: constraints not satisfied for class template 'concept_fail' [with F = const char *]}}
 // expected-note@#INIT_LIST_INNER_INVALID {{implicit deduction guide declared as 'template <False F> concept_fail(int, F) -> concept_fail<F>'}}
 // expected-note@#INIT_LIST_INNER_INVALID {{candidate function template not viable: requires 1 argument, but 2 were provided}}
 // expected-note@#INIT_LIST_INNER_INVALID {{implicit deduction guide declared as 'template <False F> concept_fail(concept_fail<F>) -> concept_fail<F>'}}
diff --git a/clang/test/utils/update_cc_test_checks/Inputs/basic-cplusplus.cpp b/clang/test/utils/update_cc_test_checks/Inputs/basic-cplusplus.cpp
index 98be350b3937..e332528e24e2 100644
--- a/clang/test/utils/update_cc_test_checks/Inputs/basic-cplusplus.cpp
+++ b/clang/test/utils/update_cc_test_checks/Inputs/basic-cplusplus.cpp
@@ -1,5 +1,9 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --include-generated-funcs --version 5
 // Basic C++ test for update_cc_test_checks
 // RUN: %clang_cc1 -triple=x86_64-unknown-linux-gnu -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple=x86_64-apple-macho -emit-llvm -o - %s | FileCheck %s --check-prefix=MACHO
+// RUN: %clang_cc1 -triple=x86_64-windows-msvc -emit-llvm -o - %s | FileCheck %s --check-prefix=MSVC
+// RUN: %clang_cc1 -triple=x86_64-windows-gnu -emit-llvm -o - %s | FileCheck %s --check-prefix=MINGW
 
 class Foo {
   int x;
@@ -13,6 +17,8 @@ public:
   inline int function_defined_out_of_line(int arg) const;
 };
 
+[[clang::noinline]] static int static_noinline_fn(int arg) { return arg; }
+
 Foo::Foo(int x) : x(x) {}
 Foo::~Foo() {}
 int Foo::function_defined_out_of_line(int arg) const { return x - arg; }
@@ -22,4 +28,5 @@ int main() {
   Foo f(1);
   f.function_defined_inline(2);
   f.function_defined_out_of_line(3);
+  return static_noinline_fn(0);
 }
diff --git a/clang/test/utils/update_cc_test_checks/Inputs/basic-cplusplus.cpp.expected b/clang/test/utils/update_cc_test_checks/Inputs/basic-cplusplus.cpp.expected
index c42dc07fa359..96370b4bec2d 100644
--- a/clang/test/utils/update_cc_test_checks/Inputs/basic-cplusplus.cpp.expected
+++ b/clang/test/utils/update_cc_test_checks/Inputs/basic-cplusplus.cpp.expected
@@ -1,6 +1,9 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --include-generated-funcs --version 5
 // Basic C++ test for update_cc_test_checks
 // RUN: %clang_cc1 -triple=x86_64-unknown-linux-gnu -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple=x86_64-apple-macho -emit-llvm -o - %s | FileCheck %s --check-prefix=MACHO
+// RUN: %clang_cc1 -triple=x86_64-windows-msvc -emit-llvm -o - %s | FileCheck %s --check-prefix=MSVC
+// RUN: %clang_cc1 -triple=x86_64-windows-gnu -emit-llvm -o - %s | FileCheck %s --check-prefix=MINGW
 
 class Foo {
   int x;
@@ -8,52 +11,109 @@ class Foo {
 public:
   explicit Foo(int x);
   ~Foo();
-// CHECK-LABEL: @_ZNK3Foo23function_defined_inlineEi(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[ARG_ADDR:%.*]] = alloca i32, align 4
-// CHECK-NEXT:    store ptr [[THIS:%.*]], ptr [[THIS_ADDR]], align 8
-// CHECK-NEXT:    store i32 [[ARG:%.*]], ptr [[ARG_ADDR]], align 4
-// CHECK-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARG_ADDR]], align 4
-// CHECK-NEXT:    [[X:%.*]] = getelementptr inbounds nuw [[CLASS_FOO:%.*]], ptr [[THIS1]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 4
-// CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP0]], [[TMP1]]
-// CHECK-NEXT:    ret i32 [[ADD]]
-//
   inline int function_defined_inline(int arg) const {
     return arg + x;
   }
   inline int function_defined_out_of_line(int arg) const;
 };
 
-// CHECK-LABEL: @_ZN3FooC1Ei(
-// CHECK-NEXT:  entry:
+[[clang::noinline]] static int static_noinline_fn(int arg) { return arg; }
+
+Foo::Foo(int x) : x(x) {}
+Foo::~Foo() {}
+int Foo::function_defined_out_of_line(int arg) const { return x - arg; }
+
+// Call the inline methods to ensure the LLVM IR is generated:
+int main() {
+  Foo f(1);
+  f.function_defined_inline(2);
+  f.function_defined_out_of_line(3);
+  return static_noinline_fn(0);
+}
+// CHECK-LABEL: define dso_local void @_ZN3FooC2Ei(
+// CHECK-SAME: ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], i32 noundef [[X:%.*]]) unnamed_addr #[[ATTR0:[0-9]+]] align 2 {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[X_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK-NEXT:    store i32 [[X]], ptr [[X_ADDR]], align 4
+// CHECK-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK-NEXT:    [[X2:%.*]] = getelementptr inbounds nuw [[CLASS_FOO:%.*]], ptr [[THIS1]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[X_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[TMP0]], ptr [[X2]], align 4
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define dso_local void @_ZN3FooC1Ei(
+// CHECK-SAME: ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], i32 noundef [[X:%.*]]) unnamed_addr #[[ATTR0]] align 2 {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-NEXT:    [[X_ADDR:%.*]] = alloca i32, align 4
-// CHECK-NEXT:    store ptr [[THIS:%.*]], ptr [[THIS_ADDR]], align 8
-// CHECK-NEXT:    store i32 [[X:%.*]], ptr [[X_ADDR]], align 4
+// CHECK-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK-NEXT:    store i32 [[X]], ptr [[X_ADDR]], align 4
 // CHECK-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[X_ADDR]], align 4
 // CHECK-NEXT:    call void @_ZN3FooC2Ei(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]], i32 noundef [[TMP0]])
 // CHECK-NEXT:    ret void
 //
-Foo::Foo(int x) : x(x) {}
-// CHECK-LABEL: @_ZN3FooD1Ev(
-// CHECK-NEXT:  entry:
+//
+// CHECK-LABEL: define dso_local void @_ZN3FooD2Ev(
+// CHECK-SAME: ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR0]] align 2 {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    store ptr [[THIS:%.*]], ptr [[THIS_ADDR]], align 8
+// CHECK-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define dso_local void @_ZN3FooD1Ev(
+// CHECK-SAME: ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR0]] align 2 {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CHECK-NEXT:    call void @_ZN3FooD2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR2:[0-9]+]]
 // CHECK-NEXT:    ret void
 //
-Foo::~Foo() {}
-// CHECK-LABEL: @_ZNK3Foo28function_defined_out_of_lineEi(
-// CHECK-NEXT:  entry:
+//
+// CHECK-LABEL: define dso_local noundef i32 @main(
+// CHECK-SAME: ) #[[ATTR1:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[F:%.*]] = alloca [[CLASS_FOO:%.*]], align 4
+// CHECK-NEXT:    store i32 0, ptr [[RETVAL]], align 4
+// CHECK-NEXT:    call void @_ZN3FooC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[F]], i32 noundef 1)
+// CHECK-NEXT:    [[CALL:%.*]] = call noundef i32 @_ZNK3Foo23function_defined_inlineEi(ptr noundef nonnull align 4 dereferenceable(4) [[F]], i32 noundef 2)
+// CHECK-NEXT:    [[CALL1:%.*]] = call noundef i32 @_ZNK3Foo28function_defined_out_of_lineEi(ptr noundef nonnull align 4 dereferenceable(4) [[F]], i32 noundef 3)
+// CHECK-NEXT:    [[CALL2:%.*]] = call noundef i32 @_ZL18static_noinline_fni(i32 noundef 0)
+// CHECK-NEXT:    store i32 [[CALL2]], ptr [[RETVAL]], align 4
+// CHECK-NEXT:    call void @_ZN3FooD1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[F]]) #[[ATTR2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[RETVAL]], align 4
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+//
+// CHECK-LABEL: define linkonce_odr noundef i32 @_ZNK3Foo23function_defined_inlineEi(
+// CHECK-SAME: ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], i32 noundef [[ARG:%.*]]) #[[ATTR0]] comdat align 2 {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-NEXT:    [[ARG_ADDR:%.*]] = alloca i32, align 4
-// CHECK-NEXT:    store ptr [[THIS:%.*]], ptr [[THIS_ADDR]], align 8
-// CHECK-NEXT:    store i32 [[ARG:%.*]], ptr [[ARG_ADDR]], align 4
+// CHECK-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK-NEXT:    store i32 [[ARG]], ptr [[ARG_ADDR]], align 4
+// CHECK-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARG_ADDR]], align 4
+// CHECK-NEXT:    [[X:%.*]] = getelementptr inbounds nuw [[CLASS_FOO:%.*]], ptr [[THIS1]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 4
+// CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    ret i32 [[ADD]]
+//
+//
+// CHECK-LABEL: define linkonce_odr noundef i32 @_ZNK3Foo28function_defined_out_of_lineEi(
+// CHECK-SAME: ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], i32 noundef [[ARG:%.*]]) #[[ATTR0]] comdat align 2 {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[ARG_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK-NEXT:    store i32 [[ARG]], ptr [[ARG_ADDR]], align 4
 // CHECK-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CHECK-NEXT:    [[X:%.*]] = getelementptr inbounds nuw [[CLASS_FOO:%.*]], ptr [[THIS1]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[X]], align 4
@@ -61,20 +121,230 @@ Foo::~Foo() {}
 // CHECK-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP0]], [[TMP1]]
 // CHECK-NEXT:    ret i32 [[SUB]]
 //
-int Foo::function_defined_out_of_line(int arg) const { return x - arg; }
-
-// Call the inline methods to ensure the LLVM IR is generated:
-// CHECK-LABEL: @main(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[F:%.*]] = alloca [[CLASS_FOO:%.*]], align 4
-// CHECK-NEXT:    call void @_ZN3FooC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[F]], i32 noundef 1)
-// CHECK-NEXT:    [[CALL:%.*]] = call noundef i32 @_ZNK3Foo23function_defined_inlineEi(ptr noundef nonnull align 4 dereferenceable(4) [[F]], i32 noundef 2)
-// CHECK-NEXT:    [[CALL1:%.*]] = call noundef i32 @_ZNK3Foo28function_defined_out_of_lineEi(ptr noundef nonnull align 4 dereferenceable(4) [[F]], i32 noundef 3)
-// CHECK-NEXT:    call void @_ZN3FooD1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[F]]) #[[ATTR2]]
-// CHECK-NEXT:    ret i32 0
 //
-int main() {
-  Foo f(1);
-  f.function_defined_inline(2);
-  f.function_defined_out_of_line(3);
-}
+// CHECK-LABEL: define internal noundef i32 @_ZL18static_noinline_fni(
+// CHECK-SAME: i32 noundef [[ARG:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[ARG_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store i32 [[ARG]], ptr [[ARG_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARG_ADDR]], align 4
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+//
+// MACHO-LABEL: define void @_ZN3FooC2Ei(
+// MACHO-SAME: ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], i32 noundef [[X:%.*]]) unnamed_addr #[[ATTR0:[0-9]+]] align 2 {
+// MACHO-NEXT:  [[ENTRY:.*:]]
+// MACHO-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// MACHO-NEXT:    [[X_ADDR:%.*]] = alloca i32, align 4
+// MACHO-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// MACHO-NEXT:    store i32 [[X]], ptr [[X_ADDR]], align 4
+// MACHO-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// MACHO-NEXT:    [[X2:%.*]] = getelementptr inbounds nuw [[CLASS_FOO:%.*]], ptr [[THIS1]], i32 0, i32 0
+// MACHO-NEXT:    [[TMP0:%.*]] = load i32, ptr [[X_ADDR]], align 4
+// MACHO-NEXT:    store i32 [[TMP0]], ptr [[X2]], align 4
+// MACHO-NEXT:    ret void
+//
+//
+// MACHO-LABEL: define void @_ZN3FooC1Ei(
+// MACHO-SAME: ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], i32 noundef [[X:%.*]]) unnamed_addr #[[ATTR0]] align 2 {
+// MACHO-NEXT:  [[ENTRY:.*:]]
+// MACHO-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// MACHO-NEXT:    [[X_ADDR:%.*]] = alloca i32, align 4
+// MACHO-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// MACHO-NEXT:    store i32 [[X]], ptr [[X_ADDR]], align 4
+// MACHO-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// MACHO-NEXT:    [[TMP0:%.*]] = load i32, ptr [[X_ADDR]], align 4
+// MACHO-NEXT:    call void @_ZN3FooC2Ei(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]], i32 noundef [[TMP0]])
+// MACHO-NEXT:    ret void
+//
+//
+// MACHO-LABEL: define void @_ZN3FooD2Ev(
+// MACHO-SAME: ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR0]] align 2 {
+// MACHO-NEXT:  [[ENTRY:.*:]]
+// MACHO-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// MACHO-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// MACHO-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// MACHO-NEXT:    ret void
+//
+//
+// MACHO-LABEL: define void @_ZN3FooD1Ev(
+// MACHO-SAME: ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR0]] align 2 {
+// MACHO-NEXT:  [[ENTRY:.*:]]
+// MACHO-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// MACHO-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// MACHO-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// MACHO-NEXT:    call void @_ZN3FooD2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR2:[0-9]+]]
+// MACHO-NEXT:    ret void
+//
+//
+// MACHO-LABEL: define noundef i32 @main(
+// MACHO-SAME: ) #[[ATTR1:[0-9]+]] {
+// MACHO-NEXT:  [[ENTRY:.*:]]
+// MACHO-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
+// MACHO-NEXT:    [[F:%.*]] = alloca [[CLASS_FOO:%.*]], align 4
+// MACHO-NEXT:    store i32 0, ptr [[RETVAL]], align 4
+// MACHO-NEXT:    call void @_ZN3FooC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[F]], i32 noundef 1)
+// MACHO-NEXT:    [[CALL:%.*]] = call noundef i32 @_ZNK3Foo23function_defined_inlineEi(ptr noundef nonnull align 4 dereferenceable(4) [[F]], i32 noundef 2)
+// MACHO-NEXT:    [[CALL1:%.*]] = call noundef i32 @_ZNK3Foo28function_defined_out_of_lineEi(ptr noundef nonnull align 4 dereferenceable(4) [[F]], i32 noundef 3)
+// MACHO-NEXT:    [[CALL2:%.*]] = call noundef i32 @_ZL18static_noinline_fni(i32 noundef 0)
+// MACHO-NEXT:    store i32 [[CALL2]], ptr [[RETVAL]], align 4
+// MACHO-NEXT:    call void @_ZN3FooD1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[F]]) #[[ATTR2]]
+// MACHO-NEXT:    [[TMP0:%.*]] = load i32, ptr [[RETVAL]], align 4
+// MACHO-NEXT:    ret i32 [[TMP0]]
+//
+//
+// MACHO-LABEL: define linkonce_odr noundef i32 @_ZNK3Foo23function_defined_inlineEi(
+// MACHO-SAME: ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], i32 noundef [[ARG:%.*]]) #[[ATTR0]] align 2 {
+// MACHO-NEXT:  [[ENTRY:.*:]]
+// MACHO-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// MACHO-NEXT:    [[ARG_ADDR:%.*]] = alloca i32, align 4
+// MACHO-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// MACHO-NEXT:    store i32 [[ARG]], ptr [[ARG_ADDR]], align 4
+// MACHO-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// MACHO-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARG_ADDR]], align 4
+// MACHO-NEXT:    [[X:%.*]] = getelementptr inbounds nuw [[CLASS_FOO:%.*]], ptr [[THIS1]], i32 0, i32 0
+// MACHO-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 4
+// MACHO-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP0]], [[TMP1]]
+// MACHO-NEXT:    ret i32 [[ADD]]
+//
+//
+// MACHO-LABEL: define linkonce_odr noundef i32 @_ZNK3Foo28function_defined_out_of_lineEi(
+// MACHO-SAME: ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], i32 noundef [[ARG:%.*]]) #[[ATTR0]] align 2 {
+// MACHO-NEXT:  [[ENTRY:.*:]]
+// MACHO-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// MACHO-NEXT:    [[ARG_ADDR:%.*]] = alloca i32, align 4
+// MACHO-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// MACHO-NEXT:    store i32 [[ARG]], ptr [[ARG_ADDR]], align 4
+// MACHO-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// MACHO-NEXT:    [[X:%.*]] = getelementptr inbounds nuw [[CLASS_FOO:%.*]], ptr [[THIS1]], i32 0, i32 0
+// MACHO-NEXT:    [[TMP0:%.*]] = load i32, ptr [[X]], align 4
+// MACHO-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARG_ADDR]], align 4
+// MACHO-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP0]], [[TMP1]]
+// MACHO-NEXT:    ret i32 [[SUB]]
+//
+//
+// MACHO-LABEL: define internal noundef i32 @_ZL18static_noinline_fni(
+// MACHO-SAME: i32 noundef [[ARG:%.*]]) #[[ATTR0]] {
+// MACHO-NEXT:  [[ENTRY:.*:]]
+// MACHO-NEXT:    [[ARG_ADDR:%.*]] = alloca i32, align 4
+// MACHO-NEXT:    store i32 [[ARG]], ptr [[ARG_ADDR]], align 4
+// MACHO-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARG_ADDR]], align 4
+// MACHO-NEXT:    ret i32 [[TMP0]]
+//
+//
+// MSVC-LABEL: define dso_local noundef i32 @main(
+// MSVC-SAME: ) #[[ATTR1:[0-9]+]] {
+// MSVC-NEXT:  [[ENTRY:.*:]]
+// MSVC-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
+// MSVC-NEXT:    [[F:%.*]] = alloca [[CLASS_FOO:%.*]], align 4
+// MSVC-NEXT:    store i32 0, ptr [[RETVAL]], align 4
+// MSVC-NEXT:    [[CALL:%.*]] = call noundef ptr @"??0Foo@@QEAA@H@Z"(ptr noundef nonnull align 4 dereferenceable(4) [[F]], i32 noundef 1)
+// MSVC-NEXT:    [[CALL1:%.*]] = call noundef i32 @"?function_defined_inline@Foo@@QEBAHH@Z"(ptr noundef nonnull align 4 dereferenceable(4) [[F]], i32 noundef 2)
+// MSVC-NEXT:    [[CALL2:%.*]] = call noundef i32 @"?function_defined_out_of_line@Foo@@QEBAHH@Z"(ptr noundef nonnull align 4 dereferenceable(4) [[F]], i32 noundef 3)
+// MSVC-NEXT:    [[CALL3:%.*]] = call noundef i32 @"?static_noinline_fn@@YAHH@Z"(i32 noundef 0)
+// MSVC-NEXT:    store i32 [[CALL3]], ptr [[RETVAL]], align 4
+// MSVC-NEXT:    call void @"??1Foo@@QEAA@XZ"(ptr noundef nonnull align 4 dereferenceable(4) [[F]]) #[[ATTR2:[0-9]+]]
+// MSVC-NEXT:    [[TMP0:%.*]] = load i32, ptr [[RETVAL]], align 4
+// MSVC-NEXT:    ret i32 [[TMP0]]
+//
+//
+// MINGW-LABEL: define dso_local void @_ZN3FooC2Ei(
+// MINGW-SAME: ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], i32 noundef [[X:%.*]]) unnamed_addr #[[ATTR0:[0-9]+]] align 2 {
+// MINGW-NEXT:  [[ENTRY:.*:]]
+// MINGW-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// MINGW-NEXT:    [[X_ADDR:%.*]] = alloca i32, align 4
+// MINGW-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// MINGW-NEXT:    store i32 [[X]], ptr [[X_ADDR]], align 4
+// MINGW-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// MINGW-NEXT:    [[X2:%.*]] = getelementptr inbounds nuw [[CLASS_FOO:%.*]], ptr [[THIS1]], i32 0, i32 0
+// MINGW-NEXT:    [[TMP0:%.*]] = load i32, ptr [[X_ADDR]], align 4
+// MINGW-NEXT:    store i32 [[TMP0]], ptr [[X2]], align 4
+// MINGW-NEXT:    ret void
+//
+//
+// MINGW-LABEL: define dso_local void @_ZN3FooC1Ei(
+// MINGW-SAME: ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], i32 noundef [[X:%.*]]) unnamed_addr #[[ATTR0]] align 2 {
+// MINGW-NEXT:  [[ENTRY:.*:]]
+// MINGW-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// MINGW-NEXT:    [[X_ADDR:%.*]] = alloca i32, align 4
+// MINGW-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// MINGW-NEXT:    store i32 [[X]], ptr [[X_ADDR]], align 4
+// MINGW-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// MINGW-NEXT:    [[TMP0:%.*]] = load i32, ptr [[X_ADDR]], align 4
+// MINGW-NEXT:    call void @_ZN3FooC2Ei(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]], i32 noundef [[TMP0]])
+// MINGW-NEXT:    ret void
+//
+//
+// MINGW-LABEL: define dso_local void @_ZN3FooD2Ev(
+// MINGW-SAME: ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR0]] align 2 {
+// MINGW-NEXT:  [[ENTRY:.*:]]
+// MINGW-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// MINGW-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// MINGW-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// MINGW-NEXT:    ret void
+//
+//
+// MINGW-LABEL: define dso_local void @_ZN3FooD1Ev(
+// MINGW-SAME: ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR0]] align 2 {
+// MINGW-NEXT:  [[ENTRY:.*:]]
+// MINGW-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// MINGW-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// MINGW-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// MINGW-NEXT:    call void @_ZN3FooD2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR2:[0-9]+]]
+// MINGW-NEXT:    ret void
+//
+//
+// MINGW-LABEL: define dso_local noundef i32 @main(
+// MINGW-SAME: ) #[[ATTR1:[0-9]+]] {
+// MINGW-NEXT:  [[ENTRY:.*:]]
+// MINGW-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
+// MINGW-NEXT:    [[F:%.*]] = alloca [[CLASS_FOO:%.*]], align 4
+// MINGW-NEXT:    store i32 0, ptr [[RETVAL]], align 4
+// MINGW-NEXT:    call void @_ZN3FooC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[F]], i32 noundef 1)
+// MINGW-NEXT:    [[CALL:%.*]] = call noundef i32 @_ZNK3Foo23function_defined_inlineEi(ptr noundef nonnull align 4 dereferenceable(4) [[F]], i32 noundef 2)
+// MINGW-NEXT:    [[CALL1:%.*]] = call noundef i32 @_ZNK3Foo28function_defined_out_of_lineEi(ptr noundef nonnull align 4 dereferenceable(4) [[F]], i32 noundef 3)
+// MINGW-NEXT:    [[CALL2:%.*]] = call noundef i32 @_ZL18static_noinline_fni(i32 noundef 0)
+// MINGW-NEXT:    store i32 [[CALL2]], ptr [[RETVAL]], align 4
+// MINGW-NEXT:    call void @_ZN3FooD1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[F]]) #[[ATTR2]]
+// MINGW-NEXT:    [[TMP0:%.*]] = load i32, ptr [[RETVAL]], align 4
+// MINGW-NEXT:    ret i32 [[TMP0]]
+//
+//
+// MINGW-LABEL: define linkonce_odr dso_local noundef i32 @_ZNK3Foo23function_defined_inlineEi(
+// MINGW-SAME: ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], i32 noundef [[ARG:%.*]]) #[[ATTR0]] comdat align 2 {
+// MINGW-NEXT:  [[ENTRY:.*:]]
+// MINGW-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// MINGW-NEXT:    [[ARG_ADDR:%.*]] = alloca i32, align 4
+// MINGW-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// MINGW-NEXT:    store i32 [[ARG]], ptr [[ARG_ADDR]], align 4
+// MINGW-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// MINGW-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARG_ADDR]], align 4
+// MINGW-NEXT:    [[X:%.*]] = getelementptr inbounds nuw [[CLASS_FOO:%.*]], ptr [[THIS1]], i32 0, i32 0
+// MINGW-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 4
+// MINGW-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP0]], [[TMP1]]
+// MINGW-NEXT:    ret i32 [[ADD]]
+//
+//
+// MINGW-LABEL: define linkonce_odr dso_local noundef i32 @_ZNK3Foo28function_defined_out_of_lineEi(
+// MINGW-SAME: ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], i32 noundef [[ARG:%.*]]) #[[ATTR0]] comdat align 2 {
+// MINGW-NEXT:  [[ENTRY:.*:]]
+// MINGW-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// MINGW-NEXT:    [[ARG_ADDR:%.*]] = alloca i32, align 4
+// MINGW-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// MINGW-NEXT:    store i32 [[ARG]], ptr [[ARG_ADDR]], align 4
+// MINGW-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// MINGW-NEXT:    [[X:%.*]] = getelementptr inbounds nuw [[CLASS_FOO:%.*]], ptr [[THIS1]], i32 0, i32 0
+// MINGW-NEXT:    [[TMP0:%.*]] = load i32, ptr [[X]], align 4
+// MINGW-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARG_ADDR]], align 4
+// MINGW-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP0]], [[TMP1]]
+// MINGW-NEXT:    ret i32 [[SUB]]
+//
+//
+// MINGW-LABEL: define internal noundef i32 @_ZL18static_noinline_fni(
+// MINGW-SAME: i32 noundef [[ARG:%.*]]) #[[ATTR0]] {
+// MINGW-NEXT:  [[ENTRY:.*:]]
+// MINGW-NEXT:    [[ARG_ADDR:%.*]] = alloca i32, align 4
+// MINGW-NEXT:    store i32 [[ARG]], ptr [[ARG_ADDR]], align 4
+// MINGW-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARG_ADDR]], align 4
+// MINGW-NEXT:    ret i32 [[TMP0]]
+//
diff --git a/clang/test/utils/update_cc_test_checks/Inputs/c-symbol-mangling.c b/clang/test/utils/update_cc_test_checks/Inputs/c-symbol-mangling.c
new file mode 100644
index 000000000000..018f99264006
--- /dev/null
+++ b/clang/test/utils/update_cc_test_checks/Inputs/c-symbol-mangling.c
@@ -0,0 +1,49 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+/// Check that we generate checks for functions even though the mangledName
+/// property in the AST dump JSON does not match the LLVM IR name.
+// RUN: %clang_cc1 -triple=x86_64-unknown-linux-gnu -emit-llvm -o - %s | FileCheck %s --check-prefix=ELF
+// RUN: %clang_cc1 -triple=x86_64-apple-macho -emit-llvm -o - %s | FileCheck %s --check-prefix=MACHO
+// RUN: %clang_cc1 -triple=x86_64-windows-msvc -emit-llvm -o - %s | FileCheck %s --check-prefix=MSVC
+// RUN: %clang_cc1 -triple=x86_64-windows-gnu -emit-llvm -o - %s | FileCheck %s --check-prefix=MINGW
+// RUN: %clang_cc1 -triple=i686-unknown-win32 -emit-llvm -o - %s | FileCheck %s --check-prefix=WIN32
+// RUN: %clang_cc1 -triple thumbv7s-apple-darwin -target-abi apcs-gnu -emit-llvm -o - %s | FileCheck %s --check-prefix=THUMB-DARWIN
+
+// UTC_ARGS: --disable
+// ELF: target datalayout = "e-m:e-
+// MACHO: target datalayout = "e-m:o-
+// MSVC: target datalayout = "e-m:w-
+// MINGW: target datalayout = "e-m:w-
+// WIN32: target datalayout = "e-m:x-
+// THUMB-DARWIN: target datalayout = "e-m:o-
+// UTC_ARGS: --enable
+
+#ifdef __arm__
+/// FIXME: UTC does not find this function, but can find all others.
+typedef __attribute__((neon_vector_type(8))) __INT8_TYPE__ int8x8_t;
+int8x8_t test_vaba_s8(int8x8_t a, int8x8_t b, int8x8_t c) {
+  return a + b + c;
+}
+#endif
+
+/// Check global variable mangling
+[[gnu::used]] static int i1 = 1;
+int i2 = 2;
+
+[[clang::noinline,gnu::used]] static int static_noinline_fn(int arg) { return arg; }
+
+[[gnu::visibility("hidden")]] int hidden_visibility(int arg) { return arg; }
+
+#ifdef __ELF__
+[[gnu::visibility("protected")]] int protected_visibility(int arg) { return arg; }
+#endif
+
+[[gnu::visibility("default")]] int default_visibility(int arg) { return arg; }
+
+int no_visibility(int arg) { return arg; }
+
+
+/// FIXME: the i386 @fastcall@12 is not being checked here
+#ifdef _WIN32
+int __fastcall fastcall(int arg, long arg2, long arg3) { return arg; }
+#endif
+
diff --git a/clang/test/utils/update_cc_test_checks/Inputs/c-symbol-mangling.c.expected b/clang/test/utils/update_cc_test_checks/Inputs/c-symbol-mangling.c.expected
new file mode 100644
index 000000000000..5d514f9d64c0
--- /dev/null
+++ b/clang/test/utils/update_cc_test_checks/Inputs/c-symbol-mangling.c.expected
@@ -0,0 +1,246 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+/// Check that we generate checks for functions even though the mangledName
+/// property in the AST dump JSON does not match the LLVM IR name.
+// RUN: %clang_cc1 -triple=x86_64-unknown-linux-gnu -emit-llvm -o - %s | FileCheck %s --check-prefix=ELF
+// RUN: %clang_cc1 -triple=x86_64-apple-macho -emit-llvm -o - %s | FileCheck %s --check-prefix=MACHO
+// RUN: %clang_cc1 -triple=x86_64-windows-msvc -emit-llvm -o - %s | FileCheck %s --check-prefix=MSVC
+// RUN: %clang_cc1 -triple=x86_64-windows-gnu -emit-llvm -o - %s | FileCheck %s --check-prefix=MINGW
+// RUN: %clang_cc1 -triple=i686-unknown-win32 -emit-llvm -o - %s | FileCheck %s --check-prefix=WIN32
+// RUN: %clang_cc1 -triple thumbv7s-apple-darwin -target-abi apcs-gnu -emit-llvm -o - %s | FileCheck %s --check-prefix=THUMB-DARWIN
+
+// UTC_ARGS: --disable
+// ELF: target datalayout = "e-m:e-
+// MACHO: target datalayout = "e-m:o-
+// MSVC: target datalayout = "e-m:w-
+// MINGW: target datalayout = "e-m:w-
+// WIN32: target datalayout = "e-m:x-
+// THUMB-DARWIN: target datalayout = "e-m:o-
+// UTC_ARGS: --enable
+
+#ifdef __arm__
+/// FIXME: UTC does not find this function, but can find all others.
+typedef __attribute__((neon_vector_type(8))) __INT8_TYPE__ int8x8_t;
+int8x8_t test_vaba_s8(int8x8_t a, int8x8_t b, int8x8_t c) {
+  return a + b + c;
+}
+#endif
+
+/// Check global variable mangling
+[[gnu::used]] static int i1 = 1;
+int i2 = 2;
+
+// ELF-LABEL: @static_noinline_fn(
+// ELF-NEXT:  entry:
+// ELF-NEXT:    [[ARG_ADDR:%.*]] = alloca i32, align 4
+// ELF-NEXT:    store i32 [[ARG:%.*]], ptr [[ARG_ADDR]], align 4
+// ELF-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARG_ADDR]], align 4
+// ELF-NEXT:    ret i32 [[TMP0]]
+//
+// MACHO-LABEL: @static_noinline_fn(
+// MACHO-NEXT:  entry:
+// MACHO-NEXT:    [[ARG_ADDR:%.*]] = alloca i32, align 4
+// MACHO-NEXT:    store i32 [[ARG:%.*]], ptr [[ARG_ADDR]], align 4
+// MACHO-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARG_ADDR]], align 4
+// MACHO-NEXT:    ret i32 [[TMP0]]
+//
+// MSVC-LABEL: @static_noinline_fn(
+// MSVC-NEXT:  entry:
+// MSVC-NEXT:    [[ARG_ADDR:%.*]] = alloca i32, align 4
+// MSVC-NEXT:    store i32 [[ARG:%.*]], ptr [[ARG_ADDR]], align 4
+// MSVC-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARG_ADDR]], align 4
+// MSVC-NEXT:    ret i32 [[TMP0]]
+//
+// MINGW-LABEL: @static_noinline_fn(
+// MINGW-NEXT:  entry:
+// MINGW-NEXT:    [[ARG_ADDR:%.*]] = alloca i32, align 4
+// MINGW-NEXT:    store i32 [[ARG:%.*]], ptr [[ARG_ADDR]], align 4
+// MINGW-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARG_ADDR]], align 4
+// MINGW-NEXT:    ret i32 [[TMP0]]
+//
+// WIN32-LABEL: @static_noinline_fn(
+// WIN32-NEXT:  entry:
+// WIN32-NEXT:    [[ARG_ADDR:%.*]] = alloca i32, align 4
+// WIN32-NEXT:    store i32 [[ARG:%.*]], ptr [[ARG_ADDR]], align 4
+// WIN32-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARG_ADDR]], align 4
+// WIN32-NEXT:    ret i32 [[TMP0]]
+//
+// THUMB-DARWIN-LABEL: @static_noinline_fn(
+// THUMB-DARWIN-NEXT:  entry:
+// THUMB-DARWIN-NEXT:    [[ARG_ADDR:%.*]] = alloca i32, align 4
+// THUMB-DARWIN-NEXT:    store i32 [[ARG:%.*]], ptr [[ARG_ADDR]], align 4
+// THUMB-DARWIN-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARG_ADDR]], align 4
+// THUMB-DARWIN-NEXT:    ret i32 [[TMP0]]
+//
+[[clang::noinline,gnu::used]] static int static_noinline_fn(int arg) { return arg; }
+
+// ELF-LABEL: @hidden_visibility(
+// ELF-NEXT:  entry:
+// ELF-NEXT:    [[ARG_ADDR:%.*]] = alloca i32, align 4
+// ELF-NEXT:    store i32 [[ARG:%.*]], ptr [[ARG_ADDR]], align 4
+// ELF-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARG_ADDR]], align 4
+// ELF-NEXT:    ret i32 [[TMP0]]
+//
+// MACHO-LABEL: @hidden_visibility(
+// MACHO-NEXT:  entry:
+// MACHO-NEXT:    [[ARG_ADDR:%.*]] = alloca i32, align 4
+// MACHO-NEXT:    store i32 [[ARG:%.*]], ptr [[ARG_ADDR]], align 4
+// MACHO-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARG_ADDR]], align 4
+// MACHO-NEXT:    ret i32 [[TMP0]]
+//
+// MSVC-LABEL: @hidden_visibility(
+// MSVC-NEXT:  entry:
+// MSVC-NEXT:    [[ARG_ADDR:%.*]] = alloca i32, align 4
+// MSVC-NEXT:    store i32 [[ARG:%.*]], ptr [[ARG_ADDR]], align 4
+// MSVC-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARG_ADDR]], align 4
+// MSVC-NEXT:    ret i32 [[TMP0]]
+//
+// MINGW-LABEL: @hidden_visibility(
+// MINGW-NEXT:  entry:
+// MINGW-NEXT:    [[ARG_ADDR:%.*]] = alloca i32, align 4
+// MINGW-NEXT:    store i32 [[ARG:%.*]], ptr [[ARG_ADDR]], align 4
+// MINGW-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARG_ADDR]], align 4
+// MINGW-NEXT:    ret i32 [[TMP0]]
+//
+// WIN32-LABEL: @hidden_visibility(
+// WIN32-NEXT:  entry:
+// WIN32-NEXT:    [[ARG_ADDR:%.*]] = alloca i32, align 4
+// WIN32-NEXT:    store i32 [[ARG:%.*]], ptr [[ARG_ADDR]], align 4
+// WIN32-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARG_ADDR]], align 4
+// WIN32-NEXT:    ret i32 [[TMP0]]
+//
+// THUMB-DARWIN-LABEL: @hidden_visibility(
+// THUMB-DARWIN-NEXT:  entry:
+// THUMB-DARWIN-NEXT:    [[ARG_ADDR:%.*]] = alloca i32, align 4
+// THUMB-DARWIN-NEXT:    store i32 [[ARG:%.*]], ptr [[ARG_ADDR]], align 4
+// THUMB-DARWIN-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARG_ADDR]], align 4
+// THUMB-DARWIN-NEXT:    ret i32 [[TMP0]]
+//
+[[gnu::visibility("hidden")]] int hidden_visibility(int arg) { return arg; }
+
+#ifdef __ELF__
+// ELF-LABEL: @protected_visibility(
+// ELF-NEXT:  entry:
+// ELF-NEXT:    [[ARG_ADDR:%.*]] = alloca i32, align 4
+// ELF-NEXT:    store i32 [[ARG:%.*]], ptr [[ARG_ADDR]], align 4
+// ELF-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARG_ADDR]], align 4
+// ELF-NEXT:    ret i32 [[TMP0]]
+//
+[[gnu::visibility("protected")]] int protected_visibility(int arg) { return arg; }
+#endif
+
+// ELF-LABEL: @default_visibility(
+// ELF-NEXT:  entry:
+// ELF-NEXT:    [[ARG_ADDR:%.*]] = alloca i32, align 4
+// ELF-NEXT:    store i32 [[ARG:%.*]], ptr [[ARG_ADDR]], align 4
+// ELF-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARG_ADDR]], align 4
+// ELF-NEXT:    ret i32 [[TMP0]]
+//
+// MACHO-LABEL: @default_visibility(
+// MACHO-NEXT:  entry:
+// MACHO-NEXT:    [[ARG_ADDR:%.*]] = alloca i32, align 4
+// MACHO-NEXT:    store i32 [[ARG:%.*]], ptr [[ARG_ADDR]], align 4
+// MACHO-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARG_ADDR]], align 4
+// MACHO-NEXT:    ret i32 [[TMP0]]
+//
+// MSVC-LABEL: @default_visibility(
+// MSVC-NEXT:  entry:
+// MSVC-NEXT:    [[ARG_ADDR:%.*]] = alloca i32, align 4
+// MSVC-NEXT:    store i32 [[ARG:%.*]], ptr [[ARG_ADDR]], align 4
+// MSVC-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARG_ADDR]], align 4
+// MSVC-NEXT:    ret i32 [[TMP0]]
+//
+// MINGW-LABEL: @default_visibility(
+// MINGW-NEXT:  entry:
+// MINGW-NEXT:    [[ARG_ADDR:%.*]] = alloca i32, align 4
+// MINGW-NEXT:    store i32 [[ARG:%.*]], ptr [[ARG_ADDR]], align 4
+// MINGW-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARG_ADDR]], align 4
+// MINGW-NEXT:    ret i32 [[TMP0]]
+//
+// WIN32-LABEL: @default_visibility(
+// WIN32-NEXT:  entry:
+// WIN32-NEXT:    [[ARG_ADDR:%.*]] = alloca i32, align 4
+// WIN32-NEXT:    store i32 [[ARG:%.*]], ptr [[ARG_ADDR]], align 4
+// WIN32-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARG_ADDR]], align 4
+// WIN32-NEXT:    ret i32 [[TMP0]]
+//
+// THUMB-DARWIN-LABEL: @default_visibility(
+// THUMB-DARWIN-NEXT:  entry:
+// THUMB-DARWIN-NEXT:    [[ARG_ADDR:%.*]] = alloca i32, align 4
+// THUMB-DARWIN-NEXT:    store i32 [[ARG:%.*]], ptr [[ARG_ADDR]], align 4
+// THUMB-DARWIN-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARG_ADDR]], align 4
+// THUMB-DARWIN-NEXT:    ret i32 [[TMP0]]
+//
+[[gnu::visibility("default")]] int default_visibility(int arg) { return arg; }
+
+// ELF-LABEL: @no_visibility(
+// ELF-NEXT:  entry:
+// ELF-NEXT:    [[ARG_ADDR:%.*]] = alloca i32, align 4
+// ELF-NEXT:    store i32 [[ARG:%.*]], ptr [[ARG_ADDR]], align 4
+// ELF-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARG_ADDR]], align 4
+// ELF-NEXT:    ret i32 [[TMP0]]
+//
+// MACHO-LABEL: @no_visibility(
+// MACHO-NEXT:  entry:
+// MACHO-NEXT:    [[ARG_ADDR:%.*]] = alloca i32, align 4
+// MACHO-NEXT:    store i32 [[ARG:%.*]], ptr [[ARG_ADDR]], align 4
+// MACHO-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARG_ADDR]], align 4
+// MACHO-NEXT:    ret i32 [[TMP0]]
+//
+// MSVC-LABEL: @no_visibility(
+// MSVC-NEXT:  entry:
+// MSVC-NEXT:    [[ARG_ADDR:%.*]] = alloca i32, align 4
+// MSVC-NEXT:    store i32 [[ARG:%.*]], ptr [[ARG_ADDR]], align 4
+// MSVC-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARG_ADDR]], align 4
+// MSVC-NEXT:    ret i32 [[TMP0]]
+//
+// MINGW-LABEL: @no_visibility(
+// MINGW-NEXT:  entry:
+// MINGW-NEXT:    [[ARG_ADDR:%.*]] = alloca i32, align 4
+// MINGW-NEXT:    store i32 [[ARG:%.*]], ptr [[ARG_ADDR]], align 4
+// MINGW-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARG_ADDR]], align 4
+// MINGW-NEXT:    ret i32 [[TMP0]]
+//
+// WIN32-LABEL: @no_visibility(
+// WIN32-NEXT:  entry:
+// WIN32-NEXT:    [[ARG_ADDR:%.*]] = alloca i32, align 4
+// WIN32-NEXT:    store i32 [[ARG:%.*]], ptr [[ARG_ADDR]], align 4
+// WIN32-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARG_ADDR]], align 4
+// WIN32-NEXT:    ret i32 [[TMP0]]
+//
+// THUMB-DARWIN-LABEL: @no_visibility(
+// THUMB-DARWIN-NEXT:  entry:
+// THUMB-DARWIN-NEXT:    [[ARG_ADDR:%.*]] = alloca i32, align 4
+// THUMB-DARWIN-NEXT:    store i32 [[ARG:%.*]], ptr [[ARG_ADDR]], align 4
+// THUMB-DARWIN-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARG_ADDR]], align 4
+// THUMB-DARWIN-NEXT:    ret i32 [[TMP0]]
+//
+int no_visibility(int arg) { return arg; }
+
+
+/// FIXME: the i386 @fastcall@12 is not being checked here
+#ifdef _WIN32
+// MSVC-LABEL: @fastcall(
+// MSVC-NEXT:  entry:
+// MSVC-NEXT:    [[ARG3_ADDR:%.*]] = alloca i32, align 4
+// MSVC-NEXT:    [[ARG2_ADDR:%.*]] = alloca i32, align 4
+// MSVC-NEXT:    [[ARG_ADDR:%.*]] = alloca i32, align 4
+// MSVC-NEXT:    store i32 [[ARG3:%.*]], ptr [[ARG3_ADDR]], align 4
+// MSVC-NEXT:    store i32 [[ARG2:%.*]], ptr [[ARG2_ADDR]], align 4
+// MSVC-NEXT:    store i32 [[ARG:%.*]], ptr [[ARG_ADDR]], align 4
+// MSVC-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARG_ADDR]], align 4
+// MSVC-NEXT:    ret i32 [[TMP0]]
+//
+// MINGW-LABEL: @fastcall(
+// MINGW-NEXT:  entry:
+// MINGW-NEXT:    [[ARG_ADDR:%.*]] = alloca i32, align 4
+// MINGW-NEXT:    [[ARG2_ADDR:%.*]] = alloca i32, align 4
+// MINGW-NEXT:    [[ARG3_ADDR:%.*]] = alloca i32, align 4
+// MINGW-NEXT:    store i32 [[ARG:%.*]], ptr [[ARG_ADDR]], align 4
+// MINGW-NEXT:    store i32 [[ARG2:%.*]], ptr [[ARG2_ADDR]], align 4
+// MINGW-NEXT:    store i32 [[ARG3:%.*]], ptr [[ARG3_ADDR]], align 4
+// MINGW-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARG_ADDR]], align 4
+// MINGW-NEXT:    ret i32 [[TMP0]]
+//
+int __fastcall fastcall(int arg, long arg2, long arg3) { return arg; }
+#endif
+
diff --git a/clang/test/utils/update_cc_test_checks/c-symbol-mangling.test b/clang/test/utils/update_cc_test_checks/c-symbol-mangling.test
new file mode 100644
index 000000000000..35cff933932f
--- /dev/null
+++ b/clang/test/utils/update_cc_test_checks/c-symbol-mangling.test
@@ -0,0 +1,8 @@
+## Test that we handle mangled C symbol names correctly in the update script
+
+# RUN: cp %S/Inputs/c-symbol-mangling.c %t-generated.c && %update_cc_test_checks %t-generated.c
+# RUN: diff -u %S/Inputs/c-symbol-mangling.c.expected %t-generated.c
+
+## Check that re-running update_cc_test_checks doesn't change the output
+# RUN: %update_cc_test_checks %t-generated.c
+# RUN: diff -u %S/Inputs/c-symbol-mangling.c.expected %t-generated.c
diff --git a/clang/tools/clang-scan-deps/ClangScanDeps.cpp b/clang/tools/clang-scan-deps/ClangScanDeps.cpp
index bd36181fca3f..709dc513be28 100644
--- a/clang/tools/clang-scan-deps/ClangScanDeps.cpp
+++ b/clang/tools/clang-scan-deps/ClangScanDeps.cpp
@@ -913,7 +913,7 @@ int clang_scan_deps_main(int argc, char **argv, const llvm::ToolContext &) {
       return llvm::nulls();
 
     std::error_code EC;
-    FileOS.emplace(OutputFileName, EC);
+    FileOS.emplace(OutputFileName, EC, llvm::sys::fs::OF_Text);
     if (EC) {
       llvm::errs() << "Failed to open output file '" << OutputFileName
                    << "': " << llvm::errorCodeToError(EC) << '\n';
@@ -1003,9 +1003,9 @@ int clang_scan_deps_main(int argc, char **argv, const llvm::ToolContext &) {
           auto OSIter = OSs.find(MakeformatOutputPath);
           if (OSIter == OSs.end()) {
             std::error_code EC;
-            OSIter =
-                OSs.try_emplace(MakeformatOutputPath, MakeformatOutputPath, EC)
-                    .first;
+            OSIter = OSs.try_emplace(MakeformatOutputPath, MakeformatOutputPath,
+                                     EC, llvm::sys::fs::OF_Text)
+                         .first;
             if (EC)
               llvm::errs() << "Failed to open P1689 make format output file \""
                            << MakeformatOutputPath << "\" for " << EC.message()
diff --git a/clang/tools/driver/driver.cpp b/clang/tools/driver/driver.cpp
index 12038de476ac..ffd157e60997 100644
--- a/clang/tools/driver/driver.cpp
+++ b/clang/tools/driver/driver.cpp
@@ -355,10 +355,12 @@ int clang_main(int Argc, char **Argv, const llvm::ToolContext &ToolContext) {
   if (!SetBackdoorDriverOutputsFromEnvVars(TheDriver))
     return 1;
 
+  auto ExecuteCC1WithContext =
+      [&ToolContext](SmallVectorImpl<const char *> &ArgV) {
+        return ExecuteCC1Tool(ArgV, ToolContext);
+      };
   if (!UseNewCC1Process) {
-    TheDriver.CC1Main = [ToolContext](SmallVectorImpl<const char *> &ArgV) {
-      return ExecuteCC1Tool(ArgV, ToolContext);
-    };
+    TheDriver.CC1Main = ExecuteCC1WithContext;
     // Ensure the CC1Command actually catches cc1 crashes
     llvm::CrashRecoveryContext::Enable();
   }
diff --git a/clang/tools/libclang/CIndex.cpp b/clang/tools/libclang/CIndex.cpp
index 3e761024392c..5e51fc4e2f66 100644
--- a/clang/tools/libclang/CIndex.cpp
+++ b/clang/tools/libclang/CIndex.cpp
@@ -2839,8 +2839,13 @@ void OpenACCClauseEnqueue::VisitIfClause(const OpenACCIfClause &C) {
   Visitor.AddStmt(C.getConditionExpr());
 }
 void OpenACCClauseEnqueue::VisitSelfClause(const OpenACCSelfClause &C) {
-  if (C.hasConditionExpr())
-    Visitor.AddStmt(C.getConditionExpr());
+  if (C.isConditionExprClause()) {
+    if (C.hasConditionExpr())
+      Visitor.AddStmt(C.getConditionExpr());
+  } else {
+    for (Expr *Var : C.getVarList())
+      Visitor.AddStmt(Var);
+  }
 }
 void OpenACCClauseEnqueue::VisitNumWorkersClause(
     const OpenACCNumWorkersClause &C) {
@@ -6439,6 +6444,8 @@ CXString clang_getCursorKindSpelling(enum CXCursorKind Kind) {
     return cxstring::createRef("OpenACCShutdownConstruct");
   case CXCursor_OpenACCSetConstruct:
     return cxstring::createRef("OpenACCSetConstruct");
+  case CXCursor_OpenACCUpdateConstruct:
+    return cxstring::createRef("OpenACCUpdateConstruct");
   }
 
   llvm_unreachable("Unhandled CXCursorKind");
diff --git a/clang/tools/libclang/CXCursor.cpp b/clang/tools/libclang/CXCursor.cpp
index cbc3485d4197..ee276d8e4e14 100644
--- a/clang/tools/libclang/CXCursor.cpp
+++ b/clang/tools/libclang/CXCursor.cpp
@@ -912,6 +912,9 @@ CXCursor cxcursor::MakeCXCursor(const Stmt *S, const Decl *Parent,
   case Stmt::OpenACCSetConstructClass:
     K = CXCursor_OpenACCSetConstruct;
     break;
+  case Stmt::OpenACCUpdateConstructClass:
+    K = CXCursor_OpenACCUpdateConstruct;
+    break;
   case Stmt::OMPTargetParallelGenericLoopDirectiveClass:
     K = CXCursor_OMPTargetParallelGenericLoopDirective;
     break;
diff --git a/clang/unittests/ASTMatchers/ASTMatchersNarrowingTest.cpp b/clang/unittests/ASTMatchers/ASTMatchersNarrowingTest.cpp
index f3d953454173..92ec79d12657 100644
--- a/clang/unittests/ASTMatchers/ASTMatchersNarrowingTest.cpp
+++ b/clang/unittests/ASTMatchers/ASTMatchersNarrowingTest.cpp
@@ -2253,6 +2253,21 @@ TEST_P(ASTMatchersTest, HasDependentName_DependentScopeDeclRefExpr) {
                       dependentScopeDeclRefExpr(hasDependentName("foo"))));
 }
 
+TEST_P(ASTMatchersTest, HasDependentName_DependentNameType) {
+  if (!GetParam().isCXX()) {
+    // FIXME: Fix this test to work with delayed template parsing.
+    return;
+  }
+
+  EXPECT_TRUE(matches(
+      R"(
+        template <typename T> struct declToImport {
+          typedef typename T::type dependent_name;
+        };
+      )",
+      dependentNameType(hasDependentName("type"))));
+}
+
 TEST(ASTMatchersTest, NamesMember_CXXDependentScopeMemberExpr) {
 
   // Member functions:
diff --git a/clang/unittests/Analysis/FlowSensitive/CachedConstAccessorsLatticeTest.cpp b/clang/unittests/Analysis/FlowSensitive/CachedConstAccessorsLatticeTest.cpp
index 6488833bd14c..d27f6a6d27e7 100644
--- a/clang/unittests/Analysis/FlowSensitive/CachedConstAccessorsLatticeTest.cpp
+++ b/clang/unittests/Analysis/FlowSensitive/CachedConstAccessorsLatticeTest.cpp
@@ -149,6 +149,35 @@ TEST_F(CachedConstAccessorsLatticeTest, SameLocBeforeClearOrDiffAfterClear) {
 }
 
 TEST_F(CachedConstAccessorsLatticeTest,
+       SameLocBeforeClearOrDiffAfterClearWithCallee) {
+  CommonTestInputs Inputs;
+  auto *CE = Inputs.CallRef;
+  RecordStorageLocation Loc(Inputs.SType, RecordStorageLocation::FieldToLoc(),
+                            {});
+
+  LatticeT Lattice;
+  auto NopInit = [](StorageLocation &) {};
+  const FunctionDecl *Callee = CE->getDirectCallee();
+  ASSERT_NE(Callee, nullptr);
+  StorageLocation &Loc1 = Lattice.getOrCreateConstMethodReturnStorageLocation(
+      Loc, Callee, Env, NopInit);
+  auto NotCalled = [](StorageLocation &) {
+    ASSERT_TRUE(false) << "Not reached";
+  };
+  StorageLocation &Loc2 = Lattice.getOrCreateConstMethodReturnStorageLocation(
+      Loc, Callee, Env, NotCalled);
+
+  EXPECT_EQ(&Loc1, &Loc2);
+
+  Lattice.clearConstMethodReturnStorageLocations(Loc);
+  StorageLocation &Loc3 = Lattice.getOrCreateConstMethodReturnStorageLocation(
+      Loc, Callee, Env, NopInit);
+
+  EXPECT_NE(&Loc3, &Loc1);
+  EXPECT_NE(&Loc3, &Loc2);
+}
+
+TEST_F(CachedConstAccessorsLatticeTest,
        SameStructValBeforeClearOrDiffAfterClear) {
   TestAST AST(R"cpp(
     struct S {
diff --git a/clang/unittests/Analysis/FlowSensitive/UncheckedOptionalAccessModelTest.cpp b/clang/unittests/Analysis/FlowSensitive/UncheckedOptionalAccessModelTest.cpp
index de16f6be8eed..19c3ff49eab2 100644
--- a/clang/unittests/Analysis/FlowSensitive/UncheckedOptionalAccessModelTest.cpp
+++ b/clang/unittests/Analysis/FlowSensitive/UncheckedOptionalAccessModelTest.cpp
@@ -3771,6 +3771,54 @@ TEST_P(UncheckedOptionalAccessTest, ConstPointerAccessorWithModInBetween) {
                        /*IgnoreSmartPointerDereference=*/false);
 }
 
+TEST_P(UncheckedOptionalAccessTest, SmartPointerAccessorMixed) {
+  ExpectDiagnosticsFor(R"cc(
+     #include "unchecked_optional_access_test.h"
+
+    struct A {
+      $ns::$optional<int> x;
+    };
+
+    namespace absl {
+    template<typename T>
+    class StatusOr {
+      public:
+      bool ok() const;
+
+      const T& operator*() const&;
+      T& operator*() &;
+
+      const T* operator->() const;
+      T* operator->();
+
+      const T& value() const;
+      T& value();
+    };
+    }
+
+    void target(absl::StatusOr<A> &mut, const absl::StatusOr<A> &imm) {
+      if (!mut.ok() || !imm.ok())
+        return;
+
+      if (mut->x.has_value()) {
+        mut->x.value();
+        ((*mut).x).value();
+        (mut.value().x).value();
+
+        // check flagged after modifying
+        mut = imm;
+        mut->x.value();  // [[unsafe]]
+      }
+      if (imm->x.has_value()) {
+        imm->x.value();
+        ((*imm).x).value();
+        (imm.value().x).value();
+      }
+    }
+  )cc",
+                       /*IgnoreSmartPointerDereference=*/false);
+}
+
 TEST_P(UncheckedOptionalAccessTest, ConstBoolAccessor) {
   ExpectDiagnosticsFor(R"cc(
     #include "unchecked_optional_access_test.h"
diff --git a/clang/unittests/StaticAnalyzer/Z3CrosscheckOracleTest.cpp b/clang/unittests/StaticAnalyzer/Z3CrosscheckOracleTest.cpp
index ed8627c50009..626f5c163d17 100644
--- a/clang/unittests/StaticAnalyzer/Z3CrosscheckOracleTest.cpp
+++ b/clang/unittests/StaticAnalyzer/Z3CrosscheckOracleTest.cpp
@@ -27,22 +27,13 @@ static constexpr std::optional<bool> UNDEF = std::nullopt;
 static unsigned operator""_ms(unsigned long long ms) { return ms; }
 static unsigned operator""_step(unsigned long long rlimit) { return rlimit; }
 
-template <class Ret, class Arg> static Ret makeDefaultOption(Arg Value) {
-  return Value;
-}
-template <> PositiveAnalyzerOption makeDefaultOption(int Value) {
-  auto DefaultVal = PositiveAnalyzerOption::create(Value);
-  assert(DefaultVal.has_value());
-  return DefaultVal.value();
-}
-
 static const AnalyzerOptions DefaultOpts = [] {
   AnalyzerOptions Config;
 #define ANALYZER_OPTION_DEPENDS_ON_USER_MODE(TYPE, NAME, CMDFLAG, DESC,        \
                                              SHALLOW_VAL, DEEP_VAL)            \
   ANALYZER_OPTION(TYPE, NAME, CMDFLAG, DESC, DEEP_VAL)
 #define ANALYZER_OPTION(TYPE, NAME, CMDFLAG, DESC, DEFAULT_VAL)                \
-  Config.NAME = makeDefaultOption<TYPE>(DEFAULT_VAL);
+  Config.NAME = DEFAULT_VAL;
 #include "clang/StaticAnalyzer/Core/AnalyzerOptions.def"
 
   // Remember to update the tests in this file when these values change.
diff --git a/clang/utils/TableGen/SveEmitter.cpp b/clang/utils/TableGen/SveEmitter.cpp
index cf7e5a1ee3e0..97b768db3a31 100644
--- a/clang/utils/TableGen/SveEmitter.cpp
+++ b/clang/utils/TableGen/SveEmitter.cpp
@@ -1640,13 +1640,6 @@ void SVEEmitter::createSMEHeader(raw_ostream &OS) {
   OS << "  return x0 & (1ULL << 63);\n";
   OS << "}\n\n";
 
-  OS << "__ai bool __arm_in_streaming_mode(void) __arm_streaming_compatible "
-        "{\n";
-  OS << "  uint64_t x0, x1;\n";
-  OS << "  __builtin_arm_get_sme_state(&x0, &x1);\n";
-  OS << "  return x0 & 1;\n";
-  OS << "}\n\n";
-
   OS << "void *__arm_sc_memcpy(void *dest, const void *src, size_t n) __arm_streaming_compatible;\n";
   OS << "void *__arm_sc_memmove(void *dest, const void *src, size_t n) __arm_streaming_compatible;\n";
   OS << "void *__arm_sc_memset(void *s, int c, size_t n) __arm_streaming_compatible;\n";
diff --git a/clang/www/cxx_dr_status.html b/clang/www/cxx_dr_status.html
index 70f57a0c00a7..f2716f1e4c65 100755
--- a/clang/www/cxx_dr_status.html
+++ b/clang/www/cxx_dr_status.html
@@ -14045,7 +14045,7 @@ and <I>POD class</I></td>
     <td><a href="https://cplusplus.github.io/CWG/issues/2369.html">2369</a></td>
     <td>CD6</td>
     <td>Ordering between constraints and substitution</td>
-    <td class="partial" align="center">Partial</td>
+    <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2370">
     <td><a href="https://cplusplus.github.io/CWG/issues/2370.html">2370</a></td>
@@ -16464,11 +16464,7 @@ objects</td>
     <td><a href="https://cplusplus.github.io/CWG/issues/2770.html">2770</a></td>
     <td>open</td>
     <td>Trailing <I>requires-clause</I> can refer to function parameters before they are substituted into</td>
-    <td align="center">
-      <details>
-        <summary>Not resolved</summary>
-        Clang 20 implements 2023-07-14 resolution
-      </details></td>
+    <td align="center">Not resolved</td>
   </tr>
   <tr id="2771">
     <td><a href="https://cplusplus.github.io/CWG/issues/2771.html">2771</a></td>
diff --git a/clang/www/make_cxx_dr_status b/clang/www/make_cxx_dr_status
index e0885fdbd2d3..05de96119fc1 100755
--- a/clang/www/make_cxx_dr_status
+++ b/clang/www/make_cxx_dr_status
@@ -83,8 +83,8 @@ else:
 
 status_map = collect_tests()
 drs = get_issues(issue_list_path)
-out_file = open(output, 'w')
-out_file.write('''\
+out_html = []
+out_html.append('''\
 <!DOCTYPE html>
 <!-- This file is auto-generated by make_cxx_dr_status. Do not modify. -->
 <html>
@@ -149,7 +149,7 @@ def availability(issue):
     unresolved_status = unresolved_status_match.group(1)
     proposed_resolution_match = re.search(r' (open|drafting|review|tentatively ready|ready) (\d{4}-\d{2}(?:-\d{2})?|P\d{4}R\d+)$', status)
     if proposed_resolution_match is None:
-      raise AvailabilityError('Issue {}: \'{}\' status should be followed by a paper number (P1234R5) or proposed resolution in YYYY-MM-DD format'.format(dr.issue, unresolved_status))
+      raise AvailabilityError('error: issue {}: \'{}\' status should be followed by a paper number (P1234R5) or proposed resolution in YYYY-MM-DD format'.format(dr.issue, unresolved_status))
     proposed_resolution = proposed_resolution_match.group(2)
     status = status[:-1-len(proposed_resolution)]
     status = status[:-1-len(unresolved_status)]
@@ -236,7 +236,7 @@ def availability(issue):
     avail = 'Duplicate of <a href="#%s">%s</a>' % (dup, dup)
     _, avail_style, _, _ = availability(dup)
   else:
-    raise AvailabilityError('Unknown status %s for issue %s' % (status, dr.issue))
+    raise AvailabilityError('error: unknown status %s for issue %s' % (status, dr.issue))
   return (avail + avail_suffix, avail_style, unresolved_status, details)
 
 count = {}
@@ -266,7 +266,7 @@ for dr in drs:
     else:
       if unresolved_status != dr.status:
         availability_error_occurred = True
-        print("Issue %s is marked '%s', which differs from CWG index status '%s'" \
+        print("error: issue %s is marked '%s', which differs from CWG index status '%s'" \
              % (dr.issue, unresolved_status, dr.status))
         continue
   else:
@@ -280,7 +280,7 @@ for dr in drs:
 
     if unresolved_status:
       availability_error_occurred = True
-      print("Issue %s is marked '%s', even though it is resolved in CWG index" \
+      print("error: issue %s is marked '%s', even though it is resolved in CWG index" \
            % (dr.issue, unresolved_status))
       continue
 
@@ -296,7 +296,7 @@ for dr in drs:
         <summary>{avail}</summary>
         {details}
       </details>'''
-  out_file.write(f'''
+  out_html.append(f'''
   <tr{row_style} id="{dr.issue}">
     <td><a href="https://cplusplus.github.io/CWG/issues/{dr.issue}.html">{dr.issue}</a></td>
     <td>{dr.status}</td>
@@ -310,12 +310,14 @@ if availability_error_occurred:
 for status, num in sorted(count.items()):
   print("%s: %s" % (status, num), file=sys.stderr)
 
-out_file.write('''\
+out_html.append('''\
 </table>
 
 </div>
 </body>
 </html>
 ''')
-out_file.close()
 
+out_file = open(output, 'w')
+out_file.write(''.join(out_html))
+out_file.close()
diff --git a/compiler-rt/cmake/Modules/AddCompilerRT.cmake b/compiler-rt/cmake/Modules/AddCompilerRT.cmake
index 3a6762320f44..c3e734f72392 100644
--- a/compiler-rt/cmake/Modules/AddCompilerRT.cmake
+++ b/compiler-rt/cmake/Modules/AddCompilerRT.cmake
@@ -389,7 +389,8 @@ function(add_compiler_rt_runtime name type)
         set_target_properties(${libname} PROPERTIES IMPORT_PREFIX "")
         set_target_properties(${libname} PROPERTIES IMPORT_SUFFIX ".lib")
       endif()
-      if (APPLE AND NOT CMAKE_LINKER MATCHES ".*lld.*")
+      find_program(CODESIGN codesign)
+      if (APPLE AND NOT CMAKE_LINKER MATCHES ".*lld.*" AND CODESIGN)
         # Apple's linker signs the resulting dylib with an ad-hoc code signature in
         # most situations, except:
         # 1. Versions of ld64 prior to ld64-609 in Xcode 12 predate this behavior.
@@ -404,7 +405,7 @@ function(add_compiler_rt_runtime name type)
         # argument and looking for `invalid argument "linker-signed"` in its output.
         # FIXME: Remove this once all supported toolchains support `-o linker-signed`.
         execute_process(
-          COMMAND sh -c "codesign -f -s - -o linker-signed this-does-not-exist 2>&1 | grep -q linker-signed"
+          COMMAND sh -c "${CODESIGN} -f -s - -o linker-signed this-does-not-exist 2>&1 | grep -q linker-signed"
           RESULT_VARIABLE CODESIGN_SUPPORTS_LINKER_SIGNED
         )
 
@@ -415,7 +416,7 @@ function(add_compiler_rt_runtime name type)
 
         add_custom_command(TARGET ${libname}
           POST_BUILD
-          COMMAND codesign --sign - ${EXTRA_CODESIGN_ARGUMENTS} $<TARGET_FILE:${libname}>
+          COMMAND ${CODESIGN} --sign - ${EXTRA_CODESIGN_ARGUMENTS} $<TARGET_FILE:${libname}>
           WORKING_DIRECTORY ${COMPILER_RT_OUTPUT_LIBRARY_DIR}
           COMMAND_EXPAND_LISTS
         )
diff --git a/compiler-rt/cmake/base-config-ix.cmake b/compiler-rt/cmake/base-config-ix.cmake
index 9717c21d8977..d92bc0e71fa1 100644
--- a/compiler-rt/cmake/base-config-ix.cmake
+++ b/compiler-rt/cmake/base-config-ix.cmake
@@ -89,6 +89,11 @@ else()
   set(COMPILER_RT_TEST_COMPILER_ID GNU)
 endif()
 
+# AppleClang expects 'Clang' as compiler-rt test compiler ID.
+if ("${COMPILER_RT_TEST_COMPILER_ID}" STREQUAL "AppleClang")
+  set(COMPILER_RT_TEST_COMPILER_ID Clang)
+endif()
+
 if(NOT DEFINED COMPILER_RT_OS_DIR)
   if(ANDROID)
     # The CMAKE_SYSTEM_NAME for Android is Android, but the OS is Linux and the
diff --git a/compiler-rt/lib/gwp_asan/tests/harness.h b/compiler-rt/lib/gwp_asan/tests/harness.h
index c96f846996d3..3fbcf991c559 100644
--- a/compiler-rt/lib/gwp_asan/tests/harness.h
+++ b/compiler-rt/lib/gwp_asan/tests/harness.h
@@ -12,7 +12,9 @@
 #include <stdarg.h>
 
 #if defined(__Fuchsia__)
+#ifndef ZXTEST_USE_STREAMABLE_MACROS
 #define ZXTEST_USE_STREAMABLE_MACROS
+#endif
 #include <zxtest/zxtest.h>
 namespace testing = zxtest;
 // zxtest defines a different ASSERT_DEATH, taking a lambda and an error message
diff --git a/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp b/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp
index f1fe20b255d9..6a5f4b91d11d 100644
--- a/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp
+++ b/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp
@@ -337,6 +337,45 @@ INTERCEPTOR(FILE *, fmemopen, void *buf, size_t size, const char *mode) {
 #define RTSAN_MAYBE_INTERCEPT_FMEMOPEN
 #endif
 
+#if SANITIZER_INTERCEPT_SETVBUF
+INTERCEPTOR(void, setbuf, FILE *stream, char *buf) {
+  __rtsan_notify_intercepted_call("setbuf");
+  return REAL(setbuf)(stream, buf);
+}
+
+INTERCEPTOR(int, setvbuf, FILE *stream, char *buf, int mode, size_t size) {
+  __rtsan_notify_intercepted_call("setvbuf");
+  return REAL(setvbuf)(stream, buf, mode, size);
+}
+
+#if SANITIZER_LINUX
+INTERCEPTOR(void, setlinebuf, FILE *stream) {
+#else
+INTERCEPTOR(int, setlinebuf, FILE *stream) {
+#endif
+  __rtsan_notify_intercepted_call("setlinebuf");
+  return REAL(setlinebuf)(stream);
+}
+
+#if SANITIZER_LINUX
+INTERCEPTOR(void, setbuffer, FILE *stream, char *buf, size_t size) {
+#else
+INTERCEPTOR(void, setbuffer, FILE *stream, char *buf, int size) {
+#endif
+  __rtsan_notify_intercepted_call("setbuffer");
+  return REAL(setbuffer)(stream, buf, size);
+}
+#define RTSAN_MAYBE_INTERCEPT_SETBUF INTERCEPT_FUNCTION(setbuf)
+#define RTSAN_MAYBE_INTERCEPT_SETVBUF INTERCEPT_FUNCTION(setvbuf)
+#define RTSAN_MAYBE_INTERCEPT_SETLINEBUF INTERCEPT_FUNCTION(setlinebuf)
+#define RTSAN_MAYBE_INTERCEPT_SETBUFFER INTERCEPT_FUNCTION(setbuffer)
+#else
+#define RTSAN_MAYBE_INTERCEPT_SETBUF
+#define RTSAN_MAYBE_INTERCEPT_SETVBUF
+#define RTSAN_MAYBE_INTERCEPT_SETLINEBUF
+#define RTSAN_MAYBE_INTERCEPT_SETBUFFER
+#endif
+
 INTERCEPTOR(int, puts, const char *s) {
   __rtsan_notify_intercepted_call("puts");
   return REAL(puts)(s);
@@ -999,6 +1038,10 @@ void __rtsan::InitializeInterceptors() {
   RTSAN_MAYBE_INTERCEPT_FOPENCOOKIE;
   RTSAN_MAYBE_INTERCEPT_OPEN_MEMSTREAM;
   RTSAN_MAYBE_INTERCEPT_FMEMOPEN;
+  RTSAN_MAYBE_INTERCEPT_SETBUF;
+  RTSAN_MAYBE_INTERCEPT_SETVBUF;
+  RTSAN_MAYBE_INTERCEPT_SETLINEBUF;
+  RTSAN_MAYBE_INTERCEPT_SETBUFFER;
   INTERCEPT_FUNCTION(lseek);
   RTSAN_MAYBE_INTERCEPT_LSEEK64;
   INTERCEPT_FUNCTION(dup);
diff --git a/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp b/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp
index d9872c54b261..5488d3c7e205 100644
--- a/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp
+++ b/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp
@@ -403,6 +403,56 @@ TEST_F(RtsanFileTest, FmemOpenDiesWhenRealtime) {
 }
 #endif
 
+#if SANITIZER_INTERCEPT_SETVBUF
+TEST_F(RtsanFileTest, SetbufDieWhenRealtime) {
+  char buffer[BUFSIZ];
+  FILE *f = fopen(GetTemporaryFilePath(), "w");
+  EXPECT_THAT(f, Ne(nullptr));
+
+  auto Func = [f, &buffer]() { setbuf(f, buffer); };
+
+  ExpectRealtimeDeath(Func, "setbuf");
+  ExpectNonRealtimeSurvival(Func);
+}
+
+TEST_F(RtsanFileTest, SetvbufDieWhenRealtime) {
+  char buffer[1024];
+  size_t size = sizeof(buffer);
+  FILE *f = fopen(GetTemporaryFilePath(), "w");
+  EXPECT_THAT(f, Ne(nullptr));
+
+  auto Func = [f, &buffer, size]() {
+    int r = setvbuf(f, buffer, _IOFBF, size);
+    EXPECT_THAT(r, Eq(0));
+  };
+
+  ExpectRealtimeDeath(Func, "setvbuf");
+  ExpectNonRealtimeSurvival(Func);
+}
+
+TEST_F(RtsanFileTest, SetlinebufDieWhenRealtime) {
+  FILE *f = fopen(GetTemporaryFilePath(), "w");
+  EXPECT_THAT(f, Ne(nullptr));
+
+  auto Func = [f]() { setlinebuf(f); };
+
+  ExpectRealtimeDeath(Func, "setlinebuf");
+  ExpectNonRealtimeSurvival(Func);
+}
+
+TEST_F(RtsanFileTest, SetbufferDieWhenRealtime) {
+  char buffer[1024];
+  size_t size = sizeof(buffer);
+  FILE *f = fopen(GetTemporaryFilePath(), "w");
+  EXPECT_THAT(f, Ne(nullptr));
+
+  auto Func = [f, &buffer, size]() { setbuffer(f, buffer, size); };
+
+  ExpectRealtimeDeath(Func, "setbuffer");
+  ExpectNonRealtimeSurvival(Func);
+}
+#endif
+
 class RtsanOpenedFileTest : public RtsanFileTest {
 protected:
   void SetUp() override {
diff --git a/compiler-rt/lib/tysan/tysan.cpp b/compiler-rt/lib/tysan/tysan.cpp
index 39d78e7c95e0..9c87b4782671 100644
--- a/compiler-rt/lib/tysan/tysan.cpp
+++ b/compiler-rt/lib/tysan/tysan.cpp
@@ -197,10 +197,14 @@ static void reportError(void *Addr, int Size, tysan_type_descriptor *TD,
     Printf("\n");
 
   if (pc) {
+    uptr top = 0;
+    uptr bottom = 0;
+    if (flags().print_stacktrace)
+      GetThreadStackTopAndBottom(false, &top, &bottom);
 
     bool request_fast = StackTrace::WillUseFastUnwind(true);
     BufferedStackTrace ST;
-    ST.Unwind(kStackTraceMax, pc, bp, 0, 0, 0, request_fast);
+    ST.Unwind(kStackTraceMax, pc, bp, 0, top, bottom, request_fast);
     ST.Print();
   } else {
     Printf("\n");
diff --git a/compiler-rt/lib/tysan/tysan_flags.inc b/compiler-rt/lib/tysan/tysan_flags.inc
index 98b6591f844e..be65c8e82879 100644
--- a/compiler-rt/lib/tysan/tysan_flags.inc
+++ b/compiler-rt/lib/tysan/tysan_flags.inc
@@ -15,3 +15,6 @@
 
 // TYSAN_FLAG(Type, Name, DefaultValue, Description)
 // See COMMON_FLAG in sanitizer_flags.inc for more details.
+
+TYSAN_FLAG(bool, print_stacktrace, false,
+           "Include full stacktrace into an error report")
diff --git a/compiler-rt/lib/ubsan/ubsan_value.h b/compiler-rt/lib/ubsan/ubsan_value.h
index 430c9ea0dc8d..ee523cf5ddda 100644
--- a/compiler-rt/lib/ubsan/ubsan_value.h
+++ b/compiler-rt/lib/ubsan/ubsan_value.h
@@ -150,9 +150,12 @@ public:
 
   unsigned getIntegerBitCount() const {
     DCHECK(isIntegerTy());
-    if (isSignedBitIntTy())
-      return *reinterpret_cast<const u32 *>(getBitIntBitCountPointer());
-    else
+    if (isSignedBitIntTy()) {
+      u32 BitCountValue;
+      internal_memcpy(&BitCountValue, getBitIntBitCountPointer(),
+                      sizeof(BitCountValue));
+      return BitCountValue;
+    } else
       return getIntegerBitWidth();
   }
 
diff --git a/compiler-rt/test/tysan/print_stacktrace.c b/compiler-rt/test/tysan/print_stacktrace.c
new file mode 100644
index 000000000000..3ffb6063377d
--- /dev/null
+++ b/compiler-rt/test/tysan/print_stacktrace.c
@@ -0,0 +1,22 @@
+// RUN: %clang_tysan -O0 %s -o %t && %run %t >%t.out 2>&1
+// RUN: FileCheck --check-prefixes=CHECK,CHECK-SHORT %s < %t.out
+
+// RUN: %env_tysan_opts=print_stacktrace=1 %run %t >%t.out 2>&1
+// RUN: FileCheck --check-prefixes=CHECK,CHECK-LONG %s < %t.out
+
+float *P;
+void zero_array() {
+  int i;
+  for (i = 0; i < 1; ++i)
+    P[i] = 0.0f;
+  // CHECK: ERROR: TypeSanitizer: type-aliasing-violation
+  // CHECK: WRITE of size 4 at {{.*}} with type float accesses an existing object of type p1 float
+  // CHECK: {{#0 0x.* in zero_array .*print_stacktrace.c:}}[[@LINE-3]]
+  // CHECK-SHORT-NOT: {{#1 0x.* in main .*print_stacktrace.c}}
+  // CHECK-LONG-NEXT: {{#1 0x.* in main .*print_stacktrace.c}}
+}
+
+int main() {
+  P = (float *)&P;
+  zero_array();
+}
diff --git a/flang/docs/Extensions.md b/flang/docs/Extensions.md
index 2d1c967a6068..f25f0d1e0ca3 100644
--- a/flang/docs/Extensions.md
+++ b/flang/docs/Extensions.md
@@ -160,7 +160,11 @@ end
 * `<>` as synonym for `.NE.` and `/=`
 * `$` and `@` as legal characters in names
 * Initialization in type declaration statements using `/values/`
-* Saved variables without explicit or default initializers are zero initialized.
+* Saved variables without explicit or default initializers are zero initialized,
+  except for scalar variables from the main program that are not explicitly
+  initialized or marked with an explicit SAVE attribute (these variables may be
+  placed on the stack by flang and not zero initialized). It is not advised to
+  rely on this extension in new code.
 * In a saved entity of a type with a default initializer, components without default
   values are zero initialized.
 * Kind specification with `*`, e.g. `REAL*4`
@@ -406,6 +410,7 @@ end
 * A character length specifier in a component or entity declaration
   is accepted before an array specification (`ch*3(2)`) as well
   as afterwards.
+* A zero field width is allowed for logical formatted output (`L0`).
 
 ### Extensions supported when enabled by options
 
diff --git a/flang/include/flang/Common/format.h b/flang/include/flang/Common/format.h
index de6967139c5c..c5e9fb06e260 100644
--- a/flang/include/flang/Common/format.h
+++ b/flang/include/flang/Common/format.h
@@ -463,10 +463,13 @@ template <typename CHAR> void FormatValidator<CHAR>::check_r(bool allowed) {
 template <typename CHAR> bool FormatValidator<CHAR>::check_w() {
   if (token_.kind() == TokenKind::UnsignedInteger) {
     wValue_ = integerValue_;
-    if (wValue_ == 0 &&
-        (*argString_ == 'A' || *argString_ == 'L' ||
-            stmt_ == IoStmtKind::Read)) { // C1306, 13.7.2.1p6
-      ReportError("'%s' edit descriptor 'w' value must be positive");
+    if (wValue_ == 0) {
+      if (*argString_ == 'A' || stmt_ == IoStmtKind::Read) {
+        // C1306, 13.7.2.1p6
+        ReportError("'%s' edit descriptor 'w' value must be positive");
+      } else if (*argString_ == 'L') {
+        ReportWarning("'%s' edit descriptor 'w' value should be positive");
+      }
     }
     NextToken();
     return true;
diff --git a/flang/include/flang/Evaluate/call.h b/flang/include/flang/Evaluate/call.h
index 7531d8a81e80..63277438128e 100644
--- a/flang/include/flang/Evaluate/call.h
+++ b/flang/include/flang/Evaluate/call.h
@@ -250,6 +250,7 @@ public:
 
   std::optional<Expr<SubscriptInteger>> LEN() const;
   int Rank() const;
+  static constexpr int Corank() { return 0; } // TODO
   bool IsElemental() const { return proc_.IsElemental(); }
   bool hasAlternateReturns() const { return hasAlternateReturns_; }
 
diff --git a/flang/include/flang/Evaluate/characteristics.h b/flang/include/flang/Evaluate/characteristics.h
index 11533a7259b0..357fc3e59524 100644
--- a/flang/include/flang/Evaluate/characteristics.h
+++ b/flang/include/flang/Evaluate/characteristics.h
@@ -102,6 +102,10 @@ public:
     }
     if (auto type{x.GetType()}) {
       TypeAndShape result{*type, GetShape(context, x, invariantOnly)};
+      result.corank_ = GetCorank(x);
+      if (result.corank_ > 0) {
+        result.attrs_.set(Attr::Coarray);
+      }
       if (type->category() == TypeCategory::Character) {
         if (const auto *chExpr{UnwrapExpr<Expr<SomeCharacter>>(x)}) {
           if (auto length{chExpr->LEN()}) {
diff --git a/flang/include/flang/Evaluate/constant.h b/flang/include/flang/Evaluate/constant.h
index d9866a08889f..61a814446bbf 100644
--- a/flang/include/flang/Evaluate/constant.h
+++ b/flang/include/flang/Evaluate/constant.h
@@ -65,6 +65,7 @@ public:
   ~ConstantBounds();
   const ConstantSubscripts &shape() const { return shape_; }
   int Rank() const { return GetRank(shape_); }
+  static constexpr int Corank() { return 0; }
   Constant<SubscriptInteger> SHAPE() const;
 
   // It is possible in this representation for a constant array to have
diff --git a/flang/include/flang/Evaluate/expression.h b/flang/include/flang/Evaluate/expression.h
index 9ea037a2f7c4..04f4406fc8a2 100644
--- a/flang/include/flang/Evaluate/expression.h
+++ b/flang/include/flang/Evaluate/expression.h
@@ -92,6 +92,7 @@ public:
 
   std::optional<DynamicType> GetType() const;
   int Rank() const;
+  int Corank() const;
   std::string AsFortran() const;
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   LLVM_DUMP_METHOD void dump() const;
@@ -190,6 +191,7 @@ public:
       return rank;
     }
   }
+  static constexpr int Corank() { return 0; }
 
   bool operator==(const Operation &that) const {
     return operand_ == that.operand_;
@@ -395,6 +397,7 @@ struct ImpliedDoIndex {
   using Result = SubscriptInteger;
   bool operator==(const ImpliedDoIndex &) const;
   static constexpr int Rank() { return 0; }
+  static constexpr int Corank() { return 0; }
   parser::CharBlock name; // nested implied DOs must use distinct names
 };
 
@@ -441,6 +444,7 @@ public:
 
   bool operator==(const ArrayConstructorValues &) const;
   static constexpr int Rank() { return 1; }
+  static constexpr int Corank() { return 0; }
   template <typename A> common::NoLvalue<A> Push(A &&x) {
     values_.emplace_back(std::move(x));
   }
@@ -680,6 +684,7 @@ public:
   int Rank() const {
     return common::visit([](const auto &x) { return x.Rank(); }, u);
   }
+  static constexpr int Corank() { return 0; }
   llvm::raw_ostream &AsFortran(llvm::raw_ostream &o) const;
   common::MapTemplate<Relational, DirectlyComparableTypes> u;
 };
@@ -766,7 +771,8 @@ public:
   std::optional<Expr<SomeType>> Find(const Symbol &) const;
 
   StructureConstructor &Add(const semantics::Symbol &, Expr<SomeType> &&);
-  int Rank() const { return 0; }
+  static constexpr int Rank() { return 0; }
+  static constexpr int Corank() { return 0; }
   DynamicType GetType() const;
   llvm::raw_ostream &AsFortran(llvm::raw_ostream &) const;
 
@@ -820,7 +826,8 @@ using BOZLiteralConstant = typename LargestReal::Scalar::Word;
 // Null pointers without MOLD= arguments are typed by context.
 struct NullPointer {
   constexpr bool operator==(const NullPointer &) const { return true; }
-  constexpr int Rank() const { return 0; }
+  static constexpr int Rank() { return 0; }
+  static constexpr int Corank() { return 0; }
 };
 
 // Procedure pointer targets are treated as if they were typeless.
diff --git a/flang/include/flang/Evaluate/shape.h b/flang/include/flang/Evaluate/shape.h
index e33044c0d34e..e679a0012354 100644
--- a/flang/include/flang/Evaluate/shape.h
+++ b/flang/include/flang/Evaluate/shape.h
@@ -117,6 +117,14 @@ MaybeExtentExpr GetExtent(const Subscript &, const NamedEntity &, int dimension,
 MaybeExtentExpr GetExtent(FoldingContext &, const Subscript &,
     const NamedEntity &, int dimension, bool invariantOnly = true);
 
+// Similar analyses for coarrays
+MaybeExtentExpr GetLCOBOUND(
+    const Symbol &, int dimension, bool invariantOnly = true);
+MaybeExtentExpr GetUCOBOUND(
+    const Symbol &, int dimension, bool invariantOnly = true);
+Shape GetLCOBOUNDs(const Symbol &, bool invariantOnly = true);
+Shape GetUCOBOUNDs(const Symbol &, bool invariantOnly = true);
+
 // Compute an element count for a triplet or trip count for a DO.
 ExtentExpr CountTrips(
     ExtentExpr &&lower, ExtentExpr &&upper, ExtentExpr &&stride);
diff --git a/flang/include/flang/Evaluate/tools.h b/flang/include/flang/Evaluate/tools.h
index f586c59d46e5..ec5fc7ab0148 100644
--- a/flang/include/flang/Evaluate/tools.h
+++ b/flang/include/flang/Evaluate/tools.h
@@ -102,22 +102,26 @@ template <typename A> bool IsAssumedRank(const A *x) {
   return x && IsAssumedRank(*x);
 }
 
-// Predicate: true when an expression is a coarray (corank > 0)
-bool IsCoarray(const ActualArgument &);
-bool IsCoarray(const Symbol &);
-template <typename A> bool IsCoarray(const A &) { return false; }
-template <typename A> bool IsCoarray(const Designator<A> &designator) {
-  if (const auto *symbol{std::get_if<SymbolRef>(&designator.u)}) {
-    return IsCoarray(**symbol);
-  }
-  return false;
+// Finds the corank of an entity, possibly packaged in various ways.
+// Unlike rank, only data references have corank > 0.
+int GetCorank(const ActualArgument &);
+static inline int GetCorank(const Symbol &symbol) { return symbol.Corank(); }
+template <typename A> int GetCorank(const A &) { return 0; }
+template <typename T> int GetCorank(const Designator<T> &designator) {
+  return designator.Corank();
 }
-template <typename T> bool IsCoarray(const Expr<T> &expr) {
-  return common::visit([](const auto &x) { return IsCoarray(x); }, expr.u);
+template <typename T> int GetCorank(const Expr<T> &expr) {
+  return common::visit([](const auto &x) { return GetCorank(x); }, expr.u);
 }
-template <typename A> bool IsCoarray(const std::optional<A> &x) {
-  return x && IsCoarray(*x);
+template <typename A> int GetCorank(const std::optional<A> &x) {
+  return x ? GetCorank(*x) : 0;
 }
+template <typename A> int GetCorank(const A *x) {
+  return x ? GetCorank(*x) : 0;
+}
+
+// Predicate: true when an expression is a coarray (corank > 0)
+template <typename A> bool IsCoarray(const A &x) { return GetCorank(x) > 0; }
 
 // Generalizing packagers: these take operations and expressions of more
 // specific types and wrap them in Expr<> containers of more abstract types.
diff --git a/flang/include/flang/Evaluate/variable.h b/flang/include/flang/Evaluate/variable.h
index 9565826dbfae..b454d37d93e5 100644
--- a/flang/include/flang/Evaluate/variable.h
+++ b/flang/include/flang/Evaluate/variable.h
@@ -51,6 +51,7 @@ template <typename T> struct Variable;
 struct BaseObject {
   EVALUATE_UNION_CLASS_BOILERPLATE(BaseObject)
   int Rank() const;
+  int Corank() const;
   std::optional<Expr<SubscriptInteger>> LEN() const;
   llvm::raw_ostream &AsFortran(llvm::raw_ostream &) const;
   const Symbol *symbol() const {
@@ -84,6 +85,7 @@ public:
   SymbolRef &symbol() { return symbol_; }
 
   int Rank() const;
+  int Corank() const;
   const Symbol &GetFirstSymbol() const;
   const Symbol &GetLastSymbol() const { return symbol_; }
   std::optional<Expr<SubscriptInteger>> LEN() const;
@@ -116,6 +118,7 @@ public:
   Component *UnwrapComponent();
 
   int Rank() const;
+  int Corank() const;
   std::optional<Expr<SubscriptInteger>> LEN() const;
   bool operator==(const NamedEntity &) const;
   llvm::raw_ostream &AsFortran(llvm::raw_ostream &) const;
@@ -147,6 +150,7 @@ public:
   const Symbol &parameter() const { return parameter_; }
 
   static constexpr int Rank() { return 0; } // always scalar
+  static constexpr int Corank() { return 0; }
   bool operator==(const TypeParamInquiry &) const;
   llvm::raw_ostream &AsFortran(llvm::raw_ostream &) const;
 
@@ -224,6 +228,7 @@ public:
   }
 
   int Rank() const;
+  int Corank() const;
   const Symbol &GetFirstSymbol() const;
   const Symbol &GetLastSymbol() const;
   std::optional<Expr<SubscriptInteger>> LEN() const;
@@ -271,6 +276,7 @@ public:
   CoarrayRef &set_team(Expr<SomeInteger> &&, bool isTeamNumber = false);
 
   int Rank() const;
+  int Corank() const { return 0; }
   const Symbol &GetFirstSymbol() const;
   const Symbol &GetLastSymbol() const;
   NamedEntity GetBase() const;
@@ -294,6 +300,7 @@ private:
 struct DataRef {
   EVALUATE_UNION_CLASS_BOILERPLATE(DataRef)
   int Rank() const;
+  int Corank() const;
   const Symbol &GetFirstSymbol() const;
   const Symbol &GetLastSymbol() const;
   std::optional<Expr<SubscriptInteger>> LEN() const;
@@ -331,6 +338,7 @@ public:
   Parent &parent() { return parent_; }
 
   int Rank() const;
+  int Corank() const;
   template <typename A> const A *GetParentIf() const {
     return std::get_if<A>(&parent_);
   }
@@ -361,6 +369,7 @@ public:
   const DataRef &complex() const { return complex_; }
   Part part() const { return part_; }
   int Rank() const;
+  int Corank() const;
   const Symbol &GetFirstSymbol() const { return complex_.GetFirstSymbol(); }
   const Symbol &GetLastSymbol() const { return complex_.GetLastSymbol(); }
   bool operator==(const ComplexPart &) const;
@@ -396,6 +405,7 @@ public:
 
   std::optional<DynamicType> GetType() const;
   int Rank() const;
+  int Corank() const;
   BaseObject GetBaseObject() const;
   const Symbol *GetLastSymbol() const;
   std::optional<Expr<SubscriptInteger>> LEN() const;
@@ -421,6 +431,7 @@ public:
   int dimension() const { return dimension_; }
 
   static constexpr int Rank() { return 0; } // always scalar
+  static constexpr int Corank() { return 0; }
   bool operator==(const DescriptorInquiry &) const;
   llvm::raw_ostream &AsFortran(llvm::raw_ostream &) const;
 
diff --git a/flang/include/flang/Optimizer/Builder/FIRBuilder.h b/flang/include/flang/Optimizer/Builder/FIRBuilder.h
index 6ee4370c99dc..c5d86e713f25 100644
--- a/flang/include/flang/Optimizer/Builder/FIRBuilder.h
+++ b/flang/include/flang/Optimizer/Builder/FIRBuilder.h
@@ -769,6 +769,11 @@ mlir::Value genMaxWithZero(fir::FirOpBuilder &builder, mlir::Location loc,
 mlir::Value genCPtrOrCFunptrAddr(fir::FirOpBuilder &builder, mlir::Location loc,
                                  mlir::Value cPtr, mlir::Type ty);
 
+/// The type(C_DEVPTR) is defined as the derived type with only one
+/// component of C_PTR type. Get the C address from the C_PTR component.
+mlir::Value genCDevPtrAddr(fir::FirOpBuilder &builder, mlir::Location loc,
+                           mlir::Value cDevPtr, mlir::Type ty);
+
 /// Get the C address value.
 mlir::Value genCPtrOrCFunptrValue(fir::FirOpBuilder &builder,
                                   mlir::Location loc, mlir::Value cPtr);
diff --git a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
index 3d0516555f76..18f84c7021e1 100644
--- a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
+++ b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
@@ -214,6 +214,7 @@ struct IntrinsicLibrary {
                                            llvm::ArrayRef<fir::ExtendedValue>);
   fir::ExtendedValue genCAssociatedCPtr(mlir::Type,
                                         llvm::ArrayRef<fir::ExtendedValue>);
+  fir::ExtendedValue genCDevLoc(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>);
   mlir::Value genErfcScaled(mlir::Type resultType,
                             llvm::ArrayRef<mlir::Value> args);
   void genCFPointer(llvm::ArrayRef<fir::ExtendedValue>);
diff --git a/flang/include/flang/Optimizer/Dialect/CanonicalizationPatterns.td b/flang/include/flang/Optimizer/Dialect/CanonicalizationPatterns.td
index 1dbde5c1c730..2414de496d45 100644
--- a/flang/include/flang/Optimizer/Dialect/CanonicalizationPatterns.td
+++ b/flang/include/flang/Optimizer/Dialect/CanonicalizationPatterns.td
@@ -57,6 +57,9 @@ def StrictSmallerWidthPred : Constraint<CPred<
                        "$0.getType().getIntOrFloatBitWidth() < "
                        "$1.getType().getIntOrFloatBitWidth()">>;
 
+def PointerCompatiblePred
+    : Constraint<CPred<"fir::ConvertOp::isPointerCompatible($0.getType())">>;
+
 // floats or ints that undergo successive extensions or successive truncations.
 def ConvertConvertOptPattern
     : Pat<(fir_ConvertOp:$res (fir_ConvertOp:$irm $arg)),
@@ -112,4 +115,18 @@ def ForwardConstantConvertPattern
           (createConstantOp $res, $attr),
           [(IndexTypePred $res), (IntegerTypePred $cnt)]>;
 
+// Optimize redundant pointer conversions, e.g.:
+// %1 = fir.convert %0 :
+//     (!fir.heap<!fir.array<2xf32>>) -> !fir.ref<!fir.array<2xf32>>
+// %2 = fir.convert %1 :
+//     (!fir.ref<!fir.array<2xf32>>) -> !fir.heap<!fir.array<2xf32>>
+// Will be optimized into:
+// %2 = fir.convert %0 :
+//     (!fir.heap<!fir.array<2xf32>>) -> !fir.heap<!fir.array<2xf32>>
+// which is redundant due to RedundantConvertOptPattern.
+def ChainedPointerConvertsPattern
+    : Pat<(fir_ConvertOp:$res(fir_ConvertOp:$irm $arg)), (fir_ConvertOp $arg),
+          [(PointerCompatiblePred $arg), (PointerCompatiblePred $irm),
+           (PointerCompatiblePred $res)]>;
+
 #endif // FORTRAN_FIR_REWRITE_PATTERNS
diff --git a/flang/include/flang/Semantics/symbol.h b/flang/include/flang/Semantics/symbol.h
index 2f97efddf7f7..bc6abccac1bb 100644
--- a/flang/include/flang/Semantics/symbol.h
+++ b/flang/include/flang/Semantics/symbol.h
@@ -861,23 +861,7 @@ public:
   bool operator!=(const Symbol &that) const { return !(*this == that); }
 
   int Rank() const { return RankImpl(); }
-
-  int Corank() const {
-    return common::visit(
-        common::visitors{
-            [](const SubprogramDetails &sd) {
-              return sd.isFunction() ? sd.result().Corank() : 0;
-            },
-            [](const GenericDetails &) {
-              return 0; /*TODO*/
-            },
-            [](const UseDetails &x) { return x.symbol().Corank(); },
-            [](const HostAssocDetails &x) { return x.symbol().Corank(); },
-            [](const ObjectEntityDetails &oed) { return oed.coshape().Rank(); },
-            [](const auto &) { return 0; },
-        },
-        details_);
-  }
+  int Corank() const { return CorankImpl(); }
 
   // If there is a parent component, return a pointer to its derived type spec.
   // The Scope * argument defaults to this->scope_ but should be overridden
@@ -955,6 +939,32 @@ private:
         },
         details_);
   }
+  inline int CorankImpl(int depth = startRecursionDepth) const {
+    if (depth-- == 0) {
+      return 0;
+    }
+    return common::visit(
+        common::visitors{
+            [&](const SubprogramDetails &sd) {
+              return sd.isFunction() ? sd.result().CorankImpl(depth) : 0;
+            },
+            [](const GenericDetails &) { return 0; },
+            [&](const ProcEntityDetails &ped) {
+              const Symbol *iface{ped.procInterface()};
+              return iface ? iface->CorankImpl(depth) : 0;
+            },
+            [&](const UseDetails &x) { return x.symbol().CorankImpl(depth); },
+            [&](const HostAssocDetails &x) {
+              return x.symbol().CorankImpl(depth);
+            },
+            [](const ObjectEntityDetails &oed) { return oed.coshape().Rank(); },
+            [](const AssocEntityDetails &aed) {
+              return aed.expr() ? aed.expr()->Corank() : 0;
+            },
+            [](const auto &) { return 0; },
+        },
+        details_);
+  }
   template <std::size_t> friend class Symbols;
   template <class, std::size_t> friend class std::array;
 };
diff --git a/flang/lib/Evaluate/characteristics.cpp b/flang/lib/Evaluate/characteristics.cpp
index 324d6b8dde73..3912d1c4b477 100644
--- a/flang/lib/Evaluate/characteristics.cpp
+++ b/flang/lib/Evaluate/characteristics.cpp
@@ -227,15 +227,14 @@ void TypeAndShape::AcquireAttrs(const semantics::Symbol &symbol) {
   } else if (semantics::IsAssumedSizeArray(symbol)) {
     attrs_.set(Attr::AssumedSize);
   }
+  if (int n{GetCorank(symbol)}) {
+    corank_ = n;
+    attrs_.set(Attr::Coarray);
+  }
   if (const auto *object{
-          symbol.GetUltimate().detailsIf<semantics::ObjectEntityDetails>()}) {
-    corank_ = object->coshape().Rank();
-    if (object->IsAssumedRank()) {
-      attrs_.set(Attr::AssumedRank);
-    }
-    if (object->IsCoarray()) {
-      attrs_.set(Attr::Coarray);
-    }
+          symbol.GetUltimate().detailsIf<semantics::ObjectEntityDetails>()};
+      object && object->IsAssumedRank()) {
+    attrs_.set(Attr::AssumedRank);
   }
 }
 
diff --git a/flang/lib/Evaluate/expression.cpp b/flang/lib/Evaluate/expression.cpp
index 9514ac8e3f65..759fe5bc71b6 100644
--- a/flang/lib/Evaluate/expression.cpp
+++ b/flang/lib/Evaluate/expression.cpp
@@ -113,6 +113,18 @@ template <typename A> int ExpressionBase<A>::Rank() const {
       derived().u);
 }
 
+template <typename A> int ExpressionBase<A>::Corank() const {
+  return common::visit(
+      [](const auto &x) {
+        if constexpr (common::HasMember<decltype(x), TypelessExpression>) {
+          return 0;
+        } else {
+          return x.Corank();
+        }
+      },
+      derived().u);
+}
+
 DynamicType Parentheses<SomeDerived>::GetType() const {
   return left().GetType().value();
 }
diff --git a/flang/lib/Evaluate/fold-integer.cpp b/flang/lib/Evaluate/fold-integer.cpp
index 26ae33faffe1..352dec4bb5ee 100644
--- a/flang/lib/Evaluate/fold-integer.cpp
+++ b/flang/lib/Evaluate/fold-integer.cpp
@@ -71,6 +71,28 @@ static bool CheckDimArg(const std::optional<ActualArgument> &dimArg,
   return true;
 }
 
+static bool CheckCoDimArg(const std::optional<ActualArgument> &dimArg,
+    const Symbol &symbol, parser::ContextualMessages &messages,
+    std::optional<int> &dimVal) {
+  dimVal.reset();
+  if (int corank{symbol.Corank()}; corank > 0) {
+    if (auto dim64{ToInt64(dimArg)}) {
+      if (*dim64 < 1) {
+        messages.Say("DIM=%jd dimension must be positive"_err_en_US, *dim64);
+        return false;
+      } else if (*dim64 > corank) {
+        messages.Say(
+            "DIM=%jd dimension is out of range for corank-%d coarray"_err_en_US,
+            *dim64, corank);
+        return false;
+      } else {
+        dimVal = static_cast<int>(*dim64 - 1); // 1-based to 0-based
+      }
+    }
+  }
+  return true;
+}
+
 // Class to retrieve the constant bound of an expression which is an
 // array that devolves to a type of Constant<T>
 class GetConstantArrayBoundHelper {
@@ -264,6 +286,37 @@ Expr<Type<TypeCategory::Integer, KIND>> UBOUND(FoldingContext &context,
   return Expr<T>{std::move(funcRef)};
 }
 
+// LCOBOUND() & UCOBOUND()
+template <int KIND>
+Expr<Type<TypeCategory::Integer, KIND>> COBOUND(FoldingContext &context,
+    FunctionRef<Type<TypeCategory::Integer, KIND>> &&funcRef, bool isUCOBOUND) {
+  using T = Type<TypeCategory::Integer, KIND>;
+  ActualArguments &args{funcRef.arguments()};
+  if (const Symbol * coarray{UnwrapWholeSymbolOrComponentDataRef(args[0])}) {
+    std::optional<int> dim;
+    if (funcRef.Rank() == 0) {
+      // Optional DIM= argument is present: result is scalar.
+      if (!CheckCoDimArg(args[1], *coarray, context.messages(), dim)) {
+        return MakeInvalidIntrinsic<T>(std::move(funcRef));
+      } else if (!dim) {
+        // DIM= is present but not constant, or error
+        return Expr<T>{std::move(funcRef)};
+      }
+    }
+    if (dim) {
+      if (auto cb{isUCOBOUND ? GetUCOBOUND(*coarray, *dim)
+                             : GetLCOBOUND(*coarray, *dim)}) {
+        return Fold(context, ConvertToType<T>(std::move(*cb)));
+      }
+    } else if (auto cbs{
+                   AsExtentArrayExpr(isUCOBOUND ? GetUCOBOUNDs(*coarray)
+                                                : GetLCOBOUNDs(*coarray))}) {
+      return Fold(context, ConvertToType<T>(Expr<ExtentType>{std::move(*cbs)}));
+    }
+  }
+  return Expr<T>{std::move(funcRef)};
+}
+
 // COUNT()
 template <typename T, int MASK_KIND> class CountAccumulator {
   using MaskT = Type<TypeCategory::Logical, MASK_KIND>;
@@ -1105,6 +1158,8 @@ Expr<Type<TypeCategory::Integer, KIND>> FoldIntrinsicFunction(
     }
   } else if (name == "lbound") {
     return LBOUND(context, std::move(funcRef));
+  } else if (name == "lcobound") {
+    return COBOUND(context, std::move(funcRef), /*isUCOBOUND=*/false);
   } else if (name == "leadz" || name == "trailz" || name == "poppar" ||
       name == "popcnt") {
     if (auto *sn{UnwrapExpr<Expr<SomeKind<T::category>>>(args[0])}) {
@@ -1396,6 +1451,8 @@ Expr<Type<TypeCategory::Integer, KIND>> FoldIntrinsicFunction(
     }
   } else if (name == "ubound") {
     return UBOUND(context, std::move(funcRef));
+  } else if (name == "ucobound") {
+    return COBOUND(context, std::move(funcRef), /*isUCOBOUND=*/true);
   } else if (name == "__builtin_numeric_storage_size") {
     if (!context.moduleFileName()) {
       // Don't fold this reference until it appears in the module file
diff --git a/flang/lib/Evaluate/intrinsics.cpp b/flang/lib/Evaluate/intrinsics.cpp
index 28805efb177e..0dc8e121ea16 100644
--- a/flang/lib/Evaluate/intrinsics.cpp
+++ b/flang/lib/Evaluate/intrinsics.cpp
@@ -958,7 +958,7 @@ static const IntrinsicInterface genericIntrinsicFunction[]{
         {{"coarray", AnyData, Rank::coarray}, RequiredDIM, OptionalTEAM},
         DefaultInt, Rank::scalar, IntrinsicClass::transformationalFunction},
     {"this_image", {{"coarray", AnyData, Rank::coarray}, OptionalTEAM},
-        DefaultInt, Rank::scalar, IntrinsicClass::transformationalFunction},
+        DefaultInt, Rank::vector, IntrinsicClass::transformationalFunction},
     {"this_image", {OptionalTEAM}, DefaultInt, Rank::scalar,
         IntrinsicClass::transformationalFunction},
     {"tiny",
@@ -2663,6 +2663,8 @@ private:
       ActualArguments &, FoldingContext &) const;
   std::optional<SpecificCall> HandleC_Loc(
       ActualArguments &, FoldingContext &) const;
+  std::optional<SpecificCall> HandleC_Devloc(
+      ActualArguments &, FoldingContext &) const;
   const std::string &ResolveAlias(const std::string &name) const {
     auto iter{aliases_.find(name)};
     return iter == aliases_.end() ? name : iter->second;
@@ -2690,7 +2692,8 @@ bool IntrinsicProcTable::Implementation::IsIntrinsicFunction(
     return true;
   }
   // special cases
-  return name == "__builtin_c_loc" || name == "null";
+  return name == "__builtin_c_loc" || name == "__builtin_c_devloc" ||
+      name == "null";
 }
 bool IntrinsicProcTable::Implementation::IsIntrinsicSubroutine(
     const std::string &name0) const {
@@ -3080,6 +3083,73 @@ std::optional<SpecificCall> IntrinsicProcTable::Implementation::HandleC_Loc(
   return std::nullopt;
 }
 
+// CUDA Fortran C_DEVLOC(x)
+std::optional<SpecificCall> IntrinsicProcTable::Implementation::HandleC_Devloc(
+    ActualArguments &arguments, FoldingContext &context) const {
+  static const char *const keywords[]{"cptr", nullptr};
+
+  if (CheckAndRearrangeArguments(arguments, context.messages(), keywords)) {
+    CHECK(arguments.size() == 1);
+    const auto *expr{arguments[0].value().UnwrapExpr()};
+    if (auto typeAndShape{characteristics::TypeAndShape::Characterize(
+            arguments[0], context)}) {
+      if (expr && !IsContiguous(*expr, context).value_or(true)) {
+        context.messages().Say(arguments[0]->sourceLocation(),
+            "C_DEVLOC() argument must be contiguous"_err_en_US);
+      }
+      if (auto constExtents{AsConstantExtents(context, typeAndShape->shape())};
+          constExtents && GetSize(*constExtents) == 0) {
+        context.messages().Say(arguments[0]->sourceLocation(),
+            "C_DEVLOC() argument may not be a zero-sized array"_err_en_US);
+      }
+      if (!(typeAndShape->type().category() != TypeCategory::Derived ||
+              typeAndShape->type().IsAssumedType() ||
+              (!typeAndShape->type().IsPolymorphic() &&
+                  CountNonConstantLenParameters(
+                      typeAndShape->type().GetDerivedTypeSpec()) == 0))) {
+        context.messages().Say(arguments[0]->sourceLocation(),
+            "C_DEVLOC() argument must have an intrinsic type, assumed type, or non-polymorphic derived type with no non-constant length parameter"_err_en_US);
+      } else if (typeAndShape->type().knownLength().value_or(1) == 0) {
+        context.messages().Say(arguments[0]->sourceLocation(),
+            "C_DEVLOC() argument may not be zero-length character"_err_en_US);
+      } else if (typeAndShape->type().category() != TypeCategory::Derived &&
+          !IsInteroperableIntrinsicType(typeAndShape->type()).value_or(true)) {
+        if (typeAndShape->type().category() == TypeCategory::Character &&
+            typeAndShape->type().kind() == 1) {
+          // Default character kind, but length is not known to be 1
+          if (context.languageFeatures().ShouldWarn(
+                  common::UsageWarning::CharacterInteroperability)) {
+            context.messages().Say(
+                common::UsageWarning::CharacterInteroperability,
+                arguments[0]->sourceLocation(),
+                "C_DEVLOC() argument has non-interoperable character length"_warn_en_US);
+          }
+        } else if (context.languageFeatures().ShouldWarn(
+                       common::UsageWarning::Interoperability)) {
+          context.messages().Say(common::UsageWarning::Interoperability,
+              arguments[0]->sourceLocation(),
+              "C_DEVLOC() argument has non-interoperable intrinsic type or kind"_warn_en_US);
+        }
+      }
+
+      characteristics::DummyDataObject ddo{std::move(*typeAndShape)};
+      ddo.intent = common::Intent::In;
+      return SpecificCall{
+          SpecificIntrinsic{"__builtin_c_devloc"s,
+              characteristics::Procedure{
+                  characteristics::FunctionResult{
+                      DynamicType{GetBuiltinDerivedType(
+                          builtinsScope_, "__builtin_c_devptr")}},
+                  characteristics::DummyArguments{
+                      characteristics::DummyArgument{"cptr"s, std::move(ddo)}},
+                  characteristics::Procedure::Attrs{
+                      characteristics::Procedure::Attr::Pure}}},
+          std::move(arguments)};
+    }
+  }
+  return std::nullopt;
+}
+
 static bool CheckForNonPositiveValues(FoldingContext &context,
     const ActualArgument &arg, const std::string &procName,
     const std::string &argName) {
@@ -3119,27 +3189,6 @@ static bool CheckForNonPositiveValues(FoldingContext &context,
   return ok;
 }
 
-static bool CheckDimAgainstCorank(SpecificCall &call, FoldingContext &context) {
-  bool ok{true};
-  if (const auto &coarrayArg{call.arguments[0]}) {
-    if (const auto &dimArg{call.arguments[1]}) {
-      if (const auto *symbol{
-              UnwrapWholeSymbolDataRef(coarrayArg->UnwrapExpr())}) {
-        const auto corank = symbol->Corank();
-        if (const auto dimNum{ToInt64(dimArg->UnwrapExpr())}) {
-          if (dimNum < 1 || dimNum > corank) {
-            ok = false;
-            context.messages().Say(dimArg->sourceLocation(),
-                "DIM=%jd dimension is out of range for coarray with corank %d"_err_en_US,
-                static_cast<std::intmax_t>(*dimNum), corank);
-          }
-        }
-      }
-    }
-  }
-  return ok;
-}
-
 static bool CheckAtomicDefineAndRef(FoldingContext &context,
     const std::optional<ActualArgument> &atomArg,
     const std::optional<ActualArgument> &valueArg,
@@ -3207,8 +3256,6 @@ static bool ApplySpecificChecks(SpecificCall &call, FoldingContext &context) {
     if (const auto &arg{call.arguments[0]}) {
       ok = CheckForNonPositiveValues(context, *arg, name, "image");
     }
-  } else if (name == "lcobound") {
-    return CheckDimAgainstCorank(call, context);
   } else if (name == "loc") {
     const auto &arg{call.arguments[0]};
     ok =
@@ -3218,8 +3265,6 @@ static bool ApplySpecificChecks(SpecificCall &call, FoldingContext &context) {
           arg ? arg->sourceLocation() : context.messages().at(),
           "Argument of LOC() must be an object or procedure"_err_en_US);
     }
-  } else if (name == "ucobound") {
-    return CheckDimAgainstCorank(call, context);
   }
   return ok;
 }
@@ -3270,6 +3315,8 @@ std::optional<SpecificCall> IntrinsicProcTable::Implementation::Probe(
   } else { // function
     if (call.name == "__builtin_c_loc") {
       return HandleC_Loc(arguments, context);
+    } else if (call.name == "__builtin_c_devloc") {
+      return HandleC_Devloc(arguments, context);
     } else if (call.name == "null") {
       return HandleNull(arguments, context);
     }
diff --git a/flang/lib/Evaluate/shape.cpp b/flang/lib/Evaluate/shape.cpp
index c62d0cb0ff29..c7b2156a3de1 100644
--- a/flang/lib/Evaluate/shape.cpp
+++ b/flang/lib/Evaluate/shape.cpp
@@ -723,6 +723,58 @@ Shape GetUBOUNDs(const NamedEntity &base, bool invariantOnly) {
   return GetUBOUNDs(nullptr, base, invariantOnly);
 }
 
+MaybeExtentExpr GetLCOBOUND(
+    const Symbol &symbol0, int dimension, bool invariantOnly) {
+  const Symbol &symbol{ResolveAssociations(symbol0)};
+  if (const auto *object{symbol.detailsIf<semantics::ObjectEntityDetails>()}) {
+    int corank{object->coshape().Rank()};
+    if (dimension < corank) {
+      const semantics::ShapeSpec &shapeSpec{object->coshape()[dimension]};
+      if (const auto &lcobound{shapeSpec.lbound().GetExplicit()}) {
+        if (!invariantOnly || IsScopeInvariantExpr(*lcobound)) {
+          return *lcobound;
+        }
+      }
+    }
+  }
+  return std::nullopt;
+}
+
+MaybeExtentExpr GetUCOBOUND(
+    const Symbol &symbol0, int dimension, bool invariantOnly) {
+  const Symbol &symbol{ResolveAssociations(symbol0)};
+  if (const auto *object{symbol.detailsIf<semantics::ObjectEntityDetails>()}) {
+    int corank{object->coshape().Rank()};
+    if (dimension < corank - 1) {
+      const semantics::ShapeSpec &shapeSpec{object->coshape()[dimension]};
+      if (const auto ucobound{shapeSpec.ubound().GetExplicit()}) {
+        if (!invariantOnly || IsScopeInvariantExpr(*ucobound)) {
+          return *ucobound;
+        }
+      }
+    }
+  }
+  return std::nullopt;
+}
+
+Shape GetLCOBOUNDs(const Symbol &symbol, bool invariantOnly) {
+  Shape result;
+  int corank{symbol.Corank()};
+  for (int dim{0}; dim < corank; ++dim) {
+    result.emplace_back(GetLCOBOUND(symbol, dim, invariantOnly));
+  }
+  return result;
+}
+
+Shape GetUCOBOUNDs(const Symbol &symbol, bool invariantOnly) {
+  Shape result;
+  int corank{symbol.Corank()};
+  for (int dim{0}; dim < corank; ++dim) {
+    result.emplace_back(GetUCOBOUND(symbol, dim, invariantOnly));
+  }
+  return result;
+}
+
 auto GetShapeHelper::operator()(const Symbol &symbol) const -> Result {
   return common::visit(
       common::visitors{
@@ -937,6 +989,10 @@ auto GetShapeHelper::operator()(const ProcedureRef &call) const -> Result {
       if (!call.arguments().empty()) {
         return (*this)(call.arguments()[0]);
       }
+    } else if (intrinsic->name == "lcobound" || intrinsic->name == "ucobound") {
+      if (call.arguments().size() == 3 && !call.arguments().at(1).has_value()) {
+        return Shape(1, ExtentExpr{GetCorank(call.arguments().at(0))});
+      }
     } else if (intrinsic->name == "matmul") {
       if (call.arguments().size() == 2) {
         if (auto ashape{(*this)(call.arguments()[0])}) {
@@ -1076,6 +1132,11 @@ auto GetShapeHelper::operator()(const ProcedureRef &call) const -> Result {
           }
         }
       }
+    } else if (intrinsic->name == "this_image") {
+      if (call.arguments().size() == 2) {
+        // THIS_IMAGE(coarray, no DIM, [TEAM])
+        return Shape(1, ExtentExpr{GetCorank(call.arguments().at(0))});
+      }
     } else if (intrinsic->name == "transpose") {
       if (call.arguments().size() >= 1) {
         if (auto shape{(*this)(call.arguments().at(0))}) {
diff --git a/flang/lib/Evaluate/tools.cpp b/flang/lib/Evaluate/tools.cpp
index 6299084d729b..6bd623a690e3 100644
--- a/flang/lib/Evaluate/tools.cpp
+++ b/flang/lib/Evaluate/tools.cpp
@@ -906,13 +906,9 @@ bool IsAssumedRank(const ActualArgument &arg) {
   }
 }
 
-bool IsCoarray(const ActualArgument &arg) {
+int GetCorank(const ActualArgument &arg) {
   const auto *expr{arg.UnwrapExpr()};
-  return expr && IsCoarray(*expr);
-}
-
-bool IsCoarray(const Symbol &symbol) {
-  return GetAssociationRoot(symbol).Corank() > 0;
+  return GetCorank(*expr);
 }
 
 bool IsProcedureDesignator(const Expr<SomeType> &expr) {
diff --git a/flang/lib/Evaluate/variable.cpp b/flang/lib/Evaluate/variable.cpp
index 707a2065ca30..841d0f71ed0e 100644
--- a/flang/lib/Evaluate/variable.cpp
+++ b/flang/lib/Evaluate/variable.cpp
@@ -465,6 +465,59 @@ template <typename T> int Designator<T>::Rank() const {
       u);
 }
 
+// Corank()
+int BaseObject::Corank() const {
+  return common::visit(common::visitors{
+                           [](SymbolRef symbol) { return symbol->Corank(); },
+                           [](const StaticDataObject::Pointer &) { return 0; },
+                       },
+      u);
+}
+
+int Component::Corank() const {
+  if (int corank{symbol_->Corank()}; corank > 0) {
+    return corank;
+  }
+  return base().Corank();
+}
+
+int NamedEntity::Corank() const {
+  return common::visit(common::visitors{
+                           [](const SymbolRef s) { return s->Corank(); },
+                           [](const Component &c) { return c.Corank(); },
+                       },
+      u_);
+}
+
+int ArrayRef::Corank() const { return base().Corank(); }
+
+int DataRef::Corank() const {
+  return common::visit(common::visitors{
+                           [](SymbolRef symbol) { return symbol->Corank(); },
+                           [](const auto &x) { return x.Corank(); },
+                       },
+      u);
+}
+
+int Substring::Corank() const {
+  return common::visit(
+      common::visitors{
+          [](const DataRef &dataRef) { return dataRef.Corank(); },
+          [](const StaticDataObject::Pointer &) { return 0; },
+      },
+      parent_);
+}
+
+int ComplexPart::Corank() const { return complex_.Corank(); }
+
+template <typename T> int Designator<T>::Corank() const {
+  return common::visit(common::visitors{
+                           [](SymbolRef symbol) { return symbol->Corank(); },
+                           [](const auto &x) { return x.Corank(); },
+                       },
+      u);
+}
+
 // GetBaseObject(), GetFirstSymbol(), GetLastSymbol(), &c.
 const Symbol &Component::GetFirstSymbol() const {
   return base_.value().GetFirstSymbol();
diff --git a/flang/lib/Frontend/CompilerInvocation.cpp b/flang/lib/Frontend/CompilerInvocation.cpp
index 79386c92d552..340efb1c63a5 100644
--- a/flang/lib/Frontend/CompilerInvocation.cpp
+++ b/flang/lib/Frontend/CompilerInvocation.cpp
@@ -766,6 +766,11 @@ static bool parseFrontendArgs(FrontendOptions &opts, llvm::opt::ArgList &args,
     opts.features.Enable(Fortran::common::LanguageFeature::DefaultSave);
   }
 
+  // -fsave-main-program
+  if (args.hasArg(clang::driver::options::OPT_fsave_main_program)) {
+    opts.features.Enable(Fortran::common::LanguageFeature::SaveMainProgram);
+  }
+
   if (args.hasArg(
           clang::driver::options::OPT_falternative_parameter_statement)) {
     opts.features.Enable(Fortran::common::LanguageFeature::OldStyleParameter);
diff --git a/flang/lib/Frontend/FrontendAction.cpp b/flang/lib/Frontend/FrontendAction.cpp
index 041182bdf617..9a555bc7cd23 100644
--- a/flang/lib/Frontend/FrontendAction.cpp
+++ b/flang/lib/Frontend/FrontendAction.cpp
@@ -232,6 +232,19 @@ bool FrontendAction::reportFatalErrors(const char (&message)[N]) {
                                            instance->getAllCookedSources());
     return true;
   }
+  if (instance->getParsing().parseTree().has_value() &&
+      !instance->getParsing().consumedWholeFile()) {
+    // Parsing failed without error.
+    const unsigned diagID = instance->getDiagnostics().getCustomDiagID(
+        clang::DiagnosticsEngine::Error, message);
+    instance->getDiagnostics().Report(diagID) << getCurrentFileOrBufferName();
+    instance->getParsing().messages().Emit(llvm::errs(),
+                                           instance->getAllCookedSources());
+    instance->getParsing().EmitMessage(
+        llvm::errs(), instance->getParsing().finalRestingPlace(),
+        "parser FAIL (final position)", "error: ", llvm::raw_ostream::RED);
+    return true;
+  }
   return false;
 }
 
diff --git a/flang/lib/Frontend/FrontendActions.cpp b/flang/lib/Frontend/FrontendActions.cpp
index 603cb039d20b..310cd650349c 100644
--- a/flang/lib/Frontend/FrontendActions.cpp
+++ b/flang/lib/Frontend/FrontendActions.cpp
@@ -566,9 +566,11 @@ void DebugMeasureParseTreeAction::executeAction() {
   // Parse. In case of failure, report and return.
   ci.getParsing().Parse(llvm::outs());
 
-  if (!ci.getParsing().messages().empty() &&
-      (ci.getInvocation().getWarnAsErr() ||
-       ci.getParsing().messages().AnyFatalError())) {
+  if ((ci.getParsing().parseTree().has_value() &&
+       !ci.getParsing().consumedWholeFile()) ||
+      (!ci.getParsing().messages().empty() &&
+       (ci.getInvocation().getWarnAsErr() ||
+        ci.getParsing().messages().AnyFatalError()))) {
     unsigned diagID = ci.getDiagnostics().getCustomDiagID(
         clang::DiagnosticsEngine::Error, "Could not parse %0");
     ci.getDiagnostics().Report(diagID) << getCurrentFileOrBufferName();
diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp
index c7bf42481554..37f51d74d23f 100644
--- a/flang/lib/Lower/Bridge.cpp
+++ b/flang/lib/Lower/Bridge.cpp
@@ -833,7 +833,11 @@ public:
           if_builder.end();
         },
         [&](const auto &) -> void {
-          if (skipDefaultInit)
+          // Always initialize allocatable component descriptor, even when the
+          // value is later copied from the host (e.g. firstprivate) because the
+          // assignment from the host to the copy will fail if the component
+          // descriptors are not initialized.
+          if (skipDefaultInit && !hlfir::mayHaveAllocatableComponent(hSymType))
             return;
           // Initialize local/private derived types with default
           // initialization (Fortran 2023 section 11.1.7.5 and OpenMP 5.2
diff --git a/flang/lib/Lower/CMakeLists.txt b/flang/lib/Lower/CMakeLists.txt
index ba6622d8504a..f57f0e7a77a0 100644
--- a/flang/lib/Lower/CMakeLists.txt
+++ b/flang/lib/Lower/CMakeLists.txt
@@ -29,6 +29,7 @@ add_flang_library(FortranLower
   OpenMP/DataSharingProcessor.cpp
   OpenMP/Decomposer.cpp
   OpenMP/OpenMP.cpp
+  OpenMP/PrivateReductionUtils.cpp
   OpenMP/ReductionProcessor.cpp
   OpenMP/Utils.cpp
   PFTBuilder.cpp
diff --git a/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp b/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp
index cd312537551e..9dfdbd8337ae 100644
--- a/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp
+++ b/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp
@@ -126,7 +126,8 @@ void DataSharingProcessor::cloneSymbol(const semantics::Symbol *sym) {
     assert(sb);
     mlir::Value addr = sb.getAddr();
     assert(addr);
-    return hlfir::mayHaveAllocatableComponent(addr.getType());
+    return !fir::isPointerType(addr.getType()) &&
+           hlfir::mayHaveAllocatableComponent(addr.getType());
   };
 
   if (needInitClone()) {
diff --git a/flang/lib/Lower/OpenMP/PrivateReductionUtils.cpp b/flang/lib/Lower/OpenMP/PrivateReductionUtils.cpp
new file mode 100644
index 000000000000..83f0d4e93ca5
--- /dev/null
+++ b/flang/lib/Lower/OpenMP/PrivateReductionUtils.cpp
@@ -0,0 +1,236 @@
+//===-- PrivateReductionUtils.cpp -------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Coding style: https://mlir.llvm.org/getting_started/DeveloperGuide/
+//
+//===----------------------------------------------------------------------===//
+
+#include "PrivateReductionUtils.h"
+
+#include "flang/Optimizer/Builder/FIRBuilder.h"
+#include "flang/Optimizer/Builder/HLFIRTools.h"
+#include "flang/Optimizer/Builder/Todo.h"
+#include "flang/Optimizer/HLFIR/HLFIROps.h"
+#include "flang/Optimizer/Support/FatalError.h"
+#include "mlir/Dialect/OpenMP/OpenMPDialect.h"
+#include "mlir/IR/Location.h"
+
+static void createCleanupRegion(fir::FirOpBuilder &builder, mlir::Location loc,
+                                mlir::Type argType,
+                                mlir::Region &cleanupRegion) {
+  assert(cleanupRegion.empty());
+  mlir::Block *block = builder.createBlock(&cleanupRegion, cleanupRegion.end(),
+                                           {argType}, {loc});
+  builder.setInsertionPointToEnd(block);
+
+  auto typeError = [loc]() {
+    fir::emitFatalError(loc,
+                        "Attempt to create an omp cleanup region "
+                        "for a type that wasn't allocated",
+                        /*genCrashDiag=*/true);
+  };
+
+  mlir::Type valTy = fir::unwrapRefType(argType);
+  if (auto boxTy = mlir::dyn_cast_or_null<fir::BaseBoxType>(valTy)) {
+    if (!mlir::isa<fir::HeapType, fir::PointerType>(boxTy.getEleTy())) {
+      mlir::Type innerTy = fir::extractSequenceType(boxTy);
+      if (!mlir::isa<fir::SequenceType>(innerTy))
+        typeError();
+    }
+
+    mlir::Value arg = builder.loadIfRef(loc, block->getArgument(0));
+    assert(mlir::isa<fir::BaseBoxType>(arg.getType()));
+
+    // Deallocate box
+    // The FIR type system doesn't nesecarrily know that this is a mutable box
+    // if we allocated the thread local array on the heap to avoid looped stack
+    // allocations.
+    mlir::Value addr =
+        hlfir::genVariableRawAddress(loc, builder, hlfir::Entity{arg});
+    mlir::Value isAllocated = builder.genIsNotNullAddr(loc, addr);
+    fir::IfOp ifOp =
+        builder.create<fir::IfOp>(loc, isAllocated, /*withElseRegion=*/false);
+    builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
+
+    mlir::Value cast = builder.createConvert(
+        loc, fir::HeapType::get(fir::dyn_cast_ptrEleTy(addr.getType())), addr);
+    builder.create<fir::FreeMemOp>(loc, cast);
+
+    builder.setInsertionPointAfter(ifOp);
+    builder.create<mlir::omp::YieldOp>(loc);
+    return;
+  }
+
+  typeError();
+}
+
+fir::ShapeShiftOp Fortran::lower::omp::getShapeShift(fir::FirOpBuilder &builder,
+                                                     mlir::Location loc,
+                                                     mlir::Value box) {
+  fir::SequenceType sequenceType = mlir::cast<fir::SequenceType>(
+      hlfir::getFortranElementOrSequenceType(box.getType()));
+  const unsigned rank = sequenceType.getDimension();
+  llvm::SmallVector<mlir::Value> lbAndExtents;
+  lbAndExtents.reserve(rank * 2);
+
+  mlir::Type idxTy = builder.getIndexType();
+  for (unsigned i = 0; i < rank; ++i) {
+    // TODO: ideally we want to hoist box reads out of the critical section.
+    // We could do this by having box dimensions in block arguments like
+    // OpenACC does
+    mlir::Value dim = builder.createIntegerConstant(loc, idxTy, i);
+    auto dimInfo =
+        builder.create<fir::BoxDimsOp>(loc, idxTy, idxTy, idxTy, box, dim);
+    lbAndExtents.push_back(dimInfo.getLowerBound());
+    lbAndExtents.push_back(dimInfo.getExtent());
+  }
+
+  auto shapeShiftTy = fir::ShapeShiftType::get(builder.getContext(), rank);
+  auto shapeShift =
+      builder.create<fir::ShapeShiftOp>(loc, shapeShiftTy, lbAndExtents);
+  return shapeShift;
+}
+
+void Fortran::lower::omp::populateByRefInitAndCleanupRegions(
+    fir::FirOpBuilder &builder, mlir::Location loc, mlir::Type argType,
+    mlir::Value scalarInitValue, mlir::Block *initBlock,
+    mlir::Value allocatedPrivVarArg, mlir::Value moldArg,
+    mlir::Region &cleanupRegion) {
+  mlir::Type ty = fir::unwrapRefType(argType);
+  builder.setInsertionPointToEnd(initBlock);
+  auto yield = [&](mlir::Value ret) {
+    builder.create<mlir::omp::YieldOp>(loc, ret);
+  };
+
+  if (fir::isa_trivial(ty)) {
+    builder.setInsertionPointToEnd(initBlock);
+
+    if (scalarInitValue)
+      builder.createStoreWithConvert(loc, scalarInitValue, allocatedPrivVarArg);
+    yield(allocatedPrivVarArg);
+    return;
+  }
+
+  // check if an allocatable box is unallocated. If so, initialize the boxAlloca
+  // to be unallocated e.g.
+  // %box_alloca = fir.alloca !fir.box<!fir.heap<...>>
+  // %addr = fir.box_addr %box
+  // if (%addr == 0) {
+  //   %nullbox = fir.embox %addr
+  //   fir.store %nullbox to %box_alloca
+  // } else {
+  //   // ...
+  //   fir.store %something to %box_alloca
+  // }
+  // omp.yield %box_alloca
+  moldArg = builder.loadIfRef(loc, moldArg);
+  auto handleNullAllocatable = [&](mlir::Value boxAlloca) -> fir::IfOp {
+    mlir::Value addr = builder.create<fir::BoxAddrOp>(loc, moldArg);
+    mlir::Value isNotAllocated = builder.genIsNullAddr(loc, addr);
+    fir::IfOp ifOp = builder.create<fir::IfOp>(loc, isNotAllocated,
+                                               /*withElseRegion=*/true);
+    builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
+    // just embox the null address and return
+    mlir::Value nullBox = builder.create<fir::EmboxOp>(loc, ty, addr);
+    builder.create<fir::StoreOp>(loc, nullBox, boxAlloca);
+    return ifOp;
+  };
+
+  // all arrays are boxed
+  if (auto boxTy = mlir::dyn_cast_or_null<fir::BaseBoxType>(ty)) {
+    bool isAllocatableOrPointer =
+        mlir::isa<fir::HeapType, fir::PointerType>(boxTy.getEleTy());
+
+    builder.setInsertionPointToEnd(initBlock);
+    mlir::Value boxAlloca = allocatedPrivVarArg;
+    mlir::Type innerTy = fir::unwrapRefType(boxTy.getEleTy());
+    if (fir::isa_trivial(innerTy)) {
+      // boxed non-sequence value e.g. !fir.box<!fir.heap<i32>>
+      if (!isAllocatableOrPointer)
+        TODO(loc,
+             "Reduction/Privatization of non-allocatable trivial typed box");
+
+      fir::IfOp ifUnallocated = handleNullAllocatable(boxAlloca);
+
+      builder.setInsertionPointToStart(&ifUnallocated.getElseRegion().front());
+      mlir::Value valAlloc = builder.create<fir::AllocMemOp>(loc, innerTy);
+      if (scalarInitValue)
+        builder.createStoreWithConvert(loc, scalarInitValue, valAlloc);
+      mlir::Value box = builder.create<fir::EmboxOp>(loc, ty, valAlloc);
+      builder.create<fir::StoreOp>(loc, box, boxAlloca);
+
+      createCleanupRegion(builder, loc, argType, cleanupRegion);
+      builder.setInsertionPointAfter(ifUnallocated);
+      yield(boxAlloca);
+      return;
+    }
+    innerTy = fir::extractSequenceType(boxTy);
+    if (!mlir::isa<fir::SequenceType>(innerTy))
+      TODO(loc, "Unsupported boxed type for reduction/privatization");
+
+    fir::IfOp ifUnallocated{nullptr};
+    if (isAllocatableOrPointer) {
+      ifUnallocated = handleNullAllocatable(boxAlloca);
+      builder.setInsertionPointToStart(&ifUnallocated.getElseRegion().front());
+    }
+
+    // Create the private copy from the initial fir.box:
+    mlir::Value loadedBox = builder.loadIfRef(loc, moldArg);
+    hlfir::Entity source = hlfir::Entity{loadedBox};
+
+    // Allocating on the heap in case the whole reduction is nested inside of a
+    // loop
+    // TODO: compare performance here to using allocas - this could be made to
+    // work by inserting stacksave/stackrestore around the reduction in
+    // openmpirbuilder
+    auto [temp, needsDealloc] = createTempFromMold(loc, builder, source);
+    // if needsDealloc isn't statically false, add cleanup region. Always
+    // do this for allocatable boxes because they might have been re-allocated
+    // in the body of the loop/parallel region
+
+    std::optional<int64_t> cstNeedsDealloc =
+        fir::getIntIfConstant(needsDealloc);
+    assert(cstNeedsDealloc.has_value() &&
+           "createTempFromMold decides this statically");
+    if (cstNeedsDealloc.has_value() && *cstNeedsDealloc != false) {
+      mlir::OpBuilder::InsertionGuard guard(builder);
+      createCleanupRegion(builder, loc, argType, cleanupRegion);
+    } else {
+      assert(!isAllocatableOrPointer &&
+             "Pointer-like arrays must be heap allocated");
+    }
+
+    // Put the temporary inside of a box:
+    // hlfir::genVariableBox doesn't handle non-default lower bounds
+    mlir::Value box;
+    fir::ShapeShiftOp shapeShift = getShapeShift(builder, loc, loadedBox);
+    mlir::Type boxType = loadedBox.getType();
+    if (mlir::isa<fir::BaseBoxType>(temp.getType()))
+      // the box created by the declare form createTempFromMold is missing lower
+      // bounds info
+      box = builder.create<fir::ReboxOp>(loc, boxType, temp, shapeShift,
+                                         /*shift=*/mlir::Value{});
+    else
+      box = builder.create<fir::EmboxOp>(
+          loc, boxType, temp, shapeShift,
+          /*slice=*/mlir::Value{},
+          /*typeParams=*/llvm::ArrayRef<mlir::Value>{});
+
+    if (scalarInitValue)
+      builder.create<hlfir::AssignOp>(loc, scalarInitValue, box);
+    builder.create<fir::StoreOp>(loc, box, boxAlloca);
+    if (ifUnallocated)
+      builder.setInsertionPointAfter(ifUnallocated);
+    yield(boxAlloca);
+    return;
+  }
+
+  TODO(loc,
+       "creating reduction/privatization init region for unsupported type");
+  return;
+}
diff --git a/flang/lib/Lower/OpenMP/PrivateReductionUtils.h b/flang/lib/Lower/OpenMP/PrivateReductionUtils.h
new file mode 100644
index 000000000000..b4abc40cd4b6
--- /dev/null
+++ b/flang/lib/Lower/OpenMP/PrivateReductionUtils.h
@@ -0,0 +1,51 @@
+//===-- Lower/OpenMP/PrivateReductionUtils.h --------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Coding style: https://mlir.llvm.org/getting_started/DeveloperGuide/
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef FORTRAN_LOWER_OPENMP_PRIVATEREDUCTIONUTILS_H
+#define FORTRAN_LOWER_OPENMP_PRIVATEREDUCTIONUTILS_H
+
+#include "mlir/IR/Location.h"
+#include "mlir/IR/Value.h"
+
+namespace mlir {
+class Region;
+} // namespace mlir
+
+namespace fir {
+class FirOpBuilder;
+class ShapeShiftOp;
+} // namespace fir
+
+namespace Fortran {
+namespace lower {
+namespace omp {
+
+/// Generate init and cleanup regions suitable for reduction or privatizer
+/// declarations. `scalarInitValue` may be nullptr if there is no default
+/// initialization (for privatization).
+void populateByRefInitAndCleanupRegions(fir::FirOpBuilder &builder,
+                                        mlir::Location loc, mlir::Type argType,
+                                        mlir::Value scalarInitValue,
+                                        mlir::Block *initBlock,
+                                        mlir::Value allocatedPrivVarArg,
+                                        mlir::Value moldArg,
+                                        mlir::Region &cleanupRegion);
+
+/// Generate a fir::ShapeShift op describing the provided boxed array.
+fir::ShapeShiftOp getShapeShift(fir::FirOpBuilder &builder, mlir::Location loc,
+                                mlir::Value box);
+
+} // namespace omp
+} // namespace lower
+} // namespace Fortran
+
+#endif // FORTRAN_LOWER_OPENMP_PRIVATEREDUCTIONUTILS_H
diff --git a/flang/lib/Lower/OpenMP/ReductionProcessor.cpp b/flang/lib/Lower/OpenMP/ReductionProcessor.cpp
index 736de2ee511b..2cd21107a916 100644
--- a/flang/lib/Lower/OpenMP/ReductionProcessor.cpp
+++ b/flang/lib/Lower/OpenMP/ReductionProcessor.cpp
@@ -12,6 +12,7 @@
 
 #include "ReductionProcessor.h"
 
+#include "PrivateReductionUtils.h"
 #include "flang/Lower/AbstractConverter.h"
 #include "flang/Lower/ConvertType.h"
 #include "flang/Lower/SymbolMap.h"
@@ -294,33 +295,6 @@ mlir::Value ReductionProcessor::createScalarCombiner(
   return reductionOp;
 }
 
-/// Generate a fir::ShapeShift op describing the provided boxed array.
-static fir::ShapeShiftOp getShapeShift(fir::FirOpBuilder &builder,
-                                       mlir::Location loc, mlir::Value box) {
-  fir::SequenceType sequenceType = mlir::cast<fir::SequenceType>(
-      hlfir::getFortranElementOrSequenceType(box.getType()));
-  const unsigned rank = sequenceType.getDimension();
-  llvm::SmallVector<mlir::Value> lbAndExtents;
-  lbAndExtents.reserve(rank * 2);
-
-  mlir::Type idxTy = builder.getIndexType();
-  for (unsigned i = 0; i < rank; ++i) {
-    // TODO: ideally we want to hoist box reads out of the critical section.
-    // We could do this by having box dimensions in block arguments like
-    // OpenACC does
-    mlir::Value dim = builder.createIntegerConstant(loc, idxTy, i);
-    auto dimInfo =
-        builder.create<fir::BoxDimsOp>(loc, idxTy, idxTy, idxTy, box, dim);
-    lbAndExtents.push_back(dimInfo.getLowerBound());
-    lbAndExtents.push_back(dimInfo.getExtent());
-  }
-
-  auto shapeShiftTy = fir::ShapeShiftType::get(builder.getContext(), rank);
-  auto shapeShift =
-      builder.create<fir::ShapeShiftOp>(loc, shapeShiftTy, lbAndExtents);
-  return shapeShift;
-}
-
 /// Create reduction combiner region for reduction variables which are boxed
 /// arrays
 static void genBoxCombiner(fir::FirOpBuilder &builder, mlir::Location loc,
@@ -422,59 +396,6 @@ static void genCombiner(fir::FirOpBuilder &builder, mlir::Location loc,
   TODO(loc, "OpenMP genCombiner for unsupported reduction variable type");
 }
 
-static void
-createReductionCleanupRegion(fir::FirOpBuilder &builder, mlir::Location loc,
-                             mlir::omp::DeclareReductionOp &reductionDecl) {
-  mlir::Type redTy = reductionDecl.getType();
-
-  mlir::Region &cleanupRegion = reductionDecl.getCleanupRegion();
-  assert(cleanupRegion.empty());
-  mlir::Block *block =
-      builder.createBlock(&cleanupRegion, cleanupRegion.end(), {redTy}, {loc});
-  builder.setInsertionPointToEnd(block);
-
-  auto typeError = [loc]() {
-    fir::emitFatalError(loc,
-                        "Attempt to create an omp reduction cleanup region "
-                        "for a type that wasn't allocated",
-                        /*genCrashDiag=*/true);
-  };
-
-  mlir::Type valTy = fir::unwrapRefType(redTy);
-  if (auto boxTy = mlir::dyn_cast_or_null<fir::BaseBoxType>(valTy)) {
-    if (!mlir::isa<fir::HeapType, fir::PointerType>(boxTy.getEleTy())) {
-      mlir::Type innerTy = fir::extractSequenceType(boxTy);
-      if (!mlir::isa<fir::SequenceType>(innerTy))
-        typeError();
-    }
-
-    mlir::Value arg = block->getArgument(0);
-    arg = builder.loadIfRef(loc, arg);
-    assert(mlir::isa<fir::BaseBoxType>(arg.getType()));
-
-    // Deallocate box
-    // The FIR type system doesn't nesecarrily know that this is a mutable box
-    // if we allocated the thread local array on the heap to avoid looped stack
-    // allocations.
-    mlir::Value addr =
-        hlfir::genVariableRawAddress(loc, builder, hlfir::Entity{arg});
-    mlir::Value isAllocated = builder.genIsNotNullAddr(loc, addr);
-    fir::IfOp ifOp =
-        builder.create<fir::IfOp>(loc, isAllocated, /*withElseRegion=*/false);
-    builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
-
-    mlir::Value cast = builder.createConvert(
-        loc, fir::HeapType::get(fir::dyn_cast_ptrEleTy(addr.getType())), addr);
-    builder.create<fir::FreeMemOp>(loc, cast);
-
-    builder.setInsertionPointAfter(ifOp);
-    builder.create<mlir::omp::YieldOp>(loc);
-    return;
-  }
-
-  typeError();
-}
-
 // like fir::unwrapSeqOrBoxedSeqType except it also works for non-sequence boxes
 static mlir::Type unwrapSeqOrBoxedType(mlir::Type ty) {
   if (auto seqTy = mlir::dyn_cast<fir::SequenceType>(ty))
@@ -517,154 +438,31 @@ static void createReductionAllocAndInitRegions(
   mlir::Value initValue = ReductionProcessor::getReductionInitValue(
       loc, unwrapSeqOrBoxedType(ty), redId, builder);
 
+  if (isByRef) {
+    populateByRefInitAndCleanupRegions(builder, loc, type, initValue, initBlock,
+                                       reductionDecl.getInitializerAllocArg(),
+                                       reductionDecl.getInitializerMoldArg(),
+                                       reductionDecl.getCleanupRegion());
+  }
+
   if (fir::isa_trivial(ty)) {
     if (isByRef) {
       // alloc region
-      {
-        builder.setInsertionPointToEnd(allocBlock);
-        mlir::Value alloca = builder.create<fir::AllocaOp>(loc, ty);
-        yield(alloca);
-      }
-
-      // init region
-      {
-        builder.setInsertionPointToEnd(initBlock);
-        // block arg is mapped to the alloca yielded from the alloc region
-        mlir::Value alloc = reductionDecl.getInitializerAllocArg();
-        builder.createStoreWithConvert(loc, initValue, alloc);
-        yield(alloc);
-      }
+      builder.setInsertionPointToEnd(allocBlock);
+      mlir::Value alloca = builder.create<fir::AllocaOp>(loc, ty);
+      yield(alloca);
       return;
     }
     // by val
     yield(initValue);
     return;
   }
+  assert(isByRef && "passing non-trivial types by val is unsupported");
 
-  // check if an allocatable box is unallocated. If so, initialize the boxAlloca
-  // to be unallocated e.g.
-  // %box_alloca = fir.alloca !fir.box<!fir.heap<...>>
-  // %addr = fir.box_addr %box
-  // if (%addr == 0) {
-  //   %nullbox = fir.embox %addr
-  //   fir.store %nullbox to %box_alloca
-  // } else {
-  //   // ...
-  //   fir.store %something to %box_alloca
-  // }
-  // omp.yield %box_alloca
-  mlir::Value moldArg =
-      builder.loadIfRef(loc, reductionDecl.getInitializerMoldArg());
-  auto handleNullAllocatable = [&](mlir::Value boxAlloca) -> fir::IfOp {
-    mlir::Value addr = builder.create<fir::BoxAddrOp>(loc, moldArg);
-    mlir::Value isNotAllocated = builder.genIsNullAddr(loc, addr);
-    fir::IfOp ifOp = builder.create<fir::IfOp>(loc, isNotAllocated,
-                                               /*withElseRegion=*/true);
-    builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
-    // just embox the null address and return
-    mlir::Value nullBox = builder.create<fir::EmboxOp>(loc, ty, addr);
-    builder.create<fir::StoreOp>(loc, nullBox, boxAlloca);
-    return ifOp;
-  };
-
-  // all arrays are boxed
-  if (auto boxTy = mlir::dyn_cast_or_null<fir::BaseBoxType>(ty)) {
-    assert(isByRef && "passing boxes by value is unsupported");
-    bool isAllocatableOrPointer =
-        mlir::isa<fir::HeapType, fir::PointerType>(boxTy.getEleTy());
-
-    // alloc region
-    {
-      builder.setInsertionPointToEnd(allocBlock);
-      mlir::Value boxAlloca = builder.create<fir::AllocaOp>(loc, ty);
-      yield(boxAlloca);
-    }
-
-    // init region
-    builder.setInsertionPointToEnd(initBlock);
-    mlir::Value boxAlloca = reductionDecl.getInitializerAllocArg();
-    mlir::Type innerTy = fir::unwrapRefType(boxTy.getEleTy());
-    if (fir::isa_trivial(innerTy)) {
-      // boxed non-sequence value e.g. !fir.box<!fir.heap<i32>>
-      if (!isAllocatableOrPointer)
-        TODO(loc, "Reduction of non-allocatable trivial typed box");
-
-      fir::IfOp ifUnallocated = handleNullAllocatable(boxAlloca);
-
-      builder.setInsertionPointToStart(&ifUnallocated.getElseRegion().front());
-      mlir::Value valAlloc = builder.create<fir::AllocMemOp>(loc, innerTy);
-      builder.createStoreWithConvert(loc, initValue, valAlloc);
-      mlir::Value box = builder.create<fir::EmboxOp>(loc, ty, valAlloc);
-      builder.create<fir::StoreOp>(loc, box, boxAlloca);
-
-      auto insPt = builder.saveInsertionPoint();
-      createReductionCleanupRegion(builder, loc, reductionDecl);
-      builder.restoreInsertionPoint(insPt);
-      builder.setInsertionPointAfter(ifUnallocated);
-      yield(boxAlloca);
-      return;
-    }
-    innerTy = fir::extractSequenceType(boxTy);
-    if (!mlir::isa<fir::SequenceType>(innerTy))
-      TODO(loc, "Unsupported boxed type for reduction");
-
-    fir::IfOp ifUnallocated{nullptr};
-    if (isAllocatableOrPointer) {
-      ifUnallocated = handleNullAllocatable(boxAlloca);
-      builder.setInsertionPointToStart(&ifUnallocated.getElseRegion().front());
-    }
-
-    // Create the private copy from the initial fir.box:
-    mlir::Value loadedBox = builder.loadIfRef(loc, moldArg);
-    hlfir::Entity source = hlfir::Entity{loadedBox};
-
-    // Allocating on the heap in case the whole reduction is nested inside of a
-    // loop
-    // TODO: compare performance here to using allocas - this could be made to
-    // work by inserting stacksave/stackrestore around the reduction in
-    // openmpirbuilder
-    auto [temp, needsDealloc] = createTempFromMold(loc, builder, source);
-    // if needsDealloc isn't statically false, add cleanup region. Always
-    // do this for allocatable boxes because they might have been re-allocated
-    // in the body of the loop/parallel region
-
-    std::optional<int64_t> cstNeedsDealloc =
-        fir::getIntIfConstant(needsDealloc);
-    assert(cstNeedsDealloc.has_value() &&
-           "createTempFromMold decides this statically");
-    if (cstNeedsDealloc.has_value() && *cstNeedsDealloc != false) {
-      mlir::OpBuilder::InsertionGuard guard(builder);
-      createReductionCleanupRegion(builder, loc, reductionDecl);
-    } else {
-      assert(!isAllocatableOrPointer &&
-             "Pointer-like arrays must be heap allocated");
-    }
-
-    // Put the temporary inside of a box:
-    // hlfir::genVariableBox doesn't handle non-default lower bounds
-    mlir::Value box;
-    fir::ShapeShiftOp shapeShift = getShapeShift(builder, loc, loadedBox);
-    mlir::Type boxType = loadedBox.getType();
-    if (mlir::isa<fir::BaseBoxType>(temp.getType()))
-      // the box created by the declare form createTempFromMold is missing lower
-      // bounds info
-      box = builder.create<fir::ReboxOp>(loc, boxType, temp, shapeShift,
-                                         /*shift=*/mlir::Value{});
-    else
-      box = builder.create<fir::EmboxOp>(
-          loc, boxType, temp, shapeShift,
-          /*slice=*/mlir::Value{},
-          /*typeParams=*/llvm::ArrayRef<mlir::Value>{});
-
-    builder.create<hlfir::AssignOp>(loc, initValue, box);
-    builder.create<fir::StoreOp>(loc, box, boxAlloca);
-    if (ifUnallocated)
-      builder.setInsertionPointAfter(ifUnallocated);
-    yield(boxAlloca);
-    return;
-  }
-
-  TODO(loc, "createReductionInitRegion for unsupported type");
+  // alloc region
+  builder.setInsertionPointToEnd(allocBlock);
+  mlir::Value boxAlloca = builder.create<fir::AllocaOp>(loc, ty);
+  yield(boxAlloca);
 }
 
 mlir::omp::DeclareReductionOp ReductionProcessor::createDeclareReduction(
diff --git a/flang/lib/Optimizer/Builder/CMakeLists.txt b/flang/lib/Optimizer/Builder/CMakeLists.txt
index a824d70fdb5c..0960e858c411 100644
--- a/flang/lib/Optimizer/Builder/CMakeLists.txt
+++ b/flang/lib/Optimizer/Builder/CMakeLists.txt
@@ -49,6 +49,7 @@ add_flang_library(FIRBuilder
   FIRDialect
   FIRDialectSupport
   FIRSupport
+  FortranEvaluate
   HLFIRDialect
   ${dialect_libs}
   ${extension_libs}
diff --git a/flang/lib/Optimizer/Builder/FIRBuilder.cpp b/flang/lib/Optimizer/Builder/FIRBuilder.cpp
index 3a39c455015f..d01becfe8009 100644
--- a/flang/lib/Optimizer/Builder/FIRBuilder.cpp
+++ b/flang/lib/Optimizer/Builder/FIRBuilder.cpp
@@ -1626,6 +1626,25 @@ mlir::Value fir::factory::genCPtrOrCFunptrAddr(fir::FirOpBuilder &builder,
                                            cPtr, addrFieldIndex);
 }
 
+mlir::Value fir::factory::genCDevPtrAddr(fir::FirOpBuilder &builder,
+                                         mlir::Location loc,
+                                         mlir::Value cDevPtr, mlir::Type ty) {
+  auto recTy = mlir::cast<fir::RecordType>(ty);
+  assert(recTy.getTypeList().size() == 1);
+  auto cptrFieldName = recTy.getTypeList()[0].first;
+  mlir::Type cptrFieldTy = recTy.getTypeList()[0].second;
+  auto fieldIndexType = fir::FieldType::get(ty.getContext());
+  mlir::Value cptrFieldIndex = builder.create<fir::FieldIndexOp>(
+      loc, fieldIndexType, cptrFieldName, recTy,
+      /*typeParams=*/mlir::ValueRange{});
+  auto cptrCoord = builder.create<fir::CoordinateOp>(
+      loc, builder.getRefType(cptrFieldTy), cDevPtr, cptrFieldIndex);
+  auto [addrFieldIndex, addrFieldTy] =
+      genCPtrOrCFunptrFieldIndex(builder, loc, cptrFieldTy);
+  return builder.create<fir::CoordinateOp>(loc, builder.getRefType(addrFieldTy),
+                                           cptrCoord, addrFieldIndex);
+}
+
 mlir::Value fir::factory::genCPtrOrCFunptrValue(fir::FirOpBuilder &builder,
                                                 mlir::Location loc,
                                                 mlir::Value cPtr) {
diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
index 9a3777994a9d..cb0af392073f 100644
--- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
@@ -167,6 +167,7 @@ static constexpr IntrinsicHandler handlers[]{
      &I::genCAssociatedCPtr,
      {{{"c_ptr_1", asAddr}, {"c_ptr_2", asAddr, handleDynamicOptional}}},
      /*isElemental=*/false},
+    {"c_devloc", &I::genCDevLoc, {{{"x", asBox}}}, /*isElemental=*/false},
     {"c_f_pointer",
      &I::genCFPointer,
      {{{"cptr", asValue},
@@ -2867,11 +2868,14 @@ static mlir::Value getAddrFromBox(fir::FirOpBuilder &builder,
 static fir::ExtendedValue
 genCLocOrCFunLoc(fir::FirOpBuilder &builder, mlir::Location loc,
                  mlir::Type resultType, llvm::ArrayRef<fir::ExtendedValue> args,
-                 bool isFunc = false) {
+                 bool isFunc = false, bool isDevLoc = false) {
   assert(args.size() == 1);
   mlir::Value res = builder.create<fir::AllocaOp>(loc, resultType);
-  mlir::Value resAddr =
-      fir::factory::genCPtrOrCFunptrAddr(builder, loc, res, resultType);
+  mlir::Value resAddr;
+  if (isDevLoc)
+    resAddr = fir::factory::genCDevPtrAddr(builder, loc, res, resultType);
+  else
+    resAddr = fir::factory::genCPtrOrCFunptrAddr(builder, loc, res, resultType);
   assert(fir::isa_box_type(fir::getBase(args[0]).getType()) &&
          "argument must have been lowered to box type");
   mlir::Value argAddr = getAddrFromBox(builder, loc, args[0], isFunc);
@@ -2928,6 +2932,14 @@ IntrinsicLibrary::genCAssociatedCPtr(mlir::Type resultType,
   return genCAssociated(builder, loc, resultType, args);
 }
 
+// C_DEVLOC
+fir::ExtendedValue
+IntrinsicLibrary::genCDevLoc(mlir::Type resultType,
+                             llvm::ArrayRef<fir::ExtendedValue> args) {
+  return genCLocOrCFunLoc(builder, loc, resultType, args, /*isFunc=*/false,
+                          /*isDevLoc=*/true);
+}
+
 // C_F_POINTER
 void IntrinsicLibrary::genCFPointer(llvm::ArrayRef<fir::ExtendedValue> args) {
   assert(args.size() == 3);
diff --git a/flang/lib/Optimizer/Dialect/FIROps.cpp b/flang/lib/Optimizer/Dialect/FIROps.cpp
index cdcf9bda49a6..fa83aa380e48 100644
--- a/flang/lib/Optimizer/Dialect/FIROps.cpp
+++ b/flang/lib/Optimizer/Dialect/FIROps.cpp
@@ -1313,7 +1313,8 @@ void fir::ConvertOp::getCanonicalizationPatterns(
   results.insert<ConvertConvertOptPattern, ConvertAscendingIndexOptPattern,
                  ConvertDescendingIndexOptPattern, RedundantConvertOptPattern,
                  CombineConvertOptPattern, CombineConvertTruncOptPattern,
-                 ForwardConstantConvertPattern>(context);
+                 ForwardConstantConvertPattern, ChainedPointerConvertsPattern>(
+      context);
 }
 
 mlir::OpFoldResult fir::ConvertOp::fold(FoldAdaptor adaptor) {
diff --git a/flang/lib/Optimizer/Passes/Pipelines.cpp b/flang/lib/Optimizer/Passes/Pipelines.cpp
index 20e4599587c4..e1d7376ec380 100644
--- a/flang/lib/Optimizer/Passes/Pipelines.cpp
+++ b/flang/lib/Optimizer/Passes/Pipelines.cpp
@@ -240,6 +240,16 @@ void createHLFIRToFIRPassPipeline(mlir::PassManager &pm, bool enableOpenMP,
   pm.addPass(hlfir::createLowerHLFIROrderedAssignments());
   pm.addPass(hlfir::createLowerHLFIRIntrinsics());
   pm.addPass(hlfir::createBufferizeHLFIR());
+  // Run hlfir.assign inlining again after BufferizeHLFIR,
+  // because the latter may introduce new hlfir.assign operations,
+  // e.g. for copying an array into a temporary due to
+  // hlfir.associate.
+  // TODO: we can remove the previous InlineHLFIRAssign, when
+  // FIR AliasAnalysis is good enough to say that a temporary
+  // array does not alias with any user object.
+  if (optLevel.isOptimizingForSpeed())
+    addNestedPassToAllTopLevelOperations<PassConstructor>(
+        pm, hlfir::createInlineHLFIRAssign);
   pm.addPass(hlfir::createConvertHLFIRtoFIR());
   if (enableOpenMP)
     pm.addPass(flangomp::createLowerWorkshare());
diff --git a/flang/lib/Optimizer/Transforms/StackArrays.cpp b/flang/lib/Optimizer/Transforms/StackArrays.cpp
index bdcb8199b790..2a9d3397e87b 100644
--- a/flang/lib/Optimizer/Transforms/StackArrays.cpp
+++ b/flang/lib/Optimizer/Transforms/StackArrays.cpp
@@ -330,6 +330,18 @@ std::optional<AllocationState> LatticePoint::get(mlir::Value val) const {
   return it->second;
 }
 
+static mlir::Value lookThroughDeclaresAndConverts(mlir::Value value) {
+  while (mlir::Operation *op = value.getDefiningOp()) {
+    if (auto declareOp = llvm::dyn_cast<fir::DeclareOp>(op))
+      value = declareOp.getMemref();
+    else if (auto convertOp = llvm::dyn_cast<fir::ConvertOp>(op))
+      value = convertOp->getOperand(0);
+    else
+      return value;
+  }
+  return value;
+}
+
 mlir::LogicalResult AllocationAnalysis::visitOperation(
     mlir::Operation *op, const LatticePoint &before, LatticePoint *after) {
   LLVM_DEBUG(llvm::dbgs() << "StackArrays: Visiting operation: " << *op
@@ -363,10 +375,10 @@ mlir::LogicalResult AllocationAnalysis::visitOperation(
     mlir::Value operand = op->getOperand(0);
 
     // Note: StackArrays is scheduled in the pass pipeline after lowering hlfir
-    // to fir. Therefore, we only need to handle `fir::DeclareOp`s.
-    if (auto declareOp =
-            llvm::dyn_cast_if_present<fir::DeclareOp>(operand.getDefiningOp()))
-      operand = declareOp.getMemref();
+    // to fir. Therefore, we only need to handle `fir::DeclareOp`s. Also look
+    // past converts in case the pointer was changed between different pointer
+    // types.
+    operand = lookThroughDeclaresAndConverts(operand);
 
     std::optional<AllocationState> operandState = before.get(operand);
     if (operandState && *operandState == AllocationState::Allocated) {
@@ -535,17 +547,12 @@ AllocMemConversion::matchAndRewrite(fir::AllocMemOp allocmem,
 
   // remove freemem operations
   llvm::SmallVector<mlir::Operation *> erases;
-  for (mlir::Operation *user : allocmem.getOperation()->getUsers()) {
-    if (auto declareOp = mlir::dyn_cast_if_present<fir::DeclareOp>(user)) {
-      for (mlir::Operation *user : declareOp->getUsers()) {
-        if (mlir::isa<fir::FreeMemOp>(user))
-          erases.push_back(user);
-      }
-    }
-
-    if (mlir::isa<fir::FreeMemOp>(user))
-      erases.push_back(user);
-  }
+  mlir::Operation *parent = allocmem->getParentOp();
+  // TODO: this shouldn't need to be re-calculated for every allocmem
+  parent->walk([&](fir::FreeMemOp freeOp) {
+    if (lookThroughDeclaresAndConverts(freeOp->getOperand(0)) == allocmem)
+      erases.push_back(freeOp);
+  });
 
   // now we are done iterating the users, it is safe to mutate them
   for (mlir::Operation *erase : erases)
diff --git a/flang/lib/Parser/prescan.cpp b/flang/lib/Parser/prescan.cpp
index 3cd32d7e6c92..703a02792a1c 100644
--- a/flang/lib/Parser/prescan.cpp
+++ b/flang/lib/Parser/prescan.cpp
@@ -709,9 +709,22 @@ bool Prescanner::NextToken(TokenSequence &tokens) {
       QuotedCharacterLiteral(tokens, start);
     } else if (IsLetter(*at_) && !preventHollerith_ &&
         parenthesisNesting_ > 0) {
-      // Handles FORMAT(3I9HHOLLERITH) by skipping over the first I so that
-      // we don't misrecognize I9HOLLERITH as an identifier in the next case.
-      EmitCharAndAdvance(tokens, *at_);
+      const char *p{at_};
+      int digits{0};
+      for (;; ++digits) {
+        ++p;
+        if (InFixedFormSource()) {
+          p = SkipWhiteSpace(p);
+        }
+        if (!IsDecimalDigit(*p)) {
+          break;
+        }
+      }
+      if (digits > 0 && (*p == 'h' || *p == 'H')) {
+        // Handles FORMAT(3I9HHOLLERITH) by skipping over the first I so that
+        // we don't misrecognize I9HOLLERITH as an identifier in the next case.
+        EmitCharAndAdvance(tokens, *at_);
+      }
     }
     preventHollerith_ = false;
   } else if (*at_ == '.') {
@@ -1289,14 +1302,18 @@ const char *Prescanner::FreeFormContinuationLine(bool ampersand) {
     return nullptr;
   }
   p = SkipWhiteSpace(p);
-  if (InCompilerDirective()) {
-    if (*p++ != '!') {
-      return nullptr;
-    }
-    for (const char *s{directiveSentinel_}; *s != '\0'; ++p, ++s) {
-      if (*s != ToLowerCaseLetter(*p)) {
-        return nullptr;
+  if (*p == '!') {
+    ++p;
+    if (InCompilerDirective()) {
+      for (const char *s{directiveSentinel_}; *s != '\0'; ++p, ++s) {
+        if (*s != ToLowerCaseLetter(*p)) {
+          return nullptr;
+        }
       }
+    } else if (features_.IsEnabled(LanguageFeature::OpenMP) && *p == '$') {
+      ++p;
+    } else {
+      return nullptr;
     }
     p = SkipWhiteSpace(p);
     if (*p == '&') {
diff --git a/flang/lib/Semantics/assignment.cpp b/flang/lib/Semantics/assignment.cpp
index e69a73c7837c..0b57197fb8db 100644
--- a/flang/lib/Semantics/assignment.cpp
+++ b/flang/lib/Semantics/assignment.cpp
@@ -66,8 +66,13 @@ void AssignmentContext::Analyze(const parser::AssignmentStmt &stmt) {
     const SomeExpr &rhs{assignment->rhs};
     auto lhsLoc{std::get<parser::Variable>(stmt.t).GetSource()};
     const Scope &scope{context_.FindScope(lhsLoc)};
-    if (auto whyNot{WhyNotDefinable(lhsLoc, scope,
-            DefinabilityFlags{DefinabilityFlag::VectorSubscriptIsOk}, lhs)}) {
+    DefinabilityFlags flags{DefinabilityFlag::VectorSubscriptIsOk};
+    bool isDefinedAssignment{
+        std::holds_alternative<evaluate::ProcedureRef>(assignment->u)};
+    if (isDefinedAssignment) {
+      flags.set(DefinabilityFlag::AllowEventLockOrNotifyType);
+    }
+    if (auto whyNot{WhyNotDefinable(lhsLoc, scope, flags, lhs)}) {
       if (whyNot->IsFatal()) {
         if (auto *msg{Say(lhsLoc,
                 "Left-hand side of assignment is not definable"_err_en_US)}) {
@@ -79,9 +84,7 @@ void AssignmentContext::Analyze(const parser::AssignmentStmt &stmt) {
       }
     }
     auto rhsLoc{std::get<parser::Expr>(stmt.t).source};
-    if (std::holds_alternative<evaluate::ProcedureRef>(assignment->u)) {
-      // it's a defined ASSIGNMENT(=)
-    } else {
+    if (!isDefinedAssignment) {
       CheckForPureContext(rhs, rhsLoc);
     }
     if (whereDepth_ > 0) {
diff --git a/flang/lib/Semantics/check-call.cpp b/flang/lib/Semantics/check-call.cpp
index 597c280a6df8..ba68a0f898d4 100644
--- a/flang/lib/Semantics/check-call.cpp
+++ b/flang/lib/Semantics/check-call.cpp
@@ -690,7 +690,8 @@ static void CheckExplicitDataArg(const characteristics::DummyDataObject &dummy,
     }
   }
   if (actualLastObject && actualLastObject->IsCoarray() &&
-      IsAllocatable(*actualLastSymbol) && dummy.intent == common::Intent::Out &&
+      dummy.attrs.test(characteristics::DummyDataObject::Attr::Allocatable) &&
+      dummy.intent == common::Intent::Out &&
       !(intrinsic &&
           evaluate::AcceptsIntentOutAllocatableCoarray(
               intrinsic->name))) { // C846
@@ -703,12 +704,14 @@ static void CheckExplicitDataArg(const characteristics::DummyDataObject &dummy,
   // Problems with polymorphism are caught in the callee's definition.
   if (scope) {
     std::optional<parser::MessageFixedText> undefinableMessage;
-    if (dummy.intent == common::Intent::Out) {
-      undefinableMessage =
-          "Actual argument associated with INTENT(OUT) %s is not definable"_err_en_US;
-    } else if (dummy.intent == common::Intent::InOut) {
+    DefinabilityFlags flags{DefinabilityFlag::PolymorphicOkInPure};
+    if (dummy.intent == common::Intent::InOut) {
+      flags.set(DefinabilityFlag::AllowEventLockOrNotifyType);
       undefinableMessage =
           "Actual argument associated with INTENT(IN OUT) %s is not definable"_err_en_US;
+    } else if (dummy.intent == common::Intent::Out) {
+      undefinableMessage =
+          "Actual argument associated with INTENT(OUT) %s is not definable"_err_en_US;
     } else if (context.ShouldWarn(common::LanguageFeature::
                        UndefinableAsynchronousOrVolatileActual)) {
       if (dummy.attrs.test(
@@ -722,7 +725,6 @@ static void CheckExplicitDataArg(const characteristics::DummyDataObject &dummy,
       }
     }
     if (undefinableMessage) {
-      DefinabilityFlags flags{DefinabilityFlag::PolymorphicOkInPure};
       if (isElemental) { // 15.5.2.4(21)
         flags.set(DefinabilityFlag::VectorSubscriptIsOk);
       }
@@ -1622,8 +1624,8 @@ static void CheckImage_Index(evaluate::ActualArguments &arguments,
             evaluate::GetShape(arguments[1]->UnwrapExpr())}) {
       if (const auto *coarrayArgSymbol{UnwrapWholeSymbolOrComponentDataRef(
               arguments[0]->UnwrapExpr())}) {
-        const auto coarrayArgCorank = coarrayArgSymbol->Corank();
-        if (const auto subArrSize = evaluate::ToInt64(*subArrShape->front())) {
+        auto coarrayArgCorank{coarrayArgSymbol->Corank()};
+        if (auto subArrSize{evaluate::ToInt64(*subArrShape->front())}) {
           if (subArrSize != coarrayArgCorank) {
             messages.Say(arguments[1]->sourceLocation(),
                 "The size of 'SUB=' (%jd) for intrinsic 'image_index' must be equal to the corank of 'COARRAY=' (%d)"_err_en_US,
diff --git a/flang/lib/Semantics/definable.cpp b/flang/lib/Semantics/definable.cpp
index 88f9463e35c7..6d0155c24c31 100644
--- a/flang/lib/Semantics/definable.cpp
+++ b/flang/lib/Semantics/definable.cpp
@@ -204,7 +204,8 @@ static std::optional<parser::Message> WhyNotDefinableLast(parser::CharBlock at,
     }
     return std::nullopt; // pointer assignment - skip following checks
   }
-  if (IsOrContainsEventOrLockComponent(ultimate)) {
+  if (!flags.test(DefinabilityFlag::AllowEventLockOrNotifyType) &&
+      IsOrContainsEventOrLockComponent(ultimate)) {
     return BlameSymbol(at,
         "'%s' is an entity with either an EVENT_TYPE or LOCK_TYPE"_en_US,
         original);
diff --git a/flang/lib/Semantics/definable.h b/flang/lib/Semantics/definable.h
index 709bbba494d1..902702dbccbf 100644
--- a/flang/lib/Semantics/definable.h
+++ b/flang/lib/Semantics/definable.h
@@ -32,7 +32,8 @@ ENUM_CLASS(DefinabilityFlag,
     AcceptAllocatable, // treat allocatable as if it were a pointer
     SourcedAllocation, // ALLOCATE(a,SOURCE=)
     PolymorphicOkInPure, // don't check for polymorphic type in pure subprogram
-    DoNotNoteDefinition) // context does not imply definition
+    DoNotNoteDefinition, // context does not imply definition
+    AllowEventLockOrNotifyType)
 
 using DefinabilityFlags =
     common::EnumSet<DefinabilityFlag, DefinabilityFlag_enumSize>;
diff --git a/flang/lib/Semantics/expression.cpp b/flang/lib/Semantics/expression.cpp
index c2eb17c1ac8e..3ec6f385ceb8 100644
--- a/flang/lib/Semantics/expression.cpp
+++ b/flang/lib/Semantics/expression.cpp
@@ -1506,9 +1506,9 @@ MaybeExpr ExpressionAnalyzer::Analyze(const parser::CoindexedNamedObject &x) {
     if (cosubsOk && !reversed.empty()) {
       int numCosubscripts{static_cast<int>(cosubscripts.size())};
       const Symbol &symbol{reversed.front()};
-      if (numCosubscripts != symbol.Corank()) {
+      if (numCosubscripts != GetCorank(symbol)) {
         Say("'%s' has corank %d, but coindexed reference has %d cosubscripts"_err_en_US,
-            symbol.name(), symbol.Corank(), numCosubscripts);
+            symbol.name(), GetCorank(symbol), numCosubscripts);
       }
     }
     for (const auto &imageSelSpec :
@@ -2536,6 +2536,15 @@ static bool CheckCompatibleArgument(bool isElemental,
             return false;
           },
           [&](const characteristics::DummyProcedure &dummy) {
+            if ((dummy.attrs.test(
+                     characteristics::DummyProcedure::Attr::Optional) ||
+                    dummy.attrs.test(
+                        characteristics::DummyProcedure::Attr::Pointer)) &&
+                IsBareNullPointer(expr)) {
+              // NULL() is compatible with any dummy pointer
+              // or optional dummy procedure.
+              return true;
+            }
             if (!expr || !IsProcedurePointerTarget(*expr)) {
               return false;
             }
diff --git a/flang/lib/Semantics/pointer-assignment.cpp b/flang/lib/Semantics/pointer-assignment.cpp
index 2450ce39215e..7f4548c7327e 100644
--- a/flang/lib/Semantics/pointer-assignment.cpp
+++ b/flang/lib/Semantics/pointer-assignment.cpp
@@ -76,6 +76,7 @@ private:
       const Procedure * = nullptr,
       const evaluate::SpecificIntrinsic *specific = nullptr);
   bool LhsOkForUnlimitedPoly() const;
+  std::optional<MessageFormattedText> CheckRanks(const TypeAndShape &rhs) const;
   template <typename... A> parser::Message *Say(A &&...);
   template <typename FeatureOrUsageWarning, typename... A>
   parser::Message *Warn(FeatureOrUsageWarning, A &&...);
@@ -278,10 +279,19 @@ bool PointerAssignmentChecker::Check(const evaluate::FunctionRef<T> &f) {
   } else if (lhsType_) {
     const auto *frTypeAndShape{funcResult->GetTypeAndShape()};
     CHECK(frTypeAndShape);
-    if (!lhsType_->IsCompatibleWith(foldingContext_.messages(), *frTypeAndShape,
-            "pointer", "function result",
-            /*omitShapeConformanceCheck=*/isBoundsRemapping_ || isAssumedRank_,
-            evaluate::CheckConformanceFlags::BothDeferredShape)) {
+    if (frTypeAndShape->type().IsUnlimitedPolymorphic() &&
+        LhsOkForUnlimitedPoly()) {
+      // Special case exception to type checking (F'2023 C1017);
+      // still check rank compatibility.
+      if (auto msg{CheckRanks(*frTypeAndShape)}) {
+        Say(*msg);
+        return false;
+      }
+    } else if (!lhsType_->IsCompatibleWith(foldingContext_.messages(),
+                   *frTypeAndShape, "pointer", "function result",
+                   /*omitShapeConformanceCheck=*/isBoundsRemapping_ ||
+                       isAssumedRank_,
+                   evaluate::CheckConformanceFlags::BothDeferredShape)) {
       return false; // IsCompatibleWith() emitted message
     }
   }
@@ -324,27 +334,17 @@ bool PointerAssignmentChecker::Check(const evaluate::Designator<T> &d) {
         msg = "Pointer must be VOLATILE when target is a"
               " VOLATILE coarray"_err_en_US;
       }
+    } else if (auto m{CheckRanks(*rhsType)}) {
+      msg = std::move(*m);
     } else if (rhsType->type().IsUnlimitedPolymorphic()) {
       if (!LhsOkForUnlimitedPoly()) {
         msg = "Pointer type must be unlimited polymorphic or non-extensible"
               " derived type when target is unlimited polymorphic"_err_en_US;
       }
-    } else {
-      if (!lhsType_->type().IsTkLenCompatibleWith(rhsType->type())) {
-        msg = MessageFormattedText{
-            "Target type %s is not compatible with pointer type %s"_err_en_US,
-            rhsType->type().AsFortran(), lhsType_->type().AsFortran()};
-
-      } else if (!isBoundsRemapping_ &&
-          !lhsType_->attrs().test(TypeAndShape::Attr::AssumedRank)) {
-        int lhsRank{lhsType_->Rank()};
-        int rhsRank{rhsType->Rank()};
-        if (lhsRank != rhsRank) {
-          msg = MessageFormattedText{
-              "Pointer has rank %d but target has rank %d"_err_en_US, lhsRank,
-              rhsRank};
-        }
-      }
+    } else if (!lhsType_->type().IsTkLenCompatibleWith(rhsType->type())) {
+      msg = MessageFormattedText{
+          "Target type %s is not compatible with pointer type %s"_err_en_US,
+          rhsType->type().AsFortran(), lhsType_->type().AsFortran()};
     }
   }
   if (msg) {
@@ -434,6 +434,21 @@ bool PointerAssignmentChecker::LhsOkForUnlimitedPoly() const {
   }
 }
 
+std::optional<MessageFormattedText> PointerAssignmentChecker::CheckRanks(
+    const TypeAndShape &rhs) const {
+  if (!isBoundsRemapping_ &&
+      !lhsType_->attrs().test(TypeAndShape::Attr::AssumedRank)) {
+    int lhsRank{lhsType_->Rank()};
+    int rhsRank{rhs.Rank()};
+    if (lhsRank != rhsRank) {
+      return MessageFormattedText{
+          "Pointer has rank %d but target has rank %d"_err_en_US, lhsRank,
+          rhsRank};
+    }
+  }
+  return std::nullopt;
+}
+
 template <typename... A>
 parser::Message *PointerAssignmentChecker::Say(A &&...x) {
   auto *msg{foldingContext_.messages().Say(std::forward<A>(x)...)};
diff --git a/flang/lib/Semantics/resolve-names.cpp b/flang/lib/Semantics/resolve-names.cpp
index 122c0a2ebb64..724f1b280783 100644
--- a/flang/lib/Semantics/resolve-names.cpp
+++ b/flang/lib/Semantics/resolve-names.cpp
@@ -3162,6 +3162,10 @@ ModuleVisitor::SymbolRename ModuleVisitor::AddUse(
 // Convert it to a UseError with this additional location.
 static bool ConvertToUseError(
     Symbol &symbol, const SourceName &location, const Scope &module) {
+  if (auto *ued{symbol.detailsIf<UseErrorDetails>()}) {
+    ued->add_occurrence(location, module);
+    return true;
+  }
   const auto *useDetails{symbol.detailsIf<UseDetails>()};
   if (!useDetails) {
     if (auto *genericDetails{symbol.detailsIf<GenericDetails>()}) {
@@ -3319,6 +3323,8 @@ void ModuleVisitor::DoAddUse(SourceName location, SourceName localName,
         combinedDerivedType = CreateLocalUseError();
       } else {
         ConvertToUseError(*localSymbol, location, *useModuleScope_);
+        localDerivedType = nullptr;
+        localGeneric = nullptr;
         combinedDerivedType = localSymbol;
       }
     }
diff --git a/flang/module/__fortran_builtins.f90 b/flang/module/__fortran_builtins.f90
index ef206dfd9431..4d134fa4b62b 100644
--- a/flang/module/__fortran_builtins.f90
+++ b/flang/module/__fortran_builtins.f90
@@ -22,6 +22,9 @@ module __fortran_builtins
   intrinsic :: __builtin_c_loc
   public :: __builtin_c_loc
 
+  intrinsic :: __builtin_c_devloc
+  public :: __builtin_c_devloc
+
   intrinsic :: __builtin_c_f_pointer
   public :: __builtin_c_f_pointer
 
@@ -40,15 +43,15 @@ module __fortran_builtins
   end type
 
   type, public :: __builtin_event_type
-    integer(kind=int64), private :: __count
+    integer(kind=int64), private :: __count = -1
   end type
 
   type, public :: __builtin_notify_type
-    integer(kind=int64), private :: __count
+    integer(kind=int64), private :: __count = -1
   end type
 
   type, public :: __builtin_lock_type
-    integer(kind=int64), private :: __count
+    integer(kind=int64), private :: __count = -1
   end type
 
   type, public :: __builtin_ieee_flag_type
@@ -88,7 +91,7 @@ module __fortran_builtins
       __builtin_ieee_round_type(_FORTRAN_RUNTIME_IEEE_OTHER)
 
   type, public :: __builtin_team_type
-    integer(kind=int64), private :: __id
+    integer(kind=int64), private :: __id = -1
   end type
 
   integer, parameter, public :: __builtin_atomic_int_kind = selected_int_kind(18)
@@ -144,6 +147,7 @@ module __fortran_builtins
 
   type :: __force_derived_type_instantiations
     type(__builtin_c_ptr) :: c_ptr
+    type(__builtin_c_devptr) :: c_devptr
     type(__builtin_c_funptr) :: c_funptr
     type(__builtin_event_type) :: event_type
     type(__builtin_lock_type) :: lock_type
diff --git a/flang/module/__fortran_type_info.f90 b/flang/module/__fortran_type_info.f90
index 5f2273de1e3d..b30a6bf69756 100644
--- a/flang/module/__fortran_type_info.f90
+++ b/flang/module/__fortran_type_info.f90
@@ -14,7 +14,7 @@
 module __fortran_type_info
 
   use, intrinsic :: __fortran_builtins, &
-    only: __builtin_c_ptr, __builtin_c_funptr
+    only: __builtin_c_ptr, __builtin_c_devptr, __builtin_c_funptr
   implicit none
 
   ! Set PRIVATE by default to explicitly only export what is meant
diff --git a/flang/runtime/derived.cpp b/flang/runtime/derived.cpp
index 7c164ff89045..10813c62e5da 100644
--- a/flang/runtime/derived.cpp
+++ b/flang/runtime/derived.cpp
@@ -129,6 +129,10 @@ RT_API_ATTRS int InitializeClone(const Descriptor &clone,
   std::size_t elements{orig.Elements()};
   int stat{StatOk};
 
+  // Skip pointers and unallocated variables.
+  if (orig.IsPointer() || !orig.IsAllocated()) {
+    return stat;
+  }
   // Initialize each data component.
   std::size_t components{componentDesc.Elements()};
   for (std::size_t i{0}; i < components; ++i) {
diff --git a/flang/runtime/edit-input.cpp b/flang/runtime/edit-input.cpp
index c714a85a336e..317f0b676bd2 100644
--- a/flang/runtime/edit-input.cpp
+++ b/flang/runtime/edit-input.cpp
@@ -130,7 +130,7 @@ static RT_API_ATTRS bool EditBOZInput(
     shift = shift - 8; // misaligned octal
   }
   while (digits > 0) {
-    char32_t ch{*io.NextInField(remaining, edit)};
+    char32_t ch{io.NextInField(remaining, edit).value_or(' ')};
     int digit{0};
     if (ch == ' ' || ch == '\t') {
       if (edit.modes.editingFlags & blankZero) {
diff --git a/flang/test/Driver/fsave-main-program.f90 b/flang/test/Driver/fsave-main-program.f90
new file mode 100644
index 000000000000..bffdfd97911e
--- /dev/null
+++ b/flang/test/Driver/fsave-main-program.f90
@@ -0,0 +1,5 @@
+! Check that the driver passes through -fsave-main-program:
+! RUN: %flang -### -S -fsave-main-program %s -o - 2>&1 | FileCheck %s
+! Check that the compiler accepts -fsave-main-program:
+! RUN: %flang_fc1 -emit-hlfir -fsave-main-program %s -o -
+! CHECK: "-fc1"{{.*}}"-fsave-main-program"
diff --git a/flang/test/Driver/mlir-pass-pipeline.f90 b/flang/test/Driver/mlir-pass-pipeline.f90
index 9655afce96d9..55e86da2dfdf 100644
--- a/flang/test/Driver/mlir-pass-pipeline.f90
+++ b/flang/test/Driver/mlir-pass-pipeline.f90
@@ -49,6 +49,15 @@ end program
 ! ALL: LowerHLFIROrderedAssignments
 ! ALL-NEXT: LowerHLFIRIntrinsics
 ! ALL-NEXT: BufferizeHLFIR
+! O2-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'omp.declare_reduction', 'omp.private']
+! O2-NEXT:   'fir.global' Pipeline
+! O2-NEXT:     InlineHLFIRAssign
+! O2-NEXT:   'func.func' Pipeline
+! O2-NEXT:     InlineHLFIRAssign
+! O2-NEXT:   'omp.declare_reduction' Pipeline
+! O2-NEXT:     InlineHLFIRAssign
+! O2-NEXT:   'omp.private' Pipeline
+! O2-NEXT:     InlineHLFIRAssign
 ! ALL-NEXT: ConvertHLFIRtoFIR
 ! ALL-NEXT: CSE
 ! Ideally, we need an output with only the pass names, but
diff --git a/flang/test/Fir/basic-program.fir b/flang/test/Fir/basic-program.fir
index 620882ebbed2..29a0f6615797 100644
--- a/flang/test/Fir/basic-program.fir
+++ b/flang/test/Fir/basic-program.fir
@@ -50,6 +50,15 @@ func.func @_QQmain() {
 // PASSES-NEXT:   LowerHLFIROrderedAssignments
 // PASSES-NEXT:   LowerHLFIRIntrinsics
 // PASSES-NEXT:   BufferizeHLFIR
+// PASSES-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'omp.declare_reduction', 'omp.private']
+// PASSES-NEXT:   'fir.global' Pipeline
+// PASSES-NEXT:     InlineHLFIRAssign
+// PASSES-NEXT:   'func.func' Pipeline
+// PASSES-NEXT:     InlineHLFIRAssign
+// PASSES-NEXT:   'omp.declare_reduction' Pipeline
+// PASSES-NEXT:     InlineHLFIRAssign
+// PASSES-NEXT:   'omp.private' Pipeline
+// PASSES-NEXT:     InlineHLFIRAssign
 // PASSES-NEXT:   ConvertHLFIRtoFIR
 // PASSES-NEXT:   LowerWorkshare
 // PASSES-NEXT:   CSE
diff --git a/flang/test/Fir/convert-fold.fir b/flang/test/Fir/convert-fold.fir
index ebb6c8db7c89..fb30e634ba5e 100644
--- a/flang/test/Fir/convert-fold.fir
+++ b/flang/test/Fir/convert-fold.fir
@@ -35,3 +35,12 @@ func.func @ctest() -> index {
   // CHECK-NEXT: return %{{.*}} : index
   return %2 : index
 }
+
+// CHECK-LABEL:   func.func @ptrtest(
+// CHECK-SAME:                       %[[VAL_0:.*]]: !fir.heap<!fir.array<2xf32>>) -> !fir.heap<!fir.array<2xf32>> {
+func.func @ptrtest(%0 : !fir.heap<!fir.array<2xf32>>) -> !fir.heap<!fir.array<2xf32>> {
+  %1 = fir.convert %0 : (!fir.heap<!fir.array<2xf32>>) -> !fir.ref<!fir.array<2xf32>>
+  %2 = fir.convert %1 : (!fir.ref<!fir.array<2xf32>>) -> !fir.heap<!fir.array<2xf32>>
+// CHECK:           return %[[VAL_0]] : !fir.heap<!fir.array<2xf32>>
+  return %2 : !fir.heap<!fir.array<2xf32>>
+}
diff --git a/flang/test/Integration/debug-local-var-2.f90 b/flang/test/Integration/debug-local-var-2.f90
index 5a675cbe1786..fe4144a3dd46 100644
--- a/flang/test/Integration/debug-local-var-2.f90
+++ b/flang/test/Integration/debug-local-var-2.f90
@@ -107,4 +107,4 @@ contains
   end function
 end program
 
-LINEONLY-NOT: DILocalVariable
+! LINEONLY-NOT: DILocalVariable
diff --git a/flang/test/Lower/CUDA/cuda-cdevloc.cuf b/flang/test/Lower/CUDA/cuda-cdevloc.cuf
new file mode 100644
index 000000000000..a71490207909
--- /dev/null
+++ b/flang/test/Lower/CUDA/cuda-cdevloc.cuf
@@ -0,0 +1,21 @@
+! RUN: bbc -emit-hlfir -fcuda %s -o - | FileCheck %s
+
+attributes(global) subroutine testcdevloc(a)
+  use __fortran_builtins, only: c_devloc => __builtin_c_devloc
+  integer, device :: a(10)
+  print*, c_devloc(a(1))
+end
+
+! CHECK-LABEL: func.func @_QPtestcdevloc(
+! CHECK-SAME: %[[A_ARG:.*]]: !fir.ref<!fir.array<10xi32>> {cuf.data_attr = #cuf.cuda<device>, fir.bindc_name = "a"}) attributes {cuf.proc_attr = #cuf.cuda_proc<global>}
+! CHECK: %[[A:.*]]:2 = hlfir.declare %[[A_ARG]](%{{.*}}) dummy_scope %{{.*}} {data_attr = #cuf.cuda<device>, uniq_name = "_QFtestcdevlocEa"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
+! CHECK: %[[A1:.*]] = hlfir.designate %[[A]]#0 (%c1{{.*}})  : (!fir.ref<!fir.array<10xi32>>, index) -> !fir.ref<i32>
+! CHECK: %[[BOX:.*]] = fir.embox %[[A1]] : (!fir.ref<i32>) -> !fir.box<i32>
+! CHECK: %[[CDEVPTR:.*]] = fir.alloca !fir.type<_QM__fortran_builtinsT__builtin_c_devptr{cptr:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>
+! CHECK: %[[FIELD_CPTR:.*]] = fir.field_index cptr, !fir.type<_QM__fortran_builtinsT__builtin_c_devptr{cptr:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>
+! CHECK: %[[COORD_CPTR:.*]] = fir.coordinate_of %[[CDEVPTR]], %[[FIELD_CPTR]] : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_devptr{cptr:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>, !fir.field) -> !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>
+! CHECK: %[[FIELD_ADDRESS:.*]] = fir.field_index __address, !fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>
+! CHECK: %[[COORD_ADDRESS:.*]] = fir.coordinate_of %[[COORD_CPTR]], %[[FIELD_ADDRESS]] : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, !fir.field) -> !fir.ref<i64>
+! CHECK: %[[BOX_ADDR:.*]] = fir.box_addr %[[BOX]] : (!fir.box<i32>) -> !fir.ref<i32>
+! CHECK: %[[ADDRESS_A1:.*]] = fir.convert %[[BOX_ADDR]] : (!fir.ref<i32>) -> i64
+! CHECK: fir.store %[[ADDRESS_A1]] to %[[COORD_ADDRESS]] : !fir.ref<i64>
diff --git a/flang/test/Lower/OpenMP/derived-type-allocatable.f90 b/flang/test/Lower/OpenMP/derived-type-allocatable.f90
index d265954ef1ce..2dc4e20f27af 100644
--- a/flang/test/Lower/OpenMP/derived-type-allocatable.f90
+++ b/flang/test/Lower/OpenMP/derived-type-allocatable.f90
@@ -13,6 +13,10 @@ module m1
 
 contains
 
+!CHECK-LABEL: omp.private {type = private} @_QMm1Ftest_pointer
+!CHECK-NOT:   fir.call @_FortranAInitializeClone
+!CHECK:       omp.yield
+
 !CHECK-LABEL: omp.private {type = private} @_QMm1Ftest_nested
 !CHECK:       fir.call @_FortranAInitializeClone
 !CHECK-NEXT:  omp.yield
@@ -91,4 +95,11 @@ contains
     !$omp parallel private(d2)
     !$omp end parallel
   end subroutine
+
+  subroutine test_pointer()
+    type(x), pointer :: ptr
+
+    !$omp parallel private(ptr)
+    !$omp end parallel
+  end subroutine
 end module
diff --git a/flang/test/Lower/OpenMP/firstprivate-alloc-comp.f90 b/flang/test/Lower/OpenMP/firstprivate-alloc-comp.f90
new file mode 100644
index 000000000000..2453fe2c5208
--- /dev/null
+++ b/flang/test/Lower/OpenMP/firstprivate-alloc-comp.f90
@@ -0,0 +1,19 @@
+! Test delayed privatization for derived types with allocatable components.
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -o - %s | FileCheck %s
+
+subroutine firstprivate_alloc_comp
+  type t1
+    integer, allocatable :: c(:)
+  end type
+  type(t1) :: x
+  !$omp parallel firstprivate(x)
+    print *, allocated(x%c)
+  !$omp end parallel
+end
+
+  call firstprivate_alloc_comp()
+end
+! CHECK-LABEL:   omp.private {type = firstprivate} @_QFfirstprivate_alloc_compEx_firstprivate_ref_rec__QFfirstprivate_alloc_compTt1 : !fir.ref<!fir.type<_QFfirstprivate_alloc_compTt1{c:!fir.box<!fir.heap<!fir.array<?xi32>>>}>> alloc {
+! CHECK:     fir.call @_FortranAInitialize(
+! CHECK:   } copy {
+! ...
diff --git a/flang/test/Lower/array-substring.f90 b/flang/test/Lower/array-substring.f90
index 02101039120e..7544fbb98962 100644
--- a/flang/test/Lower/array-substring.f90
+++ b/flang/test/Lower/array-substring.f90
@@ -24,9 +24,8 @@
 ! CHECK:         %[[VAL_16:.*]] = fir.array_coor %[[VAL_7]](%[[VAL_9]]) {{\[}}%[[VAL_10]]] %[[VAL_15]] : (!fir.ref<!fir.array<1x!fir.char<1,12>>>, !fir.shape<1>, !fir.slice<1>, index) -> !fir.ref<!fir.char<1,12>>
 ! CHECK:         %[[VAL_17:.*]] = fir.convert %[[VAL_16]] : (!fir.ref<!fir.char<1,12>>) -> !fir.ref<!fir.array<12x!fir.char<1>>>
 ! CHECK:         %[[VAL_18:.*]] = fir.coordinate_of %[[VAL_17]], %[[VAL_2]] : (!fir.ref<!fir.array<12x!fir.char<1>>>, index) -> !fir.ref<!fir.char<1>>
-! CHECK:         %[[VAL_19:.*]] = fir.convert %[[VAL_18]] : (!fir.ref<!fir.char<1>>) -> !fir.ref<!fir.char<1,?>>
 ! CHECK:         %[[VAL_20:.*]] = fir.array_coor %[[VAL_11]](%[[VAL_9]]) %[[VAL_15]] : (!fir.ref<!fir.array<1x!fir.char<1,8>>>, !fir.shape<1>, index) -> !fir.ref<!fir.char<1,8>>
-! CHECK:         %[[VAL_21:.*]] = fir.convert %[[VAL_19]] : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<i8>
+! CHECK:         %[[VAL_21:.*]] = fir.convert %[[VAL_18]] : (!fir.ref<!fir.char<1>>) -> !fir.ref<i8>
 ! CHECK:         %[[VAL_22:.*]] = fir.convert %[[VAL_20]] : (!fir.ref<!fir.char<1,8>>) -> !fir.ref<i8>
 ! CHECK:         %[[VAL_23:.*]] = fir.convert %[[VAL_4]] : (index) -> i64
 ! CHECK:         %[[VAL_24:.*]] = fir.call @_FortranACharacterCompareScalar1(%[[VAL_21]], %[[VAL_22]], %[[VAL_23]], %[[VAL_23]]) {{.*}}: (!fir.ref<i8>, !fir.ref<i8>, i64, i64) -> i32
diff --git a/flang/test/Lower/fsave-main-program.f90 b/flang/test/Lower/fsave-main-program.f90
new file mode 100644
index 000000000000..17fc1b02f506
--- /dev/null
+++ b/flang/test/Lower/fsave-main-program.f90
@@ -0,0 +1,10 @@
+! Test -fsave-main-program switch.
+! RUN: %flang_fc1 -emit-hlfir -o - %s | FileCheck --check-prefix=CHECK-DEFAULT %s
+! RUN: %flang_fc1 -fsave-main-program -emit-hlfir -o - %s | FileCheck --check-prefix=CHECK-SAVE %s
+program test
+integer :: i
+call foo(i)
+end
+
+!CHECK-DEFAULT-NOT: fir.global internal @_QFEi
+!CHECK-SAVE: fir.global internal @_QFEi
diff --git a/flang/test/Lower/vector-subscript-io.f90 b/flang/test/Lower/vector-subscript-io.f90
index 372130fd0990..9a041af16c88 100644
--- a/flang/test/Lower/vector-subscript-io.f90
+++ b/flang/test/Lower/vector-subscript-io.f90
@@ -325,12 +325,11 @@ subroutine substring(x, y, i, j)
 ! CHECK:   %[[VAL_230:.*]] = arith.subi %[[VAL_216]], %[[VAL_210]] : index
 ! CHECK:   %[[VAL_231:.*]] = fir.convert %[[VAL_228]] : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.array<?x!fir.char<1>>>
 ! CHECK:   %[[VAL_232:.*]] = fir.coordinate_of %[[VAL_231]], %[[VAL_230]] : (!fir.ref<!fir.array<?x!fir.char<1>>>, index) -> !fir.ref<!fir.char<1>>
-! CHECK:   %[[VAL_233:.*]] = fir.convert %[[VAL_232]] : (!fir.ref<!fir.char<1>>) -> !fir.ref<!fir.char<1,?>>
 ! CHECK:   %[[VAL_234:.*]] = arith.subi %[[VAL_219]], %[[VAL_216]] : index
 ! CHECK:   %[[VAL_235:.*]] = arith.addi %[[VAL_234]], %[[VAL_210]] : index
 ! CHECK:   %[[VAL_236:.*]] = arith.cmpi slt, %[[VAL_235]], %[[VAL_209]] : index
 ! CHECK:   %[[VAL_237:.*]] = arith.select %[[VAL_236]], %[[VAL_209]], %[[VAL_235]] : index
-! CHECK:   %[[VAL_238:.*]] = fir.convert %[[VAL_233]] : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<i8>
+! CHECK:   %[[VAL_238:.*]] = fir.convert %[[VAL_232]] : (!fir.ref<!fir.char<1>>) -> !fir.ref<i8>
 ! CHECK:   %[[VAL_239:.*]] = fir.convert %[[VAL_237]] : (index) -> i64
 ! CHECK:   %[[VAL_240:.*]] = fir.call @_FortranAioInputAscii(%[[VAL_213]], %[[VAL_238]], %[[VAL_239]]) {{.*}}: (!fir.ref<i8>, !fir.ref<i8>, i64) -> i1
 ! CHECK:   %[[VAL_241:.*]] = arith.addi %[[VAL_221]], %[[VAL_210]] overflow<nsw> : index
diff --git a/flang/test/Parser/OpenMP/compiler-directive-continuation.f90 b/flang/test/Parser/OpenMP/compiler-directive-continuation.f90
new file mode 100644
index 000000000000..87e4a72c5429
--- /dev/null
+++ b/flang/test/Parser/OpenMP/compiler-directive-continuation.f90
@@ -0,0 +1,44 @@
+! RUN: %flang_fc1 -fopenmp -E %s 2>&1 | FileCheck %s --check-prefix=CHECK-OMP
+! RUN: %flang_fc1 -E %s 2>&1 | FileCheck %s 
+
+
+! Test in mixed way, i.e., combination of Fortran free source form 
+! and free source form with conditional compilation sentinel.
+! CHECK-LABEL: subroutine mixed_form1()
+! CHECK-OMP: i = 1 +100+ 1000+ 10 + 1 +1000000000 + 1000000
+! CHECK: i = 1 + 10 + 10000 + 1000000
+subroutine mixed_form1()
+   i = 1 &
+  !$+100&
+  !$&+ 1000&
+   &+ 10 + 1&
+  !$& +100000&
+   &0000 + 1000000
+end subroutine	
+
+
+! Testing continuation lines in only Fortran Free form Source
+! CHECK-LABEL: subroutine mixed_form2()
+! CHECK-OMP: i = 1 +10 +100 + 1000 + 10000
+! CHECK: i = 1 +10 +100 + 1000 + 10000
+subroutine mixed_form2()
+   i = 1 &
+   +10 &
+   &+100
+   & + 1000 &
+   + 10000
+end subroutine
+
+
+! Testing continuation line in only free source form conditional compilation sentinel.
+! CHECK-LABEL: subroutine mixed_form3()
+! CHECK-OMP: i=0
+! CHECK-OMP: i = 1 +10 +100+1000
+subroutine mixed_form3()
+   !$ i=0
+   !$ i = 1 &
+   !$ & +10 &
+   !$&+100&
+   !$ +1000 
+end subroutine
+
diff --git a/flang/test/Parser/at-process.f b/flang/test/Parser/at-process.f
index 41b95044bfdc..4f54c6b65638 100644
--- a/flang/test/Parser/at-process.f
+++ b/flang/test/Parser/at-process.f
@@ -1,4 +1,4 @@
-! RUN: %flang_fc1 -fsyntax-only %s 2>&1 | FileCheck %s
+! RUN: not %flang_fc1 -fsyntax-only %s 2>&1 | FileCheck %s
 
 ! Test ignoring @PROCESS directive in fixed source form
 
@@ -18,3 +18,5 @@ c@process
 
 !CHECK: Character in fixed-form label field must be a digit
 @precoss 
+
+!CHECK: at-process.f:14:1: error: parser FAIL (final position)
diff --git a/flang/test/Parser/unparseable.f90 b/flang/test/Parser/unparseable.f90
new file mode 100644
index 000000000000..9e7a890b67a3
--- /dev/null
+++ b/flang/test/Parser/unparseable.f90
@@ -0,0 +1,5 @@
+! RUN: not %flang_fc1 -fsyntax-only %s 2>&1 | FileCheck %s
+! CHECK: unparseable.f90:5:1: error: parser FAIL (final position)
+module m
+end
+select type (barf)
diff --git a/flang/test/Preprocessing/bug129131.F b/flang/test/Preprocessing/bug129131.F
new file mode 100644
index 000000000000..5b1a914a2c9e
--- /dev/null
+++ b/flang/test/Preprocessing/bug129131.F
@@ -0,0 +1,5 @@
+! RUN: %flang -fc1 -fdebug-unparse %s 2>&1 | FileCheck %s
+! CHECK: PRINT *, 2_4
+#define a ,3
+      print *, mod(5 a)
+      end
diff --git a/flang/test/Semantics/assign16.f90 b/flang/test/Semantics/assign16.f90
new file mode 100644
index 000000000000..2e65829ff990
--- /dev/null
+++ b/flang/test/Semantics/assign16.f90
@@ -0,0 +1,46 @@
+! RUN: %python %S/test_errors.py %s %flang_fc1
+! The RHS of a pointer assignment can be unlimited polymorphic
+! if the LHS is a sequence type.
+program main
+  type nonSeqType
+    integer j
+  end type
+  type seqType
+    sequence
+    integer j
+  end type
+  type(nonSeqType), target :: xNonSeq = nonSeqType(1)
+  type(nonSeqType), pointer :: pNonSeq
+  type(seqType), target :: xSeq = seqType(1), aSeq(1)
+  type(seqType), pointer :: pSeq, paSeq(:)
+  !ERROR: function result type 'CLASS(*)' is not compatible with pointer type 'nonseqtype'
+  pNonSeq => polyPtr(xNonSeq)
+  pSeq => polyPtr(xSeq) ! ok
+  !ERROR: Pointer has rank 1 but target has rank 0
+  paSeq => polyPtr(xSeq)
+  !ERROR: Pointer has rank 0 but target has rank 1
+  pSeq => polyPtrArr(aSeq)
+ contains
+  function polyPtr(target)
+    class(*), intent(in), target :: target
+    class(*), pointer :: polyPtr
+    polyPtr => target
+  end
+  function polyPtrArr(target)
+    class(*), intent(in), target :: target(:)
+    class(*), pointer :: polyPtrArr(:)
+    polyPtrArr => target
+  end
+  function err1(target)
+    class(*), intent(in), target :: target(:)
+    class(*), pointer :: err1
+    !ERROR: Pointer has rank 0 but target has rank 1
+    err1 => target
+  end
+  function err2(target)
+    class(*), intent(in), target :: target
+    class(*), pointer :: err2(:)
+    !ERROR: Pointer has rank 1 but target has rank 0
+    err2 => target
+  end
+end
diff --git a/flang/test/Semantics/bug121718.f90 b/flang/test/Semantics/bug121718.f90
new file mode 100644
index 000000000000..e99391f227d7
--- /dev/null
+++ b/flang/test/Semantics/bug121718.f90
@@ -0,0 +1,31 @@
+! RUN: %flang_fc1 2>&1 | FileCheck %s --allow-empty
+! CHECK-NOT: error
+! Regression test simplified from LLVM bug 121718.
+! Ensure no crash and no spurious error message.
+module m1
+  type foo
+    integer x
+  end type
+ contains
+  subroutine test
+    print *, foo(123)
+  end
+end
+module m2
+  interface foo
+    procedure f
+  end interface
+  type foo
+    real x
+  end type
+ contains
+  complex function f(x)
+    complex, intent(in) :: x
+    f = x
+  end
+end
+program main
+  use m1
+  use m2
+  call test
+end
diff --git a/flang/test/Semantics/call04.f90 b/flang/test/Semantics/call04.f90
index 6877f9c9fa93..9be579fb696c 100644
--- a/flang/test/Semantics/call04.f90
+++ b/flang/test/Semantics/call04.f90
@@ -21,10 +21,14 @@ module m
   subroutine s01a(x)
     real, allocatable, intent(out) :: x(:)
   end subroutine
+  subroutine s01c(x)
+    real, intent(out) :: x(:)
+  end subroutine
   subroutine s01b ! C846 - can only be caught at a call via explicit interface
     !ERROR: ALLOCATABLE coarray 'coarray' may not be associated with INTENT(OUT) dummy argument 'x='
     !ERROR: ALLOCATABLE dummy argument 'x=' has corank 0 but actual argument has corank 1
     call s01a(coarray)
+    call s01c(coarray) ! ok, dummy is not allocatable
   end subroutine
 
   subroutine s02(x) ! C846
diff --git a/flang/test/Semantics/definable01.f90 b/flang/test/Semantics/definable01.f90
index ff71b419fa97..d3b31ee38b2a 100644
--- a/flang/test/Semantics/definable01.f90
+++ b/flang/test/Semantics/definable01.f90
@@ -109,7 +109,29 @@ module m
   end
   pure subroutine test7(lp)
     type(list), pointer :: lp
-    !CHECK-NOT: error:
-    lp%next%next => null()
+    lp%next%next => null() ! ok
   end
 end module
+program main
+  use iso_fortran_env, only: lock_type
+  type(lock_type) lock
+  interface
+    subroutine inlock(lock)
+      import lock_type
+      type(lock_type), intent(in) :: lock
+    end
+    subroutine outlock(lock)
+      import lock_type
+      !CHECK: error: An INTENT(OUT) dummy argument may not be, or contain, EVENT_TYPE or LOCK_TYPE
+      type(lock_type), intent(out) :: lock
+    end
+    subroutine inoutlock(lock)
+      import lock_type
+      type(lock_type), intent(in out) :: lock
+    end
+  end interface
+  call inlock(lock) ! ok
+  call inoutlock(lock) ! ok
+  !CHECK: error: Actual argument associated with INTENT(OUT) dummy argument 'lock=' is not definable
+  call outlock(lock)
+end
diff --git a/flang/test/Semantics/get_team.f90 b/flang/test/Semantics/get_team.f90
index 7e4886703d17..a5b49a83f95f 100644
--- a/flang/test/Semantics/get_team.f90
+++ b/flang/test/Semantics/get_team.f90
@@ -10,6 +10,8 @@ program get_team_test
   type(team_type) :: result_team
   logical wrong_result_type, non_integer
 
+  result_team = team_type()
+
   !___ standard-conforming statement with no optional arguments present ___
   result_team = get_team()
 
diff --git a/flang/test/Semantics/io07.f90 b/flang/test/Semantics/io07.f90
index 64a32c995928..a013849472f6 100644
--- a/flang/test/Semantics/io07.f90
+++ b/flang/test/Semantics/io07.f90
@@ -68,10 +68,10 @@
 6001 format(((I0, B0)))
 
      !ERROR: 'A' edit descriptor 'w' value must be positive
-     !ERROR: 'L' edit descriptor 'w' value must be positive
+     !WARNING: 'L' edit descriptor 'w' value should be positive
 6101 format((A0), ((L0)))
 
-     !ERROR: 'L' edit descriptor 'w' value must be positive
+     !WARNING: 'L' edit descriptor 'w' value should be positive
 6102 format((3(((L 0 0 0)))))
 
 7001 format(17G8.1, 17G8.1e3)
diff --git a/flang/test/Semantics/io08.f90 b/flang/test/Semantics/io08.f90
index f6038b471759..517984fe3433 100644
--- a/flang/test/Semantics/io08.f90
+++ b/flang/test/Semantics/io08.f90
@@ -192,8 +192,7 @@
   !ERROR: 'A' edit descriptor 'w' value must be positive
   write(*,'(A0)')
 
-  !ERROR: 'L' edit descriptor 'w' value must be positive
-  write(*,'(L0)')
+  write(*,'(L0)') ! warning, not error
 
   !ERROR: Expected 'G' edit descriptor '.d' value
   write(*,'(G4)')
diff --git a/flang/test/Semantics/lcobound.f90 b/flang/test/Semantics/lcobound.f90
index ce2f001ce2ea..f03f2cae03ec 100644
--- a/flang/test/Semantics/lcobound.f90
+++ b/flang/test/Semantics/lcobound.f90
@@ -11,6 +11,9 @@ program lcobound_tests
   logical non_integer, logical_coarray[3,*]
   logical, parameter :: const_non_integer = .true.
   integer, allocatable :: lcobounds(:)
+  real bounded[2:3,4:5,*]
+
+  integer(kind=merge(kind(1),-1,all(lcobound(bounded)==[2,4,1]))) test_lcobound
 
   !___ standard-conforming statement with no optional arguments present ___
   lcobounds = lcobound(scalar_coarray)
@@ -50,28 +53,28 @@ program lcobound_tests
 
   !___ non-conforming statements ___
 
-  !ERROR: DIM=0 dimension is out of range for coarray with corank 1
+  !ERROR: DIM=0 dimension must be positive
   n = lcobound(scalar_coarray, dim=0)
 
-  !ERROR: DIM=0 dimension is out of range for coarray with corank 3
+  !ERROR: DIM=0 dimension must be positive
   n = lcobound(coarray_corank3, dim=0)
 
-  !ERROR: DIM=-1 dimension is out of range for coarray with corank 1
+  !ERROR: DIM=-1 dimension must be positive
   n = lcobound(scalar_coarray, dim=-1)
 
-  !ERROR: DIM=2 dimension is out of range for coarray with corank 1
+  !ERROR: DIM=2 dimension is out of range for corank-1 coarray
   n = lcobound(array_coarray, dim=2)
 
-  !ERROR: DIM=2 dimension is out of range for coarray with corank 1
+  !ERROR: DIM=2 dimension is out of range for corank-1 coarray
   n = lcobound(array_coarray, 2)
 
-  !ERROR: DIM=4 dimension is out of range for coarray with corank 3
+  !ERROR: DIM=4 dimension is out of range for corank-3 coarray
   n = lcobound(coarray_corank3, dim=4)
 
-  !ERROR: DIM=4 dimension is out of range for coarray with corank 3
+  !ERROR: DIM=4 dimension is out of range for corank-3 coarray
   n = lcobound(dim=4, coarray=coarray_corank3)
 
-  !ERROR: DIM=5 dimension is out of range for coarray with corank 3
+  !ERROR: DIM=5 dimension is out of range for corank-3 coarray
   n = lcobound(coarray_corank3, const_out_of_range_dim)
 
   !ERROR: No intrinsic or user-defined ASSIGNMENT(=) matches scalar INTEGER(4) and rank 1 array of INTEGER(4)
diff --git a/flang/test/Semantics/resolve94.f90 b/flang/test/Semantics/resolve94.f90
index e47ab4a43382..19c06ad0d162 100644
--- a/flang/test/Semantics/resolve94.f90
+++ b/flang/test/Semantics/resolve94.f90
@@ -17,8 +17,15 @@ subroutine s1()
   intCoVar = 343
   ! OK
   rVar1 = rCoarray[1,2,3]
+  associate (x => rCoarray)
+    rVar1 = x[1,2,3] ! also ok
+  end associate
   !ERROR: 'rcoarray' has corank 3, but coindexed reference has 2 cosubscripts
   rVar1 = rCoarray[1,2]
+  associate (x => rCoarray)
+  !ERROR: 'x' has corank 3, but coindexed reference has 2 cosubscripts
+    rVar1 = x[1,2]
+  end associate
   !ERROR: Must have INTEGER type, but is REAL(4)
   rVar1 = rCoarray[1,2,3.4]
   !ERROR: Must have INTEGER type, but is REAL(4)
diff --git a/flang/test/Semantics/this_image01.f90 b/flang/test/Semantics/this_image01.f90
index 0e59aa3fa27c..fdcccdaeed0e 100644
--- a/flang/test/Semantics/this_image01.f90
+++ b/flang/test/Semantics/this_image01.f90
@@ -8,6 +8,8 @@ subroutine test
   type(team_type) :: coteam[*]
   integer :: coscalar[*], coarray(3)[*]
   save :: coteam, coscalar, coarray
+  real coarray1[*], coarray2[2,*], coarray3[2,3,*]
+  integer indices(3)
 
   ! correct calls, should produce no errors
   team = get_team()
@@ -17,6 +19,10 @@ subroutine test
   print *, this_image(coarray, team)
   print *, this_image(coarray, 1)
   print *, this_image(coarray, 1, team)
+  print *, this_image(coarray(1))
+  print *, this_image(coarray(1), team)
+  print *, this_image(coarray(1), 1)
+  print *, this_image(coarray(1), 1, team)
   print *, this_image(coscalar)
   print *, this_image(coscalar, team)
   print *, this_image(coscalar, 1)
@@ -28,4 +34,14 @@ subroutine test
   print *, team_number()
   print *, team_number(team)
 
+  indices(1:1) = this_image(coarray1) ! ok
+  indices(1:2) = this_image(coarray2) ! ok
+  indices(1:3) = this_image(coarray3) ! ok
+  !ERROR: Dimension 1 of left-hand side has extent 2, but right-hand side has extent 1
+  indices(1:2) = this_image(coarray1)
+  !ERROR: Dimension 1 of left-hand side has extent 3, but right-hand side has extent 2
+  indices(1:3) = this_image(coarray2)
+  !ERROR: Dimension 1 of left-hand side has extent 1, but right-hand side has extent 3
+  indices(1:1) = this_image(coarray3)
+
 end subroutine
diff --git a/flang/test/Semantics/ucobound.f90 b/flang/test/Semantics/ucobound.f90
index f9da11a03a6b..d84c80cdd315 100644
--- a/flang/test/Semantics/ucobound.f90
+++ b/flang/test/Semantics/ucobound.f90
@@ -11,6 +11,9 @@ program ucobound_tests
   logical non_integer, logical_coarray[3,*]
   logical, parameter :: const_non_integer = .true.
   integer, allocatable :: ucobounds(:)
+  real bounded[2:3,4:5,*]
+
+  integer(kind=merge(kind(1),-1,ucobound(bounded,1)==3.and.ucobound(bounded,2)==5)) test_ucobound
 
   !___ standard-conforming statement with no optional arguments present ___
   ucobounds = ucobound(scalar_coarray)
@@ -50,28 +53,28 @@ program ucobound_tests
 
   !___ non-conforming statements ___
 
-  !ERROR: DIM=0 dimension is out of range for coarray with corank 1
+  !ERROR: DIM=0 dimension must be positive
   n = ucobound(scalar_coarray, dim=0)
 
-  !ERROR: DIM=0 dimension is out of range for coarray with corank 3
+  !ERROR: DIM=0 dimension must be positive
   n = ucobound(coarray_corank3, dim=0)
 
-  !ERROR: DIM=-1 dimension is out of range for coarray with corank 1
+  !ERROR: DIM=-1 dimension must be positive
   n = ucobound(scalar_coarray, dim=-1)
 
-  !ERROR: DIM=2 dimension is out of range for coarray with corank 1
+  !ERROR: DIM=2 dimension is out of range for corank-1 coarray
   n = ucobound(array_coarray, dim=2)
 
-  !ERROR: DIM=2 dimension is out of range for coarray with corank 1
+  !ERROR: DIM=2 dimension is out of range for corank-1 coarray
   n = ucobound(array_coarray, 2)
 
-  !ERROR: DIM=4 dimension is out of range for coarray with corank 3
+  !ERROR: DIM=4 dimension is out of range for corank-3 coarray
   n = ucobound(coarray_corank3, dim=4)
 
-  !ERROR: DIM=4 dimension is out of range for coarray with corank 3
+  !ERROR: DIM=4 dimension is out of range for corank-3 coarray
   n = ucobound(dim=4, coarray=coarray_corank3)
 
-  !ERROR: DIM=5 dimension is out of range for coarray with corank 3
+  !ERROR: DIM=5 dimension is out of range for corank-3 coarray
   n = ucobound(coarray_corank3, const_out_of_range_dim)
 
   !ERROR: No intrinsic or user-defined ASSIGNMENT(=) matches scalar INTEGER(4) and rank 1 array of INTEGER(4)
diff --git a/flang/test/Transforms/stack-arrays.fir b/flang/test/Transforms/stack-arrays.fir
index 66cd2a5aa910..444136d53e03 100644
--- a/flang/test/Transforms/stack-arrays.fir
+++ b/flang/test/Transforms/stack-arrays.fir
@@ -379,7 +379,8 @@ func.func @placement_loop_declare() {
     %3 = arith.addi %c1, %c2 : index
     // operand is now available
     %4 = fir.allocmem !fir.array<?xi32>, %3
-    %5 = fir.declare %4 {uniq_name = "temp"} : (!fir.heap<!fir.array<?xi32>>) -> !fir.heap<!fir.array<?xi32>>
+    %shape = fir.shape %3 : (index) -> !fir.shape<1>
+    %5 = fir.declare %4(%shape) {uniq_name = "temp"} : (!fir.heap<!fir.array<?xi32>>, !fir.shape<1>) -> !fir.heap<!fir.array<?xi32>>
     // ...
     fir.freemem %5 : !fir.heap<!fir.array<?xi32>>
     fir.result %3, %c1_i32 : index, i32
@@ -400,3 +401,20 @@ func.func @placement_loop_declare() {
 // CHECK-NEXT:   }
 // CHECK-NEXT:   return
 // CHECK-NEXT: }
+
+// Can we look through fir.convert and fir.declare?
+func.func @lookthrough() {
+  %0 = fir.allocmem !fir.array<42xi32>
+  %c42 = arith.constant 42 : index
+  %shape = fir.shape %c42 : (index) -> !fir.shape<1>
+  %1 = fir.declare %0(%shape) {uniq_name = "name"} : (!fir.heap<!fir.array<42xi32>>, !fir.shape<1>) -> !fir.heap<!fir.array<42xi32>>
+  %2 = fir.convert %1 : (!fir.heap<!fir.array<42xi32>>) -> !fir.ref<!fir.array<42xi32>>
+  // use the ref so the converts aren't folded
+  %3 = fir.load %2 : !fir.ref<!fir.array<42xi32>>
+  %4 = fir.convert %2 : (!fir.ref<!fir.array<42xi32>>) -> !fir.heap<!fir.array<42xi32>>
+  fir.freemem %4 : !fir.heap<!fir.array<42xi32>>
+  return
+}
+// CHECK: func.func @lookthrough() {
+// CHECK:     fir.alloca !fir.array<42xi32>
+// CHECK-NOT: fir.freemem
diff --git a/flang/unittests/Runtime/LogicalFormatTest.cpp b/flang/unittests/Runtime/LogicalFormatTest.cpp
index c4fbfc81f06a..26c9374be133 100644
--- a/flang/unittests/Runtime/LogicalFormatTest.cpp
+++ b/flang/unittests/Runtime/LogicalFormatTest.cpp
@@ -23,7 +23,7 @@ TEST(IOApiTests, LogicalFormatTest) {
   char buffer[bufferSize];
 
   // Create format for all types and values to be written
-  const char *format{"(L,L3,I3,L2,L2,I3,L2,A3,L2,L,F4.1,L2)"};
+  const char *format{"(L0,L3,I3,L2,L2,I3,L2,A3,L2,L,F4.1,L2)"};
   auto cookie{IONAME(BeginInternalFormattedOutput)(
       buffer, bufferSize, format, std::strlen(format))};
 
diff --git a/libc/cmake/modules/LLVMLibCHeaderRules.cmake b/libc/cmake/modules/LLVMLibCHeaderRules.cmake
index 0de5e14359cf..288e4dade0b4 100644
--- a/libc/cmake/modules/LLVMLibCHeaderRules.cmake
+++ b/libc/cmake/modules/LLVMLibCHeaderRules.cmake
@@ -75,7 +75,7 @@ function(add_gen_header target_name)
   cmake_parse_arguments(
     "ADD_GEN_HDR"
     "PUBLIC" # No optional arguments
-    "YAML_FILE;DEF_FILE;GEN_HDR" # Single value arguments
+    "YAML_FILE;GEN_HDR" # Single value arguments
     "DEPENDS"     # Multi value arguments
     ${ARGN}
   )
@@ -84,9 +84,6 @@ function(add_gen_header target_name)
     add_library(${fq_target_name} INTERFACE)
     return()
   endif()
-  if(NOT ADD_GEN_HDR_DEF_FILE)
-    message(FATAL_ERROR "`add_gen_hdr` rule requires DEF_FILE to be specified.")
-  endif()
   if(NOT ADD_GEN_HDR_GEN_HDR)
     message(FATAL_ERROR "`add_gen_hdr` rule requires GEN_HDR to be specified.")
   endif()
@@ -97,8 +94,8 @@ function(add_gen_header target_name)
   set(absolute_path ${CMAKE_CURRENT_SOURCE_DIR}/${ADD_GEN_HDR_GEN_HDR})
   file(RELATIVE_PATH relative_path ${LIBC_INCLUDE_SOURCE_DIR} ${absolute_path})
   set(out_file ${LIBC_INCLUDE_DIR}/${relative_path})
+  set(dep_file "${out_file}.d")
   set(yaml_file ${CMAKE_SOURCE_DIR}/${ADD_GEN_HDR_YAML_FILE})
-  set(def_file ${CMAKE_CURRENT_SOURCE_DIR}/${ADD_GEN_HDR_DEF_FILE})
 
   set(fq_data_files "")
   if(ADD_GEN_HDR_DATA_FILES)
@@ -108,18 +105,20 @@ function(add_gen_header target_name)
   endif()
 
   set(entry_points "${TARGET_ENTRYPOINT_NAME_LIST}")
-  list(TRANSFORM entry_points PREPEND "--e=")
+  list(TRANSFORM entry_points PREPEND "--entry-point=")
 
-  set(LIBC_HDRGEN "${LIBC_SOURCE_DIR}/utils/hdrgen/yaml_to_classes.py")
   add_custom_command(
     OUTPUT ${out_file}
-    COMMAND ${Python3_EXECUTABLE} ${LIBC_HDRGEN}
-            ${yaml_file}
-            --h_def_file ${def_file}
+    WORKING_DIRECTORY ${CMAKE_BINARY_DIR}
+    COMMAND ${Python3_EXECUTABLE} "${LIBC_SOURCE_DIR}/utils/hdrgen/main.py"
+            --output ${out_file}
+            --depfile ${dep_file}
+            --write-if-changed
             ${entry_points}
-            --output_dir ${out_file}
-    DEPENDS ${yaml_file} ${def_file} ${fq_data_files}
-    COMMENT "Generating header ${ADD_GEN_HDR_GEN_HDR} from ${yaml_file} and ${def_file}"
+            ${yaml_file}
+    DEPENDS ${yaml_file} ${fq_data_files}
+    DEPFILE ${dep_file}
+    COMMENT "Generating header ${ADD_GEN_HDR_GEN_HDR} from ${yaml_file}"
   )
   if(LIBC_TARGET_OS_IS_GPU)
     file(MAKE_DIRECTORY ${LIBC_INCLUDE_DIR}/llvm-libc-decls)
@@ -127,7 +126,7 @@ function(add_gen_header target_name)
     set(decl_out_file ${LIBC_INCLUDE_DIR}/llvm-libc-decls/${relative_path})
     add_custom_command(
       OUTPUT ${decl_out_file}
-      COMMAND ${Python3_EXECUTABLE} ${LIBC_HDRGEN}
+      COMMAND ${Python3_EXECUTABLE} "${LIBC_SOURCE_DIR}/utils/hdrgen/yaml_to_classes.py"
               ${yaml_file}
               --export-decls
               ${entry_points}
diff --git a/libc/config/baremetal/config.json b/libc/config/baremetal/config.json
index 85e80879d498..08c581d1c682 100644
--- a/libc/config/baremetal/config.json
+++ b/libc/config/baremetal/config.json
@@ -30,10 +30,5 @@
     "LIBC_CONF_MATH_OPTIMIZATIONS": {
       "value": "(LIBC_MATH_SKIP_ACCURATE_PASS | LIBC_MATH_SMALL_TABLES)"
     }
-  },
-  "codegen": {
-    "LIBC_CONF_KEEP_FRAME_POINTER": {
-      "value": false
-    }
   }
 }
diff --git a/libc/hdr/types/CMakeLists.txt b/libc/hdr/types/CMakeLists.txt
index 5156b58ee11a..1674de142015 100644
--- a/libc/hdr/types/CMakeLists.txt
+++ b/libc/hdr/types/CMakeLists.txt
@@ -86,6 +86,14 @@ add_proxy_header_library(
 )
 
 add_proxy_header_library(
+  struct_tm
+  HDRS
+    struct_tm.h
+  FULL_BUILD_DEPENDS
+    libc.include.llvm-libc-types.struct_tm
+)
+
+add_proxy_header_library(
   size_t
   HDRS
     size_t.h
diff --git a/libc/hdr/types/struct_tm.h b/libc/hdr/types/struct_tm.h
new file mode 100644
index 000000000000..96c23e2ce054
--- /dev/null
+++ b/libc/hdr/types/struct_tm.h
@@ -0,0 +1,21 @@
+//===-- Proxy for struct tm  ----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_LIBC_HDR_TYPES_STRUCT_TM_H
+#define LLVM_LIBC_HDR_TYPES_STRUCT_TM_H
+
+#ifdef LIBC_FULL_BUILD
+
+#include "include/llvm-libc-types/struct_tm.h"
+
+#else
+
+#include <time.h>
+
+#endif // LIBC_FULL_BUILD
+
+#endif // LLVM_LIBC_HDR_TYPES_STRUCT_TM_H
diff --git a/libc/include/CMakeLists.txt b/libc/include/CMakeLists.txt
index eb407183c99f..568bb05d9230 100644
--- a/libc/include/CMakeLists.txt
+++ b/libc/include/CMakeLists.txt
@@ -19,11 +19,10 @@ add_header(
 
 # TODO: Can we simplify this macro expansion?
 # https://github.com/llvm/llvm-project/issues/117254
-macro(add_header_macro TARGET_NAME YAML_FILE DEF_FILE GEN_HDR DEPENDS)
+macro(add_header_macro TARGET_NAME YAML_FILE GEN_HDR DEPENDS)
   add_gen_header(
     ${TARGET_NAME}
     YAML_FILE ${YAML_FILE}
-    DEF_FILE ${DEF_FILE}
     GEN_HDR ${GEN_HDR}
     ${DEPENDS}
     ${ARGN}
@@ -33,7 +32,6 @@ endmacro()
 add_header_macro(
   ctype
   ../libc/include/ctype.yaml
-  ctype.h.def
   ctype.h
   DEPENDS
     .llvm_libc_common_h
@@ -43,7 +41,6 @@ add_header_macro(
 add_header_macro(
   dirent
   ../libc/include/dirent.yaml
-  dirent.h.def
   dirent.h
   DEPENDS
     .llvm_libc_common_h
@@ -55,7 +52,6 @@ add_header_macro(
 add_header_macro(
   fcntl
   ../libc/include/fcntl.yaml
-  fcntl.h.def
   fcntl.h
   DEPENDS
     .llvm-libc-macros.fcntl_macros
@@ -71,7 +67,6 @@ add_header_macro(
 add_header_macro(
   dlfcn
   ../libc/include/dlfcn.yaml
-  dlfcn.h.def
   dlfcn.h
   DEPENDS
     .llvm-libc-macros.dlfcn_macros
@@ -81,7 +76,6 @@ add_header_macro(
 add_header_macro(
   features
   ../libc/include/features.yaml
-  features.h.def
   features.h
   DEPENDS
     .llvm_libc_common_h
@@ -91,7 +85,6 @@ add_header_macro(
 add_header_macro(
   fenv
   ../libc/include/fenv.yaml
-  fenv.h.def
   fenv.h
   DEPENDS
     .llvm_libc_common_h
@@ -103,7 +96,6 @@ add_header_macro(
 add_header_macro(
   inttypes
   ../libc/include/inttypes.yaml
-  inttypes.h.def
   inttypes.h
   DEPENDS
     .llvm_libc_common_h
@@ -114,7 +106,6 @@ add_header_macro(
 add_header_macro(
   float
   ../libc/include/float.yaml
-  float.h.def
   float.h
   DEPENDS
     .llvm-libc-macros.float_macros
@@ -123,7 +114,6 @@ add_header_macro(
 add_header_macro(
   stdint
   ../libc/include/stdint.yaml
-  stdint.h.def
   stdint.h
   DEPENDS
     .llvm-libc-macros.stdint_macros
@@ -132,7 +122,6 @@ add_header_macro(
 add_header_macro(
   limits
   ../libc/include/limits.yaml
-  limits.h.def
   limits.h
   DEPENDS
     .llvm-libc-macros.limits_macros
@@ -141,7 +130,6 @@ add_header_macro(
 add_header_macro(
   malloc
   ../libc/include/malloc.yaml
-  malloc.h.def
   malloc.h
   DEPENDS
     .llvm_libc_common_h
@@ -151,7 +139,6 @@ add_header_macro(
 add_header_macro(
   math
   ../libc/include/math.yaml
-  math.h.def
   math.h
   DEPENDS
     .llvm_libc_common_h
@@ -166,7 +153,6 @@ add_header_macro(
 add_header_macro(
   stdfix
   ../libc/include/stdfix.yaml
-  stdfix.h.def
   stdfix.h
   DEPENDS
     .llvm-libc-macros.stdfix_macros
@@ -179,7 +165,6 @@ file(MAKE_DIRECTORY ${LIBC_INCLUDE_DIR}/arpa)
 add_header_macro(
   arpa_inet
   ../libc/include/arpa/inet.yaml
-  arpa/inet.h.def
   arpa/inet.h
   DEPENDS
     .llvm_libc_common_h
@@ -188,7 +173,6 @@ add_header_macro(
 add_header_macro(
   assert
   ../libc/include/assert.yaml
-  assert.h.def
   assert.h
   DEPENDS
     .llvm_libc_common_h
@@ -198,7 +182,6 @@ add_header_macro(
 add_header_macro(
   complex
   ../libc/include/complex.yaml
-  complex.h.def
   complex.h
   DEPENDS
     .llvm_libc_common_h
@@ -208,7 +191,6 @@ add_header_macro(
 add_header_macro(
   setjmp
   ../libc/include/setjmp.yaml
-  setjmp.h.def
   setjmp.h
   DEPENDS
     .llvm_libc_common_h
@@ -218,7 +200,6 @@ add_header_macro(
 add_header_macro(
   string
   ../libc/include/string.yaml
-  string.h.def
   string.h
   DEPENDS
     .llvm_libc_common_h
@@ -229,7 +210,6 @@ add_header_macro(
 add_header_macro(
   strings
   ../libc/include/strings.yaml
-  strings.h.def
   strings.h
   DEPENDS
     .llvm_libc_common_h
@@ -239,7 +219,6 @@ add_header_macro(
 add_header_macro(
   search
   ../libc/include/search.yaml
-  search.h.def
   search.h
   DEPENDS
     .llvm_libc_common_h
@@ -253,7 +232,6 @@ add_header_macro(
 add_header_macro(
   time
   ../libc/include/time.yaml
-  time.h.def
   time.h
   DEPENDS
     .llvm_libc_common_h
@@ -269,7 +247,6 @@ add_header_macro(
 add_header_macro(
   threads
   ../libc/include/threads.yaml
-  threads.h.def
   threads.h
   DEPENDS
     .llvm_libc_common_h
@@ -286,7 +263,6 @@ add_header_macro(
 add_header_macro(
   errno
   ../libc/include/errno.yaml
-  errno.h.def
   errno.h
   DEPENDS
     .llvm-libc-macros.generic_error_number_macros
@@ -296,7 +272,6 @@ add_header_macro(
 add_header_macro(
   signal
   ../libc/include/signal.yaml
-  signal.h.def
   signal.h
   DEPENDS
     .llvm-libc-macros.signal_macros
@@ -312,7 +287,6 @@ add_header_macro(
 add_header_macro(
   stdbit
   ../libc/include/stdbit.yaml
-  stdbit.h.def
   stdbit.h
   DEPENDS
     .llvm_libc_common_h
@@ -322,7 +296,6 @@ add_header_macro(
 add_header_macro(
   stdckdint
   ../libc/include/stdckdint.yaml
-  stdckdint.h.def
   stdckdint.h
   DEPENDS
     .llvm_libc_common_h
@@ -332,7 +305,6 @@ add_header_macro(
 add_header_macro(
   stdio
   ../libc/include/stdio.yaml
-  stdio.h.def
   stdio.h
   DEPENDS
     .llvm-libc-macros.file_seek_macros
@@ -348,7 +320,6 @@ add_header_macro(
 add_header_macro(
   stdlib
   ../libc/include/stdlib.yaml
-  stdlib.h.def
   stdlib.h
   DEPENDS
     .llvm_libc_common_h
@@ -367,7 +338,6 @@ add_header_macro(
 add_header_macro(
   unistd
   ../libc/include/unistd.yaml
-  unistd.h.def
   unistd.h
   DEPENDS
     .llvm_libc_common_h
@@ -386,7 +356,6 @@ add_header_macro(
 add_header_macro(
   pthread
   ../libc/include/pthread.yaml
-  pthread.h.def
   pthread.h
   DEPENDS
     .llvm-libc-macros.pthread_macros
@@ -410,7 +379,6 @@ add_header_macro(
 add_header_macro(
   sched
   ../libc/include/sched.yaml
-  sched.h.def
   sched.h
   DEPENDS
     .llvm_libc_common_h
@@ -427,7 +395,6 @@ add_header_macro(
 add_header_macro(
   spawn
   ../libc/include/spawn.yaml
-  spawn.h.def
   spawn.h
   DEPENDS
     .llvm_libc_common_h
@@ -440,7 +407,6 @@ add_header_macro(
 add_header_macro(
   link
   ../libc/include/link.yaml
-  link.h.def
   link.h
   DEPENDS
     .llvm_libc_common_h
@@ -450,7 +416,6 @@ add_header_macro(
 add_header_macro(
   elf
   ../libc/include/elf.yaml
-  elf.h.def
   elf.h
   DEPENDS
     .llvm-libc-macros.elf_macros
@@ -464,7 +429,6 @@ file(MAKE_DIRECTORY ${LIBC_INCLUDE_DIR}/sys)
 add_header_macro(
   sys_auxv
   ../libc/include/sys/auxv.yaml
-  sys/auxv.h.def
   sys/auxv.h
   DEPENDS
     .llvm_libc_common_h
@@ -474,7 +438,6 @@ add_header_macro(
 add_header_macro(
   sys_epoll
   ../libc/include/sys/epoll.yaml
-  sys/epoll.h.def
   sys/epoll.h
   DEPENDS
     .llvm_libc_common_h
@@ -487,7 +450,6 @@ add_header_macro(
 add_header_macro(
   sys_ioctl
   ../libc/include/sys/ioctl.yaml
-  sys/ioctl.h.def
   sys/ioctl.h
   DEPENDS
     .llvm_libc_common_h
@@ -497,7 +459,6 @@ add_header_macro(
 add_header_macro(
   sys_mman
   ../libc/include/sys/mman.yaml
-  sys/mman.h.def
   sys/mman.h
   DEPENDS
     .llvm_libc_common_h
@@ -510,7 +471,6 @@ add_header_macro(
 add_header_macro(
   sys_prctl
   ../libc/include/sys/prctl.yaml
-  sys/prctl.h.def
   sys/prctl.h
   DEPENDS
     .llvm_libc_common_h
@@ -527,7 +487,6 @@ add_header(
 add_header_macro(
   sys_random
   ../libc/include/sys/random.yaml
-  sys/random.h.def
   sys/random.h
   DEPENDS
     .llvm_libc_common_h
@@ -539,7 +498,6 @@ add_header_macro(
 add_header_macro(
   sys_resource
   ../libc/include/sys/resource.yaml
-  sys/resource.h.def
   sys/resource.h
   DEPENDS
     .llvm_libc_common_h
@@ -551,7 +509,6 @@ add_header_macro(
 add_header_macro(
   sys_stat
   ../libc/include/sys/stat.yaml
-  sys/stat.h.def
   sys/stat.h
   DEPENDS
     .llvm_libc_common_h
@@ -573,7 +530,6 @@ add_header_macro(
 add_header_macro(
   sys_select
   ../libc/include/sys/select.yaml
-  sys/select.h.def
   sys/select.h
   DEPENDS
     .llvm_libc_common_h
@@ -589,7 +545,6 @@ add_header_macro(
 add_header_macro(
   sys_sendfile
   ../libc/include/sys/sendfile.yaml
-  sys/sendfile.h.def
   sys/sendfile.h
   DEPENDS
     .llvm_libc_common_h
@@ -601,7 +556,6 @@ add_header_macro(
 add_header_macro(
   sys_socket
   ../libc/include/sys/socket.yaml
-  sys/socket.h.def
   sys/socket.h
   DEPENDS
     .llvm_libc_common_h
@@ -617,7 +571,6 @@ add_header_macro(
 add_header_macro(
   sys_statvfs
   ../libc/include/sys/statvfs.yaml
-  sys/statvfs.h.def
   sys/statvfs.h
   DEPENDS
     .llvm_libc_common_h
@@ -627,7 +580,6 @@ add_header_macro(
 add_header_macro(
   sys_syscall
   ../libc/include/sys/syscall.yaml
-  sys/syscall.h.def
   sys/syscall.h
   DEPENDS
 )
@@ -635,7 +587,6 @@ add_header_macro(
 add_header_macro(
   sys_time
   ../libc/include/sys/time.yaml
-  sys/time.h.def
   sys/time.h
   DEPENDS
     .llvm_libc_common_h
@@ -646,7 +597,6 @@ add_header_macro(
 add_header_macro(
   sys_types
   ../libc/include/sys/types.yaml
-  sys/types.h.def
   sys/types.h
   DEPENDS
     .llvm_libc_common_h
@@ -676,7 +626,6 @@ add_header_macro(
 add_header_macro(
   sys_utsname
   ../libc/include/sys/utsname.yaml
-  sys/utsname.h.def
   sys/utsname.h
   DEPENDS
     .llvm_libc_common_h
@@ -686,7 +635,6 @@ add_header_macro(
 add_header_macro(
   sys_wait
   ../libc/include/sys/wait.yaml
-  sys/wait.h.def
   sys/wait.h
   DEPENDS
     .llvm_libc_common_h
@@ -699,7 +647,6 @@ add_header_macro(
 add_header_macro(
   termios
   ../libc/include/termios.yaml
-  termios.h.def
   termios.h
   DEPENDS
     .llvm_libc_common_h
@@ -714,7 +661,6 @@ add_header_macro(
 add_header_macro(
   uchar
   ../libc/include/uchar.yaml
-  uchar.h.def
   uchar.h
   DEPENDS
     .llvm_libc_common_h
@@ -727,7 +673,6 @@ add_header_macro(
 add_header_macro(
   wchar
   ../libc/include/wchar.yaml
-  wchar.h.def
   wchar.h
   DEPENDS
     .llvm_libc_common_h
@@ -741,7 +686,6 @@ add_header_macro(
 add_header_macro(
   locale
   ../libc/include/locale.yaml
-  locale.h.def
   locale.h
   DEPENDS
     .llvm_libc_common_h
diff --git a/libc/include/__llvm-libc-common.h b/libc/include/__llvm-libc-common.h
index d9d70aff771c..c63eb134a5e5 100644
--- a/libc/include/__llvm-libc-common.h
+++ b/libc/include/__llvm-libc-common.h
@@ -50,7 +50,14 @@
 #define __END_C_DECLS
 
 #undef __restrict
-#define __restrict restrict // C99 and above support the restrict keyword.
+#if __STDC_VERSION__ >= 199901L
+// C99 and above support the restrict keyword.
+#define __restrict restrict
+#elif !defined(__GNUC__)
+// GNU-compatible compilers accept the __ spelling in all modes.
+// Otherwise, omit the qualifier for pure C89 compatibility.
+#define __restrict
+#endif
 
 #undef _Noreturn
 #if __STDC_VERSION__ >= 201112L
diff --git a/libc/include/arpa/inet.yaml b/libc/include/arpa/inet.yaml
index cb366e0f5d69..10cd56d6ce78 100644
--- a/libc/include/arpa/inet.yaml
+++ b/libc/include/arpa/inet.yaml
@@ -1,4 +1,5 @@
-header: arpa-inet.h
+header: arpa/inet.h
+header_template: inet.h.def
 macros: []
 types: []
 enums: []
diff --git a/libc/include/assert.yaml b/libc/include/assert.yaml
index f740554488ed..1a3bdeda7e54 100644
--- a/libc/include/assert.yaml
+++ b/libc/include/assert.yaml
@@ -1,4 +1,5 @@
 header: assert.h
+header_template: assert.h.def
 macros: []
 types: []
 enums: []
diff --git a/libc/include/complex.yaml b/libc/include/complex.yaml
index cd81de7dd9e2..05318480a02f 100644
--- a/libc/include/complex.yaml
+++ b/libc/include/complex.yaml
@@ -1,4 +1,5 @@
 header: complex.h
+header_template: complex.h.def
 macros: []
 types:
   - type_name: cfloat16
diff --git a/libc/include/ctype.yaml b/libc/include/ctype.yaml
index b4823c3e5323..6238f1b88998 100644
--- a/libc/include/ctype.yaml
+++ b/libc/include/ctype.yaml
@@ -1,4 +1,5 @@
 header: ctype.h
+header_template: ctype.h.def
 macros: []
 types:
   - type_name: locale_t
diff --git a/libc/include/dirent.yaml b/libc/include/dirent.yaml
index cdccf6a0c7f2..3fc522fda80e 100644
--- a/libc/include/dirent.yaml
+++ b/libc/include/dirent.yaml
@@ -1,4 +1,5 @@
 header: dirent.h
+header_template: dirent.h.def
 macros: []
 types:
   - type_name: struct_dirent
diff --git a/libc/include/dlfcn.yaml b/libc/include/dlfcn.yaml
index 725ee705714a..9e8803cb5fa7 100644
--- a/libc/include/dlfcn.yaml
+++ b/libc/include/dlfcn.yaml
@@ -1,4 +1,5 @@
 header: dlfcn.h
+header_template: dlfcn.h.def
 macros:
   - macro_name: RTLD_LAZY
     macro_value: null
diff --git a/libc/include/elf.yaml b/libc/include/elf.yaml
index 2e9db329e229..f78ae82c7785 100644
--- a/libc/include/elf.yaml
+++ b/libc/include/elf.yaml
@@ -1,4 +1,5 @@
 header: elf.h
+header_template: elf.h.def
 standards:
   - Linux
 macros: []
diff --git a/libc/include/errno.yaml b/libc/include/errno.yaml
index a894063a1ee2..188a9fa1211a 100644
--- a/libc/include/errno.yaml
+++ b/libc/include/errno.yaml
@@ -1,4 +1,5 @@
 header: errno.h
+header_template: errno.h.def
 standards:
   - stdc
   - Linux
diff --git a/libc/include/fcntl.yaml b/libc/include/fcntl.yaml
index 71c0df3b0fad..78f93533b84d 100644
--- a/libc/include/fcntl.yaml
+++ b/libc/include/fcntl.yaml
@@ -1,4 +1,5 @@
 header: fcntl.h
+header_template: fcntl.h.def
 macros: []
 types:
   - type_name: off_t
diff --git a/libc/include/features.yaml b/libc/include/features.yaml
index a18af22edb74..726320a40881 100644
--- a/libc/include/features.yaml
+++ b/libc/include/features.yaml
@@ -1,4 +1,5 @@
 header: features.h
+header_template: features.h.def
 standards:
   - stdc
 macros: []
diff --git a/libc/include/fenv.yaml b/libc/include/fenv.yaml
index 1010efc6402c..1ecaf6308550 100644
--- a/libc/include/fenv.yaml
+++ b/libc/include/fenv.yaml
@@ -1,4 +1,5 @@
 header: fenv.h
+header_template: fenv.h.def
 macros: []
 types:
   - type_name: fenv_t
diff --git a/libc/include/float.yaml b/libc/include/float.yaml
index 63639a6e8ed1..21df6513e77e 100644
--- a/libc/include/float.yaml
+++ b/libc/include/float.yaml
@@ -1,4 +1,5 @@
 header: float.h
+header_template: float.h.def
 standards:
   - stdc
 macros: []
diff --git a/libc/include/inttypes.yaml b/libc/include/inttypes.yaml
index ad636cc5121a..d5dec5b465ba 100644
--- a/libc/include/inttypes.yaml
+++ b/libc/include/inttypes.yaml
@@ -1,4 +1,5 @@
 header: inttypes.h
+header_template: inttypes.h.def
 macros: []
 types:
   - type_name: imaxdiv_t
diff --git a/libc/include/limits.yaml b/libc/include/limits.yaml
index bf33ed24e7a8..b664041bb56c 100644
--- a/libc/include/limits.yaml
+++ b/libc/include/limits.yaml
@@ -1,4 +1,5 @@
 header: limits.h
+header_template: limits.h.def
 standards:
   - stdc
 macros: []
diff --git a/libc/include/link.yaml b/libc/include/link.yaml
index d1963a86813a..1cd609e292b5 100644
--- a/libc/include/link.yaml
+++ b/libc/include/link.yaml
@@ -1,4 +1,5 @@
 header: link.h
+header_template: link.h.def
 standards:
   - Linux
 macros: []
diff --git a/libc/include/locale.yaml b/libc/include/locale.yaml
index 7da7966ea730..9ff53c16398a 100644
--- a/libc/include/locale.yaml
+++ b/libc/include/locale.yaml
@@ -1,4 +1,5 @@
 header: locale.h
+header_template: locale.h.def
 functions:
   - name: localeconv
     standards:
diff --git a/libc/include/malloc.yaml b/libc/include/malloc.yaml
index 8db4f3aebb9b..ec73c9090f72 100644
--- a/libc/include/malloc.yaml
+++ b/libc/include/malloc.yaml
@@ -1,4 +1,5 @@
 header: malloc.h
+header_template: malloc.h.def
 macros: []
 types: []
 enums: []
diff --git a/libc/include/math.yaml b/libc/include/math.yaml
index 3b8caec66bbf..831d04574567 100644
--- a/libc/include/math.yaml
+++ b/libc/include/math.yaml
@@ -1,4 +1,5 @@
 header: math.h
+header_template: math.h.def
 macros: []
 types:
   - type_name: float_t
diff --git a/libc/include/pthread.yaml b/libc/include/pthread.yaml
index b9068c3f1765..4f386bdd11cf 100644
--- a/libc/include/pthread.yaml
+++ b/libc/include/pthread.yaml
@@ -1,4 +1,5 @@
 header: pthread.h
+header_template: pthread.h.def
 macros: []
 types:
   - type_name: pthread_t
diff --git a/libc/include/sched.yaml b/libc/include/sched.yaml
index 2d4876b722ab..57871f524bf1 100644
--- a/libc/include/sched.yaml
+++ b/libc/include/sched.yaml
@@ -1,4 +1,5 @@
 header: sched.h
+header_template: sched.h.def
 macros: []
 types:
   - type_name: struct_timespec
diff --git a/libc/include/search.yaml b/libc/include/search.yaml
index a0c73bc679d8..b7ce06d48e70 100644
--- a/libc/include/search.yaml
+++ b/libc/include/search.yaml
@@ -1,4 +1,5 @@
 header: search.h
+header_template: search.h.def
 macros: []
 types:
   - type_name: struct_hsearch_data
diff --git a/libc/include/setjmp.yaml b/libc/include/setjmp.yaml
index 68e3ff046e4b..2c4f7fb6dfcc 100644
--- a/libc/include/setjmp.yaml
+++ b/libc/include/setjmp.yaml
@@ -1,4 +1,5 @@
 header: setjmp.h
+header_template: setjmp.h.def
 macros: []
 types:
   - type_name: jmp_buf
diff --git a/libc/include/signal.yaml b/libc/include/signal.yaml
index c66abb1a8744..576e77576ac7 100644
--- a/libc/include/signal.yaml
+++ b/libc/include/signal.yaml
@@ -1,4 +1,5 @@
 header: signal.h
+header_template: signal.h.def
 macros: []
 types:
   - type_name: pid_t
diff --git a/libc/include/spawn.yaml b/libc/include/spawn.yaml
index be3f4e99d27f..e725ab9719ed 100644
--- a/libc/include/spawn.yaml
+++ b/libc/include/spawn.yaml
@@ -1,4 +1,5 @@
 header: spawn.h
+header_template: spawn.h.def
 macros: []
 types:
   - type_name: posix_spawn_file_actions_t
diff --git a/libc/include/stdbit.yaml b/libc/include/stdbit.yaml
index 25d2d326c30e..e9bd6b3918e7 100644
--- a/libc/include/stdbit.yaml
+++ b/libc/include/stdbit.yaml
@@ -1,4 +1,5 @@
 header: stdbit.h
+header_template: stdbit.h.def
 macros: []
 types: []
 enums: []
diff --git a/libc/include/stdckdint.yaml b/libc/include/stdckdint.yaml
index ea8fc47625b0..e8b2e80ee029 100644
--- a/libc/include/stdckdint.yaml
+++ b/libc/include/stdckdint.yaml
@@ -1,4 +1,5 @@
 header: stdckdint.h
+header_template: stdckdint.h.def
 standards:
   - stdc
 macros: []
diff --git a/libc/include/stdfix.yaml b/libc/include/stdfix.yaml
index 9787eaba45e4..7b3bdba082dd 100644
--- a/libc/include/stdfix.yaml
+++ b/libc/include/stdfix.yaml
@@ -1,4 +1,5 @@
 header: stdfix.h
+header_template: stdfix.h.def
 macros: []
 types: 
   - type_name: stdfix-types
diff --git a/libc/include/stdint.yaml b/libc/include/stdint.yaml
index 8887f596bc8a..d583a104af37 100644
--- a/libc/include/stdint.yaml
+++ b/libc/include/stdint.yaml
@@ -1,4 +1,5 @@
 header: stdint.h
+header_template: stdint.h.def
 standards:
   - stdc
 macros: []
diff --git a/libc/include/stdio.yaml b/libc/include/stdio.yaml
index fd116bbc0089..2619984cca26 100644
--- a/libc/include/stdio.yaml
+++ b/libc/include/stdio.yaml
@@ -1,4 +1,5 @@
 header: stdio.h
+header_template: stdio.h.def
 macros:
   - macro_name: stdout
     macro_value: stdout
diff --git a/libc/include/stdlib.yaml b/libc/include/stdlib.yaml
index c6c95e421cee..4b68f272613b 100644
--- a/libc/include/stdlib.yaml
+++ b/libc/include/stdlib.yaml
@@ -1,4 +1,5 @@
 header: stdlib.h
+header_template: stdlib.h.def
 standards:
   - stdc
 macros: []
diff --git a/libc/include/string.yaml b/libc/include/string.yaml
index af1750e91243..deded309abc2 100644
--- a/libc/include/string.yaml
+++ b/libc/include/string.yaml
@@ -1,4 +1,5 @@
 header: string.h
+header_template: string.h.def
 macros: []
 types:
   - type_name: size_t
diff --git a/libc/include/strings.yaml b/libc/include/strings.yaml
index ca91b626740c..e672dca6a94d 100644
--- a/libc/include/strings.yaml
+++ b/libc/include/strings.yaml
@@ -1,4 +1,5 @@
 header: strings.h
+header_template: strings.h.def
 macros: []
 types: []
 enums: []
diff --git a/libc/include/sys/auxv.yaml b/libc/include/sys/auxv.yaml
index 9d546b358824..82ecee75c40a 100644
--- a/libc/include/sys/auxv.yaml
+++ b/libc/include/sys/auxv.yaml
@@ -1,4 +1,5 @@
-header: sys-auxv.h
+header: sys/auxv.h
+header_template: auxv.h.def
 macros: []
 types: []
 enums: []
diff --git a/libc/include/sys/epoll.yaml b/libc/include/sys/epoll.yaml
index ee188c17fedc..996eb785bc71 100644
--- a/libc/include/sys/epoll.yaml
+++ b/libc/include/sys/epoll.yaml
@@ -1,4 +1,5 @@
-header: sys-epoll.h
+header: sys/epoll.h
+header_template: epoll.h.def
 macros: []
 types:
   - type_name: struct_epoll_event
diff --git a/libc/include/sys/ioctl.yaml b/libc/include/sys/ioctl.yaml
index ffe73a84d51b..5f7b7f333191 100644
--- a/libc/include/sys/ioctl.yaml
+++ b/libc/include/sys/ioctl.yaml
@@ -1,4 +1,5 @@
-header: sys-ioctl.h
+header: sys/ioctl.h
+header_template: ioctl.h.def
 standards: POSIX
 macros: []
 types: []
diff --git a/libc/include/sys/mman.yaml b/libc/include/sys/mman.yaml
index 962ca3591917..8c207552f980 100644
--- a/libc/include/sys/mman.yaml
+++ b/libc/include/sys/mman.yaml
@@ -1,4 +1,5 @@
-header: sys-mman.h
+header: sys/mman.h
+header_template: mman.h.def
 macros: []
 types:
   - type_name: mode_t
diff --git a/libc/include/sys/prctl.yaml b/libc/include/sys/prctl.yaml
index 82374be87d5d..53f57645b18e 100644
--- a/libc/include/sys/prctl.yaml
+++ b/libc/include/sys/prctl.yaml
@@ -1,4 +1,5 @@
-header: sys-prctl.h
+header: sys/prctl.h
+header_template: prctl.h.def
 macros: []
 types: []
 enums: []
diff --git a/libc/include/sys/random.yaml b/libc/include/sys/random.yaml
index 228bb50d5db9..4efb2fbb4473 100644
--- a/libc/include/sys/random.yaml
+++ b/libc/include/sys/random.yaml
@@ -1,4 +1,5 @@
-header: sys-random.h
+header: sys/random.h
+header_template: random.h.def
 macros: []
 types:
   - type_name: ssize_t
diff --git a/libc/include/sys/resource.yaml b/libc/include/sys/resource.yaml
index 85ea1ad12f19..3652d6d490a4 100644
--- a/libc/include/sys/resource.yaml
+++ b/libc/include/sys/resource.yaml
@@ -1,4 +1,5 @@
-header: sys-resource.h
+header: sys/resource.h
+header_template: resource.h.def
 macros: []
 types:
   - type_name: struct_rlimit
diff --git a/libc/include/sys/select.yaml b/libc/include/sys/select.yaml
index c6806122aa81..6066fd341f07 100644
--- a/libc/include/sys/select.yaml
+++ b/libc/include/sys/select.yaml
@@ -1,4 +1,5 @@
-header: sys-select.h
+header: sys/select.h
+header_template: select.h.def
 macros: []
 types:
   - type_name: struct_timeval
diff --git a/libc/include/sys/sendfile.yaml b/libc/include/sys/sendfile.yaml
index 7e45e40e171d..259ab83dff54 100644
--- a/libc/include/sys/sendfile.yaml
+++ b/libc/include/sys/sendfile.yaml
@@ -1,4 +1,5 @@
-header: sys-sendfile.h
+header: sys/sendfile.h
+header_template: sendfile.h.def
 macros: []
 types:
   - type_name: ssize_t
diff --git a/libc/include/sys/socket.yaml b/libc/include/sys/socket.yaml
index 47d835fa5f4a..00d5de6af8a8 100644
--- a/libc/include/sys/socket.yaml
+++ b/libc/include/sys/socket.yaml
@@ -1,4 +1,5 @@
-header: sys-socket.h
+header: sys/socket.h
+header_template: socket.h.def
 macros: []
 types:
   - type_name: struct_sockaddr_un
diff --git a/libc/include/sys/stat.yaml b/libc/include/sys/stat.yaml
index ed500f832f90..7f013420818a 100644
--- a/libc/include/sys/stat.yaml
+++ b/libc/include/sys/stat.yaml
@@ -1,4 +1,5 @@
-header: sys-stat.h
+header: sys/stat.h
+header_template: stat.h.def
 macros: []
 types:
   - type_name: blkcnt_t
diff --git a/libc/include/sys/statvfs.yaml b/libc/include/sys/statvfs.yaml
index 22e0ef22c413..8c1d254add37 100644
--- a/libc/include/sys/statvfs.yaml
+++ b/libc/include/sys/statvfs.yaml
@@ -1,4 +1,5 @@
-header: sys-statvfs.h
+header: sys/statvfs.h
+header_template: statvfs.h.def
 macros: []
 types:
   - type_name: struct_statvfs
diff --git a/libc/include/sys/syscall.yaml b/libc/include/sys/syscall.yaml
index c0a64338b6f7..879d95c2ea39 100644
--- a/libc/include/sys/syscall.yaml
+++ b/libc/include/sys/syscall.yaml
@@ -1,4 +1,5 @@
-header: sys-syscall.h
+header: sys/syscall.h
+header_template: syscall.h.def
 standards: Linux
 macros: []
 types: []
diff --git a/libc/include/sys/time.yaml b/libc/include/sys/time.yaml
index eb3dd548389b..687c1f83028d 100644
--- a/libc/include/sys/time.yaml
+++ b/libc/include/sys/time.yaml
@@ -1,4 +1,5 @@
-header: sys-time.h
+header: sys/time.h
+header_template: time.h.def
 standards: Linux
 macros: []
 types: []
diff --git a/libc/include/sys/types.yaml b/libc/include/sys/types.yaml
index 15eaf107f691..6fa0b448fcd3 100644
--- a/libc/include/sys/types.yaml
+++ b/libc/include/sys/types.yaml
@@ -1,4 +1,5 @@
-header: sys-types.h
+header: sys/types.h
+header_template: types.h.def
 standards: POSIX
 macros: []
 types:
diff --git a/libc/include/sys/utsname.yaml b/libc/include/sys/utsname.yaml
index eecd55b18082..6c7cb71f9a34 100644
--- a/libc/include/sys/utsname.yaml
+++ b/libc/include/sys/utsname.yaml
@@ -1,4 +1,5 @@
-header: sys-utsname.h
+header: sys/utsname.h
+header_template: utsname.h.def
 macros: []
 types:
   - type_name: struct_utsname
diff --git a/libc/include/sys/wait.yaml b/libc/include/sys/wait.yaml
index 4f0c69baee2c..6257e34b9e08 100644
--- a/libc/include/sys/wait.yaml
+++ b/libc/include/sys/wait.yaml
@@ -1,4 +1,5 @@
-header: sys-wait.h
+header: sys/wait.h
+header_template: wait.h.def
 macros: []
 types:
   - type_name: siginfo_t
diff --git a/libc/include/termios.yaml b/libc/include/termios.yaml
index e9c4cd375d0b..8815097264f9 100644
--- a/libc/include/termios.yaml
+++ b/libc/include/termios.yaml
@@ -1,4 +1,5 @@
 header: termios.h
+header_template: termios.h.def
 macros: []
 types:
   - type_name: tcflag_t
diff --git a/libc/include/threads.yaml b/libc/include/threads.yaml
index aadcaf5f66e0..7014822f9251 100644
--- a/libc/include/threads.yaml
+++ b/libc/include/threads.yaml
@@ -1,4 +1,5 @@
 header: threads.h
+header_template: threads.h.def
 macros:
   - macro_name: ONCE_FLAG_INIT
     macro_value: '{0}'
diff --git a/libc/include/time.yaml b/libc/include/time.yaml
index 3f745e5ee338..b71b9ab72075 100644
--- a/libc/include/time.yaml
+++ b/libc/include/time.yaml
@@ -1,4 +1,5 @@
 header: time.h
+header_template: time.h.def
 macros: []
 types:
   - type_name: struct_timeval
diff --git a/libc/include/uchar.yaml b/libc/include/uchar.yaml
index 18ca840612e0..713919796762 100644
--- a/libc/include/uchar.yaml
+++ b/libc/include/uchar.yaml
@@ -1,4 +1,5 @@
 header: uchar.h
+header_template: uchar.h.def
 standards:
   - stdc
 macros: []
diff --git a/libc/include/unistd.yaml b/libc/include/unistd.yaml
index c6441c04ce3a..fada365e0103 100644
--- a/libc/include/unistd.yaml
+++ b/libc/include/unistd.yaml
@@ -1,4 +1,5 @@
 header: unistd.h
+header_template: unistd.h.def
 macros: []
 types:
   - type_name: uid_t
diff --git a/libc/include/wchar.yaml b/libc/include/wchar.yaml
index bc824b21d8be..27a5926b5745 100644
--- a/libc/include/wchar.yaml
+++ b/libc/include/wchar.yaml
@@ -1,4 +1,5 @@
 header: wchar.h
+header_template: wchar.h.def
 macros: []
 types:
   - type_name: size_t
diff --git a/libc/src/__support/CMakeLists.txt b/libc/src/__support/CMakeLists.txt
index 4e90aad9a45b..5090dc218cda 100644
--- a/libc/src/__support/CMakeLists.txt
+++ b/libc/src/__support/CMakeLists.txt
@@ -267,7 +267,9 @@ add_header_library(
   HDRS
     fixedvector.h
   DEPENDS
+    .libc_assert
     libc.src.__support.CPP.array
+    libc.src.string.memory_utils.inline_memset
 )
 
 add_header_library(
diff --git a/libc/src/__support/File/file.cpp b/libc/src/__support/File/file.cpp
index 972249fef96b..528542cccf32 100644
--- a/libc/src/__support/File/file.cpp
+++ b/libc/src/__support/File/file.cpp
@@ -42,7 +42,7 @@ FileIOResult File::write_unlocked_nbf(const uint8_t *data, size_t len) {
   if (pos > 0) { // If the buffer is not empty
     // Flush the buffer
     const size_t write_size = pos;
-    auto write_result = platform_write(this, buf, write_size);
+    FileIOResult write_result = platform_write(this, buf, write_size);
     pos = 0; // Buffer is now empty so reset pos to the beginning.
     // If less bytes were written than expected, then an error occurred.
     if (write_result < write_size) {
@@ -52,7 +52,7 @@ FileIOResult File::write_unlocked_nbf(const uint8_t *data, size_t len) {
     }
   }
 
-  auto write_result = platform_write(this, data, len);
+  FileIOResult write_result = platform_write(this, data, len);
   if (write_result < len)
     err = true;
   return write_result;
@@ -99,7 +99,7 @@ FileIOResult File::write_unlocked_fbf(const uint8_t *data, size_t len) {
   // is full.
   const size_t write_size = pos;
 
-  auto buf_result = platform_write(this, buf, write_size);
+  FileIOResult buf_result = platform_write(this, buf, write_size);
   size_t bytes_written = buf_result.value;
 
   pos = 0; // Buffer is now empty so reset pos to the beginning.
@@ -121,7 +121,8 @@ FileIOResult File::write_unlocked_fbf(const uint8_t *data, size_t len) {
     pos = remainder.size();
   } else {
 
-    auto result = platform_write(this, remainder.data(), remainder.size());
+    FileIOResult result =
+        platform_write(this, remainder.data(), remainder.size());
     size_t bytes_written = buf_result.value;
 
     // If less bytes were written than expected, then an error occurred. Return
@@ -190,6 +191,17 @@ FileIOResult File::read_unlocked(void *data, size_t len) {
 
   prev_op = FileOp::READ;
 
+  if (bufmode == _IONBF) { // unbuffered.
+    return read_unlocked_nbf(static_cast<uint8_t *>(data), len);
+  } else if (bufmode == _IOFBF) { // fully buffered
+    return read_unlocked_fbf(static_cast<uint8_t *>(data), len);
+  } else /*if (bufmode == _IOLBF) */ { // line buffered
+    // There is no line buffered mode for read. Use fully buffered instead.
+    return read_unlocked_fbf(static_cast<uint8_t *>(data), len);
+  }
+}
+
+size_t File::copy_data_from_buf(uint8_t *data, size_t len) {
   cpp::span<uint8_t> bufref(static_cast<uint8_t *>(buf), bufsize);
   cpp::span<uint8_t> dataref(static_cast<uint8_t *>(data), len);
 
@@ -209,32 +221,42 @@ FileIOResult File::read_unlocked(void *data, size_t len) {
   for (size_t i = 0; i < available_data; ++i)
     dataref[i] = bufref[i + pos];
   read_limit = pos = 0; // Reset the pointers.
+
+  return available_data;
+}
+
+FileIOResult File::read_unlocked_fbf(uint8_t *data, size_t len) {
+  // Read data from the buffer first.
+  size_t available_data = copy_data_from_buf(data, len);
+  if (available_data == len)
+    return available_data;
+
   // Update the dataref to reflect that fact that we have already
   // copied |available_data| into |data|.
-  dataref = cpp::span<uint8_t>(dataref.data() + available_data,
-                               dataref.size() - available_data);
-
   size_t to_fetch = len - available_data;
+  cpp::span<uint8_t> dataref(static_cast<uint8_t *>(data) + available_data,
+                             to_fetch);
+
   if (to_fetch > bufsize) {
-    auto result = platform_read(this, dataref.data(), to_fetch);
+    FileIOResult result = platform_read(this, dataref.data(), to_fetch);
     size_t fetched_size = result.value;
     if (result.has_error() || fetched_size < to_fetch) {
       if (!result.has_error())
         eof = true;
       else
         err = true;
-      return {available_data + fetched_size, result.has_error()};
+      return {available_data + fetched_size, result.error};
     }
     return len;
   }
 
   // Fetch and buffer another buffer worth of data.
-  auto result = platform_read(this, buf, bufsize);
+  FileIOResult result = platform_read(this, buf, bufsize);
   size_t fetched_size = result.value;
   read_limit += fetched_size;
   size_t transfer_size = fetched_size >= to_fetch ? to_fetch : fetched_size;
   for (size_t i = 0; i < transfer_size; ++i)
-    dataref[i] = bufref[i];
+    dataref[i] = buf[i];
   pos += transfer_size;
   if (result.has_error() || fetched_size < to_fetch) {
     if (!result.has_error())
@@ -245,6 +267,26 @@ FileIOResult File::read_unlocked(void *data, size_t len) {
   return {transfer_size + available_data, result.error};
 }
 
+FileIOResult File::read_unlocked_nbf(uint8_t *data, size_t len) {
+  // Check whether there is a character in the ungetc buffer.
+  size_t available_data = copy_data_from_buf(data, len);
+  if (available_data == len)
+    return available_data;
+
+  // Directly copy the data into |data|.
+  cpp::span<uint8_t> dataref(static_cast<uint8_t *>(data) + available_data,
+                             len - available_data);
+  FileIOResult result = platform_read(this, dataref.data(), dataref.size());
+
+  if (result.has_error() || result < dataref.size()) {
+    if (!result.has_error())
+      eof = true;
+    else
+      err = true;
+  }
+  return {result + available_data, result.error};
+}
+
 int File::ungetc_unlocked(int c) {
   // There is no meaning to unget if:
   // 1. You are trying to push back EOF.
@@ -287,7 +329,7 @@ ErrorOr<int> File::seek(off_t offset, int whence) {
   FileLock lock(this);
   if (prev_op == FileOp::WRITE && pos > 0) {
 
-    auto buf_result = platform_write(this, buf, pos);
+    FileIOResult buf_result = platform_write(this, buf, pos);
     if (buf_result.has_error() || buf_result.value < pos) {
       err = true;
       return Error(buf_result.error);
@@ -325,7 +367,7 @@ ErrorOr<off_t> File::tell() {
 
 int File::flush_unlocked() {
   if (prev_op == FileOp::WRITE && pos > 0) {
-    auto buf_result = platform_write(this, buf, pos);
+    FileIOResult buf_result = platform_write(this, buf, pos);
     if (buf_result.has_error() || buf_result.value < pos) {
       err = true;
       return buf_result.error;
diff --git a/libc/src/__support/File/file.h b/libc/src/__support/File/file.h
index 42e1d11b4ab1..5c97a9c6419f 100644
--- a/libc/src/__support/File/file.h
+++ b/libc/src/__support/File/file.h
@@ -280,6 +280,10 @@ private:
   FileIOResult write_unlocked_fbf(const uint8_t *data, size_t len);
   FileIOResult write_unlocked_nbf(const uint8_t *data, size_t len);
 
+  FileIOResult read_unlocked_fbf(uint8_t *data, size_t len);
+  FileIOResult read_unlocked_nbf(uint8_t *data, size_t len);
+  size_t copy_data_from_buf(uint8_t *data, size_t len);
+
   constexpr void adjust_buf() {
     if (read_allowed() && (buf == nullptr || bufsize == 0)) {
       // We should allow atleast one ungetc operation.
diff --git a/libc/src/__support/GPU/CMakeLists.txt b/libc/src/__support/GPU/CMakeLists.txt
index 28fd9a1ebcc9..9b359f65cdb3 100644
--- a/libc/src/__support/GPU/CMakeLists.txt
+++ b/libc/src/__support/GPU/CMakeLists.txt
@@ -1,16 +1,12 @@
-if(NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${LIBC_TARGET_ARCHITECTURE})
+# These utilities are GPU only.
+if(NOT LIBC_TARGET_OS_IS_GPU)
   return()
 endif()
 
-add_subdirectory(${LIBC_TARGET_ARCHITECTURE})
-set(target_gpu_utils libc.src.__support.GPU.${LIBC_TARGET_ARCHITECTURE}.${LIBC_TARGET_ARCHITECTURE}_utils)
-
 add_header_library(
   utils
   HDRS
     utils.h
-  DEPENDS
-    ${target_gpu_utils}
 )
 
 add_object_library(
@@ -21,6 +17,6 @@ add_object_library(
     allocator.h
   DEPENDS
     libc.src.__support.common
-    libc.src.__support.GPU.utils
     libc.src.__support.RPC.rpc_client
+    .utils
 )
diff --git a/libc/src/__support/GPU/amdgpu/CMakeLists.txt b/libc/src/__support/GPU/amdgpu/CMakeLists.txt
deleted file mode 100644
index f2b98fc03b21..000000000000
--- a/libc/src/__support/GPU/amdgpu/CMakeLists.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-add_header_library(
-  amdgpu_utils
-  HDRS
-    utils.h
-  DEPENDS
-    libc.src.__support.common
-)
diff --git a/libc/src/__support/GPU/amdgpu/utils.h b/libc/src/__support/GPU/amdgpu/utils.h
deleted file mode 100644
index 6ab95403ca38..000000000000
--- a/libc/src/__support/GPU/amdgpu/utils.h
+++ /dev/null
@@ -1,183 +0,0 @@
-//===-------------- AMDGPU implementation of GPU utils ----------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIBC_SRC___SUPPORT_GPU_AMDGPU_IO_H
-#define LLVM_LIBC_SRC___SUPPORT_GPU_AMDGPU_IO_H
-
-#include "src/__support/common.h"
-#include "src/__support/macros/config.h"
-
-#include <stdint.h>
-
-namespace LIBC_NAMESPACE_DECL {
-namespace gpu {
-
-/// Type aliases to the address spaces used by the AMDGPU backend.
-template <typename T> using Private = [[clang::opencl_private]] T;
-template <typename T> using Constant = [[clang::opencl_constant]] T;
-template <typename T> using Local = [[clang::opencl_local]] T;
-template <typename T> using Global = [[clang::opencl_global]] T;
-
-/// Returns the number of workgroups in the 'x' dimension of the grid.
-LIBC_INLINE uint32_t get_num_blocks_x() {
-  return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x();
-}
-
-/// Returns the number of workgroups in the 'y' dimension of the grid.
-LIBC_INLINE uint32_t get_num_blocks_y() {
-  return __builtin_amdgcn_grid_size_y() / __builtin_amdgcn_workgroup_size_y();
-}
-
-/// Returns the number of workgroups in the 'z' dimension of the grid.
-LIBC_INLINE uint32_t get_num_blocks_z() {
-  return __builtin_amdgcn_grid_size_z() / __builtin_amdgcn_workgroup_size_z();
-}
-
-/// Returns the total number of workgruops in the grid.
-LIBC_INLINE uint64_t get_num_blocks() {
-  return get_num_blocks_x() * get_num_blocks_y() * get_num_blocks_z();
-}
-
-/// Returns the 'x' dimension of the current AMD workgroup's id.
-LIBC_INLINE uint32_t get_block_id_x() {
-  return __builtin_amdgcn_workgroup_id_x();
-}
-
-/// Returns the 'y' dimension of the current AMD workgroup's id.
-LIBC_INLINE uint32_t get_block_id_y() {
-  return __builtin_amdgcn_workgroup_id_y();
-}
-
-/// Returns the 'z' dimension of the current AMD workgroup's id.
-LIBC_INLINE uint32_t get_block_id_z() {
-  return __builtin_amdgcn_workgroup_id_z();
-}
-
-/// Returns the absolute id of the AMD workgroup.
-LIBC_INLINE uint64_t get_block_id() {
-  return get_block_id_x() + get_num_blocks_x() * get_block_id_y() +
-         get_num_blocks_x() * get_num_blocks_y() * get_block_id_z();
-}
-
-/// Returns the number of workitems in the 'x' dimension.
-LIBC_INLINE uint32_t get_num_threads_x() {
-  return __builtin_amdgcn_workgroup_size_x();
-}
-
-/// Returns the number of workitems in the 'y' dimension.
-LIBC_INLINE uint32_t get_num_threads_y() {
-  return __builtin_amdgcn_workgroup_size_y();
-}
-
-/// Returns the number of workitems in the 'z' dimension.
-LIBC_INLINE uint32_t get_num_threads_z() {
-  return __builtin_amdgcn_workgroup_size_z();
-}
-
-/// Returns the total number of workitems in the workgroup.
-LIBC_INLINE uint64_t get_num_threads() {
-  return get_num_threads_x() * get_num_threads_y() * get_num_threads_z();
-}
-
-/// Returns the 'x' dimension id of the workitem in the current AMD workgroup.
-LIBC_INLINE uint32_t get_thread_id_x() {
-  return __builtin_amdgcn_workitem_id_x();
-}
-
-/// Returns the 'y' dimension id of the workitem in the current AMD workgroup.
-LIBC_INLINE uint32_t get_thread_id_y() {
-  return __builtin_amdgcn_workitem_id_y();
-}
-
-/// Returns the 'z' dimension id of the workitem in the current AMD workgroup.
-LIBC_INLINE uint32_t get_thread_id_z() {
-  return __builtin_amdgcn_workitem_id_z();
-}
-
-/// Returns the absolute id of the thread in the current AMD workgroup.
-LIBC_INLINE uint64_t get_thread_id() {
-  return get_thread_id_x() + get_num_threads_x() * get_thread_id_y() +
-         get_num_threads_x() * get_num_threads_y() * get_thread_id_z();
-}
-
-/// Returns the size of an AMD wavefront, either 32 or 64 depending on hardware
-/// and compilation options.
-LIBC_INLINE uint32_t get_lane_size() {
-  return __builtin_amdgcn_wavefrontsize();
-}
-
-/// Returns the id of the thread inside of an AMD wavefront executing together.
-[[clang::convergent]] LIBC_INLINE uint32_t get_lane_id() {
-  return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u));
-}
-
-/// Returns the bit-mask of active threads in the current wavefront.
-[[clang::convergent]] LIBC_INLINE uint64_t get_lane_mask() {
-  return __builtin_amdgcn_read_exec();
-}
-
-/// Copies the value from the first active thread in the wavefront to the rest.
-[[clang::convergent]] LIBC_INLINE uint32_t broadcast_value(uint64_t,
-                                                           uint32_t x) {
-  return __builtin_amdgcn_readfirstlane(x);
-}
-
-/// Returns a bitmask of threads in the current lane for which \p x is true.
-[[clang::convergent]] LIBC_INLINE uint64_t ballot(uint64_t lane_mask, bool x) {
-  // the lane_mask & gives the nvptx semantics when lane_mask is a subset of
-  // the active threads
-  return lane_mask & __builtin_amdgcn_ballot_w64(x);
-}
-
-/// Waits for all the threads in the block to converge and issues a fence.
-[[clang::convergent]] LIBC_INLINE void sync_threads() {
-  __builtin_amdgcn_s_barrier();
-  __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "workgroup");
-}
-
-/// Waits for all pending memory operations to complete in program order.
-[[clang::convergent]] LIBC_INLINE void memory_fence() {
-  __builtin_amdgcn_fence(__ATOMIC_ACQ_REL, "");
-}
-
-/// Wait for all threads in the wavefront to converge, this is a noop on AMDGPU.
-[[clang::convergent]] LIBC_INLINE void sync_lane(uint64_t) {
-  __builtin_amdgcn_wave_barrier();
-}
-
-/// Shuffles the the lanes inside the wavefront according to the given index.
-[[clang::convergent]] LIBC_INLINE uint32_t shuffle(uint64_t, uint32_t idx,
-                                                   uint32_t x) {
-  return __builtin_amdgcn_ds_bpermute(idx << 2, x);
-}
-
-/// Returns the current value of the GPU's processor clock.
-/// NOTE: The RDNA3 and RDNA2 architectures use a 20-bit cycle counter.
-LIBC_INLINE uint64_t processor_clock() { return __builtin_readcyclecounter(); }
-
-/// Returns a fixed-frequency timestamp. The actual frequency is dependent on
-/// the card and can only be queried via the driver.
-LIBC_INLINE uint64_t fixed_frequency_clock() {
-  return __builtin_readsteadycounter();
-}
-
-/// Terminates execution of the associated wavefront.
-[[noreturn]] LIBC_INLINE void end_program() { __builtin_amdgcn_endpgm(); }
-
-/// Returns a unique identifier for the process cluster the current wavefront is
-/// executing on. Here we use the identifier for the compute unit (CU) and
-/// shader engine.
-/// FIXME: Currently unimplemented on AMDGPU until we have a simpler interface
-/// than the one at
-/// https://github.com/ROCm/clr/blob/develop/hipamd/include/hip/amd_detail/amd_device_functions.h#L899
-LIBC_INLINE uint32_t get_cluster_id() { return 0; }
-
-} // namespace gpu
-} // namespace LIBC_NAMESPACE_DECL
-
-#endif
diff --git a/libc/src/__support/GPU/generic/CMakeLists.txt b/libc/src/__support/GPU/generic/CMakeLists.txt
deleted file mode 100644
index 68ba7d1ec80e..000000000000
--- a/libc/src/__support/GPU/generic/CMakeLists.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-add_header_library(
-  generic_utils
-  HDRS
-    utils.h
-  DEPENDS
-    libc.src.__support.common
-)
diff --git a/libc/src/__support/GPU/generic/utils.h b/libc/src/__support/GPU/generic/utils.h
deleted file mode 100644
index 9461ef0aa245..000000000000
--- a/libc/src/__support/GPU/generic/utils.h
+++ /dev/null
@@ -1,84 +0,0 @@
-//===-------------- Generic implementation of GPU utils ---------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIBC_SRC___SUPPORT_GPU_GENERIC_UTILS_H
-#define LLVM_LIBC_SRC___SUPPORT_GPU_GENERIC_UTILS_H
-
-#include "src/__support/common.h"
-#include "src/__support/macros/config.h"
-
-#include <stdint.h>
-
-namespace LIBC_NAMESPACE_DECL {
-namespace gpu {
-
-template <typename T> using Private = T;
-template <typename T> using Constant = T;
-template <typename T> using Shared = T;
-template <typename T> using Global = T;
-
-LIBC_INLINE uint32_t get_num_blocks_x() { return 1; }
-
-LIBC_INLINE uint32_t get_num_blocks_y() { return 1; }
-
-LIBC_INLINE uint32_t get_num_blocks_z() { return 1; }
-
-LIBC_INLINE uint64_t get_num_blocks() { return 1; }
-
-LIBC_INLINE uint32_t get_block_id_x() { return 0; }
-
-LIBC_INLINE uint32_t get_block_id_y() { return 0; }
-
-LIBC_INLINE uint32_t get_block_id_z() { return 0; }
-
-LIBC_INLINE uint64_t get_block_id() { return 0; }
-
-LIBC_INLINE uint32_t get_num_threads_x() { return 1; }
-
-LIBC_INLINE uint32_t get_num_threads_y() { return 1; }
-
-LIBC_INLINE uint32_t get_num_threads_z() { return 1; }
-
-LIBC_INLINE uint64_t get_num_threads() { return 1; }
-
-LIBC_INLINE uint32_t get_thread_id_x() { return 0; }
-
-LIBC_INLINE uint32_t get_thread_id_y() { return 0; }
-
-LIBC_INLINE uint32_t get_thread_id_z() { return 0; }
-
-LIBC_INLINE uint64_t get_thread_id() { return 0; }
-
-LIBC_INLINE uint32_t get_lane_size() { return 1; }
-
-LIBC_INLINE uint32_t get_lane_id() { return 0; }
-
-LIBC_INLINE uint64_t get_lane_mask() { return 1; }
-
-LIBC_INLINE uint32_t broadcast_value(uint64_t, uint32_t x) { return x; }
-
-LIBC_INLINE uint64_t ballot(uint64_t, bool x) { return x; }
-
-LIBC_INLINE void sync_threads() {}
-
-LIBC_INLINE void sync_lane(uint64_t) {}
-
-LIBC_INLINE uint32_t shuffle(uint64_t, uint32_t, uint32_t x) { return x; }
-
-LIBC_INLINE uint64_t processor_clock() { return 0; }
-
-LIBC_INLINE uint64_t fixed_frequency_clock() { return 0; }
-
-[[noreturn]] LIBC_INLINE void end_program() { __builtin_unreachable(); }
-
-LIBC_INLINE uint32_t get_cluster_id() { return 0; }
-
-} // namespace gpu
-} // namespace LIBC_NAMESPACE_DECL
-
-#endif // LLVM_LIBC_SRC___SUPPORT_GPU_GENERIC_UTILS_H
diff --git a/libc/src/__support/GPU/nvptx/CMakeLists.txt b/libc/src/__support/GPU/nvptx/CMakeLists.txt
deleted file mode 100644
index 0d3f8c7933c8..000000000000
--- a/libc/src/__support/GPU/nvptx/CMakeLists.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-add_header_library(
-  nvptx_utils
-  HDRS
-    utils.h
-  DEPENDS
-    libc.src.__support.common
-)
diff --git a/libc/src/__support/GPU/nvptx/utils.h b/libc/src/__support/GPU/nvptx/utils.h
deleted file mode 100644
index 1a43a839a9ce..000000000000
--- a/libc/src/__support/GPU/nvptx/utils.h
+++ /dev/null
@@ -1,160 +0,0 @@
-//===-------------- NVPTX implementation of GPU utils -----------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-id: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIBC_SRC___SUPPORT_GPU_NVPTX_IO_H
-#define LLVM_LIBC_SRC___SUPPORT_GPU_NVPTX_IO_H
-
-#include "src/__support/common.h"
-#include "src/__support/macros/config.h"
-
-#include <stdint.h>
-
-namespace LIBC_NAMESPACE_DECL {
-namespace gpu {
-
-/// Type aliases to the address spaces used by the NVPTX backend.
-template <typename T> using Private = [[clang::opencl_private]] T;
-template <typename T> using Constant = [[clang::opencl_constant]] T;
-template <typename T> using Local = [[clang::opencl_local]] T;
-template <typename T> using Global = [[clang::opencl_global]] T;
-
-/// Returns the number of CUDA blocks in the 'x' dimension.
-LIBC_INLINE uint32_t get_num_blocks_x() {
-  return __nvvm_read_ptx_sreg_nctaid_x();
-}
-
-/// Returns the number of CUDA blocks in the 'y' dimension.
-LIBC_INLINE uint32_t get_num_blocks_y() {
-  return __nvvm_read_ptx_sreg_nctaid_y();
-}
-
-/// Returns the number of CUDA blocks in the 'z' dimension.
-LIBC_INLINE uint32_t get_num_blocks_z() {
-  return __nvvm_read_ptx_sreg_nctaid_z();
-}
-
-/// Returns the total number of CUDA blocks.
-LIBC_INLINE uint64_t get_num_blocks() {
-  return get_num_blocks_x() * get_num_blocks_y() * get_num_blocks_z();
-}
-
-/// Returns the 'x' dimension of the current CUDA block's id.
-LIBC_INLINE uint32_t get_block_id_x() { return __nvvm_read_ptx_sreg_ctaid_x(); }
-
-/// Returns the 'y' dimension of the current CUDA block's id.
-LIBC_INLINE uint32_t get_block_id_y() { return __nvvm_read_ptx_sreg_ctaid_y(); }
-
-/// Returns the 'z' dimension of the current CUDA block's id.
-LIBC_INLINE uint32_t get_block_id_z() { return __nvvm_read_ptx_sreg_ctaid_z(); }
-
-/// Returns the absolute id of the CUDA block.
-LIBC_INLINE uint64_t get_block_id() {
-  return get_block_id_x() + get_num_blocks_x() * get_block_id_y() +
-         get_num_blocks_x() * get_num_blocks_y() * get_block_id_z();
-}
-
-/// Returns the number of CUDA threads in the 'x' dimension.
-LIBC_INLINE uint32_t get_num_threads_x() {
-  return __nvvm_read_ptx_sreg_ntid_x();
-}
-
-/// Returns the number of CUDA threads in the 'y' dimension.
-LIBC_INLINE uint32_t get_num_threads_y() {
-  return __nvvm_read_ptx_sreg_ntid_y();
-}
-
-/// Returns the number of CUDA threads in the 'z' dimension.
-LIBC_INLINE uint32_t get_num_threads_z() {
-  return __nvvm_read_ptx_sreg_ntid_z();
-}
-
-/// Returns the total number of threads in the block.
-LIBC_INLINE uint64_t get_num_threads() {
-  return get_num_threads_x() * get_num_threads_y() * get_num_threads_z();
-}
-
-/// Returns the 'x' dimension id of the thread in the current CUDA block.
-LIBC_INLINE uint32_t get_thread_id_x() { return __nvvm_read_ptx_sreg_tid_x(); }
-
-/// Returns the 'y' dimension id of the thread in the current CUDA block.
-LIBC_INLINE uint32_t get_thread_id_y() { return __nvvm_read_ptx_sreg_tid_y(); }
-
-/// Returns the 'z' dimension id of the thread in the current CUDA block.
-LIBC_INLINE uint32_t get_thread_id_z() { return __nvvm_read_ptx_sreg_tid_z(); }
-
-/// Returns the absolute id of the thread in the current CUDA block.
-LIBC_INLINE uint64_t get_thread_id() {
-  return get_thread_id_x() + get_num_threads_x() * get_thread_id_y() +
-         get_num_threads_x() * get_num_threads_y() * get_thread_id_z();
-}
-
-/// Returns the size of a CUDA warp, always 32 on NVIDIA hardware.
-LIBC_INLINE uint32_t get_lane_size() { return 32; }
-
-/// Returns the id of the thread inside of a CUDA warp executing together.
-[[clang::convergent]] LIBC_INLINE uint32_t get_lane_id() {
-  return __nvvm_read_ptx_sreg_laneid();
-}
-
-/// Returns the bit-mask of active threads in the current warp.
-[[clang::convergent]] LIBC_INLINE uint64_t get_lane_mask() {
-  return __nvvm_activemask();
-}
-
-/// Copies the value from the first active thread in the warp to the rest.
-[[clang::convergent]] LIBC_INLINE uint32_t broadcast_value(uint64_t lane_mask,
-                                                           uint32_t x) {
-  uint32_t mask = static_cast<uint32_t>(lane_mask);
-  uint32_t id = __builtin_ffs(mask) - 1;
-  return __nvvm_shfl_sync_idx_i32(mask, x, id, get_lane_size() - 1);
-}
-
-/// Returns a bitmask of threads in the current lane for which \p x is true.
-[[clang::convergent]] LIBC_INLINE uint64_t ballot(uint64_t lane_mask, bool x) {
-  uint32_t mask = static_cast<uint32_t>(lane_mask);
-  return __nvvm_vote_ballot_sync(mask, x);
-}
-
-/// Waits for all the threads in the block to converge and issues a fence.
-[[clang::convergent]] LIBC_INLINE void sync_threads() { __syncthreads(); }
-
-/// Waits for all pending memory operations to complete in program order.
-[[clang::convergent]] LIBC_INLINE void memory_fence() { __nvvm_membar_sys(); }
-
-/// Waits for all threads in the warp to reconverge for independent scheduling.
-[[clang::convergent]] LIBC_INLINE void sync_lane(uint64_t mask) {
-  __nvvm_bar_warp_sync(static_cast<uint32_t>(mask));
-}
-
-/// Shuffles the the lanes inside the warp according to the given index.
-[[clang::convergent]] LIBC_INLINE uint32_t shuffle(uint64_t lane_mask,
-                                                   uint32_t idx, uint32_t x) {
-  uint32_t mask = static_cast<uint32_t>(lane_mask);
-  uint32_t bitmask = (mask >> idx) & 1;
-  return -bitmask & __nvvm_shfl_sync_idx_i32(mask, x, idx, get_lane_size() - 1);
-}
-
-/// Returns the current value of the GPU's processor clock.
-LIBC_INLINE uint64_t processor_clock() { return __builtin_readcyclecounter(); }
-
-/// Returns a global fixed-frequency timer at nanosecond frequency.
-LIBC_INLINE uint64_t fixed_frequency_clock() {
-  return __builtin_readsteadycounter();
-}
-
-/// Terminates execution of the calling thread.
-[[noreturn]] LIBC_INLINE void end_program() { __nvvm_exit(); }
-
-/// Returns a unique identifier for the process cluster the current warp is
-/// executing on. Here we use the identifier for the symmetric multiprocessor.
-LIBC_INLINE uint32_t get_cluster_id() { return __nvvm_read_ptx_sreg_smid(); }
-
-} // namespace gpu
-} // namespace LIBC_NAMESPACE_DECL
-
-#endif
diff --git a/libc/src/__support/GPU/utils.h b/libc/src/__support/GPU/utils.h
index ae52e7a088ad..e138c84c0cb2 100644
--- a/libc/src/__support/GPU/utils.h
+++ b/libc/src/__support/GPU/utils.h
@@ -9,48 +9,108 @@
 #ifndef LLVM_LIBC_SRC___SUPPORT_GPU_UTILS_H
 #define LLVM_LIBC_SRC___SUPPORT_GPU_UTILS_H
 
+#include "src/__support/macros/attributes.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/macros/properties/architectures.h"
 
-#if defined(LIBC_TARGET_ARCH_IS_AMDGPU)
-#include "amdgpu/utils.h"
-#elif defined(LIBC_TARGET_ARCH_IS_NVPTX)
-#include "nvptx/utils.h"
-#else
-#include "generic/utils.h"
+#if !__has_include(<gpuintrin.h>)
+#error "Unsupported compiler"
 #endif
 
+#include <gpuintrin.h>
+
 namespace LIBC_NAMESPACE_DECL {
 namespace gpu {
-/// Get the first active thread inside the lane.
-LIBC_INLINE uint64_t get_first_lane_id(uint64_t lane_mask) {
-  return __builtin_ffsll(lane_mask) - 1;
+
+template <typename T> using Private = __gpu_private T;
+template <typename T> using Constant = __gpu_constant T;
+template <typename T> using Local = __gpu_local T;
+template <typename T> using Global = __gpu_local T;
+
+LIBC_INLINE uint32_t get_num_blocks_x() { return __gpu_num_blocks(0); }
+
+LIBC_INLINE uint32_t get_num_blocks_y() { return __gpu_num_blocks(1); }
+
+LIBC_INLINE uint32_t get_num_blocks_z() { return __gpu_num_blocks(2); }
+
+LIBC_INLINE uint64_t get_num_blocks() {
+  return get_num_blocks_x() * get_num_blocks_y() * get_num_blocks_z();
+}
+
+LIBC_INLINE uint32_t get_block_id_x() { return __gpu_block_id(0); }
+
+LIBC_INLINE uint32_t get_block_id_y() { return __gpu_block_id(1); }
+
+LIBC_INLINE uint32_t get_block_id_z() { return __gpu_block_id(2); }
+
+LIBC_INLINE uint64_t get_block_id() {
+  return get_block_id_x() + get_num_blocks_x() * get_block_id_y() +
+         get_num_blocks_x() * get_num_blocks_y() * get_block_id_z();
+}
+
+LIBC_INLINE uint32_t get_num_threads_x() { return __gpu_num_threads(0); }
+
+LIBC_INLINE uint32_t get_num_threads_y() { return __gpu_num_threads(1); }
+
+LIBC_INLINE uint32_t get_num_threads_z() { return __gpu_num_threads(2); }
+
+LIBC_INLINE uint64_t get_num_threads() {
+  return get_num_threads_x() * get_num_threads_y() * get_num_threads_z();
+}
+
+LIBC_INLINE uint32_t get_thread_id_x() { return __gpu_thread_id(0); }
+
+LIBC_INLINE uint32_t get_thread_id_y() { return __gpu_thread_id(1); }
+
+LIBC_INLINE uint32_t get_thread_id_z() { return __gpu_thread_id(2); }
+
+LIBC_INLINE uint64_t get_thread_id() {
+  return get_thread_id_x() + get_num_threads_x() * get_thread_id_y() +
+         get_num_threads_x() * get_num_threads_y() * get_thread_id_z();
+}
+
+LIBC_INLINE uint32_t get_lane_size() { return __gpu_num_lanes(); }
+
+LIBC_INLINE uint32_t get_lane_id() { return __gpu_lane_id(); }
+
+LIBC_INLINE uint64_t get_lane_mask() { return __gpu_lane_mask(); }
+
+LIBC_INLINE uint32_t broadcast_value(uint64_t lane_mask, uint32_t x) {
+  return __gpu_read_first_lane_u32(lane_mask, x);
+}
+
+LIBC_INLINE uint64_t ballot(uint64_t lane_mask, bool x) {
+  return __gpu_ballot(lane_mask, x);
+}
+
+LIBC_INLINE void sync_threads() { __gpu_sync_threads(); }
+
+LIBC_INLINE void sync_lane(uint64_t lane_mask) { __gpu_sync_lane(lane_mask); }
+
+LIBC_INLINE uint32_t shuffle(uint64_t lane_mask, uint32_t idx, uint32_t x) {
+  return __gpu_shuffle_idx_u32(lane_mask, idx, x);
 }
 
-/// Conditional that is only true for a single thread in a lane.
+[[noreturn]] LIBC_INLINE void end_program() { __gpu_exit(); }
+
 LIBC_INLINE bool is_first_lane(uint64_t lane_mask) {
-  return gpu::get_lane_id() == get_first_lane_id(lane_mask);
+  return __gpu_is_first_in_lane(lane_mask);
 }
 
-/// Gets the sum of all lanes inside the warp or wavefront.
 LIBC_INLINE uint32_t reduce(uint64_t lane_mask, uint32_t x) {
-  for (uint32_t step = gpu::get_lane_size() / 2; step > 0; step /= 2) {
-    uint32_t index = step + gpu::get_lane_id();
-    x += gpu::shuffle(lane_mask, index, x);
-  }
-  return gpu::broadcast_value(lane_mask, x);
+  return __gpu_lane_sum_u32(lane_mask, x);
 }
 
-/// Gets the accumulator scan of the threads in the warp or wavefront.
 LIBC_INLINE uint32_t scan(uint64_t lane_mask, uint32_t x) {
-  for (uint32_t step = 1; step < gpu::get_lane_size(); step *= 2) {
-    uint32_t index = gpu::get_lane_id() - step;
-    uint32_t bitmask = gpu::get_lane_id() >= step;
-    x += -bitmask & gpu::shuffle(lane_mask, index, x);
-  }
-  return x;
+  return __gpu_lane_scan_u32(lane_mask, x);
+}
+
+LIBC_INLINE uint64_t fixed_frequency_clock() {
+  return __builtin_readsteadycounter();
 }
 
+LIBC_INLINE uint64_t processor_clock() { return __builtin_readcyclecounter(); }
+
 } // namespace gpu
 } // namespace LIBC_NAMESPACE_DECL
 
diff --git a/libc/src/__support/fixedvector.h b/libc/src/__support/fixedvector.h
index 7ac0c230f9c5..34601f86dc01 100644
--- a/libc/src/__support/fixedvector.h
+++ b/libc/src/__support/fixedvector.h
@@ -10,9 +10,10 @@
 #define LLVM_LIBC_SRC___SUPPORT_FIXEDVECTOR_H
 
 #include "src/__support/CPP/array.h"
-
 #include "src/__support/CPP/iterator.h"
+#include "src/__support/libc_assert.h"
 #include "src/__support/macros/config.h"
+#include "src/string/memory_utils/inline_memset.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
@@ -23,27 +24,32 @@ template <typename T, size_t CAPACITY> class FixedVector {
   size_t item_count = 0;
 
 public:
-  constexpr FixedVector() = default;
+  LIBC_INLINE constexpr FixedVector() = default;
 
   using iterator = typename cpp::array<T, CAPACITY>::iterator;
-  constexpr FixedVector(iterator begin, iterator end) : store{}, item_count{} {
+  LIBC_INLINE constexpr FixedVector(iterator begin, iterator end)
+      : store{}, item_count{} {
+    LIBC_ASSERT(begin + CAPACITY >= end);
     for (; begin != end; ++begin)
       push_back(*begin);
   }
 
   using const_iterator = typename cpp::array<T, CAPACITY>::const_iterator;
-  constexpr FixedVector(const_iterator begin, const_iterator end)
+  LIBC_INLINE constexpr FixedVector(const_iterator begin, const_iterator end)
       : store{}, item_count{} {
+    LIBC_ASSERT(begin + CAPACITY >= end);
     for (; begin != end; ++begin)
       push_back(*begin);
   }
 
-  constexpr FixedVector(size_t count, const T &value) : store{}, item_count{} {
+  LIBC_INLINE constexpr FixedVector(size_t count, const T &value)
+      : store{}, item_count{} {
+    LIBC_ASSERT(count <= CAPACITY);
     for (size_t i = 0; i < count; ++i)
       push_back(value);
   }
 
-  constexpr bool push_back(const T &obj) {
+  LIBC_INLINE constexpr bool push_back(const T &obj) {
     if (item_count == CAPACITY)
       return false;
     store[item_count] = obj;
@@ -51,27 +57,43 @@ public:
     return true;
   }
 
-  constexpr const T &back() const { return store[item_count - 1]; }
+  LIBC_INLINE constexpr const T &back() const {
+    LIBC_ASSERT(!empty());
+    return store[item_count - 1];
+  }
 
-  constexpr T &back() { return store[item_count - 1]; }
+  LIBC_INLINE constexpr T &back() {
+    LIBC_ASSERT(!empty());
+    return store[item_count - 1];
+  }
 
-  constexpr bool pop_back() {
+  LIBC_INLINE constexpr bool pop_back() {
     if (item_count == 0)
       return false;
+    inline_memset(&store[item_count - 1], 0, sizeof(T));
     --item_count;
     return true;
   }
 
-  constexpr T &operator[](size_t idx) { return store[idx]; }
+  LIBC_INLINE constexpr T &operator[](size_t idx) {
+    LIBC_ASSERT(idx < item_count);
+    return store[idx];
+  }
 
-  constexpr const T &operator[](size_t idx) const { return store[idx]; }
+  LIBC_INLINE constexpr const T &operator[](size_t idx) const {
+    LIBC_ASSERT(idx < item_count);
+    return store[idx];
+  }
 
-  constexpr bool empty() const { return item_count == 0; }
+  LIBC_INLINE constexpr bool empty() const { return item_count == 0; }
 
-  constexpr size_t size() const { return item_count; }
+  LIBC_INLINE constexpr size_t size() const { return item_count; }
 
   // Empties the store for all practical purposes.
-  constexpr void reset() { item_count = 0; }
+  LIBC_INLINE constexpr void reset() {
+    inline_memset(store.data(), 0, sizeof(T) * item_count);
+    item_count = 0;
+  }
 
   // This static method does not free up the resources held by |store|,
   // say by calling `free` or something similar. It just does the equivalent
@@ -81,7 +103,9 @@ public:
   // dynamically allocated storate. So, the `destroy` method like this
   // matches the `destroy` API of those other data structures so that users
   // can easily swap one data structure for the other.
-  static void destroy(FixedVector<T, CAPACITY> *store) { store->reset(); }
+  LIBC_INLINE static void destroy(FixedVector<T, CAPACITY> *store) {
+    store->reset();
+  }
 
   using reverse_iterator = typename cpp::array<T, CAPACITY>::reverse_iterator;
   LIBC_INLINE constexpr reverse_iterator rbegin() {
diff --git a/libc/src/__support/threads/thread.cpp b/libc/src/__support/threads/thread.cpp
index dad4f75f092e..6f6b75be5766 100644
--- a/libc/src/__support/threads/thread.cpp
+++ b/libc/src/__support/threads/thread.cpp
@@ -117,7 +117,9 @@ public:
 
   int add_callback(AtExitCallback *callback, void *obj) {
     cpp::lock_guard lock(mtx);
-    return callback_list.push_back({callback, obj});
+    if (callback_list.push_back({callback, obj}))
+      return 0;
+    return -1;
   }
 
   void call() {
diff --git a/libc/src/pthread/pthread_condattr_init.cpp b/libc/src/pthread/pthread_condattr_init.cpp
index 12005b8a9d30..b360804bb7bb 100644
--- a/libc/src/pthread/pthread_condattr_init.cpp
+++ b/libc/src/pthread/pthread_condattr_init.cpp
@@ -11,8 +11,8 @@
 #include "src/__support/common.h"
 #include "src/__support/macros/config.h"
 
-#include <pthread.h> // pthread_condattr_t, PTHREAD_PROCESS_PRIVATE
-#include <time.h>    // CLOCK_REALTIME
+#include "hdr/time_macros.h" // CLOCK_REALTIME
+#include <pthread.h>         // pthread_condattr_t, PTHREAD_PROCESS_PRIVATE
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/pthread/pthread_condattr_setclock.cpp b/libc/src/pthread/pthread_condattr_setclock.cpp
index 37fbd6b27143..5e825d5ecea6 100644
--- a/libc/src/pthread/pthread_condattr_setclock.cpp
+++ b/libc/src/pthread/pthread_condattr_setclock.cpp
@@ -12,9 +12,9 @@
 #include "src/__support/macros/config.h"
 #include "src/errno/libc_errno.h"
 
-#include <pthread.h>   // pthread_condattr_t
-#include <sys/types.h> // clockid_t
-#include <time.h>      // CLOCK_MONOTONIC, CLOCK_REALTIME
+#include "hdr/time_macros.h" // CLOCK_MONOTONIC, CLOCK_REALTIME
+#include <pthread.h>         // pthread_condattr_t
+#include <sys/types.h>       // clockid_t
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/stdlib/exit_handler.h b/libc/src/stdlib/exit_handler.h
index 9720c5473940..e9d163dfe902 100644
--- a/libc/src/stdlib/exit_handler.h
+++ b/libc/src/stdlib/exit_handler.h
@@ -48,7 +48,7 @@ LIBC_INLINE void stdc_at_exit_func(void *payload) {
 LIBC_INLINE void call_exit_callbacks(ExitCallbackList &callbacks) {
   handler_list_mtx.lock();
   while (!callbacks.empty()) {
-    AtExitUnit &unit = callbacks.back();
+    AtExitUnit unit = callbacks.back();
     callbacks.pop_back();
     handler_list_mtx.unlock();
     unit.callback(unit.payload);
diff --git a/libc/src/time/CMakeLists.txt b/libc/src/time/CMakeLists.txt
index ae835dcc7427..ef9bfe57bc4e 100644
--- a/libc/src/time/CMakeLists.txt
+++ b/libc/src/time/CMakeLists.txt
@@ -2,6 +2,17 @@ if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${LIBC_TARGET_OS})
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/${LIBC_TARGET_OS})
 endif()
 
+add_header_library(
+  time_constants
+  HDRS
+    time_constants.h
+  DEPENDS
+    libc.include.time
+    libc.src.__support.CPP.array
+    libc.src.__support.CPP.string_view
+    libc.hdr.types.time_t
+)
+
 add_object_library(
   time_utils
   SRCS
@@ -12,6 +23,10 @@ add_object_library(
     libc.include.time
     libc.src.__support.CPP.limits
     libc.src.errno.errno
+    .time_constants
+    libc.hdr.types.time_t
+    libc.hdr.types.size_t
+    libc.hdr.types.struct_tm
 )
 
 add_entrypoint_object(
@@ -22,7 +37,9 @@ add_entrypoint_object(
     asctime.h
   DEPENDS
     .time_utils
+    .time_constants
     libc.include.time
+    libc.hdr.types.struct_tm
 )
 
 add_entrypoint_object(
@@ -33,7 +50,9 @@ add_entrypoint_object(
     asctime_r.h
   DEPENDS
     .time_utils
+    .time_constants
     libc.include.time
+    libc.hdr.types.struct_tm
 )
 
 add_entrypoint_object(
@@ -44,6 +63,7 @@ add_entrypoint_object(
     ctime.h
   DEPENDS
     .time_utils
+    .time_constants
     libc.hdr.types.time_t
     libc.include.time
 )
@@ -56,6 +76,7 @@ add_entrypoint_object(
     ctime_r.h
   DEPENDS
     .time_utils
+    .time_constants
     libc.hdr.types.time_t
     libc.include.time
 )
@@ -68,6 +89,7 @@ add_entrypoint_object(
     difftime.h
   DEPENDS
     libc.include.time
+    libc.hdr.types.time_t
 )
 
 add_entrypoint_object(
@@ -79,6 +101,8 @@ add_entrypoint_object(
   DEPENDS
     .time_utils
     libc.include.time
+    libc.hdr.types.time_t
+    libc.hdr.types.struct_tm
 )
 
 add_entrypoint_object(
@@ -90,6 +114,8 @@ add_entrypoint_object(
   DEPENDS
     .time_utils
     libc.include.time
+    libc.hdr.types.time_t
+    libc.hdr.types.struct_tm
 )
 
 add_entrypoint_object(
@@ -100,8 +126,11 @@ add_entrypoint_object(
     mktime.h
   DEPENDS
     .time_utils
+    .time_constants
     libc.include.time
     libc.src.errno.errno
+    libc.hdr.types.time_t
+    libc.hdr.types.struct_tm
 )
 
 add_entrypoint_object(
@@ -115,6 +144,7 @@ add_entrypoint_object(
     libc.hdr.types.time_t
     libc.src.__support.time.clock_gettime
     libc.src.errno.errno
+    libc.hdr.types.struct_tm
 )
 
 add_entrypoint_object(
diff --git a/libc/src/time/asctime.cpp b/libc/src/time/asctime.cpp
index d6fbe7316ced..2b00c4136f90 100644
--- a/libc/src/time/asctime.cpp
+++ b/libc/src/time/asctime.cpp
@@ -9,15 +9,15 @@
 #include "src/time/asctime.h"
 #include "src/__support/common.h"
 #include "src/__support/macros/config.h"
+#include "src/time/time_constants.h"
 #include "src/time/time_utils.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
-using LIBC_NAMESPACE::time_utils::TimeConstants;
-
 LLVM_LIBC_FUNCTION(char *, asctime, (const struct tm *timeptr)) {
-  static char buffer[TimeConstants::ASCTIME_BUFFER_SIZE];
-  return time_utils::asctime(timeptr, buffer, TimeConstants::ASCTIME_MAX_BYTES);
+  static char buffer[time_constants::ASCTIME_BUFFER_SIZE];
+  return time_utils::asctime(timeptr, buffer,
+                             time_constants::ASCTIME_MAX_BYTES);
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/time/asctime.h b/libc/src/time/asctime.h
index 623e6dff60c3..37325e75b829 100644
--- a/libc/src/time/asctime.h
+++ b/libc/src/time/asctime.h
@@ -9,8 +9,8 @@
 #ifndef LLVM_LIBC_SRC_TIME_ASCTIME_H
 #define LLVM_LIBC_SRC_TIME_ASCTIME_H
 
+#include "hdr/types/struct_tm.h"
 #include "src/__support/macros/config.h"
-#include <time.h>
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/time/asctime_r.cpp b/libc/src/time/asctime_r.cpp
index caa22f1cd778..bf53bfdf2f8c 100644
--- a/libc/src/time/asctime_r.cpp
+++ b/libc/src/time/asctime_r.cpp
@@ -9,15 +9,15 @@
 #include "src/time/asctime_r.h"
 #include "src/__support/common.h"
 #include "src/__support/macros/config.h"
+#include "src/time/time_constants.h"
 #include "src/time/time_utils.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
-using LIBC_NAMESPACE::time_utils::TimeConstants;
-
 LLVM_LIBC_FUNCTION(char *, asctime_r,
                    (const struct tm *timeptr, char *buffer)) {
-  return time_utils::asctime(timeptr, buffer, TimeConstants::ASCTIME_MAX_BYTES);
+  return time_utils::asctime(timeptr, buffer,
+                             time_constants::ASCTIME_MAX_BYTES);
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/time/asctime_r.h b/libc/src/time/asctime_r.h
index 328b7dff19c2..65a6b84ca38f 100644
--- a/libc/src/time/asctime_r.h
+++ b/libc/src/time/asctime_r.h
@@ -9,8 +9,8 @@
 #ifndef LLVM_LIBC_SRC_TIME_ASCTIME_R_H
 #define LLVM_LIBC_SRC_TIME_ASCTIME_R_H
 
+#include "hdr/types/struct_tm.h"
 #include "src/__support/macros/config.h"
-#include <time.h>
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/time/ctime.cpp b/libc/src/time/ctime.cpp
index 8adae9be7380..ac0ffe5b32ae 100644
--- a/libc/src/time/ctime.cpp
+++ b/libc/src/time/ctime.cpp
@@ -6,23 +6,22 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "ctime.h"
+#include "src/time/ctime.h"
 #include "src/__support/CPP/limits.h"
 #include "src/__support/common.h"
 #include "src/__support/macros/config.h"
-#include "time_utils.h"
+#include "src/time/time_constants.h"
+#include "src/time/time_utils.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
-using LIBC_NAMESPACE::time_utils::TimeConstants;
-
 LLVM_LIBC_FUNCTION(char *, ctime, (const time_t *t_ptr)) {
   if (t_ptr == nullptr || *t_ptr > cpp::numeric_limits<int32_t>::max()) {
     return nullptr;
   }
-  static char buffer[TimeConstants::ASCTIME_BUFFER_SIZE];
+  static char buffer[time_constants::ASCTIME_BUFFER_SIZE];
   return time_utils::asctime(time_utils::localtime(t_ptr), buffer,
-                             TimeConstants::ASCTIME_MAX_BYTES);
+                             time_constants::ASCTIME_MAX_BYTES);
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/time/ctime_r.cpp b/libc/src/time/ctime_r.cpp
index 63d93c4085f3..7224f7742f13 100644
--- a/libc/src/time/ctime_r.cpp
+++ b/libc/src/time/ctime_r.cpp
@@ -6,16 +6,15 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "ctime_r.h"
+#include "src/time/ctime_r.h"
 #include "src/__support/CPP/limits.h"
 #include "src/__support/common.h"
 #include "src/__support/macros/config.h"
-#include "time_utils.h"
+#include "src/time/time_constants.h"
+#include "src/time/time_utils.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
-using LIBC_NAMESPACE::time_utils::TimeConstants;
-
 LLVM_LIBC_FUNCTION(char *, ctime_r, (const time_t *t_ptr, char *buffer)) {
   if (t_ptr == nullptr || buffer == nullptr ||
       *t_ptr > cpp::numeric_limits<int32_t>::max()) {
@@ -23,7 +22,7 @@ LLVM_LIBC_FUNCTION(char *, ctime_r, (const time_t *t_ptr, char *buffer)) {
   }
 
   return time_utils::asctime(time_utils::localtime(t_ptr), buffer,
-                             TimeConstants::ASCTIME_MAX_BYTES);
+                             time_constants::ASCTIME_MAX_BYTES);
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/time/difftime.h b/libc/src/time/difftime.h
index d5cd593cc533..12de5678864c 100644
--- a/libc/src/time/difftime.h
+++ b/libc/src/time/difftime.h
@@ -9,8 +9,8 @@
 #ifndef LLVM_LIBC_SRC_TIME_DIFFTIME_H
 #define LLVM_LIBC_SRC_TIME_DIFFTIME_H
 
+#include "hdr/types/time_t.h"
 #include "src/__support/macros/config.h"
-#include <time.h>
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/time/gmtime.h b/libc/src/time/gmtime.h
index 3de3cebbfde2..ac7f1be7bbce 100644
--- a/libc/src/time/gmtime.h
+++ b/libc/src/time/gmtime.h
@@ -9,8 +9,9 @@
 #ifndef LLVM_LIBC_SRC_TIME_GMTIME_H
 #define LLVM_LIBC_SRC_TIME_GMTIME_H
 
+#include "hdr/types/struct_tm.h"
+#include "hdr/types/time_t.h"
 #include "src/__support/macros/config.h"
-#include <time.h>
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/time/gmtime_r.h b/libc/src/time/gmtime_r.h
index b4f387ef443b..4c88b22faf4c 100644
--- a/libc/src/time/gmtime_r.h
+++ b/libc/src/time/gmtime_r.h
@@ -9,8 +9,9 @@
 #ifndef LLVM_LIBC_SRC_TIME_GMTIME_R_H
 #define LLVM_LIBC_SRC_TIME_GMTIME_R_H
 
+#include "hdr/types/struct_tm.h"
+#include "hdr/types/time_t.h"
 #include "src/__support/macros/config.h"
-#include <time.h>
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/time/gpu/clock.cpp b/libc/src/time/gpu/clock.cpp
index add5b2725ef8..8609c5cd6b6b 100644
--- a/libc/src/time/gpu/clock.cpp
+++ b/libc/src/time/gpu/clock.cpp
@@ -7,6 +7,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/time/clock.h"
+
+#include "src/__support/common.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/time/gpu/time_utils.h"
 
diff --git a/libc/src/time/gpu/nanosleep.cpp b/libc/src/time/gpu/nanosleep.cpp
index a92f660f225c..d22d9d6bd8d7 100644
--- a/libc/src/time/gpu/nanosleep.cpp
+++ b/libc/src/time/gpu/nanosleep.cpp
@@ -8,6 +8,7 @@
 
 #include "src/time/nanosleep.h"
 
+#include "src/__support/common.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/time/gpu/time_utils.h"
 
diff --git a/libc/src/time/mktime.cpp b/libc/src/time/mktime.cpp
index 72cd22912053..3874cad02fac 100644
--- a/libc/src/time/mktime.cpp
+++ b/libc/src/time/mktime.cpp
@@ -9,15 +9,11 @@
 #include "src/time/mktime.h"
 #include "src/__support/common.h"
 #include "src/__support/macros/config.h"
+#include "src/time/time_constants.h"
 #include "src/time/time_utils.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
-using LIBC_NAMESPACE::time_utils::TimeConstants;
-
-static constexpr int NON_LEAP_YEAR_DAYS_IN_MONTH[] = {31, 28, 31, 30, 31, 30,
-                                                      31, 31, 30, 31, 30, 31};
-
 // Returns number of years from (1, year).
 static constexpr int64_t get_num_of_leap_years_before(int64_t year) {
   return (year / 4) - (year / 100) + (year / 400);
@@ -31,12 +27,12 @@ static constexpr bool is_leap_year(const int64_t year) {
 LLVM_LIBC_FUNCTION(time_t, mktime, (struct tm * tm_out)) {
   // Unlike most C Library functions, mktime doesn't just die on bad input.
   // TODO(rtenneti); Handle leap seconds.
-  int64_t tm_year_from_base = tm_out->tm_year + TimeConstants::TIME_YEAR_BASE;
+  int64_t tm_year_from_base = tm_out->tm_year + time_constants::TIME_YEAR_BASE;
 
   // 32-bit end-of-the-world is 03:14:07 UTC on 19 January 2038.
   if (sizeof(time_t) == 4 &&
-      tm_year_from_base >= TimeConstants::END_OF32_BIT_EPOCH_YEAR) {
-    if (tm_year_from_base > TimeConstants::END_OF32_BIT_EPOCH_YEAR)
+      tm_year_from_base >= time_constants::END_OF32_BIT_EPOCH_YEAR) {
+    if (tm_year_from_base > time_constants::END_OF32_BIT_EPOCH_YEAR)
       return time_utils::out_of_range();
     if (tm_out->tm_mon > 0)
       return time_utils::out_of_range();
@@ -64,7 +60,7 @@ LLVM_LIBC_FUNCTION(time_t, mktime, (struct tm * tm_out)) {
 
   // Calculate number of months and years from tm_mon.
   int64_t month = tm_out->tm_mon;
-  if (month < 0 || month >= TimeConstants::MONTHS_PER_YEAR - 1) {
+  if (month < 0 || month >= time_constants::MONTHS_PER_YEAR - 1) {
     int64_t years = month / 12;
     month %= 12;
     if (month < 0) {
@@ -78,23 +74,23 @@ LLVM_LIBC_FUNCTION(time_t, mktime, (struct tm * tm_out)) {
   // Calculate total number of days based on the month and the day (tm_mday).
   int64_t total_days = tm_out->tm_mday - 1;
   for (int64_t i = 0; i < month; ++i)
-    total_days += NON_LEAP_YEAR_DAYS_IN_MONTH[i];
+    total_days += time_constants::NON_LEAP_YEAR_DAYS_IN_MONTH[i];
   // Add one day if it is a leap year and the month is after February.
   if (tm_year_is_leap && month > 1)
     total_days++;
 
   // Calculate total numbers of days based on the year.
-  total_days += (tm_year_from_base - TimeConstants::EPOCH_YEAR) *
-                TimeConstants::DAYS_PER_NON_LEAP_YEAR;
-  if (tm_year_from_base >= TimeConstants::EPOCH_YEAR) {
+  total_days += (tm_year_from_base - time_constants::EPOCH_YEAR) *
+                time_constants::DAYS_PER_NON_LEAP_YEAR;
+  if (tm_year_from_base >= time_constants::EPOCH_YEAR) {
     total_days += get_num_of_leap_years_before(tm_year_from_base - 1) -
-                  get_num_of_leap_years_before(TimeConstants::EPOCH_YEAR);
+                  get_num_of_leap_years_before(time_constants::EPOCH_YEAR);
   } else if (tm_year_from_base >= 1) {
-    total_days -= get_num_of_leap_years_before(TimeConstants::EPOCH_YEAR) -
+    total_days -= get_num_of_leap_years_before(time_constants::EPOCH_YEAR) -
                   get_num_of_leap_years_before(tm_year_from_base - 1);
   } else {
     // Calculate number of leap years until 0th year.
-    total_days -= get_num_of_leap_years_before(TimeConstants::EPOCH_YEAR) -
+    total_days -= get_num_of_leap_years_before(time_constants::EPOCH_YEAR) -
                   get_num_of_leap_years_before(0);
     if (tm_year_from_base <= 0) {
       total_days -= 1; // Subtract 1 for 0th year.
@@ -106,11 +102,12 @@ LLVM_LIBC_FUNCTION(time_t, mktime, (struct tm * tm_out)) {
     }
   }
 
-  // TODO(rtenneti): Need to handle timezone and update of tm_isdst.
+  // TODO: https://github.com/llvm/llvm-project/issues/121962
+  // Need to handle timezone and update of tm_isdst.
   int64_t seconds = tm_out->tm_sec +
-                    tm_out->tm_min * TimeConstants::SECONDS_PER_MIN +
-                    tm_out->tm_hour * TimeConstants::SECONDS_PER_HOUR +
-                    total_days * TimeConstants::SECONDS_PER_DAY;
+                    tm_out->tm_min * time_constants::SECONDS_PER_MIN +
+                    tm_out->tm_hour * time_constants::SECONDS_PER_HOUR +
+                    total_days * time_constants::SECONDS_PER_DAY;
 
   // Update the tm structure's year, month, day, etc. from seconds.
   if (time_utils::update_from_seconds(seconds, tm_out) < 0)
diff --git a/libc/src/time/mktime.h b/libc/src/time/mktime.h
index 2b4c67996555..985c6293f9d5 100644
--- a/libc/src/time/mktime.h
+++ b/libc/src/time/mktime.h
@@ -9,8 +9,9 @@
 #ifndef LLVM_LIBC_SRC_TIME_MKTIME_H
 #define LLVM_LIBC_SRC_TIME_MKTIME_H
 
+#include "hdr/types/struct_tm.h"
+#include "hdr/types/time_t.h"
 #include "src/__support/macros/config.h"
-#include <time.h>
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/time/time.cpp b/libc/src/time/time.cpp
index 4a0b614a68ef..860909af7488 100644
--- a/libc/src/time/time.cpp
+++ b/libc/src/time/time.cpp
@@ -6,12 +6,13 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "src/time/time_func.h"
+
 #include "hdr/time_macros.h"
 #include "src/__support/common.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/time/clock_gettime.h"
 #include "src/errno/libc_errno.h"
-#include "src/time/time_func.h"
 
 namespace LIBC_NAMESPACE_DECL {
 // avoid inconsitent clang-format behavior
diff --git a/libc/src/time/time_constants.h b/libc/src/time/time_constants.h
new file mode 100644
index 000000000000..3e25f741745a
--- /dev/null
+++ b/libc/src/time/time_constants.h
@@ -0,0 +1,100 @@
+//===-- Collection of constants for time functions --------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_TIME_TIME_CONSTANTS_H
+#define LLVM_LIBC_SRC_TIME_TIME_CONSTANTS_H
+
+#include "hdr/types/time_t.h"
+#include "src/__support/CPP/array.h"
+#include "src/__support/CPP/string_view.h"
+#include <stdint.h>
+
+namespace LIBC_NAMESPACE_DECL {
+namespace time_constants {
+
+enum Month : int {
+  JANUARY,
+  FEBRUARY,
+  MARCH,
+  APRIL,
+  MAY,
+  JUNE,
+  JULY,
+  AUGUST,
+  SEPTEMBER,
+  OCTOBER,
+  NOVEMBER,
+  DECEMBER
+};
+
+constexpr int SECONDS_PER_MIN = 60;
+constexpr int MINUTES_PER_HOUR = 60;
+constexpr int HOURS_PER_DAY = 24;
+constexpr int DAYS_PER_WEEK = 7;
+constexpr int MONTHS_PER_YEAR = 12;
+constexpr int DAYS_PER_NON_LEAP_YEAR = 365;
+constexpr int DAYS_PER_LEAP_YEAR = 366;
+
+constexpr int SECONDS_PER_HOUR = SECONDS_PER_MIN * MINUTES_PER_HOUR;
+constexpr int SECONDS_PER_DAY = SECONDS_PER_HOUR * HOURS_PER_DAY;
+constexpr int NUMBER_OF_SECONDS_IN_LEAP_YEAR =
+    DAYS_PER_LEAP_YEAR * SECONDS_PER_DAY;
+
+constexpr int TIME_YEAR_BASE = 1900;
+constexpr int EPOCH_YEAR = 1970;
+constexpr int EPOCH_WEEK_DAY = 4;
+
+// For asctime the behavior is undefined if struct tm's tm_wday or tm_mon are
+// not within the normal ranges as defined in <time.h>, or if struct tm's
+// tm_year exceeds {INT_MAX}-1990, or if the below asctime_internal algorithm
+// would attempt to generate more than 26 bytes of output (including the
+// terminating null).
+constexpr int ASCTIME_BUFFER_SIZE = 256;
+constexpr int ASCTIME_MAX_BYTES = 26;
+
+/* 2000-03-01 (mod 400 year, immediately after feb29 */
+constexpr int64_t SECONDS_UNTIL2000_MARCH_FIRST =
+    (946684800LL + SECONDS_PER_DAY * (31 + 29));
+constexpr int WEEK_DAY_OF2000_MARCH_FIRST = 3;
+
+constexpr int DAYS_PER400_YEARS =
+    (DAYS_PER_NON_LEAP_YEAR * 400) + (400 / 4) - 3;
+constexpr int DAYS_PER100_YEARS =
+    (DAYS_PER_NON_LEAP_YEAR * 100) + (100 / 4) - 1;
+constexpr int DAYS_PER4_YEARS = (DAYS_PER_NON_LEAP_YEAR * 4) + 1;
+
+// The latest time that can be represented in this form is 03:14:07 UTC on
+// Tuesday, 19 January 2038 (corresponding to 2,147,483,647 seconds since the
+// start of the epoch). This means that systems using a 32-bit time_t type are
+// susceptible to the Year 2038 problem.
+constexpr int END_OF32_BIT_EPOCH_YEAR = 2038;
+
+constexpr time_t OUT_OF_RANGE_RETURN_VALUE = -1;
+
+constexpr cpp::array<cpp::string_view, DAYS_PER_WEEK> WEEK_DAY_NAMES = {
+    "Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat"};
+
+constexpr cpp::array<cpp::string_view, DAYS_PER_WEEK> WEEK_DAY_FULL_NAMES = {
+    "Sunday",   "Monday", "Tuesday", "Wednesday",
+    "Thursday", "Friday", "Saturday"};
+
+constexpr cpp::array<cpp::string_view, MONTHS_PER_YEAR> MONTH_NAMES = {
+    "Jan", "Feb", "Mar", "Apr", "May", "Jun",
+    "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"};
+
+constexpr cpp::array<cpp::string_view, MONTHS_PER_YEAR> MONTH_FULL_NAMES = {
+    "January", "February", "March",     "April",   "May",      "June",
+    "July",    "August",   "September", "October", "November", "December"};
+
+constexpr int NON_LEAP_YEAR_DAYS_IN_MONTH[] = {31, 28, 31, 30, 31, 30,
+                                               31, 31, 30, 31, 30, 31};
+
+} // namespace time_constants
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_TIME_TIME_CONSTANTS_H
diff --git a/libc/src/time/time_utils.cpp b/libc/src/time/time_utils.cpp
index 509cad8146df..abc93b8cb961 100644
--- a/libc/src/time/time_utils.cpp
+++ b/libc/src/time/time_utils.cpp
@@ -10,12 +10,11 @@
 #include "src/__support/CPP/limits.h" // INT_MIN, INT_MAX
 #include "src/__support/common.h"
 #include "src/__support/macros/config.h"
+#include "src/time/time_constants.h"
 
 namespace LIBC_NAMESPACE_DECL {
 namespace time_utils {
 
-using LIBC_NAMESPACE::time_utils::TimeConstants;
-
 static int64_t computeRemainingYears(int64_t daysPerYears,
                                      int64_t quotientYears,
                                      int64_t *remainingDays) {
@@ -52,36 +51,36 @@ int64_t update_from_seconds(int64_t total_seconds, struct tm *tm) {
       (sizeof(time_t) == 4)
           ? INT_MIN
           : INT_MIN * static_cast<int64_t>(
-                          TimeConstants::NUMBER_OF_SECONDS_IN_LEAP_YEAR);
+                          time_constants::NUMBER_OF_SECONDS_IN_LEAP_YEAR);
   constexpr time_t time_max =
       (sizeof(time_t) == 4)
           ? INT_MAX
           : INT_MAX * static_cast<int64_t>(
-                          TimeConstants::NUMBER_OF_SECONDS_IN_LEAP_YEAR);
+                          time_constants::NUMBER_OF_SECONDS_IN_LEAP_YEAR);
 
   time_t ts = static_cast<time_t>(total_seconds);
   if (ts < time_min || ts > time_max)
     return time_utils::out_of_range();
 
   int64_t seconds =
-      total_seconds - TimeConstants::SECONDS_UNTIL2000_MARCH_FIRST;
-  int64_t days = seconds / TimeConstants::SECONDS_PER_DAY;
-  int64_t remainingSeconds = seconds % TimeConstants::SECONDS_PER_DAY;
+      total_seconds - time_constants::SECONDS_UNTIL2000_MARCH_FIRST;
+  int64_t days = seconds / time_constants::SECONDS_PER_DAY;
+  int64_t remainingSeconds = seconds % time_constants::SECONDS_PER_DAY;
   if (remainingSeconds < 0) {
-    remainingSeconds += TimeConstants::SECONDS_PER_DAY;
+    remainingSeconds += time_constants::SECONDS_PER_DAY;
     days--;
   }
 
-  int64_t wday = (TimeConstants::WEEK_DAY_OF2000_MARCH_FIRST + days) %
-                 TimeConstants::DAYS_PER_WEEK;
+  int64_t wday = (time_constants::WEEK_DAY_OF2000_MARCH_FIRST + days) %
+                 time_constants::DAYS_PER_WEEK;
   if (wday < 0)
-    wday += TimeConstants::DAYS_PER_WEEK;
+    wday += time_constants::DAYS_PER_WEEK;
 
   // Compute the number of 400 year cycles.
-  int64_t numOfFourHundredYearCycles = days / TimeConstants::DAYS_PER400_YEARS;
-  int64_t remainingDays = days % TimeConstants::DAYS_PER400_YEARS;
+  int64_t numOfFourHundredYearCycles = days / time_constants::DAYS_PER400_YEARS;
+  int64_t remainingDays = days % time_constants::DAYS_PER400_YEARS;
   if (remainingDays < 0) {
-    remainingDays += TimeConstants::DAYS_PER400_YEARS;
+    remainingDays += time_constants::DAYS_PER400_YEARS;
     numOfFourHundredYearCycles--;
   }
 
@@ -89,17 +88,17 @@ int64_t update_from_seconds(int64_t total_seconds, struct tm *tm) {
   // "four hundred year cycles" will be 4 hundred year cycles or less in 400
   // years.
   int64_t numOfHundredYearCycles = computeRemainingYears(
-      TimeConstants::DAYS_PER100_YEARS, 4, &remainingDays);
+      time_constants::DAYS_PER100_YEARS, 4, &remainingDays);
 
   // The remaining number of years after computing the number of
   // "hundred year cycles" will be 25 four year cycles or less in 100 years.
-  int64_t numOfFourYearCycles =
-      computeRemainingYears(TimeConstants::DAYS_PER4_YEARS, 25, &remainingDays);
+  int64_t numOfFourYearCycles = computeRemainingYears(
+      time_constants::DAYS_PER4_YEARS, 25, &remainingDays);
 
   // The remaining number of years after computing the number of
   // "four year cycles" will be 4 one year cycles or less in 4 years.
   int64_t remainingYears = computeRemainingYears(
-      TimeConstants::DAYS_PER_NON_LEAP_YEAR, 4, &remainingDays);
+      time_constants::DAYS_PER_NON_LEAP_YEAR, 4, &remainingDays);
 
   // Calculate number of years from year 2000.
   int64_t years = remainingYears + 4 * numOfFourYearCycles +
@@ -112,8 +111,8 @@ int64_t update_from_seconds(int64_t total_seconds, struct tm *tm) {
   // We add 31 and 28 for the number of days in January and February, since our
   // starting point was March 1st.
   int64_t yday = remainingDays + 31 + 28 + leapDay;
-  if (yday >= TimeConstants::DAYS_PER_NON_LEAP_YEAR + leapDay)
-    yday -= TimeConstants::DAYS_PER_NON_LEAP_YEAR + leapDay;
+  if (yday >= time_constants::DAYS_PER_NON_LEAP_YEAR + leapDay)
+    yday -= time_constants::DAYS_PER_NON_LEAP_YEAR + leapDay;
 
   int64_t months = 0;
   while (daysInMonth[months] <= remainingDays) {
@@ -121,8 +120,8 @@ int64_t update_from_seconds(int64_t total_seconds, struct tm *tm) {
     months++;
   }
 
-  if (months >= TimeConstants::MONTHS_PER_YEAR - 2) {
-    months -= TimeConstants::MONTHS_PER_YEAR;
+  if (months >= time_constants::MONTHS_PER_YEAR - 2) {
+    months -= time_constants::MONTHS_PER_YEAR;
     years++;
   }
 
@@ -131,19 +130,19 @@ int64_t update_from_seconds(int64_t total_seconds, struct tm *tm) {
 
   // All the data (years, month and remaining days) was calculated from
   // March, 2000. Thus adjust the data to be from January, 1900.
-  tm->tm_year = static_cast<int>(years + 2000 - TimeConstants::TIME_YEAR_BASE);
+  tm->tm_year = static_cast<int>(years + 2000 - time_constants::TIME_YEAR_BASE);
   tm->tm_mon = static_cast<int>(months + 2);
   tm->tm_mday = static_cast<int>(remainingDays + 1);
   tm->tm_wday = static_cast<int>(wday);
   tm->tm_yday = static_cast<int>(yday);
 
   tm->tm_hour =
-      static_cast<int>(remainingSeconds / TimeConstants::SECONDS_PER_HOUR);
+      static_cast<int>(remainingSeconds / time_constants::SECONDS_PER_HOUR);
   tm->tm_min =
-      static_cast<int>(remainingSeconds / TimeConstants::SECONDS_PER_MIN %
-                       TimeConstants::SECONDS_PER_MIN);
+      static_cast<int>(remainingSeconds / time_constants::SECONDS_PER_MIN %
+                       time_constants::SECONDS_PER_MIN);
   tm->tm_sec =
-      static_cast<int>(remainingSeconds % TimeConstants::SECONDS_PER_MIN);
+      static_cast<int>(remainingSeconds % time_constants::SECONDS_PER_MIN);
   // TODO(rtenneti): Need to handle timezone and update of tm_isdst.
   tm->tm_isdst = 0;
 
diff --git a/libc/src/time/time_utils.h b/libc/src/time/time_utils.h
index 552ea925c1c7..5e0a692d4db0 100644
--- a/libc/src/time/time_utils.h
+++ b/libc/src/time/time_utils.h
@@ -9,79 +9,19 @@
 #ifndef LLVM_LIBC_SRC_TIME_TIME_UTILS_H
 #define LLVM_LIBC_SRC_TIME_TIME_UTILS_H
 
-#include <stddef.h> // For size_t.
-
+#include "hdr/types/size_t.h"
+#include "hdr/types/struct_tm.h"
+#include "hdr/types/time_t.h"
 #include "src/__support/common.h"
 #include "src/__support/macros/config.h"
 #include "src/errno/libc_errno.h"
-#include "src/time/mktime.h"
+#include "time_constants.h"
 
 #include <stdint.h>
 
 namespace LIBC_NAMESPACE_DECL {
 namespace time_utils {
 
-enum Month : int {
-  JANUARY,
-  FEBRUARY,
-  MARCH,
-  APRIL,
-  MAY,
-  JUNE,
-  JULY,
-  AUGUST,
-  SEPTEMBER,
-  OCTOBER,
-  NOVEMBER,
-  DECEMBER
-};
-
-struct TimeConstants {
-  static constexpr int SECONDS_PER_MIN = 60;
-  static constexpr int MINUTES_PER_HOUR = 60;
-  static constexpr int HOURS_PER_DAY = 24;
-  static constexpr int DAYS_PER_WEEK = 7;
-  static constexpr int MONTHS_PER_YEAR = 12;
-  static constexpr int DAYS_PER_NON_LEAP_YEAR = 365;
-  static constexpr int DAYS_PER_LEAP_YEAR = 366;
-
-  static constexpr int SECONDS_PER_HOUR = SECONDS_PER_MIN * MINUTES_PER_HOUR;
-  static constexpr int SECONDS_PER_DAY = SECONDS_PER_HOUR * HOURS_PER_DAY;
-  static constexpr int NUMBER_OF_SECONDS_IN_LEAP_YEAR =
-      DAYS_PER_LEAP_YEAR * SECONDS_PER_DAY;
-
-  static constexpr int TIME_YEAR_BASE = 1900;
-  static constexpr int EPOCH_YEAR = 1970;
-  static constexpr int EPOCH_WEEK_DAY = 4;
-
-  // For asctime the behavior is undefined if struct tm's tm_wday or tm_mon are
-  // not within the normal ranges as defined in <time.h>, or if struct tm's
-  // tm_year exceeds {INT_MAX}-1990, or if the below asctime_internal algorithm
-  // would attempt to generate more than 26 bytes of output (including the
-  // terminating null).
-  static constexpr int ASCTIME_BUFFER_SIZE = 256;
-  static constexpr int ASCTIME_MAX_BYTES = 26;
-
-  /* 2000-03-01 (mod 400 year, immediately after feb29 */
-  static constexpr int64_t SECONDS_UNTIL2000_MARCH_FIRST =
-      (946684800LL + SECONDS_PER_DAY * (31 + 29));
-  static constexpr int WEEK_DAY_OF2000_MARCH_FIRST = 3;
-
-  static constexpr int DAYS_PER400_YEARS =
-      (DAYS_PER_NON_LEAP_YEAR * 400) + (400 / 4) - 3;
-  static constexpr int DAYS_PER100_YEARS =
-      (DAYS_PER_NON_LEAP_YEAR * 100) + (100 / 4) - 1;
-  static constexpr int DAYS_PER4_YEARS = (DAYS_PER_NON_LEAP_YEAR * 4) + 1;
-
-  // The latest time that can be represented in this form is 03:14:07 UTC on
-  // Tuesday, 19 January 2038 (corresponding to 2,147,483,647 seconds since the
-  // start of the epoch). This means that systems using a 32-bit time_t type are
-  // susceptible to the Year 2038 problem.
-  static constexpr int END_OF32_BIT_EPOCH_YEAR = 2038;
-
-  static constexpr time_t OUT_OF_RANGE_RETURN_VALUE = -1;
-};
-
 // Update the "tm" structure's year, month, etc. members from seconds.
 // "total_seconds" is the number of seconds since January 1st, 1970.
 extern int64_t update_from_seconds(int64_t total_seconds, struct tm *tm);
@@ -98,7 +38,7 @@ LIBC_INLINE time_t out_of_range() {
   // require it.
   libc_errno = EOVERFLOW;
 #endif
-  return TimeConstants::OUT_OF_RANGE_RETURN_VALUE;
+  return time_constants::OUT_OF_RANGE_RETURN_VALUE;
 }
 
 LIBC_INLINE void invalid_value() { libc_errno = EINVAL; }
@@ -110,32 +50,23 @@ LIBC_INLINE char *asctime(const struct tm *timeptr, char *buffer,
     return nullptr;
   }
   if (timeptr->tm_wday < 0 ||
-      timeptr->tm_wday > (TimeConstants::DAYS_PER_WEEK - 1)) {
+      timeptr->tm_wday > (time_constants::DAYS_PER_WEEK - 1)) {
     invalid_value();
     return nullptr;
   }
   if (timeptr->tm_mon < 0 ||
-      timeptr->tm_mon > (TimeConstants::MONTHS_PER_YEAR - 1)) {
+      timeptr->tm_mon > (time_constants::MONTHS_PER_YEAR - 1)) {
     invalid_value();
     return nullptr;
   }
 
-  // TODO(rtenneti): i18n the following strings.
-  static const char *week_days_name[TimeConstants::DAYS_PER_WEEK] = {
-      "Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat"};
-
-  static const char *months_name[TimeConstants::MONTHS_PER_YEAR] = {
-      "Jan", "Feb", "Mar", "Apr", "May", "Jun",
-      "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"};
-
-  // TODO(michaelr): look into removing this call to __builtin_snprintf that may
-  // be emitted as a call to snprintf. Alternatively, look into using our
-  // internal printf machinery.
+  // TODO(michaelr): move this to use the strftime machinery
   int written_size = __builtin_snprintf(
       buffer, bufferLength, "%.3s %.3s%3d %.2d:%.2d:%.2d %d\n",
-      week_days_name[timeptr->tm_wday], months_name[timeptr->tm_mon],
-      timeptr->tm_mday, timeptr->tm_hour, timeptr->tm_min, timeptr->tm_sec,
-      TimeConstants::TIME_YEAR_BASE + timeptr->tm_year);
+      time_constants::WEEK_DAY_NAMES[timeptr->tm_wday].data(),
+      time_constants::MONTH_NAMES[timeptr->tm_mon].data(), timeptr->tm_mday,
+      timeptr->tm_hour, timeptr->tm_min, timeptr->tm_sec,
+      time_constants::TIME_YEAR_BASE + timeptr->tm_year);
   if (written_size < 0)
     return nullptr;
   if (static_cast<size_t>(written_size) >= bufferLength) {
diff --git a/libc/test/src/time/CMakeLists.txt b/libc/test/src/time/CMakeLists.txt
index da3903f3e0e4..12add224f386 100644
--- a/libc/test/src/time/CMakeLists.txt
+++ b/libc/test/src/time/CMakeLists.txt
@@ -13,6 +13,8 @@ add_libc_unittest(
     20
   DEPENDS
     libc.src.time.asctime
+    libc.hdr.types.struct_tm
+    libc.src.time.time_constants
 )
 
 add_libc_unittest(
@@ -28,6 +30,8 @@ add_libc_unittest(
     20
   DEPENDS
     libc.src.time.asctime_r
+    libc.hdr.types.struct_tm
+    libc.src.time.time_constants
 )
 
 add_libc_unittest(
@@ -45,7 +49,8 @@ add_libc_unittest(
     libc.include.time
     libc.hdr.types.time_t
     libc.src.time.ctime
-    libc.src.time.time_utils
+    libc.src.time.time_constants
+    libc.hdr.types.struct_tm
 )
 
 add_libc_unittest(
@@ -63,7 +68,8 @@ add_libc_unittest(
     libc.include.time
     libc.hdr.types.time_t
     libc.src.time.ctime_r
-    libc.src.time.time_utils
+    libc.src.time.time_constants
+    libc.hdr.types.struct_tm
 )
 
 add_libc_test(
@@ -74,6 +80,9 @@ add_libc_test(
     clock_gettime_test.cpp
   DEPENDS
     libc.src.time.clock_gettime
+    libc.hdr.types.time_t
+    libc.hdr.types.struct_timespec
+    libc.hdr.time_macros
 )
 
 add_libc_test(
@@ -94,6 +103,8 @@ add_libc_unittest(
     difftime_test.cpp
   DEPENDS
     libc.src.time.difftime
+    libc.src.time.time_constants
+    libc.src.__support.FPUtil.fp_bits
 )
 
 add_libc_unittest(
@@ -105,6 +116,7 @@ add_libc_unittest(
   DEPENDS
     libc.include.time
     libc.src.time.gettimeofday
+    libc.hdr.types.struct_timeval
 )
 
 add_libc_unittest(
@@ -118,6 +130,8 @@ add_libc_unittest(
   DEPENDS
     libc.src.time.gmtime
     libc.src.__support.CPP.limits
+    libc.hdr.types.struct_tm
+    libc.src.time.time_constants
 )
 
 add_libc_unittest(
@@ -130,6 +144,8 @@ add_libc_unittest(
     TmMatcher.h
   DEPENDS
     libc.src.time.gmtime_r
+    libc.hdr.types.struct_tm
+    libc.src.time.time_constants
 )
 
 add_libc_unittest(
@@ -146,6 +162,8 @@ add_libc_unittest(
   DEPENDS
     libc.src.time.mktime
     libc.src.__support.CPP.limits
+    libc.hdr.types.struct_tm
+    libc.src.time.time_constants
 )
 
 add_libc_test(
@@ -158,6 +176,7 @@ add_libc_test(
     libc.include.time
     libc.src.time.nanosleep
     libc.src.errno.errno
+    libc.hdr.types.struct_timespec
 )
 
 add_libc_unittest(
@@ -180,6 +199,7 @@ add_libc_test(
     timespec_get_test.cpp
   DEPENDS
     libc.src.time.timespec_get
+    libc.hdr.types.struct_timespec
 )
 
 add_libc_test(
@@ -192,4 +212,5 @@ add_libc_test(
     libc.include.time
     libc.src.time.clock
     libc.src.errno.errno
+    libc.hdr.types.clock_t
 )
diff --git a/libc/test/src/time/TmHelper.h b/libc/test/src/time/TmHelper.h
index 5ae258461099..1582839ffaf2 100644
--- a/libc/test/src/time/TmHelper.h
+++ b/libc/test/src/time/TmHelper.h
@@ -9,12 +9,9 @@
 #ifndef LLVM_LIBC_TEST_SRC_TIME_TMHELPER_H
 #define LLVM_LIBC_TEST_SRC_TIME_TMHELPER_H
 
-#include <time.h>
-
+#include "hdr/types/struct_tm.h"
 #include "src/__support/macros/config.h"
-#include "src/time/time_utils.h"
-
-using LIBC_NAMESPACE::time_utils::TimeConstants;
+#include "src/time/time_constants.h"
 
 namespace LIBC_NAMESPACE_DECL {
 namespace tmhelper {
@@ -30,7 +27,7 @@ static inline void initialize_tm_data(struct tm *tm_data, int year, int month,
                     .tm_mday = mday,
                     .tm_mon = month - 1, // tm_mon starts with 0 for Jan
                     // years since 1900
-                    .tm_year = year - TimeConstants::TIME_YEAR_BASE,
+                    .tm_year = year - time_constants::TIME_YEAR_BASE,
                     .tm_wday = wday,
                     .tm_yday = yday,
                     .tm_isdst = 0};
diff --git a/libc/test/src/time/TmMatcher.h b/libc/test/src/time/TmMatcher.h
index 630956b0f08d..d39ee396057b 100644
--- a/libc/test/src/time/TmMatcher.h
+++ b/libc/test/src/time/TmMatcher.h
@@ -9,8 +9,7 @@
 #ifndef LLVM_LIBC_TEST_SRC_TIME_TM_MATCHER_H
 #define LLVM_LIBC_TEST_SRC_TIME_TM_MATCHER_H
 
-#include <time.h>
-
+#include "hdr/types/struct_tm.h"
 #include "src/__support/macros/config.h"
 #include "test/UnitTest/Test.h"
 
diff --git a/libc/test/src/time/asctime_r_test.cpp b/libc/test/src/time/asctime_r_test.cpp
index f3aadbb39de4..b595cfe02486 100644
--- a/libc/test/src/time/asctime_r_test.cpp
+++ b/libc/test/src/time/asctime_r_test.cpp
@@ -8,12 +8,10 @@
 
 #include "src/errno/libc_errno.h"
 #include "src/time/asctime_r.h"
-#include "src/time/time_utils.h"
+#include "src/time/time_constants.h"
 #include "test/UnitTest/Test.h"
 #include "test/src/time/TmHelper.h"
 
-using LIBC_NAMESPACE::time_utils::TimeConstants;
-
 static inline char *call_asctime_r(struct tm *tm_data, int year, int month,
                                    int mday, int hour, int min, int sec,
                                    int wday, int yday, char *buffer) {
@@ -30,7 +28,7 @@ TEST(LlvmLibcAsctimeR, Nullptr) {
   ASSERT_ERRNO_EQ(EINVAL);
   ASSERT_STREQ(nullptr, result);
 
-  char buffer[TimeConstants::ASCTIME_BUFFER_SIZE];
+  char buffer[LIBC_NAMESPACE::time_constants::ASCTIME_BUFFER_SIZE];
   result = LIBC_NAMESPACE::asctime_r(nullptr, buffer);
   ASSERT_ERRNO_EQ(EINVAL);
   ASSERT_STREQ(nullptr, result);
@@ -42,7 +40,7 @@ TEST(LlvmLibcAsctimeR, Nullptr) {
 }
 
 TEST(LlvmLibcAsctimeR, ValidDate) {
-  char buffer[TimeConstants::ASCTIME_BUFFER_SIZE];
+  char buffer[LIBC_NAMESPACE::time_constants::ASCTIME_BUFFER_SIZE];
   struct tm tm_data;
   char *result;
   // 1970-01-01 00:00:00. Test with a valid buffer size.
diff --git a/libc/test/src/time/clock_gettime_test.cpp b/libc/test/src/time/clock_gettime_test.cpp
index 43715c0265f1..d3edcae00cdd 100644
--- a/libc/test/src/time/clock_gettime_test.cpp
+++ b/libc/test/src/time/clock_gettime_test.cpp
@@ -6,12 +6,13 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "hdr/time_macros.h"
+#include "hdr/types/struct_timespec.h"
+#include "hdr/types/time_t.h"
 #include "src/__support/macros/properties/architectures.h"
 #include "src/time/clock_gettime.h"
 #include "test/UnitTest/Test.h"
 
-#include <time.h>
-
 TEST(LlvmLibcClockGetTime, RealTime) {
   timespec tp;
   int result;
diff --git a/libc/test/src/time/clock_test.cpp b/libc/test/src/time/clock_test.cpp
index 05082aa23388..8d8d89d577a9 100644
--- a/libc/test/src/time/clock_test.cpp
+++ b/libc/test/src/time/clock_test.cpp
@@ -6,11 +6,10 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "hdr/types/clock_t.h"
 #include "src/time/clock.h"
 #include "test/UnitTest/Test.h"
 
-#include <time.h>
-
 TEST(LlvmLibcClockTest, SmokeTest) {
   clock_t c1 = LIBC_NAMESPACE::clock();
   ASSERT_GT(c1, clock_t(0));
diff --git a/libc/test/src/time/ctime_r_test.cpp b/libc/test/src/time/ctime_r_test.cpp
index 9ce6f75f7548..27011b7e0fbd 100644
--- a/libc/test/src/time/ctime_r_test.cpp
+++ b/libc/test/src/time/ctime_r_test.cpp
@@ -8,18 +8,16 @@
 
 #include "src/errno/libc_errno.h"
 #include "src/time/ctime_r.h"
-#include "src/time/time_utils.h"
+#include "src/time/time_constants.h"
 #include "test/UnitTest/Test.h"
 #include "test/src/time/TmHelper.h"
 
-using LIBC_NAMESPACE::time_utils::TimeConstants;
-
 TEST(LlvmLibcCtimeR, Nullptr) {
   char *result;
   result = LIBC_NAMESPACE::ctime_r(nullptr, nullptr);
   ASSERT_STREQ(nullptr, result);
 
-  char buffer[TimeConstants::ASCTIME_BUFFER_SIZE];
+  char buffer[LIBC_NAMESPACE::time_constants::ASCTIME_BUFFER_SIZE];
   result = LIBC_NAMESPACE::ctime_r(nullptr, buffer);
   ASSERT_STREQ(nullptr, result);
 
@@ -29,7 +27,7 @@ TEST(LlvmLibcCtimeR, Nullptr) {
 }
 
 TEST(LlvmLibcCtimeR, ValidUnixTimestamp0) {
-  char buffer[TimeConstants::ASCTIME_BUFFER_SIZE];
+  char buffer[LIBC_NAMESPACE::time_constants::ASCTIME_BUFFER_SIZE];
   time_t t;
   char *result;
   // 1970-01-01 00:00:00. Test with a valid buffer size.
@@ -39,7 +37,7 @@ TEST(LlvmLibcCtimeR, ValidUnixTimestamp0) {
 }
 
 TEST(LlvmLibcCtime, ValidUnixTimestamp32Int) {
-  char buffer[TimeConstants::ASCTIME_BUFFER_SIZE];
+  char buffer[LIBC_NAMESPACE::time_constants::ASCTIME_BUFFER_SIZE];
   time_t t;
   char *result;
   // 2038-01-19 03:14:07. Test with a valid buffer size.
@@ -49,7 +47,7 @@ TEST(LlvmLibcCtime, ValidUnixTimestamp32Int) {
 }
 
 TEST(LlvmLibcCtimeR, InvalidArgument) {
-  char buffer[TimeConstants::ASCTIME_BUFFER_SIZE];
+  char buffer[LIBC_NAMESPACE::time_constants::ASCTIME_BUFFER_SIZE];
   time_t t;
   char *result;
   t = 2147483648;
diff --git a/libc/test/src/time/difftime_test.cpp b/libc/test/src/time/difftime_test.cpp
index 68ff4630e61b..4dab1ac91104 100644
--- a/libc/test/src/time/difftime_test.cpp
+++ b/libc/test/src/time/difftime_test.cpp
@@ -8,15 +8,12 @@
 
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/time/difftime.h"
-#include "src/time/time_utils.h"
+#include "src/time/time_constants.h"
 #include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
-using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
-using LIBC_NAMESPACE::time_utils::TimeConstants;
-
 TEST(LlvmLibcDifftime, SmokeTest) {
-  time_t t1_seconds = TimeConstants::SECONDS_PER_HOUR;
+  time_t t1_seconds = LIBC_NAMESPACE::time_constants::SECONDS_PER_HOUR;
   time_t t2_seconds = 0;
 
   LIBC_NAMESPACE::fputil::FPBits<long double> expected_fp =
diff --git a/libc/test/src/time/gettimeofday_test.cpp b/libc/test/src/time/gettimeofday_test.cpp
index ee934b7f3a20..8f9f136164f5 100644
--- a/libc/test/src/time/gettimeofday_test.cpp
+++ b/libc/test/src/time/gettimeofday_test.cpp
@@ -6,8 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include <time.h>
-
+#include "hdr/types/struct_timeval.h"
 #include "src/time/gettimeofday.h"
 #include "test/UnitTest/Test.h"
 
diff --git a/libc/test/src/time/gmtime_r_test.cpp b/libc/test/src/time/gmtime_r_test.cpp
index 2276b4803f19..9d466f444f97 100644
--- a/libc/test/src/time/gmtime_r_test.cpp
+++ b/libc/test/src/time/gmtime_r_test.cpp
@@ -7,12 +7,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/time/gmtime_r.h"
-#include "src/time/time_utils.h"
+#include "src/time/time_constants.h"
 #include "test/UnitTest/Test.h"
 #include "test/src/time/TmMatcher.h"
 
-using LIBC_NAMESPACE::time_utils::TimeConstants;
-
 // gmtime and gmtime_r share the same code and thus didn't repeat all the tests
 // from gmtime. Added couple of validation tests.
 TEST(LlvmLibcGmTimeR, EndOf32BitEpochYear) {
@@ -22,16 +20,17 @@ TEST(LlvmLibcGmTimeR, EndOf32BitEpochYear) {
   struct tm tm_data;
   struct tm *tm_data_ptr;
   tm_data_ptr = LIBC_NAMESPACE::gmtime_r(&seconds, &tm_data);
-  EXPECT_TM_EQ((tm{7,  // sec
-                   14, // min
-                   3,  // hr
-                   19, // day
-                   0,  // tm_mon starts with 0 for Jan
-                   2038 - TimeConstants::TIME_YEAR_BASE, // year
-                   2,                                    // wday
-                   7,                                    // yday
-                   0}),
-               *tm_data_ptr);
+  EXPECT_TM_EQ(
+      (tm{7,  // sec
+          14, // min
+          3,  // hr
+          19, // day
+          0,  // tm_mon starts with 0 for Jan
+          2038 - LIBC_NAMESPACE::time_constants::TIME_YEAR_BASE, // year
+          2,                                                     // wday
+          7,                                                     // yday
+          0}),
+      *tm_data_ptr);
   EXPECT_TM_EQ(*tm_data_ptr, tm_data);
 }
 
@@ -43,15 +42,16 @@ TEST(LlvmLibcGmTimeR, Max64BitYear) {
   struct tm tm_data;
   struct tm *tm_data_ptr;
   tm_data_ptr = LIBC_NAMESPACE::gmtime_r(&seconds, &tm_data);
-  EXPECT_TM_EQ((tm{50, // sec
-                   50, // min
-                   12, // hr
-                   1,  // day
-                   0,  // tm_mon starts with 0 for Jan
-                   2147483647 - TimeConstants::TIME_YEAR_BASE, // year
-                   2,                                          // wday
-                   50,                                         // yday
-                   0}),
-               *tm_data_ptr);
+  EXPECT_TM_EQ(
+      (tm{50, // sec
+          50, // min
+          12, // hr
+          1,  // day
+          0,  // tm_mon starts with 0 for Jan
+          2147483647 - LIBC_NAMESPACE::time_constants::TIME_YEAR_BASE, // year
+          2,                                                           // wday
+          50,                                                          // yday
+          0}),
+      *tm_data_ptr);
   EXPECT_TM_EQ(*tm_data_ptr, tm_data);
 }
diff --git a/libc/test/src/time/gmtime_test.cpp b/libc/test/src/time/gmtime_test.cpp
index 433fbf666705..6af5a18d3699 100644
--- a/libc/test/src/time/gmtime_test.cpp
+++ b/libc/test/src/time/gmtime_test.cpp
@@ -6,32 +6,36 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "hdr/types/struct_tm.h"
 #include "src/__support/CPP/limits.h" // INT_MAX, INT_MIN
 #include "src/errno/libc_errno.h"
 #include "src/time/gmtime.h"
-#include "src/time/time_utils.h"
+#include "src/time/time_constants.h"
 #include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "test/src/time/TmMatcher.h"
 
 using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails;
 using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
-using LIBC_NAMESPACE::time_utils::TimeConstants;
 
 TEST(LlvmLibcGmTime, OutOfRange) {
   if (sizeof(time_t) < sizeof(int64_t))
     return;
   time_t seconds =
-      1 + INT_MAX * static_cast<int64_t>(
-                        TimeConstants::NUMBER_OF_SECONDS_IN_LEAP_YEAR);
+      1 +
+      INT_MAX *
+          static_cast<int64_t>(
+              LIBC_NAMESPACE::time_constants::NUMBER_OF_SECONDS_IN_LEAP_YEAR);
   struct tm *tm_data = LIBC_NAMESPACE::gmtime(&seconds);
   EXPECT_TRUE(tm_data == nullptr);
   ASSERT_ERRNO_EQ(EOVERFLOW);
 
   LIBC_NAMESPACE::libc_errno = 0;
-  seconds = INT_MIN * static_cast<int64_t>(
-                          TimeConstants::NUMBER_OF_SECONDS_IN_LEAP_YEAR) -
-            1;
+  seconds =
+      INT_MIN *
+          static_cast<int64_t>(
+              LIBC_NAMESPACE::time_constants::NUMBER_OF_SECONDS_IN_LEAP_YEAR) -
+      1;
   tm_data = LIBC_NAMESPACE::gmtime(&seconds);
   EXPECT_TRUE(tm_data == nullptr);
   ASSERT_ERRNO_EQ(EOVERFLOW);
@@ -43,201 +47,215 @@ TEST(LlvmLibcGmTime, InvalidSeconds) {
   // -1 second from 1970-01-01 00:00:00 returns 1969-12-31 23:59:59.
   seconds = -1;
   tm_data = LIBC_NAMESPACE::gmtime(&seconds);
-  EXPECT_TM_EQ((tm{59,     // sec
-                   59,     // min
-                   23,     // hr
-                   31,     // day
-                   12 - 1, // tm_mon starts with 0 for Jan
-                   1969 - TimeConstants::TIME_YEAR_BASE, // year
-                   3,                                    // wday
-                   364,                                  // yday
-                   0}),
-               *tm_data);
+  EXPECT_TM_EQ(
+      (tm{59,     // sec
+          59,     // min
+          23,     // hr
+          31,     // day
+          12 - 1, // tm_mon starts with 0 for Jan
+          1969 - LIBC_NAMESPACE::time_constants::TIME_YEAR_BASE, // year
+          3,                                                     // wday
+          364,                                                   // yday
+          0}),
+      *tm_data);
   // 60 seconds from 1970-01-01 00:00:00 returns 1970-01-01 00:01:00.
   seconds = 60;
   tm_data = LIBC_NAMESPACE::gmtime(&seconds);
-  EXPECT_TM_EQ((tm{0, // sec
-                   1, // min
-                   0, // hr
-                   1, // day
-                   0, // tm_mon starts with 0 for Jan
-                   1970 - TimeConstants::TIME_YEAR_BASE, // year
-                   4,                                    // wday
-                   0,                                    // yday
-                   0}),
-               *tm_data);
+  EXPECT_TM_EQ(
+      (tm{0, // sec
+          1, // min
+          0, // hr
+          1, // day
+          0, // tm_mon starts with 0 for Jan
+          1970 - LIBC_NAMESPACE::time_constants::TIME_YEAR_BASE, // year
+          4,                                                     // wday
+          0,                                                     // yday
+          0}),
+      *tm_data);
 }
 
 TEST(LlvmLibcGmTime, InvalidMinutes) {
   time_t seconds = 0;
   struct tm *tm_data = nullptr;
   // -1 minute from 1970-01-01 00:00:00 returns 1969-12-31 23:59:00.
-  seconds = -TimeConstants::SECONDS_PER_MIN;
+  seconds = -LIBC_NAMESPACE::time_constants::SECONDS_PER_MIN;
   tm_data = LIBC_NAMESPACE::gmtime(&seconds);
-  EXPECT_TM_EQ((tm{0,  // sec
-                   59, // min
-                   23, // hr
-                   31, // day
-                   11, // tm_mon starts with 0 for Jan
-                   1969 - TimeConstants::TIME_YEAR_BASE, // year
-                   3,                                    // wday
-                   0,                                    // yday
-                   0}),
-               *tm_data);
+  EXPECT_TM_EQ(
+      (tm{0,  // sec
+          59, // min
+          23, // hr
+          31, // day
+          11, // tm_mon starts with 0 for Jan
+          1969 - LIBC_NAMESPACE::time_constants::TIME_YEAR_BASE, // year
+          3,                                                     // wday
+          0,                                                     // yday
+          0}),
+      *tm_data);
   // 60 minutes from 1970-01-01 00:00:00 returns 1970-01-01 01:00:00.
-  seconds = 60 * TimeConstants::SECONDS_PER_MIN;
+  seconds = 60 * LIBC_NAMESPACE::time_constants::SECONDS_PER_MIN;
   tm_data = LIBC_NAMESPACE::gmtime(&seconds);
-  EXPECT_TM_EQ((tm{0, // sec
-                   0, // min
-                   1, // hr
-                   1, // day
-                   0, // tm_mon starts with 0 for Jan
-                   1970 - TimeConstants::TIME_YEAR_BASE, // year
-                   4,                                    // wday
-                   0,                                    // yday
-                   0}),
-               *tm_data);
+  EXPECT_TM_EQ(
+      (tm{0, // sec
+          0, // min
+          1, // hr
+          1, // day
+          0, // tm_mon starts with 0 for Jan
+          1970 - LIBC_NAMESPACE::time_constants::TIME_YEAR_BASE, // year
+          4,                                                     // wday
+          0,                                                     // yday
+          0}),
+      *tm_data);
 }
 
 TEST(LlvmLibcGmTime, InvalidHours) {
   time_t seconds = 0;
   struct tm *tm_data = nullptr;
   // -1 hour from 1970-01-01 00:00:00 returns 1969-12-31 23:00:00.
-  seconds = -TimeConstants::SECONDS_PER_HOUR;
+  seconds = -LIBC_NAMESPACE::time_constants::SECONDS_PER_HOUR;
   tm_data = LIBC_NAMESPACE::gmtime(&seconds);
-  EXPECT_TM_EQ((tm{0,  // sec
-                   0,  // min
-                   23, // hr
-                   31, // day
-                   11, // tm_mon starts with 0 for Jan
-                   1969 - TimeConstants::TIME_YEAR_BASE, // year
-                   3,                                    // wday
-                   0,                                    // yday
-                   0}),
-               *tm_data);
+  EXPECT_TM_EQ(
+      (tm{0,  // sec
+          0,  // min
+          23, // hr
+          31, // day
+          11, // tm_mon starts with 0 for Jan
+          1969 - LIBC_NAMESPACE::time_constants::TIME_YEAR_BASE, // year
+          3,                                                     // wday
+          0,                                                     // yday
+          0}),
+      *tm_data);
   // 24 hours from 1970-01-01 00:00:00 returns 1970-01-02 00:00:00.
-  seconds = 24 * TimeConstants::SECONDS_PER_HOUR;
+  seconds = 24 * LIBC_NAMESPACE::time_constants::SECONDS_PER_HOUR;
   tm_data = LIBC_NAMESPACE::gmtime(&seconds);
-  EXPECT_TM_EQ((tm{0, // sec
-                   0, // min
-                   0, // hr
-                   2, // day
-                   0, // tm_mon starts with 0 for Jan
-                   1970 - TimeConstants::TIME_YEAR_BASE, // year
-                   5,                                    // wday
-                   0,                                    // yday
-                   0}),
-               *tm_data);
+  EXPECT_TM_EQ(
+      (tm{0, // sec
+          0, // min
+          0, // hr
+          2, // day
+          0, // tm_mon starts with 0 for Jan
+          1970 - LIBC_NAMESPACE::time_constants::TIME_YEAR_BASE, // year
+          5,                                                     // wday
+          0,                                                     // yday
+          0}),
+      *tm_data);
 }
 
 TEST(LlvmLibcGmTime, InvalidYear) {
   // -1 year from 1970-01-01 00:00:00 returns 1969-01-01 00:00:00.
-  time_t seconds =
-      -TimeConstants::DAYS_PER_NON_LEAP_YEAR * TimeConstants::SECONDS_PER_DAY;
+  time_t seconds = -LIBC_NAMESPACE::time_constants::DAYS_PER_NON_LEAP_YEAR *
+                   LIBC_NAMESPACE::time_constants::SECONDS_PER_DAY;
   struct tm *tm_data = LIBC_NAMESPACE::gmtime(&seconds);
-  EXPECT_TM_EQ((tm{0, // sec
-                   0, // min
-                   0, // hr
-                   1, // day
-                   0, // tm_mon starts with 0 for Jan
-                   1969 - TimeConstants::TIME_YEAR_BASE, // year
-                   3,                                    // wday
-                   0,                                    // yday
-                   0}),
-               *tm_data);
+  EXPECT_TM_EQ(
+      (tm{0, // sec
+          0, // min
+          0, // hr
+          1, // day
+          0, // tm_mon starts with 0 for Jan
+          1969 - LIBC_NAMESPACE::time_constants::TIME_YEAR_BASE, // year
+          3,                                                     // wday
+          0,                                                     // yday
+          0}),
+      *tm_data);
 }
 
 TEST(LlvmLibcGmTime, InvalidMonths) {
   time_t seconds = 0;
   struct tm *tm_data = nullptr;
   // -1 month from 1970-01-01 00:00:00 returns 1969-12-01 00:00:00.
-  seconds = -31 * TimeConstants::SECONDS_PER_DAY;
+  seconds = -31 * LIBC_NAMESPACE::time_constants::SECONDS_PER_DAY;
   tm_data = LIBC_NAMESPACE::gmtime(&seconds);
-  EXPECT_TM_EQ((tm{0,      // sec
-                   0,      // min
-                   0,      // hr
-                   1,      // day
-                   12 - 1, // tm_mon starts with 0 for Jan
-                   1969 - TimeConstants::TIME_YEAR_BASE, // year
-                   1,                                    // wday
-                   0,                                    // yday
-                   0}),
-               *tm_data);
+  EXPECT_TM_EQ(
+      (tm{0,      // sec
+          0,      // min
+          0,      // hr
+          1,      // day
+          12 - 1, // tm_mon starts with 0 for Jan
+          1969 - LIBC_NAMESPACE::time_constants::TIME_YEAR_BASE, // year
+          1,                                                     // wday
+          0,                                                     // yday
+          0}),
+      *tm_data);
   // 1970-13-01 00:00:00 returns 1971-01-01 00:00:00.
-  seconds =
-      TimeConstants::DAYS_PER_NON_LEAP_YEAR * TimeConstants::SECONDS_PER_DAY;
+  seconds = LIBC_NAMESPACE::time_constants::DAYS_PER_NON_LEAP_YEAR *
+            LIBC_NAMESPACE::time_constants::SECONDS_PER_DAY;
   tm_data = LIBC_NAMESPACE::gmtime(&seconds);
-  EXPECT_TM_EQ((tm{0, // sec
-                   0, // min
-                   0, // hr
-                   1, // day
-                   0, // tm_mon starts with 0 for Jan
-                   1971 - TimeConstants::TIME_YEAR_BASE, // year
-                   5,                                    // wday
-                   0,                                    // yday
-                   0}),
-               *tm_data);
+  EXPECT_TM_EQ(
+      (tm{0, // sec
+          0, // min
+          0, // hr
+          1, // day
+          0, // tm_mon starts with 0 for Jan
+          1971 - LIBC_NAMESPACE::time_constants::TIME_YEAR_BASE, // year
+          5,                                                     // wday
+          0,                                                     // yday
+          0}),
+      *tm_data);
 }
 
 TEST(LlvmLibcGmTime, InvalidDays) {
   time_t seconds = 0;
   struct tm *tm_data = nullptr;
   // -1 day from 1970-01-01 00:00:00 returns 1969-12-31 00:00:00.
-  seconds = -1 * TimeConstants::SECONDS_PER_DAY;
+  seconds = -1 * LIBC_NAMESPACE::time_constants::SECONDS_PER_DAY;
   tm_data = LIBC_NAMESPACE::gmtime(&seconds);
-  EXPECT_TM_EQ((tm{0,  // sec
-                   0,  // min
-                   0,  // hr
-                   31, // day
-                   11, // tm_mon starts with 0 for Jan
-                   1969 - TimeConstants::TIME_YEAR_BASE, // year
-                   3,                                    // wday
-                   0,                                    // yday
-                   0}),
-               *tm_data);
+  EXPECT_TM_EQ(
+      (tm{0,  // sec
+          0,  // min
+          0,  // hr
+          31, // day
+          11, // tm_mon starts with 0 for Jan
+          1969 - LIBC_NAMESPACE::time_constants::TIME_YEAR_BASE, // year
+          3,                                                     // wday
+          0,                                                     // yday
+          0}),
+      *tm_data);
 
   // 1970-01-32 00:00:00 returns 1970-02-01 00:00:00.
-  seconds = 31 * TimeConstants::SECONDS_PER_DAY;
+  seconds = 31 * LIBC_NAMESPACE::time_constants::SECONDS_PER_DAY;
   tm_data = LIBC_NAMESPACE::gmtime(&seconds);
-  EXPECT_TM_EQ((tm{0, // sec
-                   0, // min
-                   0, // hr
-                   1, // day
-                   0, // tm_mon starts with 0 for Jan
-                   1970 - TimeConstants::TIME_YEAR_BASE, // year
-                   0,                                    // wday
-                   0,                                    // yday
-                   0}),
-               *tm_data);
+  EXPECT_TM_EQ(
+      (tm{0, // sec
+          0, // min
+          0, // hr
+          1, // day
+          0, // tm_mon starts with 0 for Jan
+          1970 - LIBC_NAMESPACE::time_constants::TIME_YEAR_BASE, // year
+          0,                                                     // wday
+          0,                                                     // yday
+          0}),
+      *tm_data);
 
   // 1970-02-29 00:00:00 returns 1970-03-01 00:00:00.
-  seconds = 59 * TimeConstants::SECONDS_PER_DAY;
+  seconds = 59 * LIBC_NAMESPACE::time_constants::SECONDS_PER_DAY;
   tm_data = LIBC_NAMESPACE::gmtime(&seconds);
-  EXPECT_TM_EQ((tm{0, // sec
-                   0, // min
-                   0, // hr
-                   1, // day
-                   2, // tm_mon starts with 0 for Jan
-                   1970 - TimeConstants::TIME_YEAR_BASE, // year
-                   0,                                    // wday
-                   0,                                    // yday
-                   0}),
-               *tm_data);
+  EXPECT_TM_EQ(
+      (tm{0, // sec
+          0, // min
+          0, // hr
+          1, // day
+          2, // tm_mon starts with 0 for Jan
+          1970 - LIBC_NAMESPACE::time_constants::TIME_YEAR_BASE, // year
+          0,                                                     // wday
+          0,                                                     // yday
+          0}),
+      *tm_data);
 
   // 1972-02-30 00:00:00 returns 1972-03-01 00:00:00.
-  seconds = ((2 * TimeConstants::DAYS_PER_NON_LEAP_YEAR) + 60) *
-            TimeConstants::SECONDS_PER_DAY;
+  seconds =
+      ((2 * LIBC_NAMESPACE::time_constants::DAYS_PER_NON_LEAP_YEAR) + 60) *
+      LIBC_NAMESPACE::time_constants::SECONDS_PER_DAY;
   tm_data = LIBC_NAMESPACE::gmtime(&seconds);
-  EXPECT_TM_EQ((tm{0, // sec
-                   0, // min
-                   0, // hr
-                   1, // day
-                   2, // tm_mon starts with 0 for Jan
-                   1972 - TimeConstants::TIME_YEAR_BASE, // year
-                   3,                                    // wday
-                   0,                                    // yday
-                   0}),
-               *tm_data);
+  EXPECT_TM_EQ(
+      (tm{0, // sec
+          0, // min
+          0, // hr
+          1, // day
+          2, // tm_mon starts with 0 for Jan
+          1972 - LIBC_NAMESPACE::time_constants::TIME_YEAR_BASE, // year
+          3,                                                     // wday
+          0,                                                     // yday
+          0}),
+      *tm_data);
 }
 
 TEST(LlvmLibcGmTime, EndOf32BitEpochYear) {
@@ -245,16 +263,17 @@ TEST(LlvmLibcGmTime, EndOf32BitEpochYear) {
   // Test implementation can encode time for Tue 19 January 2038 03:14:07 UTC.
   time_t seconds = 0x7FFFFFFF;
   struct tm *tm_data = LIBC_NAMESPACE::gmtime(&seconds);
-  EXPECT_TM_EQ((tm{7,  // sec
-                   14, // min
-                   3,  // hr
-                   19, // day
-                   0,  // tm_mon starts with 0 for Jan
-                   2038 - TimeConstants::TIME_YEAR_BASE, // year
-                   2,                                    // wday
-                   7,                                    // yday
-                   0}),
-               *tm_data);
+  EXPECT_TM_EQ(
+      (tm{7,  // sec
+          14, // min
+          3,  // hr
+          19, // day
+          0,  // tm_mon starts with 0 for Jan
+          2038 - LIBC_NAMESPACE::time_constants::TIME_YEAR_BASE, // year
+          2,                                                     // wday
+          7,                                                     // yday
+          0}),
+      *tm_data);
 }
 
 TEST(LlvmLibcGmTime, Max64BitYear) {
@@ -263,28 +282,30 @@ TEST(LlvmLibcGmTime, Max64BitYear) {
   // Mon Jan 1 12:50:50 2170 (200 years from 1970),
   time_t seconds = 6311479850;
   struct tm *tm_data = LIBC_NAMESPACE::gmtime(&seconds);
-  EXPECT_TM_EQ((tm{50, // sec
-                   50, // min
-                   12, // hr
-                   1,  // day
-                   0,  // tm_mon starts with 0 for Jan
-                   2170 - TimeConstants::TIME_YEAR_BASE, // year
-                   1,                                    // wday
-                   50,                                   // yday
-                   0}),
-               *tm_data);
+  EXPECT_TM_EQ(
+      (tm{50, // sec
+          50, // min
+          12, // hr
+          1,  // day
+          0,  // tm_mon starts with 0 for Jan
+          2170 - LIBC_NAMESPACE::time_constants::TIME_YEAR_BASE, // year
+          1,                                                     // wday
+          50,                                                    // yday
+          0}),
+      *tm_data);
 
   // Test for Tue Jan 1 12:50:50 in 2,147,483,647th year.
   seconds = 67767976202043050;
   tm_data = LIBC_NAMESPACE::gmtime(&seconds);
-  EXPECT_TM_EQ((tm{50, // sec
-                   50, // min
-                   12, // hr
-                   1,  // day
-                   0,  // tm_mon starts with 0 for Jan
-                   2147483647 - TimeConstants::TIME_YEAR_BASE, // year
-                   2,                                          // wday
-                   50,                                         // yday
-                   0}),
-               *tm_data);
+  EXPECT_TM_EQ(
+      (tm{50, // sec
+          50, // min
+          12, // hr
+          1,  // day
+          0,  // tm_mon starts with 0 for Jan
+          2147483647 - LIBC_NAMESPACE::time_constants::TIME_YEAR_BASE, // year
+          2,                                                           // wday
+          50,                                                          // yday
+          0}),
+      *tm_data);
 }
diff --git a/libc/test/src/time/mktime_test.cpp b/libc/test/src/time/mktime_test.cpp
index 84e6c7eb2c42..fe1116f7dd2e 100644
--- a/libc/test/src/time/mktime_test.cpp
+++ b/libc/test/src/time/mktime_test.cpp
@@ -8,7 +8,7 @@
 
 #include "src/__support/CPP/limits.h" // INT_MAX
 #include "src/time/mktime.h"
-#include "src/time/time_utils.h"
+#include "src/time/time_constants.h"
 #include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "test/src/time/TmHelper.h"
@@ -16,29 +16,37 @@
 
 using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails;
 using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
-using LIBC_NAMESPACE::time_utils::Month;
+using LIBC_NAMESPACE::time_constants::Month;
 
 static inline constexpr int tm_year(int year) {
-  return year - TimeConstants::TIME_YEAR_BASE;
+  return year - LIBC_NAMESPACE::time_constants::TIME_YEAR_BASE;
 }
 
 TEST(LlvmLibcMkTime, FailureSetsErrno) {
-  struct tm tm_data {
-    .tm_sec = INT_MAX, .tm_min = INT_MAX, .tm_hour = INT_MAX,
-    .tm_mday = INT_MAX, .tm_mon = INT_MAX - 1, .tm_year = tm_year(INT_MAX),
-    .tm_wday = 0, .tm_yday = 0, .tm_isdst = 0
-  };
+  struct tm tm_data{.tm_sec = INT_MAX,
+                    .tm_min = INT_MAX,
+                    .tm_hour = INT_MAX,
+                    .tm_mday = INT_MAX,
+                    .tm_mon = INT_MAX - 1,
+                    .tm_year = tm_year(INT_MAX),
+                    .tm_wday = 0,
+                    .tm_yday = 0,
+                    .tm_isdst = 0};
   EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data), Fails(EOVERFLOW));
 }
 
 TEST(LlvmLibcMkTime, InvalidSeconds) {
   {
     // -1 second from 1970-01-01 00:00:00 returns 1969-12-31 23:59:59.
-    struct tm tm_data {
-      .tm_sec = -1, .tm_min = 0, .tm_hour = 0, .tm_mday = 1,
-      .tm_mon = Month::JANUARY, .tm_year = tm_year(1970), .tm_wday = 0,
-      .tm_yday = 0, .tm_isdst = 0
-    };
+    struct tm tm_data{.tm_sec = -1,
+                      .tm_min = 0,
+                      .tm_hour = 0,
+                      .tm_mday = 1,
+                      .tm_mon = Month::JANUARY,
+                      .tm_year = tm_year(1970),
+                      .tm_wday = 0,
+                      .tm_yday = 0,
+                      .tm_isdst = 0};
     EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data), Succeeds(-1));
     EXPECT_TM_EQ((tm{.tm_sec = 59,
                      .tm_min = 59,
@@ -54,11 +62,15 @@ TEST(LlvmLibcMkTime, InvalidSeconds) {
 
   {
     // 60 seconds from 1970-01-01 00:00:00 returns 1970-01-01 00:01:00.
-    struct tm tm_data {
-      .tm_sec = 60, .tm_min = 0, .tm_hour = 0, .tm_mday = 1,
-      .tm_mon = Month::JANUARY, .tm_year = tm_year(1970), .tm_wday = 0,
-      .tm_yday = 0, .tm_isdst = 0
-    };
+    struct tm tm_data{.tm_sec = 60,
+                      .tm_min = 0,
+                      .tm_hour = 0,
+                      .tm_mday = 1,
+                      .tm_mon = Month::JANUARY,
+                      .tm_year = tm_year(1970),
+                      .tm_wday = 0,
+                      .tm_yday = 0,
+                      .tm_isdst = 0};
     EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data), Succeeds(60));
     EXPECT_TM_EQ((tm{.tm_sec = 0,
                      .tm_min = 1,
@@ -76,13 +88,17 @@ TEST(LlvmLibcMkTime, InvalidSeconds) {
 TEST(LlvmLibcMkTime, InvalidMinutes) {
   {
     // -1 minute from 1970-01-01 00:00:00 returns 1969-12-31 23:59:00.
-    struct tm tm_data {
-      .tm_sec = 0, .tm_min = -1, .tm_hour = 0, .tm_mday = 1,
-      .tm_mon = Month::JANUARY, .tm_year = tm_year(1970), .tm_wday = 0,
-      .tm_yday = 0, .tm_isdst = 0
-    };
+    struct tm tm_data{.tm_sec = 0,
+                      .tm_min = -1,
+                      .tm_hour = 0,
+                      .tm_mday = 1,
+                      .tm_mon = Month::JANUARY,
+                      .tm_year = tm_year(1970),
+                      .tm_wday = 0,
+                      .tm_yday = 0,
+                      .tm_isdst = 0};
     EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data),
-                Succeeds(-TimeConstants::SECONDS_PER_MIN));
+                Succeeds(-LIBC_NAMESPACE::time_constants::SECONDS_PER_MIN));
     EXPECT_TM_EQ((tm{.tm_sec = 0,
                      .tm_min = 59,
                      .tm_hour = 23,
@@ -97,13 +113,17 @@ TEST(LlvmLibcMkTime, InvalidMinutes) {
 
   {
     // 60 minutes from 1970-01-01 00:00:00 returns 1970-01-01 01:00:00.
-    struct tm tm_data {
-      .tm_sec = 0, .tm_min = 60, .tm_hour = 0, .tm_mday = 1,
-      .tm_mon = Month::JANUARY, .tm_year = tm_year(1970), .tm_wday = 0,
-      .tm_yday = 0, .tm_isdst = 0
-    };
+    struct tm tm_data{.tm_sec = 0,
+                      .tm_min = 60,
+                      .tm_hour = 0,
+                      .tm_mday = 1,
+                      .tm_mon = Month::JANUARY,
+                      .tm_year = tm_year(1970),
+                      .tm_wday = 0,
+                      .tm_yday = 0,
+                      .tm_isdst = 0};
     EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data),
-                Succeeds(60 * TimeConstants::SECONDS_PER_MIN));
+                Succeeds(60 * LIBC_NAMESPACE::time_constants::SECONDS_PER_MIN));
     EXPECT_TM_EQ((tm{.tm_sec = 0,
                      .tm_min = 0,
                      .tm_hour = 1,
@@ -120,13 +140,17 @@ TEST(LlvmLibcMkTime, InvalidMinutes) {
 TEST(LlvmLibcMkTime, InvalidHours) {
   {
     // -1 hour from 1970-01-01 00:00:00 returns 1969-12-31 23:00:00.
-    struct tm tm_data {
-      .tm_sec = 0, .tm_min = 0, .tm_hour = -1, .tm_mday = 1,
-      .tm_mon = Month::JANUARY, .tm_year = tm_year(1970), .tm_wday = 0,
-      .tm_yday = 0, .tm_isdst = 0
-    };
+    struct tm tm_data{.tm_sec = 0,
+                      .tm_min = 0,
+                      .tm_hour = -1,
+                      .tm_mday = 1,
+                      .tm_mon = Month::JANUARY,
+                      .tm_year = tm_year(1970),
+                      .tm_wday = 0,
+                      .tm_yday = 0,
+                      .tm_isdst = 0};
     EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data),
-                Succeeds(-TimeConstants::SECONDS_PER_HOUR));
+                Succeeds(-LIBC_NAMESPACE::time_constants::SECONDS_PER_HOUR));
     EXPECT_TM_EQ((tm{.tm_sec = 0,
                      .tm_min = 0,
                      .tm_hour = 23,
@@ -141,13 +165,18 @@ TEST(LlvmLibcMkTime, InvalidHours) {
 
   {
     // 24 hours from 1970-01-01 00:00:00 returns 1970-01-02 00:00:00.
-    struct tm tm_data {
-      .tm_sec = 0, .tm_min = 0, .tm_hour = 24, .tm_mday = 1,
-      .tm_mon = Month::JANUARY, .tm_year = tm_year(1970), .tm_wday = 0,
-      .tm_yday = 0, .tm_isdst = 0
-    };
-    EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data),
-                Succeeds(24 * TimeConstants::SECONDS_PER_HOUR));
+    struct tm tm_data{.tm_sec = 0,
+                      .tm_min = 0,
+                      .tm_hour = 24,
+                      .tm_mday = 1,
+                      .tm_mon = Month::JANUARY,
+                      .tm_year = tm_year(1970),
+                      .tm_wday = 0,
+                      .tm_yday = 0,
+                      .tm_isdst = 0};
+    EXPECT_THAT(
+        LIBC_NAMESPACE::mktime(&tm_data),
+        Succeeds(24 * LIBC_NAMESPACE::time_constants::SECONDS_PER_HOUR));
     EXPECT_TM_EQ((tm{.tm_sec = 0,
                      .tm_min = 0,
                      .tm_hour = 0,
@@ -163,14 +192,18 @@ TEST(LlvmLibcMkTime, InvalidHours) {
 
 TEST(LlvmLibcMkTime, InvalidYear) {
   // -1 year from 1970-01-01 00:00:00 returns 1969-01-01 00:00:00.
-  struct tm tm_data {
-    .tm_sec = 0, .tm_min = 0, .tm_hour = 0, .tm_mday = 1,
-    .tm_mon = Month::JANUARY, .tm_year = tm_year(1969), .tm_wday = 0,
-    .tm_yday = 0, .tm_isdst = 0
-  };
+  struct tm tm_data{.tm_sec = 0,
+                    .tm_min = 0,
+                    .tm_hour = 0,
+                    .tm_mday = 1,
+                    .tm_mon = Month::JANUARY,
+                    .tm_year = tm_year(1969),
+                    .tm_wday = 0,
+                    .tm_yday = 0,
+                    .tm_isdst = 0};
   EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data),
-              Succeeds(-TimeConstants::DAYS_PER_NON_LEAP_YEAR *
-                       TimeConstants::SECONDS_PER_DAY));
+              Succeeds(-LIBC_NAMESPACE::time_constants::DAYS_PER_NON_LEAP_YEAR *
+                       LIBC_NAMESPACE::time_constants::SECONDS_PER_DAY));
   EXPECT_TM_EQ((tm{.tm_sec = 0,
                    .tm_min = 0,
                    .tm_hour = 0,
@@ -188,61 +221,85 @@ TEST(LlvmLibcMkTime, InvalidEndOf32BitEpochYear) {
     return;
   {
     // 2038-01-19 03:14:08 tests overflow of the second in 2038.
-    struct tm tm_data {
-      .tm_sec = 8, .tm_min = 14, .tm_hour = 3, .tm_mday = 19,
-      .tm_mon = Month::JANUARY, .tm_year = tm_year(2038), .tm_wday = 0,
-      .tm_yday = 0, .tm_isdst = 0
-    };
+    struct tm tm_data{.tm_sec = 8,
+                      .tm_min = 14,
+                      .tm_hour = 3,
+                      .tm_mday = 19,
+                      .tm_mon = Month::JANUARY,
+                      .tm_year = tm_year(2038),
+                      .tm_wday = 0,
+                      .tm_yday = 0,
+                      .tm_isdst = 0};
     EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data), Fails(EOVERFLOW));
   }
 
   {
     // 2038-01-19 03:15:07 tests overflow of the minute in 2038.
-    struct tm tm_data {
-      .tm_sec = 7, .tm_min = 15, .tm_hour = 3, .tm_mday = 19,
-      .tm_mon = Month::JANUARY, .tm_year = tm_year(2038), .tm_wday = 0,
-      .tm_yday = 0, .tm_isdst = 0
-    };
+    struct tm tm_data{.tm_sec = 7,
+                      .tm_min = 15,
+                      .tm_hour = 3,
+                      .tm_mday = 19,
+                      .tm_mon = Month::JANUARY,
+                      .tm_year = tm_year(2038),
+                      .tm_wday = 0,
+                      .tm_yday = 0,
+                      .tm_isdst = 0};
     EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data), Fails(EOVERFLOW));
   }
 
   {
     // 2038-01-19 04:14:07 tests overflow of the hour in 2038.
-    struct tm tm_data {
-      .tm_sec = 7, .tm_min = 14, .tm_hour = 4, .tm_mday = 19,
-      .tm_mon = Month::JANUARY, .tm_year = tm_year(2038), .tm_wday = 0,
-      .tm_yday = 0, .tm_isdst = 0
-    };
+    struct tm tm_data{.tm_sec = 7,
+                      .tm_min = 14,
+                      .tm_hour = 4,
+                      .tm_mday = 19,
+                      .tm_mon = Month::JANUARY,
+                      .tm_year = tm_year(2038),
+                      .tm_wday = 0,
+                      .tm_yday = 0,
+                      .tm_isdst = 0};
     EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data), Fails(EOVERFLOW));
   }
 
   {
     // 2038-01-20 03:14:07 tests overflow of the day in 2038.
-    struct tm tm_data {
-      .tm_sec = 7, .tm_min = 14, .tm_hour = 3, .tm_mday = 20,
-      .tm_mon = Month::JANUARY, .tm_year = tm_year(2038), .tm_wday = 0,
-      .tm_yday = 0, .tm_isdst = 0
-    };
+    struct tm tm_data{.tm_sec = 7,
+                      .tm_min = 14,
+                      .tm_hour = 3,
+                      .tm_mday = 20,
+                      .tm_mon = Month::JANUARY,
+                      .tm_year = tm_year(2038),
+                      .tm_wday = 0,
+                      .tm_yday = 0,
+                      .tm_isdst = 0};
     EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data), Fails(EOVERFLOW));
   }
 
   {
     // 2038-02-19 03:14:07 tests overflow of the month in 2038.
-    struct tm tm_data {
-      .tm_sec = 7, .tm_min = 14, .tm_hour = 3, .tm_mday = 19,
-      .tm_mon = Month::FEBRUARY, .tm_year = tm_year(2038), .tm_wday = 0,
-      .tm_yday = 0, .tm_isdst = 0
-    };
+    struct tm tm_data{.tm_sec = 7,
+                      .tm_min = 14,
+                      .tm_hour = 3,
+                      .tm_mday = 19,
+                      .tm_mon = Month::FEBRUARY,
+                      .tm_year = tm_year(2038),
+                      .tm_wday = 0,
+                      .tm_yday = 0,
+                      .tm_isdst = 0};
     EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data), Fails(EOVERFLOW));
   }
 
   {
     // 2039-01-19 03:14:07 tests overflow of the year.
-    struct tm tm_data {
-      .tm_sec = 7, .tm_min = 14, .tm_hour = 3, .tm_mday = 19,
-      .tm_mon = Month::JANUARY, .tm_year = tm_year(2039), .tm_wday = 0,
-      .tm_yday = 0, .tm_isdst = 0
-    };
+    struct tm tm_data{.tm_sec = 7,
+                      .tm_min = 14,
+                      .tm_hour = 3,
+                      .tm_mday = 19,
+                      .tm_mon = Month::JANUARY,
+                      .tm_year = tm_year(2039),
+                      .tm_wday = 0,
+                      .tm_yday = 0,
+                      .tm_isdst = 0};
     EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data), Fails(EOVERFLOW));
   }
 }
@@ -250,12 +307,18 @@ TEST(LlvmLibcMkTime, InvalidEndOf32BitEpochYear) {
 TEST(LlvmLibcMkTime, InvalidMonths) {
   {
     // -1 month from 1970-01-01 00:00:00 returns 1969-12-01 00:00:00.
-    struct tm tm_data {
-      .tm_sec = 0, .tm_min = 0, .tm_hour = 0, .tm_mday = 0, .tm_mon = -1,
-      .tm_year = tm_year(1970), .tm_wday = 0, .tm_yday = 0, .tm_isdst = 0
-    };
-    EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data),
-                Succeeds(-32 * TimeConstants::SECONDS_PER_DAY));
+    struct tm tm_data{.tm_sec = 0,
+                      .tm_min = 0,
+                      .tm_hour = 0,
+                      .tm_mday = 0,
+                      .tm_mon = -1,
+                      .tm_year = tm_year(1970),
+                      .tm_wday = 0,
+                      .tm_yday = 0,
+                      .tm_isdst = 0};
+    EXPECT_THAT(
+        LIBC_NAMESPACE::mktime(&tm_data),
+        Succeeds(-32 * LIBC_NAMESPACE::time_constants::SECONDS_PER_DAY));
     EXPECT_TM_EQ((tm{.tm_sec = 0,
                      .tm_min = 0,
                      .tm_hour = 0,
@@ -270,13 +333,19 @@ TEST(LlvmLibcMkTime, InvalidMonths) {
 
   {
     // 1970-13-01 00:00:00 returns 1971-01-01 00:00:00.
-    struct tm tm_data {
-      .tm_sec = 0, .tm_min = 0, .tm_hour = 0, .tm_mday = 1, .tm_mon = 12,
-      .tm_year = tm_year(1970), .tm_wday = 0, .tm_yday = 0, .tm_isdst = 0
-    };
-    EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data),
-                Succeeds(TimeConstants::DAYS_PER_NON_LEAP_YEAR *
-                         TimeConstants::SECONDS_PER_DAY));
+    struct tm tm_data{.tm_sec = 0,
+                      .tm_min = 0,
+                      .tm_hour = 0,
+                      .tm_mday = 1,
+                      .tm_mon = 12,
+                      .tm_year = tm_year(1970),
+                      .tm_wday = 0,
+                      .tm_yday = 0,
+                      .tm_isdst = 0};
+    EXPECT_THAT(
+        LIBC_NAMESPACE::mktime(&tm_data),
+        Succeeds(LIBC_NAMESPACE::time_constants::DAYS_PER_NON_LEAP_YEAR *
+                 LIBC_NAMESPACE::time_constants::SECONDS_PER_DAY));
     EXPECT_TM_EQ((tm{.tm_sec = 0,
                      .tm_min = 0,
                      .tm_hour = 0,
@@ -293,13 +362,17 @@ TEST(LlvmLibcMkTime, InvalidMonths) {
 TEST(LlvmLibcMkTime, InvalidDays) {
   {
     // -1 day from 1970-01-01 00:00:00 returns 1969-12-31 00:00:00.
-    struct tm tm_data {
-      .tm_sec = 0, .tm_min = 0, .tm_hour = 0, .tm_mday = (1 - 1),
-      .tm_mon = Month::JANUARY, .tm_year = tm_year(1970), .tm_wday = 0,
-      .tm_yday = 0, .tm_isdst = 0
-    };
+    struct tm tm_data{.tm_sec = 0,
+                      .tm_min = 0,
+                      .tm_hour = 0,
+                      .tm_mday = (1 - 1),
+                      .tm_mon = Month::JANUARY,
+                      .tm_year = tm_year(1970),
+                      .tm_wday = 0,
+                      .tm_yday = 0,
+                      .tm_isdst = 0};
     EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data),
-                Succeeds(-1 * TimeConstants::SECONDS_PER_DAY));
+                Succeeds(-1 * LIBC_NAMESPACE::time_constants::SECONDS_PER_DAY));
     EXPECT_TM_EQ((tm{.tm_sec = 0,
                      .tm_min = 0,
                      .tm_hour = 0,
@@ -314,13 +387,17 @@ TEST(LlvmLibcMkTime, InvalidDays) {
 
   {
     // 1970-01-32 00:00:00 returns 1970-02-01 00:00:00.
-    struct tm tm_data {
-      .tm_sec = 0, .tm_min = 0, .tm_hour = 0, .tm_mday = 32,
-      .tm_mon = Month::JANUARY, .tm_year = tm_year(1970), .tm_wday = 0,
-      .tm_yday = 0, .tm_isdst = 0
-    };
+    struct tm tm_data{.tm_sec = 0,
+                      .tm_min = 0,
+                      .tm_hour = 0,
+                      .tm_mday = 32,
+                      .tm_mon = Month::JANUARY,
+                      .tm_year = tm_year(1970),
+                      .tm_wday = 0,
+                      .tm_yday = 0,
+                      .tm_isdst = 0};
     EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data),
-                Succeeds(31 * TimeConstants::SECONDS_PER_DAY));
+                Succeeds(31 * LIBC_NAMESPACE::time_constants::SECONDS_PER_DAY));
     EXPECT_TM_EQ((tm{.tm_sec = 0,
                      .tm_min = 0,
                      .tm_hour = 0,
@@ -335,13 +412,17 @@ TEST(LlvmLibcMkTime, InvalidDays) {
 
   {
     // 1970-02-29 00:00:00 returns 1970-03-01 00:00:00.
-    struct tm tm_data {
-      .tm_sec = 0, .tm_min = 0, .tm_hour = 0, .tm_mday = 29,
-      .tm_mon = Month::FEBRUARY, .tm_year = tm_year(1970), .tm_wday = 0,
-      .tm_yday = 0, .tm_isdst = 0
-    };
+    struct tm tm_data{.tm_sec = 0,
+                      .tm_min = 0,
+                      .tm_hour = 0,
+                      .tm_mday = 29,
+                      .tm_mon = Month::FEBRUARY,
+                      .tm_year = tm_year(1970),
+                      .tm_wday = 0,
+                      .tm_yday = 0,
+                      .tm_isdst = 0};
     EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data),
-                Succeeds(59 * TimeConstants::SECONDS_PER_DAY));
+                Succeeds(59 * LIBC_NAMESPACE::time_constants::SECONDS_PER_DAY));
     EXPECT_TM_EQ((tm{.tm_sec = 0,
                      .tm_min = 0,
                      .tm_hour = 0,
@@ -356,14 +437,20 @@ TEST(LlvmLibcMkTime, InvalidDays) {
 
   {
     // 1972-02-30 00:00:00 returns 1972-03-01 00:00:00.
-    struct tm tm_data {
-      .tm_sec = 0, .tm_min = 0, .tm_hour = 0, .tm_mday = 30,
-      .tm_mon = Month::FEBRUARY, .tm_year = tm_year(1972), .tm_wday = 0,
-      .tm_yday = 0, .tm_isdst = 0
-    };
-    EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data),
-                Succeeds(((2 * TimeConstants::DAYS_PER_NON_LEAP_YEAR) + 60) *
-                         TimeConstants::SECONDS_PER_DAY));
+    struct tm tm_data{.tm_sec = 0,
+                      .tm_min = 0,
+                      .tm_hour = 0,
+                      .tm_mday = 30,
+                      .tm_mon = Month::FEBRUARY,
+                      .tm_year = tm_year(1972),
+                      .tm_wday = 0,
+                      .tm_yday = 0,
+                      .tm_isdst = 0};
+    EXPECT_THAT(
+        LIBC_NAMESPACE::mktime(&tm_data),
+        Succeeds(((2 * LIBC_NAMESPACE::time_constants::DAYS_PER_NON_LEAP_YEAR) +
+                  60) *
+                 LIBC_NAMESPACE::time_constants::SECONDS_PER_DAY));
     EXPECT_TM_EQ((tm{.tm_sec = 0,
                      .tm_min = 0,
                      .tm_hour = 0,
@@ -381,11 +468,15 @@ TEST(LlvmLibcMkTime, EndOf32BitEpochYear) {
   // Test for maximum value of a signed 32-bit integer.
   // Test implementation can encode time for Tue 19 January 2038 03:14:07 UTC.
   {
-    struct tm tm_data {
-      .tm_sec = 7, .tm_min = 14, .tm_hour = 3, .tm_mday = 19,
-      .tm_mon = Month::JANUARY, .tm_year = tm_year(2038), .tm_wday = 0,
-      .tm_yday = 0, .tm_isdst = 0
-    };
+    struct tm tm_data{.tm_sec = 7,
+                      .tm_min = 14,
+                      .tm_hour = 3,
+                      .tm_mday = 19,
+                      .tm_mon = Month::JANUARY,
+                      .tm_year = tm_year(2038),
+                      .tm_wday = 0,
+                      .tm_yday = 0,
+                      .tm_isdst = 0};
     EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data), Succeeds(0x7FFFFFFF));
     EXPECT_TM_EQ((tm{.tm_sec = 7,
                      .tm_min = 14,
@@ -403,11 +494,15 @@ TEST(LlvmLibcMkTime, EndOf32BitEpochYear) {
   {
     // 2038-01-19 03:13:59 tests that even a large seconds field is
     // accepted if the minutes field is smaller.
-    struct tm tm_data {
-      .tm_sec = 59, .tm_min = 13, .tm_hour = 3, .tm_mday = 19,
-      .tm_mon = Month::JANUARY, .tm_year = tm_year(2038), .tm_wday = 0,
-      .tm_yday = 0, .tm_isdst = 0
-    };
+    struct tm tm_data{.tm_sec = 59,
+                      .tm_min = 13,
+                      .tm_hour = 3,
+                      .tm_mday = 19,
+                      .tm_mon = Month::JANUARY,
+                      .tm_year = tm_year(2038),
+                      .tm_wday = 0,
+                      .tm_yday = 0,
+                      .tm_isdst = 0};
     EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data), Succeeds(0x7FFFFFFF - 8));
     EXPECT_TM_EQ((tm{.tm_sec = 59,
                      .tm_min = 13,
@@ -424,13 +519,18 @@ TEST(LlvmLibcMkTime, EndOf32BitEpochYear) {
   {
     // 2038-01-19 02:59:59 tests that large seconds and minutes are
     // accepted if the hours field is smaller.
-    struct tm tm_data {
-      .tm_sec = 59, .tm_min = 59, .tm_hour = 2, .tm_mday = 19,
-      .tm_mon = Month::JANUARY, .tm_year = tm_year(2038), .tm_wday = 0,
-      .tm_yday = 0, .tm_isdst = 0
-    };
+    struct tm tm_data{.tm_sec = 59,
+                      .tm_min = 59,
+                      .tm_hour = 2,
+                      .tm_mday = 19,
+                      .tm_mon = Month::JANUARY,
+                      .tm_year = tm_year(2038),
+                      .tm_wday = 0,
+                      .tm_yday = 0,
+                      .tm_isdst = 0};
     EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data),
-                Succeeds(0x7FFFFFFF - 8 - 14 * TimeConstants::SECONDS_PER_MIN));
+                Succeeds(0x7FFFFFFF - 8 -
+                         14 * LIBC_NAMESPACE::time_constants::SECONDS_PER_MIN));
     EXPECT_TM_EQ((tm{.tm_sec = 59,
                      .tm_min = 59,
                      .tm_hour = 2,
@@ -446,14 +546,19 @@ TEST(LlvmLibcMkTime, EndOf32BitEpochYear) {
   {
     // 2038-01-18 23:59:59 tests that large seconds, minutes and hours
     // are accepted if the days field is smaller.
-    struct tm tm_data {
-      .tm_sec = 59, .tm_min = 59, .tm_hour = 23, .tm_mday = 18,
-      .tm_mon = Month::JANUARY, .tm_year = tm_year(2038), .tm_wday = 0,
-      .tm_yday = 0, .tm_isdst = 0
-    };
+    struct tm tm_data{.tm_sec = 59,
+                      .tm_min = 59,
+                      .tm_hour = 23,
+                      .tm_mday = 18,
+                      .tm_mon = Month::JANUARY,
+                      .tm_year = tm_year(2038),
+                      .tm_wday = 0,
+                      .tm_yday = 0,
+                      .tm_isdst = 0};
     EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data),
-                Succeeds(0x7FFFFFFF - 8 - 14 * TimeConstants::SECONDS_PER_MIN -
-                         3 * TimeConstants::SECONDS_PER_HOUR));
+                Succeeds(0x7FFFFFFF - 8 -
+                         14 * LIBC_NAMESPACE::time_constants::SECONDS_PER_MIN -
+                         3 * LIBC_NAMESPACE::time_constants::SECONDS_PER_HOUR));
     EXPECT_TM_EQ((tm{.tm_sec = 59,
                      .tm_min = 59,
                      .tm_hour = 23,
@@ -469,15 +574,20 @@ TEST(LlvmLibcMkTime, EndOf32BitEpochYear) {
   {
     // 2038-01-18 23:59:59 tests that the final second of 2037 is
     // accepted.
-    struct tm tm_data {
-      .tm_sec = 59, .tm_min = 59, .tm_hour = 23, .tm_mday = 31,
-      .tm_mon = Month::DECEMBER, .tm_year = tm_year(2037), .tm_wday = 0,
-      .tm_yday = 0, .tm_isdst = 0
-    };
+    struct tm tm_data{.tm_sec = 59,
+                      .tm_min = 59,
+                      .tm_hour = 23,
+                      .tm_mday = 31,
+                      .tm_mon = Month::DECEMBER,
+                      .tm_year = tm_year(2037),
+                      .tm_wday = 0,
+                      .tm_yday = 0,
+                      .tm_isdst = 0};
     EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data),
-                Succeeds(0x7FFFFFFF - 8 - 14 * TimeConstants::SECONDS_PER_MIN -
-                         3 * TimeConstants::SECONDS_PER_HOUR -
-                         18 * TimeConstants::SECONDS_PER_DAY));
+                Succeeds(0x7FFFFFFF - 8 -
+                         14 * LIBC_NAMESPACE::time_constants::SECONDS_PER_MIN -
+                         3 * LIBC_NAMESPACE::time_constants::SECONDS_PER_HOUR -
+                         18 * LIBC_NAMESPACE::time_constants::SECONDS_PER_DAY));
     EXPECT_TM_EQ((tm{.tm_sec = 59,
                      .tm_min = 59,
                      .tm_hour = 23,
@@ -496,11 +606,15 @@ TEST(LlvmLibcMkTime, Max64BitYear) {
     return;
   {
     // Mon Jan 1 12:50:50 2170 (200 years from 1970),
-    struct tm tm_data {
-      .tm_sec = 50, .tm_min = 50, .tm_hour = 12, .tm_mday = 1,
-      .tm_mon = Month::JANUARY, .tm_year = tm_year(2170), .tm_wday = 0,
-      .tm_yday = 0, .tm_isdst = 0
-    };
+    struct tm tm_data{.tm_sec = 50,
+                      .tm_min = 50,
+                      .tm_hour = 12,
+                      .tm_mday = 1,
+                      .tm_mon = Month::JANUARY,
+                      .tm_year = tm_year(2170),
+                      .tm_wday = 0,
+                      .tm_yday = 0,
+                      .tm_isdst = 0};
     EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data), Succeeds(6311479850));
     EXPECT_TM_EQ((tm{.tm_sec = 50,
                      .tm_min = 50,
@@ -516,11 +630,15 @@ TEST(LlvmLibcMkTime, Max64BitYear) {
 
   {
     // Test for Tue Jan 1 12:50:50 in 2,147,483,647th year.
-    struct tm tm_data {
-      .tm_sec = 50, .tm_min = 50, .tm_hour = 12, .tm_mday = 1,
-      .tm_mon = Month::JANUARY, .tm_year = tm_year(2147483647), .tm_wday = 0,
-      .tm_yday = 0, .tm_isdst = 0
-    };
+    struct tm tm_data{.tm_sec = 50,
+                      .tm_min = 50,
+                      .tm_hour = 12,
+                      .tm_mday = 1,
+                      .tm_mon = Month::JANUARY,
+                      .tm_year = tm_year(2147483647),
+                      .tm_wday = 0,
+                      .tm_yday = 0,
+                      .tm_isdst = 0};
     EXPECT_THAT(LIBC_NAMESPACE::mktime(&tm_data), Succeeds(67767976202043050));
     EXPECT_TM_EQ((tm{.tm_sec = 50,
                      .tm_min = 50,
diff --git a/libc/test/src/time/nanosleep_test.cpp b/libc/test/src/time/nanosleep_test.cpp
index 2a6eea4d5e16..d4f98e29bd98 100644
--- a/libc/test/src/time/nanosleep_test.cpp
+++ b/libc/test/src/time/nanosleep_test.cpp
@@ -6,8 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include <time.h>
-
+#include "hdr/types/struct_timespec.h"
 #include "src/errno/libc_errno.h"
 #include "src/time/nanosleep.h"
 #include "test/UnitTest/ErrnoSetterMatcher.h"
diff --git a/libc/utils/hdrgen/README.rst b/libc/utils/hdrgen/README.rst
index d16e6c5ccaec..6db2968d1e61 100644
--- a/libc/utils/hdrgen/README.rst
+++ b/libc/utils/hdrgen/README.rst
@@ -1,5 +1,4 @@
-This directory contains the sources and specifications for the types,
-macros and entrypoint functions.  These definitions are organized in the
-``yaml`` subdirectory and match the organization of the ``*.h.def``
-files. This directory also contains the Python sources for hdrgen, which is
-what generates the headers.
+This directory also contains the Python sources for hdrgen, which is
+what generates the headers public libc headers. The definitions for these
+headers are in the ``include`` directory. The ``.h.def`` files are the bases
+and the ``.yaml`` files are the contents.
diff --git a/libc/utils/hdrgen/header.py b/libc/utils/hdrgen/header.py
index df8ce613bd0f..9339acceaf7a 100644
--- a/libc/utils/hdrgen/header.py
+++ b/libc/utils/hdrgen/header.py
@@ -9,6 +9,7 @@
 
 class HeaderFile:
     def __init__(self, name):
+        self.template_file = None
         self.name = name
         self.macros = []
         self.types = []
@@ -31,7 +32,7 @@ class HeaderFile:
     def add_function(self, function):
         self.functions.append(function)
 
-    def __str__(self):
+    def public_api(self):
         content = [""]
 
         for macro in self.macros:
diff --git a/libc/utils/hdrgen/main.py b/libc/utils/hdrgen/main.py
new file mode 100755
index 000000000000..5dd392ab6662
--- /dev/null
+++ b/libc/utils/hdrgen/main.py
@@ -0,0 +1,91 @@
+#!/usr/bin/env python3
+#
+# ===- Generate headers for libc functions  ------------------*- python -*--==#
+#
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# ==------------------------------------------------------------------------==#
+
+import argparse
+import sys
+from pathlib import Path
+
+from header import HeaderFile
+from yaml_to_classes import load_yaml_file, fill_public_api
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Generate header files from YAML")
+    parser.add_argument(
+        "yaml_file",
+        help="Path to the YAML file containing header specification",
+        metavar="FILE",
+        type=Path,
+        nargs=1,
+    )
+    parser.add_argument(
+        "-o",
+        "--output",
+        help="Path to write generated header file",
+        type=Path,
+        required=True,
+    )
+    parser.add_argument(
+        "--depfile",
+        help="Path to write a depfile",
+        type=Path,
+    )
+    parser.add_argument(
+        "--write-if-changed",
+        help="Write the output file only if its contents have changed",
+        action="store_true",
+        default=False,
+    )
+    parser.add_argument(
+        "-e",
+        "--entry-point",
+        help="Entry point to include; may be given many times",
+        metavar="SYMBOL",
+        action="append",
+    )
+    args = parser.parse_args()
+
+    [yaml_file] = args.yaml_file
+    files_read = {yaml_file}
+
+    def write_depfile():
+        if not args.depfile:
+            return
+        deps = " ".join(str(f) for f in sorted(files_read))
+        args.depfile.parent.mkdir(parents=True, exist_ok=True)
+        with open(args.depfile, "w") as depfile:
+            depfile.write(f"{args.output}: {deps}\n")
+
+    header = load_yaml_file(yaml_file, HeaderFile, args.entry_point)
+
+    if not header.template_file:
+        print(f"{yaml_file}: Missing header_template", sys.stderr)
+        return 2
+
+    # The header_template path is relative to the containing YAML file.
+    template_path = yaml_file.parent / header.template_file
+
+    files_read.add(template_path)
+    with open(template_path) as template:
+        contents = fill_public_api(header.public_api(), template.read())
+
+    write_depfile()
+
+    if (
+        not args.write_if_changed
+        or not args.output.exists()
+        or args.output.read_text() != contents
+    ):
+        args.output.parent.mkdir(parents=True, exist_ok=True)
+        args.output.write_text(contents)
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/libc/utils/hdrgen/tests/input/test_small.yaml b/libc/utils/hdrgen/tests/input/test_small.yaml
index 772552faf81d..1d4b2990a300 100644
--- a/libc/utils/hdrgen/tests/input/test_small.yaml
+++ b/libc/utils/hdrgen/tests/input/test_small.yaml
@@ -1,4 +1,5 @@
-header: test_header.h
+header: test_small.h
+header_template: test_small.h.def
 macros:
   - macro_name: MACRO_A
     macro_value: 1
@@ -62,5 +63,3 @@ functions:
       - type: float
     standards:
       - stdc
-
-
diff --git a/libc/utils/hdrgen/tests/test_integration.py b/libc/utils/hdrgen/tests/test_integration.py
index ce80026e7bcc..49cb08cd1b33 100644
--- a/libc/utils/hdrgen/tests/test_integration.py
+++ b/libc/utils/hdrgen/tests/test_integration.py
@@ -1,36 +1,27 @@
+import argparse
 import subprocess
+import sys
 import unittest
 from pathlib import Path
-import os
-import argparse
-import sys
 
 
 class TestHeaderGenIntegration(unittest.TestCase):
     def setUp(self):
-        self.output_dir = Path(
-            args.output_dir if args.output_dir else "libc/utils/hdrgen/tests/output"
-        )
-
-        self.maxDiff = None
-
-        self.source_dir = Path(__file__).resolve().parent.parent.parent.parent.parent
+        self.output_dir = TestHeaderGenIntegration.output_dir
+        self.source_dir = Path(__file__).parent
+        self.main_script = self.source_dir.parent / "main.py"
 
-    def run_script(self, yaml_file, h_def_file, output_dir, entry_points):
-        yaml_file = self.source_dir / yaml_file
-        h_def_file = self.source_dir / h_def_file
+    def run_script(self, yaml_file, output_file, entry_points):
         command = [
             "python3",
-            str(self.source_dir / "libc/utils/hdrgen/yaml_to_classes.py"),
+            str(self.main_script),
             str(yaml_file),
-            "--h_def_file",
-            str(h_def_file),
-            "--output_dir",
-            str(output_dir),
+            "--output",
+            str(output_file),
         ]
 
         for entry_point in entry_points:
-            command.extend(["--e", entry_point])
+            command.extend(["--entry-point", entry_point])
 
         result = subprocess.run(
             command,
@@ -51,26 +42,23 @@ class TestHeaderGenIntegration(unittest.TestCase):
         self.assertEqual(gen_content, exp_content)
 
     def test_generate_header(self):
-        yaml_file = "libc/utils/hdrgen/tests/input/test_small.yaml"
-        h_def_file = "libc/utils/hdrgen/tests/input/test_small.h.def"
-        expected_output_file = (
-            self.source_dir / "libc/utils/hdrgen/tests/expected_output/test_header.h"
-        )
+        yaml_file = self.source_dir / "input/test_small.yaml"
+        expected_output_file = self.source_dir / "expected_output/test_header.h"
         output_file = self.output_dir / "test_small.h"
         entry_points = {"func_b", "func_a", "func_c", "func_d", "func_e"}
 
-        if not self.output_dir.exists():
-            self.output_dir.mkdir(parents=True)
-
-        self.run_script(yaml_file, h_def_file, self.output_dir, entry_points)
+        self.run_script(yaml_file, output_file, entry_points)
 
         self.compare_files(output_file, expected_output_file)
 
 
-if __name__ == "__main__":
+def main():
     parser = argparse.ArgumentParser(description="TestHeaderGenIntegration arguments")
     parser.add_argument(
-        "--output_dir", type=str, help="Output directory for generated headers"
+        "--output_dir",
+        type=Path,
+        help="Output directory for generated headers",
+        required=True,
     )
     args, remaining_argv = parser.parse_known_args()
 
@@ -79,3 +67,7 @@ if __name__ == "__main__":
     sys.argv[1:] = remaining_argv
 
     unittest.main()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/libc/utils/hdrgen/yaml_to_classes.py b/libc/utils/hdrgen/yaml_to_classes.py
index ec2441b78aee..d64feafc260b 100644
--- a/libc/utils/hdrgen/yaml_to_classes.py
+++ b/libc/utils/hdrgen/yaml_to_classes.py
@@ -35,6 +35,7 @@ def yaml_to_classes(yaml_data, header_class, entry_points=None):
     """
     header_name = yaml_data.get("header")
     header = header_class(header_name)
+    header.template_file = yaml_data.get("header_template")
 
     for macro_data in yaml_data.get("macros", []):
         header.add_macro(Macro(macro_data["macro_name"], macro_data["macro_value"]))
@@ -227,10 +228,6 @@ def main():
         help="Directory to output the generated header file",
     )
     parser.add_argument(
-        "--h_def_file",
-        help="Path to the .h.def template file (required if not using --export_decls)",
-    )
-    parser.add_argument(
         "--add_function",
         nargs=6,
         metavar=(
@@ -244,7 +241,10 @@ def main():
         help="Add a function to the YAML file",
     )
     parser.add_argument(
-        "--e", action="append", help="Entry point to include", dest="entry_points"
+        "--entry-point",
+        action="append",
+        help="Entry point to include",
+        dest="entry_points",
     )
     parser.add_argument(
         "--export-decls",
@@ -268,13 +268,7 @@ def main():
     else:
         output_file_path = Path(f"{Path(args.yaml_file).stem}.h")
 
-    if not args.export_decls and args.h_def_file:
-        with open(args.h_def_file, "r") as f:
-            h_def_content = f.read()
-        final_header_content = fill_public_api(header_str, h_def_content)
-        with open(output_file_path, "w") as f:
-            f.write(final_header_content)
-    else:
+    if args.export_decls:
         with open(output_file_path, "w") as f:
             f.write(header_str)
 
diff --git a/libcxx/docs/Hardening.rst b/libcxx/docs/Hardening.rst
index 531065afb8e8..d399b94d27c1 100644
--- a/libcxx/docs/Hardening.rst
+++ b/libcxx/docs/Hardening.rst
@@ -311,7 +311,10 @@ ABI configuration.
 ABI options
 -----------
 
-Vendors can use the following ABI options to enable additional hardening checks:
+Vendors can use some ABI options at CMake configuration time (when building libc++
+itself) to enable additional hardening checks. This is done by passing these
+macros as ``-DLIBCXX_ABI_DEFINES="_LIBCPP_ABI_FOO;_LIBCPP_ABI_BAR;etc"`` at
+CMake configuration time. The available options are:
 
 - ``_LIBCPP_ABI_BOUNDED_ITERATORS`` -- changes the iterator type of select
   containers (see below) to a bounded iterator that keeps track of whether it's
@@ -341,7 +344,7 @@ Vendors can use the following ABI options to enable additional hardening checks:
 
   ABI impact: changes the iterator type of ``vector`` (except ``vector<bool>``).
 
-- ``_LIBCPP_ABI_BOUNDED_UNIQUE_PTR``` -- tracks the bounds of the array stored inside
+- ``_LIBCPP_ABI_BOUNDED_UNIQUE_PTR`` -- tracks the bounds of the array stored inside
   a ``std::unique_ptr<T[]>``, allowing it to trap when accessed out-of-bounds. This
   requires the ``std::unique_ptr`` to be created using an API like ``std::make_unique``
   or ``std::make_unique_for_overwrite``, otherwise the bounds information is not available
diff --git a/libcxx/docs/ReleaseNotes/20.rst b/libcxx/docs/ReleaseNotes/20.rst
index c8a07fb8b733..ecfbaa5b7a37 100644
--- a/libcxx/docs/ReleaseNotes/20.rst
+++ b/libcxx/docs/ReleaseNotes/20.rst
@@ -73,6 +73,39 @@ Improvements and New Features
   optimized, resulting in a performance improvement of up to 2x for trivial element types (e.g., `std::vector<int>`),
   and up to 3.4x for non-trivial element types (e.g., `std::vector<std::vector<int>>`).
 
+- On Windows, ``<system_error>``'s ``std::system_category`` is now distinct from ``std::generic_category``. The behavior
+  on other operating systems is unchanged.
+
+  On Windows -- unlike on Unix systems -- the libc and system APIs use distinct error codes. The libc functions return
+  ``errno.h`` error codes via the ``errno`` global, while Win32 API functions return ``winerror.h`` error codes via
+  ``GetLastError()``.
+
+  The C++ standard's ``std::error_code`` and ``std::error_category`` functionality was designed to support multiple
+  error domains, precisely in order to handle situations such as this. However, libc++ formerly treated
+  ``generic_category()`` and ``system_category()`` as equivalent, even on Windows. It now implements the intended split,
+  where ``system_category`` represents native ``winerror.h`` error codes, and ``generic_category`` represents libc error
+  codes (and, equivalently, ``std::errc::*`` errors).
+
+  This change enables code like ``std::error_code(GetLastError(), std::system_category()) ==
+  std::errc::invalid_argument`` to function as desired: constructing an ``error_code`` with the Windows error number in
+  the "system" category, and then mapping it to a generic code with ``error_condition``, for comparison with the
+  ``std::errc`` constant.
+
+  This is an incompatible change: ``std::error_code(ENOSYS, std::system_category()) ==
+  std::errc::function_not_supported`` would formerly have returned true, but now returns false on Windows. Code
+  providing a number from the ``errno.h`` domain should be migrated to construct a ``generic_category`` error_code,
+  instead. (E.g., use ``std::error_code(ENOSYS, std::generic_category())``). The new behavior matches MSVC.
+
+- On Windows, the ``std::filesystem`` library now returns the Win32 ``system_category`` error codes, where it's feasible
+  to do so. This allows interrogation and reporting of the original error code, which is useful if multiple Windows
+  errors map to a single generic error (such as with ``std::errc::no_such_file_or_directory``).
+
+  This is also a slightly-incompatible API change: code inspecting the raw integer value from the returned error_code
+  expecting an integer from ``generic_category`` (e.g. ``err.value() == ENOTDIR``) will not work as desired. Instead,
+  such code should use the comparison operators which implicitly handle eror mappings, ``err ==
+  std::errc::not_a_directory``, or use ``err.default_error_condition()`` to map to an ``error_condition``, and then test
+  its ``value()`` and ``category()``.
+
 Deprecations and Removals
 -------------------------
 
diff --git a/libcxx/docs/TestingLibcxx.rst b/libcxx/docs/TestingLibcxx.rst
index cf092fabd046..e98b96bfb478 100644
--- a/libcxx/docs/TestingLibcxx.rst
+++ b/libcxx/docs/TestingLibcxx.rst
@@ -459,6 +459,29 @@ we only want to make sure they don't rot. Do not rely on the results of benchmar
 run through ``check-cxx`` for anything, instead run the benchmarks manually using
 the instructions for running individual tests.
 
+If you want to compare the results of different benchmark runs, we recommend using the
+``libcxx-compare-benchmarks`` helper tool. First, configure CMake in a build directory
+and run the benchmark:
+
+.. code-block:: bash
+
+  $ cmake -S runtimes -B <build1> [...]
+  $ libcxx/utils/libcxx-lit <build1> libcxx/test/benchmarks/string.bench.cpp --param optimization=speed
+
+Then, do the same for the second configuration you want to test. Use a different build
+directory for that configuration:
+
+.. code-block:: bash
+
+  $ cmake -S runtimes -B <build2> [...]
+  $ libcxx/utils/libcxx-lit <build2> libcxx/test/benchmarks/string.bench.cpp --param optimization=speed
+
+Finally, use ``libcxx-compare-benchmarks`` to compare both:
+
+.. code-block:: bash
+
+  $ libcxx/utils/libcxx-compare-benchmarks <build1> <build2> libcxx/test/benchmarks/string.bench.cpp
+
 .. _`Google Benchmark`: https://github.com/google/benchmark
 
 .. _testing-hardening-assertions:
diff --git a/libcxx/include/__algorithm/comp_ref_type.h b/libcxx/include/__algorithm/comp_ref_type.h
index c367fbb91ac2..6a9d5cef2671 100644
--- a/libcxx/include/__algorithm/comp_ref_type.h
+++ b/libcxx/include/__algorithm/comp_ref_type.h
@@ -56,10 +56,10 @@ struct __debug_less {
 // Pass the comparator by lvalue reference. Or in the debug mode, using a debugging wrapper that stores a reference.
 #if _LIBCPP_HARDENING_MODE == _LIBCPP_HARDENING_MODE_DEBUG
 template <class _Comp>
-using __comp_ref_type = __debug_less<_Comp>;
+using __comp_ref_type _LIBCPP_NODEBUG = __debug_less<_Comp>;
 #else
 template <class _Comp>
-using __comp_ref_type = _Comp&;
+using __comp_ref_type _LIBCPP_NODEBUG = _Comp&;
 #endif
 
 _LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/include/__algorithm/copy.h b/libcxx/include/__algorithm/copy.h
index 4f30b2050abb..962aa90059d5 100644
--- a/libcxx/include/__algorithm/copy.h
+++ b/libcxx/include/__algorithm/copy.h
@@ -47,7 +47,7 @@ struct __copy_impl {
 
   template <class _InIter, class _OutIter>
   struct _CopySegment {
-    using _Traits = __segmented_iterator_traits<_InIter>;
+    using _Traits _LIBCPP_NODEBUG = __segmented_iterator_traits<_InIter>;
 
     _OutIter& __result_;
 
diff --git a/libcxx/include/__algorithm/iterator_operations.h b/libcxx/include/__algorithm/iterator_operations.h
index 6cdb0aec9b2d..e5c89c1e67e3 100644
--- a/libcxx/include/__algorithm/iterator_operations.h
+++ b/libcxx/include/__algorithm/iterator_operations.h
@@ -48,13 +48,13 @@ struct _RangeAlgPolicy {};
 template <>
 struct _IterOps<_RangeAlgPolicy> {
   template <class _Iter>
-  using __value_type = iter_value_t<_Iter>;
+  using __value_type _LIBCPP_NODEBUG = iter_value_t<_Iter>;
 
   template <class _Iter>
-  using __iterator_category = ranges::__iterator_concept<_Iter>;
+  using __iterator_category _LIBCPP_NODEBUG = ranges::__iterator_concept<_Iter>;
 
   template <class _Iter>
-  using __difference_type = iter_difference_t<_Iter>;
+  using __difference_type _LIBCPP_NODEBUG = iter_difference_t<_Iter>;
 
   static constexpr auto advance      = ranges::advance;
   static constexpr auto distance     = ranges::distance;
@@ -72,13 +72,13 @@ struct _ClassicAlgPolicy {};
 template <>
 struct _IterOps<_ClassicAlgPolicy> {
   template <class _Iter>
-  using __value_type = typename iterator_traits<_Iter>::value_type;
+  using __value_type _LIBCPP_NODEBUG = typename iterator_traits<_Iter>::value_type;
 
   template <class _Iter>
-  using __iterator_category = typename iterator_traits<_Iter>::iterator_category;
+  using __iterator_category _LIBCPP_NODEBUG = typename iterator_traits<_Iter>::iterator_category;
 
   template <class _Iter>
-  using __difference_type = typename iterator_traits<_Iter>::difference_type;
+  using __difference_type _LIBCPP_NODEBUG = typename iterator_traits<_Iter>::difference_type;
 
   // advance
   template <class _Iter, class _Distance>
@@ -94,10 +94,10 @@ struct _IterOps<_ClassicAlgPolicy> {
   }
 
   template <class _Iter>
-  using __deref_t = decltype(*std::declval<_Iter&>());
+  using __deref_t _LIBCPP_NODEBUG = decltype(*std::declval<_Iter&>());
 
   template <class _Iter>
-  using __move_t = decltype(std::move(*std::declval<_Iter&>()));
+  using __move_t _LIBCPP_NODEBUG = decltype(std::move(*std::declval<_Iter&>()));
 
   template <class _Iter>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 static void __validate_iter_reference() {
@@ -217,7 +217,7 @@ private:
 };
 
 template <class _AlgPolicy, class _Iter>
-using __policy_iter_diff_t = typename _IterOps<_AlgPolicy>::template __difference_type<_Iter>;
+using __policy_iter_diff_t _LIBCPP_NODEBUG = typename _IterOps<_AlgPolicy>::template __difference_type<_Iter>;
 
 _LIBCPP_END_NAMESPACE_STD
 
diff --git a/libcxx/include/__algorithm/mismatch.h b/libcxx/include/__algorithm/mismatch.h
index f5855379f687..a6836792c058 100644
--- a/libcxx/include/__algorithm/mismatch.h
+++ b/libcxx/include/__algorithm/mismatch.h
@@ -78,7 +78,7 @@ __mismatch_vectorized(_Iter __first1, _Iter __last1, _Iter __first2) {
       }
 
       for (size_t __i = 0; __i != __unroll_count; ++__i) {
-        if (auto __cmp_res = std::__as_mask(__lhs[__i] == __rhs[__i]); !std::__all_of(__cmp_res)) {
+        if (auto __cmp_res = __lhs[__i] == __rhs[__i]; !std::__all_of(__cmp_res)) {
           auto __offset = __i * __vec_size + std::__find_first_not_set(__cmp_res);
           return {__first1 + __offset, __first2 + __offset};
         }
@@ -90,7 +90,7 @@ __mismatch_vectorized(_Iter __first1, _Iter __last1, _Iter __first2) {
 
     // check the remaining 0-3 vectors
     while (static_cast<size_t>(__last1 - __first1) >= __vec_size) {
-      if (auto __cmp_res = std::__as_mask(std::__load_vector<__vec>(__first1) == std::__load_vector<__vec>(__first2));
+      if (auto __cmp_res = std::__load_vector<__vec>(__first1) == std::__load_vector<__vec>(__first2);
           !std::__all_of(__cmp_res)) {
         auto __offset = std::__find_first_not_set(__cmp_res);
         return {__first1 + __offset, __first2 + __offset};
@@ -107,8 +107,8 @@ __mismatch_vectorized(_Iter __first1, _Iter __last1, _Iter __first2) {
     if (static_cast<size_t>(__first1 - __orig_first1) >= __vec_size) {
       __first1 = __last1 - __vec_size;
       __first2 = __last2 - __vec_size;
-      auto __offset = std::__find_first_not_set(
-          std::__as_mask(std::__load_vector<__vec>(__first1) == std::__load_vector<__vec>(__first2)));
+      auto __offset =
+          std::__find_first_not_set(std::__load_vector<__vec>(__first1) == std::__load_vector<__vec>(__first2));
       return {__first1 + __offset, __first2 + __offset};
     } // else loop over the elements individually
   }
diff --git a/libcxx/include/__algorithm/move.h b/libcxx/include/__algorithm/move.h
index 005099dcac06..6f3b0eb5d292 100644
--- a/libcxx/include/__algorithm/move.h
+++ b/libcxx/include/__algorithm/move.h
@@ -50,7 +50,7 @@ struct __move_impl {
 
   template <class _InIter, class _OutIter>
   struct _MoveSegment {
-    using _Traits = __segmented_iterator_traits<_InIter>;
+    using _Traits _LIBCPP_NODEBUG = __segmented_iterator_traits<_InIter>;
 
     _OutIter& __result_;
 
diff --git a/libcxx/include/__algorithm/ranges_iterator_concept.h b/libcxx/include/__algorithm/ranges_iterator_concept.h
index 2af891d3af00..58790e95aa80 100644
--- a/libcxx/include/__algorithm/ranges_iterator_concept.h
+++ b/libcxx/include/__algorithm/ranges_iterator_concept.h
@@ -44,7 +44,7 @@ consteval auto __get_iterator_concept() {
 }
 
 template <class _Iter>
-using __iterator_concept = decltype(__get_iterator_concept<_Iter>());
+using __iterator_concept _LIBCPP_NODEBUG = decltype(__get_iterator_concept<_Iter>());
 
 } // namespace ranges
 _LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/include/__algorithm/ranges_unique_copy.h b/libcxx/include/__algorithm/ranges_unique_copy.h
index 3b4a64e94ca1..ee7f0a0187b7 100644
--- a/libcxx/include/__algorithm/ranges_unique_copy.h
+++ b/libcxx/include/__algorithm/ranges_unique_copy.h
@@ -60,7 +60,7 @@ struct __unique_copy {
   }
 
   template <class _InIter, class _OutIter>
-  using __algo_tag_t = decltype(__get_algo_tag<_InIter, _OutIter>());
+  using __algo_tag_t _LIBCPP_NODEBUG = decltype(__get_algo_tag<_InIter, _OutIter>());
 
   template <input_iterator _InIter,
             sentinel_for<_InIter> _Sent,
diff --git a/libcxx/include/__algorithm/simd_utils.h b/libcxx/include/__algorithm/simd_utils.h
index 3ca79247bbd0..4e03723a3285 100644
--- a/libcxx/include/__algorithm/simd_utils.h
+++ b/libcxx/include/__algorithm/simd_utils.h
@@ -70,7 +70,7 @@ struct __get_as_integer_type_impl<8> {
 };
 
 template <class _Tp>
-using __get_as_integer_type_t = typename __get_as_integer_type_impl<sizeof(_Tp)>::type;
+using __get_as_integer_type_t _LIBCPP_NODEBUG = typename __get_as_integer_type_impl<sizeof(_Tp)>::type;
 
 // This isn't specialized for 64 byte vectors on purpose. They have the potential to significantly reduce performance
 // in mixed simd/non-simd workloads and don't provide any performance improvement for currently vectorized algorithms
@@ -90,7 +90,7 @@ inline constexpr size_t __native_vector_size = 1;
 #  endif
 
 template <class _ArithmeticT, size_t _Np>
-using __simd_vector __attribute__((__ext_vector_type__(_Np))) = _ArithmeticT;
+using __simd_vector __attribute__((__ext_vector_type__(_Np))) _LIBCPP_NODEBUG = _ArithmeticT;
 
 template <class _VecT>
 inline constexpr size_t __simd_vector_size_v = []<bool _False = false>() -> size_t {
@@ -106,7 +106,7 @@ _LIBCPP_HIDE_FROM_ABI _Tp __simd_vector_underlying_type_impl(__simd_vector<_Tp,
 }
 
 template <class _VecT>
-using __simd_vector_underlying_type_t = decltype(std::__simd_vector_underlying_type_impl(_VecT{}));
+using __simd_vector_underlying_type_t _LIBCPP_NODEBUG = decltype(std::__simd_vector_underlying_type_impl(_VecT{}));
 
 // This isn't inlined without always_inline when loading chars.
 template <class _VecT, class _Iter>
@@ -116,65 +116,42 @@ template <class _VecT, class _Iter>
   }(make_index_sequence<__simd_vector_size_v<_VecT>>{});
 }
 
-template <size_t _Np>
-[[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI bool __all_of(__simd_vector<bool, _Np> __vec) noexcept {
-  return __builtin_reduce_and(__vec);
-}
-
 template <class _Tp, size_t _Np>
-[[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI auto __as_mask(__simd_vector<_Tp, _Np> __vec) noexcept {
-  static_assert(!is_same<_Tp, bool>::value, "vector type should not be a bool!");
-  return __builtin_convertvector(__vec, __simd_vector<bool, _Np>);
-}
-
-// This uses __builtin_convertvector around the __builtin_shufflevector to work around #107981.
-template <size_t _Np>
-[[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI __simd_vector<bool, 8>
-__extend_vector(__simd_vector<bool, _Np> __vec) noexcept {
-  using _VecT = __simd_vector<bool, _Np>;
-  if constexpr (_Np == 4) {
-    return __builtin_convertvector(
-        __builtin_shufflevector(__vec, _VecT{}, 0, 1, 2, 3, 4, 5, 6, 7), __simd_vector<bool, 8>);
-  } else if constexpr (_Np == 2) {
-    return std::__extend_vector(
-        __builtin_convertvector(__builtin_shufflevector(__vec, _VecT{}, 0, 1, 2, 3), __simd_vector<bool, 4>));
-  } else if constexpr (_Np == 1) {
-    return std::__extend_vector(
-        __builtin_convertvector(__builtin_shufflevector(__vec, _VecT{}, 0, 1), __simd_vector<bool, 2>));
-  } else {
-    static_assert(sizeof(_VecT) == 0, "Unexpected vector size");
-  }
+[[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI bool __all_of(__simd_vector<_Tp, _Np> __vec) noexcept {
+  return __builtin_reduce_and(__builtin_convertvector(__vec, __simd_vector<bool, _Np>));
 }
 
-template <size_t _Np>
-[[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI auto __to_int_mask(__simd_vector<bool, _Np> __vec) {
-  if constexpr (_Np < 8) {
-    return std::__bit_cast<uint8_t>(std::__extend_vector(__vec));
-  } else if constexpr (_Np == 8) {
-    return std::__bit_cast<uint8_t>(__vec);
-  } else if constexpr (_Np == 16) {
-    return std::__bit_cast<uint16_t>(__vec);
-  } else if constexpr (_Np == 32) {
-    return std::__bit_cast<uint32_t>(__vec);
-  } else if constexpr (_Np == 64) {
-    return std::__bit_cast<uint64_t>(__vec);
-  } else {
-    static_assert(sizeof(__simd_vector<bool, _Np>) == 0, "Unexpected vector size");
-    return 0;
-  }
-}
+template <class _Tp, size_t _Np>
+[[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI size_t __find_first_set(__simd_vector<_Tp, _Np> __vec) noexcept {
+  using __mask_vec = __simd_vector<bool, _Np>;
 
-template <size_t _Np>
-[[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI size_t __find_first_set(__simd_vector<bool, _Np> __vec) noexcept {
+  // This has MSan disabled du to https://github.com/llvm/llvm-project/issues/85876
+  auto __impl = [&]<class _MaskT>(_MaskT) _LIBCPP_NO_SANITIZE("memory") noexcept {
 #  if defined(_LIBCPP_BIG_ENDIAN)
-  return std::min<size_t>(_Np, std::__countl_zero(std::__to_int_mask(__vec)));
+    return std::min<size_t>(
+        _Np, std::__countl_zero(__builtin_bit_cast(_MaskT, __builtin_convertvector(__vec, __mask_vec))));
 #  else
-  return std::min<size_t>(_Np, std::__countr_zero(std::__to_int_mask(__vec)));
+    return std::min<size_t>(
+        _Np, std::__countr_zero(__builtin_bit_cast(_MaskT, __builtin_convertvector(__vec, __mask_vec))));
 #  endif
+  };
+
+  if constexpr (sizeof(__mask_vec) == sizeof(uint8_t)) {
+    return __impl(uint8_t{});
+  } else if constexpr (sizeof(__mask_vec) == sizeof(uint16_t)) {
+    return __impl(uint16_t{});
+  } else if constexpr (sizeof(__mask_vec) == sizeof(uint32_t)) {
+    return __impl(uint32_t{});
+  } else if constexpr (sizeof(__mask_vec) == sizeof(uint64_t)) {
+    return __impl(uint64_t{});
+  } else {
+    static_assert(sizeof(__mask_vec) == 0, "unexpected required size for mask integer type");
+    return 0;
+  }
 }
 
-template <size_t _Np>
-[[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI size_t __find_first_not_set(__simd_vector<bool, _Np> __vec) noexcept {
+template <class _Tp, size_t _Np>
+[[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI size_t __find_first_not_set(__simd_vector<_Tp, _Np> __vec) noexcept {
   return std::__find_first_set(~__vec);
 }
 
diff --git a/libcxx/include/__algorithm/sort.h b/libcxx/include/__algorithm/sort.h
index ed828b6d7231..5c60b23931cc 100644
--- a/libcxx/include/__algorithm/sort.h
+++ b/libcxx/include/__algorithm/sort.h
@@ -890,10 +890,10 @@ __sort_dispatch(_RandomAccessIterator __first, _RandomAccessIterator __last, _Co
 }
 
 template <class _Type, class... _Options>
-using __is_any_of = _Or<is_same<_Type, _Options>...>;
+using __is_any_of _LIBCPP_NODEBUG = _Or<is_same<_Type, _Options>...>;
 
 template <class _Type>
-using __sort_is_specialized_in_library = __is_any_of<
+using __sort_is_specialized_in_library _LIBCPP_NODEBUG = __is_any_of<
     _Type,
     char,
 #if _LIBCPP_HAS_WIDE_CHARACTERS
diff --git a/libcxx/include/__algorithm/three_way_comp_ref_type.h b/libcxx/include/__algorithm/three_way_comp_ref_type.h
index 5702a1fee082..f6f76455e466 100644
--- a/libcxx/include/__algorithm/three_way_comp_ref_type.h
+++ b/libcxx/include/__algorithm/three_way_comp_ref_type.h
@@ -61,10 +61,10 @@ struct __debug_three_way_comp {
 // Pass the comparator by lvalue reference. Or in the debug mode, using a debugging wrapper that stores a reference.
 #  if _LIBCPP_HARDENING_MODE == _LIBCPP_HARDENING_MODE_DEBUG
 template <class _Comp>
-using __three_way_comp_ref_type = __debug_three_way_comp<_Comp>;
+using __three_way_comp_ref_type _LIBCPP_NODEBUG = __debug_three_way_comp<_Comp>;
 #  else
 template <class _Comp>
-using __three_way_comp_ref_type = _Comp&;
+using __three_way_comp_ref_type _LIBCPP_NODEBUG = _Comp&;
 #  endif
 
 #endif // _LIBCPP_STD_VER >= 20
diff --git a/libcxx/include/__algorithm/unwrap_iter.h b/libcxx/include/__algorithm/unwrap_iter.h
index 8cc0d22d4fc2..b66a682e765f 100644
--- a/libcxx/include/__algorithm/unwrap_iter.h
+++ b/libcxx/include/__algorithm/unwrap_iter.h
@@ -46,7 +46,7 @@ struct __unwrap_iter_impl {
 // It's a contiguous iterator, so we can use a raw pointer instead
 template <class _Iter>
 struct __unwrap_iter_impl<_Iter, true> {
-  using _ToAddressT = decltype(std::__to_address(std::declval<_Iter>()));
+  using _ToAddressT _LIBCPP_NODEBUG = decltype(std::__to_address(std::declval<_Iter>()));
 
   static _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR _Iter __rewrap(_Iter __orig_iter, _ToAddressT __unwrapped_iter) {
     return __orig_iter + (__unwrapped_iter - std::__to_address(__orig_iter));
diff --git a/libcxx/include/__atomic/aliases.h b/libcxx/include/__atomic/aliases.h
index 37d11dd0aabf..4fccebab2563 100644
--- a/libcxx/include/__atomic/aliases.h
+++ b/libcxx/include/__atomic/aliases.h
@@ -84,19 +84,19 @@ using atomic_uintmax_t = atomic<uintmax_t>;
 // C++20 atomic_{signed,unsigned}_lock_free: prefer the contention type most highly, then the largest lock-free type
 #if _LIBCPP_STD_VER >= 20
 #  if ATOMIC_LLONG_LOCK_FREE == 2
-using __largest_lock_free_type = long long;
+using __largest_lock_free_type _LIBCPP_NODEBUG = long long;
 #  elif ATOMIC_INT_LOCK_FREE == 2
-using __largest_lock_free_type = int;
+using __largest_lock_free_type _LIBCPP_NODEBUG = int;
 #  elif ATOMIC_SHORT_LOCK_FREE == 2
-using __largest_lock_free_type = short;
+using __largest_lock_free_type _LIBCPP_NODEBUG = short;
 #  elif ATOMIC_CHAR_LOCK_FREE == 2
-using __largest_lock_free_type = char;
+using __largest_lock_free_type _LIBCPP_NODEBUG = char;
 #  else
 #    define _LIBCPP_NO_LOCK_FREE_TYPES // There are no lockfree types (this can happen on unusual platforms)
 #  endif
 
 #  ifndef _LIBCPP_NO_LOCK_FREE_TYPES
-using __contention_t_or_largest =
+using __contention_t_or_largest _LIBCPP_NODEBUG =
     __conditional_t<__libcpp_is_always_lock_free<__cxx_contention_t>::__value,
                     __cxx_contention_t,
                     __largest_lock_free_type>;
diff --git a/libcxx/include/__atomic/atomic.h b/libcxx/include/__atomic/atomic.h
index 8029b52770d2..975a479e2040 100644
--- a/libcxx/include/__atomic/atomic.h
+++ b/libcxx/include/__atomic/atomic.h
@@ -143,7 +143,7 @@ struct __atomic_base // false
 
 template <class _Tp>
 struct __atomic_base<_Tp, true> : public __atomic_base<_Tp, false> {
-  using __base = __atomic_base<_Tp, false>;
+  using __base _LIBCPP_NODEBUG = __atomic_base<_Tp, false>;
 
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __atomic_base() _NOEXCEPT = default;
 
@@ -228,9 +228,9 @@ struct __atomic_waitable_traits<__atomic_base<_Tp, _IsIntegral> > {
 
 template <class _Tp>
 struct atomic : public __atomic_base<_Tp> {
-  using __base          = __atomic_base<_Tp>;
-  using value_type      = _Tp;
-  using difference_type = value_type;
+  using __base _LIBCPP_NODEBUG = __atomic_base<_Tp>;
+  using value_type             = _Tp;
+  using difference_type        = value_type;
 
 #if _LIBCPP_STD_VER >= 20
   _LIBCPP_HIDE_FROM_ABI atomic() = default;
@@ -257,9 +257,9 @@ struct atomic : public __atomic_base<_Tp> {
 
 template <class _Tp>
 struct atomic<_Tp*> : public __atomic_base<_Tp*> {
-  using __base          = __atomic_base<_Tp*>;
-  using value_type      = _Tp*;
-  using difference_type = ptrdiff_t;
+  using __base _LIBCPP_NODEBUG = __atomic_base<_Tp*>;
+  using value_type             = _Tp*;
+  using difference_type        = ptrdiff_t;
 
   _LIBCPP_HIDE_FROM_ABI atomic() _NOEXCEPT = default;
 
@@ -389,9 +389,9 @@ private:
   }
 
 public:
-  using __base          = __atomic_base<_Tp>;
-  using value_type      = _Tp;
-  using difference_type = value_type;
+  using __base _LIBCPP_NODEBUG = __atomic_base<_Tp>;
+  using value_type             = _Tp;
+  using difference_type        = value_type;
 
   _LIBCPP_HIDE_FROM_ABI constexpr atomic() noexcept = default;
   _LIBCPP_HIDE_FROM_ABI constexpr atomic(_Tp __d) noexcept : __base(__d) {}
diff --git a/libcxx/include/__atomic/atomic_ref.h b/libcxx/include/__atomic/atomic_ref.h
index eef15983b983..177ea646b6cd 100644
--- a/libcxx/include/__atomic/atomic_ref.h
+++ b/libcxx/include/__atomic/atomic_ref.h
@@ -221,7 +221,7 @@ public:
   _LIBCPP_HIDE_FROM_ABI void notify_all() const noexcept { std::__atomic_notify_all(*this); }
 
 protected:
-  using _Aligned_Tp [[__gnu__::__aligned__(required_alignment)]] = _Tp;
+  using _Aligned_Tp [[__gnu__::__aligned__(required_alignment), __gnu__::__nodebug__]] = _Tp;
   _Aligned_Tp* __ptr_;
 
   _LIBCPP_HIDE_FROM_ABI __atomic_ref_base(_Tp& __obj) : __ptr_(std::addressof(__obj)) {}
@@ -241,7 +241,7 @@ template <class _Tp>
 struct atomic_ref : public __atomic_ref_base<_Tp> {
   static_assert(is_trivially_copyable_v<_Tp>, "std::atomic_ref<T> requires that 'T' be a trivially copyable type");
 
-  using __base = __atomic_ref_base<_Tp>;
+  using __base _LIBCPP_NODEBUG = __atomic_ref_base<_Tp>;
 
   _LIBCPP_HIDE_FROM_ABI explicit atomic_ref(_Tp& __obj) : __base(__obj) {
     _LIBCPP_ASSERT_ARGUMENT_WITHIN_DOMAIN(
@@ -259,7 +259,7 @@ struct atomic_ref : public __atomic_ref_base<_Tp> {
 template <class _Tp>
   requires(std::integral<_Tp> && !std::same_as<bool, _Tp>)
 struct atomic_ref<_Tp> : public __atomic_ref_base<_Tp> {
-  using __base = __atomic_ref_base<_Tp>;
+  using __base _LIBCPP_NODEBUG = __atomic_ref_base<_Tp>;
 
   using difference_type = __base::value_type;
 
@@ -305,7 +305,7 @@ struct atomic_ref<_Tp> : public __atomic_ref_base<_Tp> {
 template <class _Tp>
   requires std::floating_point<_Tp>
 struct atomic_ref<_Tp> : public __atomic_ref_base<_Tp> {
-  using __base = __atomic_ref_base<_Tp>;
+  using __base _LIBCPP_NODEBUG = __atomic_ref_base<_Tp>;
 
   using difference_type = __base::value_type;
 
@@ -344,7 +344,7 @@ struct atomic_ref<_Tp> : public __atomic_ref_base<_Tp> {
 
 template <class _Tp>
 struct atomic_ref<_Tp*> : public __atomic_ref_base<_Tp*> {
-  using __base = __atomic_ref_base<_Tp*>;
+  using __base _LIBCPP_NODEBUG = __atomic_ref_base<_Tp*>;
 
   using difference_type = ptrdiff_t;
 
diff --git a/libcxx/include/__atomic/atomic_sync.h b/libcxx/include/__atomic/atomic_sync.h
index 153001e7b62e..ab9bc59fdcfe 100644
--- a/libcxx/include/__atomic/atomic_sync.h
+++ b/libcxx/include/__atomic/atomic_sync.h
@@ -81,7 +81,7 @@ struct __atomic_wait_backoff_impl {
   _Poll __poll_;
   memory_order __order_;
 
-  using __waitable_traits = __atomic_waitable_traits<__decay_t<_AtomicWaitable> >;
+  using __waitable_traits _LIBCPP_NODEBUG = __atomic_waitable_traits<__decay_t<_AtomicWaitable> >;
 
   _LIBCPP_AVAILABILITY_SYNC
   _LIBCPP_HIDE_FROM_ABI bool
diff --git a/libcxx/include/__atomic/contention_t.h b/libcxx/include/__atomic/contention_t.h
index 6f2a073bc1a8..5b42a0125f87 100644
--- a/libcxx/include/__atomic/contention_t.h
+++ b/libcxx/include/__atomic/contention_t.h
@@ -20,12 +20,12 @@
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 #if defined(__linux__) || (defined(_AIX) && !defined(__64BIT__))
-using __cxx_contention_t = int32_t;
+using __cxx_contention_t _LIBCPP_NODEBUG = int32_t;
 #else
-using __cxx_contention_t = int64_t;
+using __cxx_contention_t _LIBCPP_NODEBUG = int64_t;
 #endif // __linux__ || (_AIX && !__64BIT__)
 
-using __cxx_atomic_contention_t = __cxx_atomic_impl<__cxx_contention_t>;
+using __cxx_atomic_contention_t _LIBCPP_NODEBUG = __cxx_atomic_impl<__cxx_contention_t>;
 
 _LIBCPP_END_NAMESPACE_STD
 
diff --git a/libcxx/include/__atomic/memory_order.h b/libcxx/include/__atomic/memory_order.h
index 294121d1c4e7..44790fe888b3 100644
--- a/libcxx/include/__atomic/memory_order.h
+++ b/libcxx/include/__atomic/memory_order.h
@@ -24,7 +24,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 // to pin the underlying type in C++20.
 enum __legacy_memory_order { __mo_relaxed, __mo_consume, __mo_acquire, __mo_release, __mo_acq_rel, __mo_seq_cst };
 
-using __memory_order_underlying_t = underlying_type<__legacy_memory_order>::type;
+using __memory_order_underlying_t _LIBCPP_NODEBUG = underlying_type<__legacy_memory_order>::type;
 
 #if _LIBCPP_STD_VER >= 20
 
diff --git a/libcxx/include/__bit_reference b/libcxx/include/__bit_reference
index 9fa24c98d493..7e27090cc68a 100644
--- a/libcxx/include/__bit_reference
+++ b/libcxx/include/__bit_reference
@@ -43,8 +43,8 @@ struct __has_storage_type {
 
 template <class _Cp, bool = __has_storage_type<_Cp>::value>
 class __bit_reference {
-  using __storage_type    = typename _Cp::__storage_type;
-  using __storage_pointer = typename _Cp::__storage_pointer;
+  using __storage_type _LIBCPP_NODEBUG    = typename _Cp::__storage_type;
+  using __storage_pointer _LIBCPP_NODEBUG = typename _Cp::__storage_pointer;
 
   __storage_pointer __seg_;
   __storage_type __mask_;
@@ -55,7 +55,7 @@ class __bit_reference {
   friend class __bit_iterator<_Cp, false>;
 
 public:
-  using __container = typename _Cp::__self;
+  using __container _LIBCPP_NODEBUG = typename _Cp::__self;
 
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __bit_reference(const __bit_reference&) = default;
 
@@ -135,8 +135,8 @@ inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void swap(bool& __x,
 
 template <class _Cp>
 class __bit_const_reference {
-  using __storage_type    = typename _Cp::__storage_type;
-  using __storage_pointer = typename _Cp::__const_storage_pointer;
+  using __storage_type _LIBCPP_NODEBUG    = typename _Cp::__storage_type;
+  using __storage_pointer _LIBCPP_NODEBUG = typename _Cp::__const_storage_pointer;
 
   __storage_pointer __seg_;
   __storage_type __mask_;
@@ -145,7 +145,7 @@ class __bit_const_reference {
   friend class __bit_iterator<_Cp, true>;
 
 public:
-  using __container = typename _Cp::__self;
+  using __container _LIBCPP_NODEBUG = typename _Cp::__self;
 
   _LIBCPP_HIDE_FROM_ABI __bit_const_reference(const __bit_const_reference&) = default;
   __bit_const_reference& operator=(const __bit_const_reference&)            = delete;
@@ -587,10 +587,10 @@ inline _LIBCPP_HIDE_FROM_ABI __bit_iterator<_Cr, false> swap_ranges(
 
 template <class _Cp>
 struct __bit_array {
-  using difference_type   = typename _Cp::difference_type;
-  using __storage_type    = typename _Cp::__storage_type;
-  using __storage_pointer = typename _Cp::__storage_pointer;
-  using iterator          = typename _Cp::iterator;
+  using difference_type _LIBCPP_NODEBUG   = typename _Cp::difference_type;
+  using __storage_type _LIBCPP_NODEBUG    = typename _Cp::__storage_type;
+  using __storage_pointer _LIBCPP_NODEBUG = typename _Cp::__storage_pointer;
+  using iterator _LIBCPP_NODEBUG          = typename _Cp::iterator;
 
   static const unsigned __bits_per_word = _Cp::__bits_per_word;
   static const unsigned _Np             = 4;
@@ -790,8 +790,8 @@ public:
   using iterator_category = random_access_iterator_tag;
 
 private:
-  using __storage_type = typename _Cp::__storage_type;
-  using __storage_pointer =
+  using __storage_type _LIBCPP_NODEBUG = typename _Cp::__storage_type;
+  using __storage_pointer _LIBCPP_NODEBUG =
       __conditional_t<_IsConst, typename _Cp::__const_storage_pointer, typename _Cp::__storage_pointer>;
 
   static const unsigned __bits_per_word = _Cp::__bits_per_word;
diff --git a/libcxx/include/__chrono/formatter.h b/libcxx/include/__chrono/formatter.h
index 3671e6aa52b4..1086dde38195 100644
--- a/libcxx/include/__chrono/formatter.h
+++ b/libcxx/include/__chrono/formatter.h
@@ -711,7 +711,7 @@ public:
 template <class _Duration, __fmt_char_type _CharT>
 struct _LIBCPP_TEMPLATE_VIS formatter<chrono::sys_time<_Duration>, _CharT> : public __formatter_chrono<_CharT> {
 public:
-  using _Base = __formatter_chrono<_CharT>;
+  using _Base _LIBCPP_NODEBUG = __formatter_chrono<_CharT>;
 
   template <class _ParseContext>
   _LIBCPP_HIDE_FROM_ABI constexpr typename _ParseContext::iterator parse(_ParseContext& __ctx) {
@@ -722,7 +722,7 @@ public:
 template <class _Duration, __fmt_char_type _CharT>
 struct _LIBCPP_TEMPLATE_VIS formatter<chrono::file_time<_Duration>, _CharT> : public __formatter_chrono<_CharT> {
 public:
-  using _Base = __formatter_chrono<_CharT>;
+  using _Base _LIBCPP_NODEBUG = __formatter_chrono<_CharT>;
 
   template <class _ParseContext>
   _LIBCPP_HIDE_FROM_ABI constexpr typename _ParseContext::iterator parse(_ParseContext& __ctx) {
@@ -733,7 +733,7 @@ public:
 template <class _Duration, __fmt_char_type _CharT>
 struct _LIBCPP_TEMPLATE_VIS formatter<chrono::local_time<_Duration>, _CharT> : public __formatter_chrono<_CharT> {
 public:
-  using _Base = __formatter_chrono<_CharT>;
+  using _Base _LIBCPP_NODEBUG = __formatter_chrono<_CharT>;
 
   template <class _ParseContext>
   _LIBCPP_HIDE_FROM_ABI constexpr typename _ParseContext::iterator parse(_ParseContext& __ctx) {
@@ -745,7 +745,7 @@ public:
 template <class _Rep, class _Period, __fmt_char_type _CharT>
 struct formatter<chrono::duration<_Rep, _Period>, _CharT> : public __formatter_chrono<_CharT> {
 public:
-  using _Base = __formatter_chrono<_CharT>;
+  using _Base _LIBCPP_NODEBUG = __formatter_chrono<_CharT>;
 
   template <class _ParseContext>
   _LIBCPP_HIDE_FROM_ABI constexpr typename _ParseContext::iterator parse(_ParseContext& __ctx) {
@@ -767,7 +767,7 @@ public:
 template <__fmt_char_type _CharT>
 struct _LIBCPP_TEMPLATE_VIS formatter<chrono::day, _CharT> : public __formatter_chrono<_CharT> {
 public:
-  using _Base = __formatter_chrono<_CharT>;
+  using _Base _LIBCPP_NODEBUG = __formatter_chrono<_CharT>;
 
   template <class _ParseContext>
   _LIBCPP_HIDE_FROM_ABI constexpr typename _ParseContext::iterator parse(_ParseContext& __ctx) {
@@ -778,7 +778,7 @@ public:
 template <__fmt_char_type _CharT>
 struct _LIBCPP_TEMPLATE_VIS formatter<chrono::month, _CharT> : public __formatter_chrono<_CharT> {
 public:
-  using _Base = __formatter_chrono<_CharT>;
+  using _Base _LIBCPP_NODEBUG = __formatter_chrono<_CharT>;
 
   template <class _ParseContext>
   _LIBCPP_HIDE_FROM_ABI constexpr typename _ParseContext::iterator parse(_ParseContext& __ctx) {
@@ -789,7 +789,7 @@ public:
 template <__fmt_char_type _CharT>
 struct _LIBCPP_TEMPLATE_VIS formatter<chrono::year, _CharT> : public __formatter_chrono<_CharT> {
 public:
-  using _Base = __formatter_chrono<_CharT>;
+  using _Base _LIBCPP_NODEBUG = __formatter_chrono<_CharT>;
 
   template <class _ParseContext>
   _LIBCPP_HIDE_FROM_ABI constexpr typename _ParseContext::iterator parse(_ParseContext& __ctx) {
@@ -800,7 +800,7 @@ public:
 template <__fmt_char_type _CharT>
 struct _LIBCPP_TEMPLATE_VIS formatter<chrono::weekday, _CharT> : public __formatter_chrono<_CharT> {
 public:
-  using _Base = __formatter_chrono<_CharT>;
+  using _Base _LIBCPP_NODEBUG = __formatter_chrono<_CharT>;
 
   template <class _ParseContext>
   _LIBCPP_HIDE_FROM_ABI constexpr typename _ParseContext::iterator parse(_ParseContext& __ctx) {
@@ -811,7 +811,7 @@ public:
 template <__fmt_char_type _CharT>
 struct _LIBCPP_TEMPLATE_VIS formatter<chrono::weekday_indexed, _CharT> : public __formatter_chrono<_CharT> {
 public:
-  using _Base = __formatter_chrono<_CharT>;
+  using _Base _LIBCPP_NODEBUG = __formatter_chrono<_CharT>;
 
   template <class _ParseContext>
   _LIBCPP_HIDE_FROM_ABI constexpr typename _ParseContext::iterator parse(_ParseContext& __ctx) {
@@ -822,7 +822,7 @@ public:
 template <__fmt_char_type _CharT>
 struct _LIBCPP_TEMPLATE_VIS formatter<chrono::weekday_last, _CharT> : public __formatter_chrono<_CharT> {
 public:
-  using _Base = __formatter_chrono<_CharT>;
+  using _Base _LIBCPP_NODEBUG = __formatter_chrono<_CharT>;
 
   template <class _ParseContext>
   _LIBCPP_HIDE_FROM_ABI constexpr typename _ParseContext::iterator parse(_ParseContext& __ctx) {
@@ -833,7 +833,7 @@ public:
 template <__fmt_char_type _CharT>
 struct _LIBCPP_TEMPLATE_VIS formatter<chrono::month_day, _CharT> : public __formatter_chrono<_CharT> {
 public:
-  using _Base = __formatter_chrono<_CharT>;
+  using _Base _LIBCPP_NODEBUG = __formatter_chrono<_CharT>;
 
   template <class _ParseContext>
   _LIBCPP_HIDE_FROM_ABI constexpr typename _ParseContext::iterator parse(_ParseContext& __ctx) {
@@ -844,7 +844,7 @@ public:
 template <__fmt_char_type _CharT>
 struct _LIBCPP_TEMPLATE_VIS formatter<chrono::month_day_last, _CharT> : public __formatter_chrono<_CharT> {
 public:
-  using _Base = __formatter_chrono<_CharT>;
+  using _Base _LIBCPP_NODEBUG = __formatter_chrono<_CharT>;
 
   template <class _ParseContext>
   _LIBCPP_HIDE_FROM_ABI constexpr typename _ParseContext::iterator parse(_ParseContext& __ctx) {
@@ -855,7 +855,7 @@ public:
 template <__fmt_char_type _CharT>
 struct _LIBCPP_TEMPLATE_VIS formatter<chrono::month_weekday, _CharT> : public __formatter_chrono<_CharT> {
 public:
-  using _Base = __formatter_chrono<_CharT>;
+  using _Base _LIBCPP_NODEBUG = __formatter_chrono<_CharT>;
 
   template <class _ParseContext>
   _LIBCPP_HIDE_FROM_ABI constexpr typename _ParseContext::iterator parse(_ParseContext& __ctx) {
@@ -866,7 +866,7 @@ public:
 template <__fmt_char_type _CharT>
 struct _LIBCPP_TEMPLATE_VIS formatter<chrono::month_weekday_last, _CharT> : public __formatter_chrono<_CharT> {
 public:
-  using _Base = __formatter_chrono<_CharT>;
+  using _Base _LIBCPP_NODEBUG = __formatter_chrono<_CharT>;
 
   template <class _ParseContext>
   _LIBCPP_HIDE_FROM_ABI constexpr typename _ParseContext::iterator parse(_ParseContext& __ctx) {
@@ -877,7 +877,7 @@ public:
 template <__fmt_char_type _CharT>
 struct _LIBCPP_TEMPLATE_VIS formatter<chrono::year_month, _CharT> : public __formatter_chrono<_CharT> {
 public:
-  using _Base = __formatter_chrono<_CharT>;
+  using _Base _LIBCPP_NODEBUG = __formatter_chrono<_CharT>;
 
   template <class _ParseContext>
   _LIBCPP_HIDE_FROM_ABI constexpr typename _ParseContext::iterator parse(_ParseContext& __ctx) {
@@ -888,7 +888,7 @@ public:
 template <__fmt_char_type _CharT>
 struct _LIBCPP_TEMPLATE_VIS formatter<chrono::year_month_day, _CharT> : public __formatter_chrono<_CharT> {
 public:
-  using _Base = __formatter_chrono<_CharT>;
+  using _Base _LIBCPP_NODEBUG = __formatter_chrono<_CharT>;
 
   template <class _ParseContext>
   _LIBCPP_HIDE_FROM_ABI constexpr typename _ParseContext::iterator parse(_ParseContext& __ctx) {
@@ -899,7 +899,7 @@ public:
 template <__fmt_char_type _CharT>
 struct _LIBCPP_TEMPLATE_VIS formatter<chrono::year_month_day_last, _CharT> : public __formatter_chrono<_CharT> {
 public:
-  using _Base = __formatter_chrono<_CharT>;
+  using _Base _LIBCPP_NODEBUG = __formatter_chrono<_CharT>;
 
   template <class _ParseContext>
   _LIBCPP_HIDE_FROM_ABI constexpr typename _ParseContext::iterator parse(_ParseContext& __ctx) {
@@ -910,7 +910,7 @@ public:
 template <__fmt_char_type _CharT>
 struct _LIBCPP_TEMPLATE_VIS formatter<chrono::year_month_weekday, _CharT> : public __formatter_chrono<_CharT> {
 public:
-  using _Base = __formatter_chrono<_CharT>;
+  using _Base _LIBCPP_NODEBUG = __formatter_chrono<_CharT>;
 
   template <class _ParseContext>
   _LIBCPP_HIDE_FROM_ABI constexpr typename _ParseContext::iterator parse(_ParseContext& __ctx) {
@@ -921,7 +921,7 @@ public:
 template <__fmt_char_type _CharT>
 struct _LIBCPP_TEMPLATE_VIS formatter<chrono::year_month_weekday_last, _CharT> : public __formatter_chrono<_CharT> {
 public:
-  using _Base = __formatter_chrono<_CharT>;
+  using _Base _LIBCPP_NODEBUG = __formatter_chrono<_CharT>;
 
   template <class _ParseContext>
   _LIBCPP_HIDE_FROM_ABI constexpr typename _ParseContext::iterator parse(_ParseContext& __ctx) {
@@ -932,7 +932,7 @@ public:
 template <class _Duration, __fmt_char_type _CharT>
 struct formatter<chrono::hh_mm_ss<_Duration>, _CharT> : public __formatter_chrono<_CharT> {
 public:
-  using _Base = __formatter_chrono<_CharT>;
+  using _Base _LIBCPP_NODEBUG = __formatter_chrono<_CharT>;
 
   template <class _ParseContext>
   _LIBCPP_HIDE_FROM_ABI constexpr typename _ParseContext::iterator parse(_ParseContext& __ctx) {
@@ -944,7 +944,7 @@ public:
 template <__fmt_char_type _CharT>
 struct formatter<chrono::sys_info, _CharT> : public __formatter_chrono<_CharT> {
 public:
-  using _Base = __formatter_chrono<_CharT>;
+  using _Base _LIBCPP_NODEBUG = __formatter_chrono<_CharT>;
 
   template <class _ParseContext>
   _LIBCPP_HIDE_FROM_ABI constexpr typename _ParseContext::iterator parse(_ParseContext& __ctx) {
@@ -955,7 +955,7 @@ public:
 template <__fmt_char_type _CharT>
 struct formatter<chrono::local_info, _CharT> : public __formatter_chrono<_CharT> {
 public:
-  using _Base = __formatter_chrono<_CharT>;
+  using _Base _LIBCPP_NODEBUG = __formatter_chrono<_CharT>;
 
   template <class _ParseContext>
   _LIBCPP_HIDE_FROM_ABI constexpr typename _ParseContext::iterator parse(_ParseContext& __ctx) {
@@ -968,7 +968,7 @@ public:
 template <class _Duration, class _TimeZonePtr, __fmt_char_type _CharT>
 struct formatter<chrono::zoned_time<_Duration, _TimeZonePtr>, _CharT> : public __formatter_chrono<_CharT> {
 public:
-  using _Base = __formatter_chrono<_CharT>;
+  using _Base _LIBCPP_NODEBUG = __formatter_chrono<_CharT>;
 
   template <class _ParseContext>
   _LIBCPP_HIDE_FROM_ABI constexpr typename _ParseContext::iterator parse(_ParseContext& __ctx) {
diff --git a/libcxx/include/__chrono/hh_mm_ss.h b/libcxx/include/__chrono/hh_mm_ss.h
index c460b1130508..6ea8a28ee093 100644
--- a/libcxx/include/__chrono/hh_mm_ss.h
+++ b/libcxx/include/__chrono/hh_mm_ss.h
@@ -30,7 +30,7 @@ template <class _Duration>
 class hh_mm_ss {
 private:
   static_assert(__is_duration_v<_Duration>, "template parameter of hh_mm_ss must be a std::chrono::duration");
-  using __CommonType = common_type_t<_Duration, chrono::seconds>;
+  using __CommonType _LIBCPP_NODEBUG = common_type_t<_Duration, chrono::seconds>;
 
   _LIBCPP_HIDE_FROM_ABI static constexpr uint64_t __pow10(unsigned __exp) {
     uint64_t __ret = 1;
diff --git a/libcxx/include/__chrono/parser_std_format_spec.h b/libcxx/include/__chrono/parser_std_format_spec.h
index 3976864c12b9..4df8e603c6bc 100644
--- a/libcxx/include/__chrono/parser_std_format_spec.h
+++ b/libcxx/include/__chrono/parser_std_format_spec.h
@@ -140,7 +140,7 @@ _LIBCPP_HIDE_FROM_ABI constexpr void __validate_time_zone(__flags __flags) {
 
 template <class _CharT>
 class _LIBCPP_TEMPLATE_VIS __parser_chrono {
-  using _ConstIterator = typename basic_format_parse_context<_CharT>::const_iterator;
+  using _ConstIterator _LIBCPP_NODEBUG = typename basic_format_parse_context<_CharT>::const_iterator;
 
 public:
   template <class _ParseContext>
diff --git a/libcxx/include/__chrono/zoned_time.h b/libcxx/include/__chrono/zoned_time.h
index f57e65c90a62..1deba10d96ae 100644
--- a/libcxx/include/__chrono/zoned_time.h
+++ b/libcxx/include/__chrono/zoned_time.h
@@ -66,7 +66,7 @@ class zoned_time {
   // Using these constraints in the code causes the compiler to give an
   // error that the constraint depends on itself. To avoid that issue use
   // the fact it is possible to create this object from a _TimeZonePtr.
-  using __traits = zoned_traits<_TimeZonePtr>;
+  using __traits _LIBCPP_NODEBUG = zoned_traits<_TimeZonePtr>;
 
 public:
   using duration = common_type_t<_Duration, seconds>;
@@ -186,7 +186,7 @@ template <class _Duration>
 zoned_time(sys_time<_Duration>) -> zoned_time<common_type_t<_Duration, seconds>>;
 
 template <class _TimeZonePtrOrName>
-using __time_zone_representation =
+using __time_zone_representation _LIBCPP_NODEBUG =
     conditional_t<is_convertible_v<_TimeZonePtrOrName, string_view>,
                   const time_zone*,
                   remove_cvref_t<_TimeZonePtrOrName>>;
diff --git a/libcxx/include/__compare/ordering.h b/libcxx/include/__compare/ordering.h
index 297218e6f29e..902ef5329dd4 100644
--- a/libcxx/include/__compare/ordering.h
+++ b/libcxx/include/__compare/ordering.h
@@ -120,7 +120,7 @@ inline constexpr partial_ordering partial_ordering::greater(_PartialOrdResult::_
 inline constexpr partial_ordering partial_ordering::unordered(_PartialOrdResult::__unordered);
 
 class weak_ordering {
-  using _ValueT = signed char;
+  using _ValueT _LIBCPP_NODEBUG = signed char;
 
   _LIBCPP_HIDE_FROM_ABI explicit constexpr weak_ordering(_OrdResult __v) noexcept : __value_(_ValueT(__v)) {}
 
@@ -190,7 +190,7 @@ inline constexpr weak_ordering weak_ordering::equivalent(_OrdResult::__equiv);
 inline constexpr weak_ordering weak_ordering::greater(_OrdResult::__greater);
 
 class strong_ordering {
-  using _ValueT = signed char;
+  using _ValueT _LIBCPP_NODEBUG = signed char;
 
   _LIBCPP_HIDE_FROM_ABI explicit constexpr strong_ordering(_OrdResult __v) noexcept : __value_(_ValueT(__v)) {}
 
diff --git a/libcxx/include/__compare/synth_three_way.h b/libcxx/include/__compare/synth_three_way.h
index e48ce4979983..63bf56d0cf42 100644
--- a/libcxx/include/__compare/synth_three_way.h
+++ b/libcxx/include/__compare/synth_three_way.h
@@ -43,7 +43,8 @@ _LIBCPP_HIDE_FROM_ABI inline constexpr auto __synth_three_way = []<class _Tp, cl
 };
 
 template <class _Tp, class _Up = _Tp>
-using __synth_three_way_result = decltype(std::__synth_three_way(std::declval<_Tp&>(), std::declval<_Up&>()));
+using __synth_three_way_result _LIBCPP_NODEBUG =
+    decltype(std::__synth_three_way(std::declval<_Tp&>(), std::declval<_Up&>()));
 
 #endif // _LIBCPP_STD_VER >= 20
 
diff --git a/libcxx/include/__exception/exception_ptr.h b/libcxx/include/__exception/exception_ptr.h
index 7df46a0e94dc..6257e6f729bf 100644
--- a/libcxx/include/__exception/exception_ptr.h
+++ b/libcxx/include/__exception/exception_ptr.h
@@ -66,7 +66,7 @@ class _LIBCPP_EXPORTED_FROM_ABI exception_ptr {
 
 public:
   // exception_ptr is basically a COW string.
-  using __trivially_relocatable = exception_ptr;
+  using __trivially_relocatable _LIBCPP_NODEBUG = exception_ptr;
 
   _LIBCPP_HIDE_FROM_ABI exception_ptr() _NOEXCEPT : __ptr_() {}
   _LIBCPP_HIDE_FROM_ABI exception_ptr(nullptr_t) _NOEXCEPT : __ptr_() {}
diff --git a/libcxx/include/__expected/expected.h b/libcxx/include/__expected/expected.h
index 3d3f11967ee7..03bbd1623ed5 100644
--- a/libcxx/include/__expected/expected.h
+++ b/libcxx/include/__expected/expected.h
@@ -459,14 +459,14 @@ class expected : private __expected_base<_Tp, _Err> {
   template <class _Up, class _OtherErr>
   friend class expected;
 
-  using __base = __expected_base<_Tp, _Err>;
+  using __base _LIBCPP_NODEBUG = __expected_base<_Tp, _Err>;
 
 public:
   using value_type      = _Tp;
   using error_type      = _Err;
   using unexpected_type = unexpected<_Err>;
 
-  using __trivially_relocatable =
+  using __trivially_relocatable _LIBCPP_NODEBUG =
       __conditional_t<__libcpp_is_trivially_relocatable<_Tp>::value && __libcpp_is_trivially_relocatable<_Err>::value,
                       expected,
                       void>;
@@ -505,7 +505,7 @@ public:
 
 private:
   template <class _Up, class _OtherErr, class _UfQual, class _OtherErrQual>
-  using __can_convert = _And<
+  using __can_convert _LIBCPP_NODEBUG = _And<
       is_constructible<_Tp, _UfQual>,
       is_constructible<_Err, _OtherErrQual>,
       _If<_Not<is_same<remove_cv_t<_Tp>, bool>>::value,
@@ -1363,7 +1363,7 @@ class expected<_Tp, _Err> : private __expected_void_base<_Err> {
   friend class expected;
 
   template <class _Up, class _OtherErr, class _OtherErrQual>
-  using __can_convert =
+  using __can_convert _LIBCPP_NODEBUG =
       _And< is_void<_Up>,
             is_constructible<_Err, _OtherErrQual>,
             _Not<is_constructible<unexpected<_Err>, expected<_Up, _OtherErr>&>>,
@@ -1371,7 +1371,7 @@ class expected<_Tp, _Err> : private __expected_void_base<_Err> {
             _Not<is_constructible<unexpected<_Err>, const expected<_Up, _OtherErr>&>>,
             _Not<is_constructible<unexpected<_Err>, const expected<_Up, _OtherErr>>>>;
 
-  using __base = __expected_void_base<_Err>;
+  using __base _LIBCPP_NODEBUG = __expected_void_base<_Err>;
 
 public:
   using value_type      = _Tp;
diff --git a/libcxx/include/__expected/unexpected.h b/libcxx/include/__expected/unexpected.h
index cf110bcf69a8..6904889b8c6b 100644
--- a/libcxx/include/__expected/unexpected.h
+++ b/libcxx/include/__expected/unexpected.h
@@ -48,12 +48,12 @@ template <class _Err>
 struct __is_std_unexpected<unexpected<_Err>> : true_type {};
 
 template <class _Tp>
-using __valid_std_unexpected = _BoolConstant< //
-    is_object_v<_Tp> &&                       //
-    !is_array_v<_Tp> &&                       //
-    !__is_std_unexpected<_Tp>::value &&       //
-    !is_const_v<_Tp> &&                       //
-    !is_volatile_v<_Tp>                       //
+using __valid_std_unexpected _LIBCPP_NODEBUG = _BoolConstant< //
+    is_object_v<_Tp> &&                                       //
+    !is_array_v<_Tp> &&                                       //
+    !__is_std_unexpected<_Tp>::value &&                       //
+    !is_const_v<_Tp> &&                                       //
+    !is_volatile_v<_Tp>                                       //
     >;
 
 template <class _Err>
diff --git a/libcxx/include/__filesystem/directory_entry.h b/libcxx/include/__filesystem/directory_entry.h
index 7d0c01b98def..11e07acdbe00 100644
--- a/libcxx/include/__filesystem/directory_entry.h
+++ b/libcxx/include/__filesystem/directory_entry.h
@@ -22,7 +22,9 @@
 #include <__filesystem/perms.h>
 #include <__fwd/ostream.h>
 #include <__system_error/errc.h>
+#include <__system_error/error_category.h>
 #include <__system_error/error_code.h>
+#include <__system_error/error_condition.h>
 #include <__utility/move.h>
 #include <__utility/unreachable.h>
 #include <cstdint>
@@ -274,15 +276,7 @@ private:
   _LIBCPP_EXPORTED_FROM_ABI error_code __do_refresh() noexcept;
 
   _LIBCPP_HIDE_FROM_ABI static bool __is_dne_error(error_code const& __ec) {
-    if (!__ec)
-      return true;
-    switch (static_cast<errc>(__ec.value())) {
-    case errc::no_such_file_or_directory:
-    case errc::not_a_directory:
-      return true;
-    default:
-      return false;
-    }
+    return !__ec || __ec == errc::no_such_file_or_directory || __ec == errc::not_a_directory;
   }
 
   _LIBCPP_HIDE_FROM_ABI void
diff --git a/libcxx/include/__filesystem/path.h b/libcxx/include/__filesystem/path.h
index 509d1cc8052f..0a751ba32954 100644
--- a/libcxx/include/__filesystem/path.h
+++ b/libcxx/include/__filesystem/path.h
@@ -51,30 +51,30 @@ template <class _Tp>
 struct __can_convert_char<const _Tp> : public __can_convert_char<_Tp> {};
 template <>
 struct __can_convert_char<char> {
-  static const bool value = true;
-  using __char_type       = char;
+  static const bool value           = true;
+  using __char_type _LIBCPP_NODEBUG = char;
 };
 template <>
 struct __can_convert_char<wchar_t> {
-  static const bool value = true;
-  using __char_type       = wchar_t;
+  static const bool value           = true;
+  using __char_type _LIBCPP_NODEBUG = wchar_t;
 };
 #  if _LIBCPP_HAS_CHAR8_T
 template <>
 struct __can_convert_char<char8_t> {
-  static const bool value = true;
-  using __char_type       = char8_t;
+  static const bool value           = true;
+  using __char_type _LIBCPP_NODEBUG = char8_t;
 };
 #  endif
 template <>
 struct __can_convert_char<char16_t> {
-  static const bool value = true;
-  using __char_type       = char16_t;
+  static const bool value           = true;
+  using __char_type _LIBCPP_NODEBUG = char16_t;
 };
 template <>
 struct __can_convert_char<char32_t> {
-  static const bool value = true;
-  using __char_type       = char32_t;
+  static const bool value           = true;
+  using __char_type _LIBCPP_NODEBUG = char32_t;
 };
 
 template <class _ECharT, __enable_if_t<__can_convert_char<_ECharT>::value, int> = 0>
@@ -95,7 +95,7 @@ typedef string __u8_string;
 struct _NullSentinel {};
 
 template <class _Tp>
-using _Void = void;
+using _Void _LIBCPP_NODEBUG = void;
 
 template <class _Tp, class = void>
 struct __is_pathable_string : public false_type {};
@@ -104,7 +104,7 @@ template <class _ECharT, class _Traits, class _Alloc>
 struct __is_pathable_string< basic_string<_ECharT, _Traits, _Alloc>,
                              _Void<typename __can_convert_char<_ECharT>::__char_type> >
     : public __can_convert_char<_ECharT> {
-  using _Str = basic_string<_ECharT, _Traits, _Alloc>;
+  using _Str _LIBCPP_NODEBUG = basic_string<_ECharT, _Traits, _Alloc>;
 
   _LIBCPP_HIDE_FROM_ABI static _ECharT const* __range_begin(_Str const& __s) { return __s.data(); }
 
@@ -117,7 +117,7 @@ template <class _ECharT, class _Traits>
 struct __is_pathable_string< basic_string_view<_ECharT, _Traits>,
                              _Void<typename __can_convert_char<_ECharT>::__char_type> >
     : public __can_convert_char<_ECharT> {
-  using _Str = basic_string_view<_ECharT, _Traits>;
+  using _Str _LIBCPP_NODEBUG = basic_string_view<_ECharT, _Traits>;
 
   _LIBCPP_HIDE_FROM_ABI static _ECharT const* __range_begin(_Str const& __s) { return __s.data(); }
 
@@ -157,7 +157,7 @@ struct __is_pathable_iter<
     true,
     _Void<typename __can_convert_char< typename iterator_traits<_Iter>::value_type>::__char_type> >
     : __can_convert_char<typename iterator_traits<_Iter>::value_type> {
-  using _ECharT = typename iterator_traits<_Iter>::value_type;
+  using _ECharT _LIBCPP_NODEBUG = typename iterator_traits<_Iter>::value_type;
 
   _LIBCPP_HIDE_FROM_ABI static _Iter __range_begin(_Iter __b) { return __b; }
 
@@ -380,13 +380,13 @@ struct _PathExport<char8_t> {
 
 class _LIBCPP_EXPORTED_FROM_ABI path {
   template <class _SourceOrIter, class _Tp = path&>
-  using _EnableIfPathable = __enable_if_t<__is_pathable<_SourceOrIter>::value, _Tp>;
+  using _EnableIfPathable _LIBCPP_NODEBUG = __enable_if_t<__is_pathable<_SourceOrIter>::value, _Tp>;
 
   template <class _Tp>
-  using _SourceChar = typename __is_pathable<_Tp>::__char_type;
+  using _SourceChar _LIBCPP_NODEBUG = typename __is_pathable<_Tp>::__char_type;
 
   template <class _Tp>
-  using _SourceCVT = _PathCVT<_SourceChar<_Tp> >;
+  using _SourceCVT _LIBCPP_NODEBUG = _PathCVT<_SourceChar<_Tp> >;
 
 public:
 #  if defined(_LIBCPP_WIN32API)
diff --git a/libcxx/include/__flat_map/flat_map.h b/libcxx/include/__flat_map/flat_map.h
index b66bc1cb66fc..9fe84250b120 100644
--- a/libcxx/include/__flat_map/flat_map.h
+++ b/libcxx/include/__flat_map/flat_map.h
@@ -90,7 +90,7 @@ class flat_map {
   static_assert(!is_same_v<_MappedContainer, std::vector<bool>>, "vector<bool> is not a sequence container");
 
   template <bool _Const>
-  using __iterator = __key_value_iterator<flat_map, _KeyContainer, _MappedContainer, _Const>;
+  using __iterator _LIBCPP_NODEBUG = __key_value_iterator<flat_map, _KeyContainer, _MappedContainer, _Const>;
 
 public:
   // types
diff --git a/libcxx/include/__flat_map/key_value_iterator.h b/libcxx/include/__flat_map/key_value_iterator.h
index 987ac677a413..06a23f342997 100644
--- a/libcxx/include/__flat_map/key_value_iterator.h
+++ b/libcxx/include/__flat_map/key_value_iterator.h
@@ -41,9 +41,9 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 template <class _Owner, class _KeyContainer, class _MappedContainer, bool _Const>
 struct __key_value_iterator {
 private:
-  using __key_iterator    = ranges::iterator_t<const _KeyContainer>;
-  using __mapped_iterator = ranges::iterator_t<__maybe_const<_Const, _MappedContainer>>;
-  using __reference       = _If<_Const, typename _Owner::const_reference, typename _Owner::reference>;
+  using __key_iterator _LIBCPP_NODEBUG    = ranges::iterator_t<const _KeyContainer>;
+  using __mapped_iterator _LIBCPP_NODEBUG = ranges::iterator_t<__maybe_const<_Const, _MappedContainer>>;
+  using __reference _LIBCPP_NODEBUG       = _If<_Const, typename _Owner::const_reference, typename _Owner::reference>;
 
   struct __arrow_proxy {
     __reference __ref_;
diff --git a/libcxx/include/__format/buffer.h b/libcxx/include/__format/buffer.h
index 618b8ef02564..9509f19e1672 100644
--- a/libcxx/include/__format/buffer.h
+++ b/libcxx/include/__format/buffer.h
@@ -322,7 +322,7 @@ struct _LIBCPP_TEMPLATE_VIS __back_insert_iterator_container<back_insert_iterato
 template <class _Container>
 class _LIBCPP_TEMPLATE_VIS __writer_container {
 public:
-  using _CharT = typename _Container::value_type;
+  using _CharT _LIBCPP_NODEBUG = typename _Container::value_type;
 
   _LIBCPP_HIDE_FROM_ABI explicit __writer_container(back_insert_iterator<_Container> __out_it)
       : __container_{__out_it.__get_container()} {}
@@ -340,7 +340,7 @@ private:
 /// Selects the type of the writer used for the output iterator.
 template <class _OutIt, class _CharT>
 class _LIBCPP_TEMPLATE_VIS __writer_selector {
-  using _Container = typename __back_insert_iterator_container<_OutIt>::type;
+  using _Container _LIBCPP_NODEBUG = typename __back_insert_iterator_container<_OutIt>::type;
 
 public:
   using type =
@@ -355,7 +355,7 @@ public:
 template <class _OutIt, __fmt_char_type _CharT>
   requires(output_iterator<_OutIt, const _CharT&>)
 class _LIBCPP_TEMPLATE_VIS __format_buffer {
-  using _Storage =
+  using _Storage _LIBCPP_NODEBUG =
       conditional_t<__enable_direct_output<_OutIt, _CharT>, __direct_storage<_CharT>, __internal_storage<_CharT>>;
 
 public:
@@ -408,7 +408,7 @@ private:
 template <class _OutIt, __fmt_char_type _CharT, bool>
   requires(output_iterator<_OutIt, const _CharT&>)
 struct _LIBCPP_TEMPLATE_VIS __format_to_n_buffer_base {
-  using _Size = iter_difference_t<_OutIt>;
+  using _Size _LIBCPP_NODEBUG = iter_difference_t<_OutIt>;
 
 public:
   _LIBCPP_HIDE_FROM_ABI explicit __format_to_n_buffer_base(_OutIt __out_it, _Size __max_size)
@@ -438,7 +438,7 @@ protected:
 template <class _OutIt, __fmt_char_type _CharT>
   requires(output_iterator<_OutIt, const _CharT&>)
 class _LIBCPP_TEMPLATE_VIS __format_to_n_buffer_base<_OutIt, _CharT, true> {
-  using _Size = iter_difference_t<_OutIt>;
+  using _Size _LIBCPP_NODEBUG = iter_difference_t<_OutIt>;
 
 public:
   _LIBCPP_HIDE_FROM_ABI explicit __format_to_n_buffer_base(_OutIt __out_it, _Size __max_size)
@@ -489,8 +489,8 @@ template <class _OutIt, __fmt_char_type _CharT>
   requires(output_iterator<_OutIt, const _CharT&>)
 struct _LIBCPP_TEMPLATE_VIS __format_to_n_buffer final
     : public __format_to_n_buffer_base< _OutIt, _CharT, __enable_direct_output<_OutIt, _CharT>> {
-  using _Base = __format_to_n_buffer_base<_OutIt, _CharT, __enable_direct_output<_OutIt, _CharT>>;
-  using _Size = iter_difference_t<_OutIt>;
+  using _Base _LIBCPP_NODEBUG = __format_to_n_buffer_base<_OutIt, _CharT, __enable_direct_output<_OutIt, _CharT>>;
+  using _Size _LIBCPP_NODEBUG = iter_difference_t<_OutIt>;
 
 public:
   _LIBCPP_HIDE_FROM_ABI explicit __format_to_n_buffer(_OutIt __out_it, _Size __max_size)
@@ -523,7 +523,7 @@ public:
 // would lead to a circular include with formatter for vector<bool>.
 template <__fmt_char_type _CharT>
 class _LIBCPP_TEMPLATE_VIS __retarget_buffer {
-  using _Alloc = allocator<_CharT>;
+  using _Alloc _LIBCPP_NODEBUG = allocator<_CharT>;
 
 public:
   using value_type = _CharT;
diff --git a/libcxx/include/__format/concepts.h b/libcxx/include/__format/concepts.h
index 2c40e3e31491..28297c612db7 100644
--- a/libcxx/include/__format/concepts.h
+++ b/libcxx/include/__format/concepts.h
@@ -44,7 +44,7 @@ concept __fmt_char_type =
 // (Note testing for (w)format_context would be a valid choice, but requires
 // selecting the proper one depending on the type of _CharT.)
 template <class _CharT>
-using __fmt_iter_for = _CharT*;
+using __fmt_iter_for _LIBCPP_NODEBUG = _CharT*;
 
 template <class _Tp, class _Context, class _Formatter = typename _Context::template formatter_type<remove_const_t<_Tp>>>
 concept __formattable_with =
diff --git a/libcxx/include/__format/container_adaptor.h b/libcxx/include/__format/container_adaptor.h
index d3be2e189560..48d42ee7d901 100644
--- a/libcxx/include/__format/container_adaptor.h
+++ b/libcxx/include/__format/container_adaptor.h
@@ -37,8 +37,8 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 template <class _Adaptor, class _CharT>
 struct _LIBCPP_TEMPLATE_VIS __formatter_container_adaptor {
 private:
-  using __maybe_const_container = __fmt_maybe_const<typename _Adaptor::container_type, _CharT>;
-  using __maybe_const_adaptor   = __maybe_const<is_const_v<__maybe_const_container>, _Adaptor>;
+  using __maybe_const_container _LIBCPP_NODEBUG = __fmt_maybe_const<typename _Adaptor::container_type, _CharT>;
+  using __maybe_const_adaptor _LIBCPP_NODEBUG   = __maybe_const<is_const_v<__maybe_const_container>, _Adaptor>;
   formatter<ranges::ref_view<__maybe_const_container>, _CharT> __underlying_;
 
 public:
diff --git a/libcxx/include/__format/format_arg.h b/libcxx/include/__format/format_arg.h
index a973ccd43c42..1c530fd5a5d0 100644
--- a/libcxx/include/__format/format_arg.h
+++ b/libcxx/include/__format/format_arg.h
@@ -208,7 +208,7 @@ _LIBCPP_HIDE_FROM_ABI _Rp __visit_format_arg(_Visitor&& __vis, basic_format_arg<
 /// separate arrays.
 template <class _Context>
 class __basic_format_arg_value {
-  using _CharT = typename _Context::char_type;
+  using _CharT _LIBCPP_NODEBUG = typename _Context::char_type;
 
 public:
   /// Contains the implementation for basic_format_arg::handle.
diff --git a/libcxx/include/__format/format_arg_store.h b/libcxx/include/__format/format_arg_store.h
index 8b2c95c657c9..4c5ee9e9e4fd 100644
--- a/libcxx/include/__format/format_arg_store.h
+++ b/libcxx/include/__format/format_arg_store.h
@@ -257,7 +257,7 @@ struct _LIBCPP_TEMPLATE_VIS __format_arg_store {
     }
   }
 
-  using _Storage =
+  using _Storage _LIBCPP_NODEBUG =
       conditional_t<__format::__use_packed_format_arg_store(sizeof...(_Args)),
                     __format::__packed_format_arg_store<_Context, sizeof...(_Args)>,
                     __format::__unpacked_format_arg_store<_Context, sizeof...(_Args)>>;
diff --git a/libcxx/include/__format/format_functions.h b/libcxx/include/__format/format_functions.h
index 3991363c0124..b920be5acbe8 100644
--- a/libcxx/include/__format/format_functions.h
+++ b/libcxx/include/__format/format_functions.h
@@ -379,7 +379,7 @@ struct _LIBCPP_TEMPLATE_VIS basic_format_string {
 private:
   basic_string_view<_CharT> __str_;
 
-  using _Context = __format::__compile_time_basic_format_context<_CharT>;
+  using _Context _LIBCPP_NODEBUG = __format::__compile_time_basic_format_context<_CharT>;
 
   static constexpr array<__format::__arg_t, sizeof...(_Args)> __types_{
       __format::__determine_arg_t<_Context, remove_cvref_t<_Args>>()...};
diff --git a/libcxx/include/__format/formatter_floating_point.h b/libcxx/include/__format/formatter_floating_point.h
index e04fffb683c3..ac4be9b61935 100644
--- a/libcxx/include/__format/formatter_floating_point.h
+++ b/libcxx/include/__format/formatter_floating_point.h
@@ -141,7 +141,7 @@ struct __traits<double> {
 /// on the stack or the heap.
 template <floating_point _Fp>
 class _LIBCPP_TEMPLATE_VIS __float_buffer {
-  using _Traits = __traits<_Fp>;
+  using _Traits _LIBCPP_NODEBUG = __traits<_Fp>;
 
 public:
   // TODO FMT Improve this constructor to do a better estimate.
diff --git a/libcxx/include/__format/formatter_string.h b/libcxx/include/__format/formatter_string.h
index 826d6421c863..30084e582214 100644
--- a/libcxx/include/__format/formatter_string.h
+++ b/libcxx/include/__format/formatter_string.h
@@ -59,7 +59,7 @@ public:
 // Formatter const char*.
 template <__fmt_char_type _CharT>
 struct _LIBCPP_TEMPLATE_VIS formatter<const _CharT*, _CharT> : public __formatter_string<_CharT> {
-  using _Base = __formatter_string<_CharT>;
+  using _Base _LIBCPP_NODEBUG = __formatter_string<_CharT>;
 
   template <class _FormatContext>
   _LIBCPP_HIDE_FROM_ABI typename _FormatContext::iterator format(const _CharT* __str, _FormatContext& __ctx) const {
@@ -78,7 +78,7 @@ struct _LIBCPP_TEMPLATE_VIS formatter<const _CharT*, _CharT> : public __formatte
 // Formatter char*.
 template <__fmt_char_type _CharT>
 struct _LIBCPP_TEMPLATE_VIS formatter<_CharT*, _CharT> : public formatter<const _CharT*, _CharT> {
-  using _Base = formatter<const _CharT*, _CharT>;
+  using _Base _LIBCPP_NODEBUG = formatter<const _CharT*, _CharT>;
 
   template <class _FormatContext>
   _LIBCPP_HIDE_FROM_ABI typename _FormatContext::iterator format(_CharT* __str, _FormatContext& __ctx) const {
@@ -89,7 +89,7 @@ struct _LIBCPP_TEMPLATE_VIS formatter<_CharT*, _CharT> : public formatter<const
 // Formatter char[].
 template <__fmt_char_type _CharT, size_t _Size>
 struct _LIBCPP_TEMPLATE_VIS formatter<_CharT[_Size], _CharT> : public __formatter_string<_CharT> {
-  using _Base = __formatter_string<_CharT>;
+  using _Base _LIBCPP_NODEBUG = __formatter_string<_CharT>;
 
   template <class _FormatContext>
   _LIBCPP_HIDE_FROM_ABI typename _FormatContext::iterator
@@ -102,7 +102,7 @@ struct _LIBCPP_TEMPLATE_VIS formatter<_CharT[_Size], _CharT> : public __formatte
 template <__fmt_char_type _CharT, class _Traits, class _Allocator>
 struct _LIBCPP_TEMPLATE_VIS formatter<basic_string<_CharT, _Traits, _Allocator>, _CharT>
     : public __formatter_string<_CharT> {
-  using _Base = __formatter_string<_CharT>;
+  using _Base _LIBCPP_NODEBUG = __formatter_string<_CharT>;
 
   template <class _FormatContext>
   _LIBCPP_HIDE_FROM_ABI typename _FormatContext::iterator
@@ -115,7 +115,7 @@ struct _LIBCPP_TEMPLATE_VIS formatter<basic_string<_CharT, _Traits, _Allocator>,
 // Formatter std::string_view.
 template <__fmt_char_type _CharT, class _Traits>
 struct _LIBCPP_TEMPLATE_VIS formatter<basic_string_view<_CharT, _Traits>, _CharT> : public __formatter_string<_CharT> {
-  using _Base = __formatter_string<_CharT>;
+  using _Base _LIBCPP_NODEBUG = __formatter_string<_CharT>;
 
   template <class _FormatContext>
   _LIBCPP_HIDE_FROM_ABI typename _FormatContext::iterator
diff --git a/libcxx/include/__format/range_default_formatter.h b/libcxx/include/__format/range_default_formatter.h
index fb21b0f8beb3..bb4c520f5ea1 100644
--- a/libcxx/include/__format/range_default_formatter.h
+++ b/libcxx/include/__format/range_default_formatter.h
@@ -40,7 +40,7 @@ concept __const_formattable_range =
     ranges::input_range<const _Rp> && formattable<ranges::range_reference_t<const _Rp>, _CharT>;
 
 template <class _Rp, class _CharT>
-using __fmt_maybe_const = conditional_t<__const_formattable_range<_Rp, _CharT>, const _Rp, _Rp>;
+using __fmt_maybe_const _LIBCPP_NODEBUG = conditional_t<__const_formattable_range<_Rp, _CharT>, const _Rp, _Rp>;
 
 _LIBCPP_DIAGNOSTIC_PUSH
 _LIBCPP_CLANG_DIAGNOSTIC_IGNORED("-Wshadow")
@@ -95,7 +95,7 @@ struct _LIBCPP_TEMPLATE_VIS __range_default_formatter;
 template <ranges::input_range _Rp, class _CharT>
 struct _LIBCPP_TEMPLATE_VIS __range_default_formatter<range_format::sequence, _Rp, _CharT> {
 private:
-  using __maybe_const_r = __fmt_maybe_const<_Rp, _CharT>;
+  using __maybe_const_r _LIBCPP_NODEBUG = __fmt_maybe_const<_Rp, _CharT>;
   range_formatter<remove_cvref_t<ranges::range_reference_t<__maybe_const_r>>, _CharT> __underlying_;
 
 public:
@@ -122,8 +122,8 @@ public:
 template <ranges::input_range _Rp, class _CharT>
 struct _LIBCPP_TEMPLATE_VIS __range_default_formatter<range_format::map, _Rp, _CharT> {
 private:
-  using __maybe_const_map = __fmt_maybe_const<_Rp, _CharT>;
-  using __element_type    = remove_cvref_t<ranges::range_reference_t<__maybe_const_map>>;
+  using __maybe_const_map _LIBCPP_NODEBUG = __fmt_maybe_const<_Rp, _CharT>;
+  using __element_type _LIBCPP_NODEBUG    = remove_cvref_t<ranges::range_reference_t<__maybe_const_map>>;
   range_formatter<__element_type, _CharT> __underlying_;
 
 public:
@@ -150,8 +150,8 @@ public:
 template <ranges::input_range _Rp, class _CharT>
 struct _LIBCPP_TEMPLATE_VIS __range_default_formatter<range_format::set, _Rp, _CharT> {
 private:
-  using __maybe_const_set = __fmt_maybe_const<_Rp, _CharT>;
-  using __element_type    = remove_cvref_t<ranges::range_reference_t<__maybe_const_set>>;
+  using __maybe_const_set _LIBCPP_NODEBUG = __fmt_maybe_const<_Rp, _CharT>;
+  using __element_type _LIBCPP_NODEBUG    = remove_cvref_t<ranges::range_reference_t<__maybe_const_set>>;
   range_formatter<__element_type, _CharT> __underlying_;
 
 public:
diff --git a/libcxx/include/__format/unicode.h b/libcxx/include/__format/unicode.h
index b4f22c739d6a..46096fda1e8a 100644
--- a/libcxx/include/__format/unicode.h
+++ b/libcxx/include/__format/unicode.h
@@ -123,7 +123,7 @@ class __code_point_view;
 /// UTF-8 specialization.
 template <>
 class __code_point_view<char> {
-  using _Iterator = basic_string_view<char>::const_iterator;
+  using _Iterator _LIBCPP_NODEBUG = basic_string_view<char>::const_iterator;
 
 public:
   _LIBCPP_HIDE_FROM_ABI constexpr explicit __code_point_view(_Iterator __first, _Iterator __last)
@@ -249,7 +249,7 @@ _LIBCPP_HIDE_FROM_ABI constexpr bool __is_surrogate_pair_low(wchar_t __value) {
 /// - 4 UTF-32 (for example Linux)
 template <>
 class __code_point_view<wchar_t> {
-  using _Iterator = typename basic_string_view<wchar_t>::const_iterator;
+  using _Iterator _LIBCPP_NODEBUG = typename basic_string_view<wchar_t>::const_iterator;
 
 public:
   static_assert(sizeof(wchar_t) == 2 || sizeof(wchar_t) == 4, "sizeof(wchar_t) has a not implemented value");
@@ -300,8 +300,8 @@ private:
 // This implements the extended rules see
 // https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries
 class __extended_grapheme_cluster_break {
-  using __EGC_property  = __extended_grapheme_custer_property_boundary::__property;
-  using __inCB_property = __indic_conjunct_break::__property;
+  using __EGC_property _LIBCPP_NODEBUG  = __extended_grapheme_custer_property_boundary::__property;
+  using __inCB_property _LIBCPP_NODEBUG = __indic_conjunct_break::__property;
 
 public:
   _LIBCPP_HIDE_FROM_ABI constexpr explicit __extended_grapheme_cluster_break(char32_t __first_code_point)
@@ -527,7 +527,7 @@ private:
 /// Therefore only this code point is extracted.
 template <class _CharT>
 class __extended_grapheme_cluster_view {
-  using _Iterator = typename basic_string_view<_CharT>::const_iterator;
+  using _Iterator _LIBCPP_NODEBUG = typename basic_string_view<_CharT>::const_iterator;
 
 public:
   _LIBCPP_HIDE_FROM_ABI constexpr explicit __extended_grapheme_cluster_view(_Iterator __first, _Iterator __last)
@@ -572,7 +572,7 @@ __extended_grapheme_cluster_view(_Iterator, _Iterator) -> __extended_grapheme_cl
 // This makes it easier to write code agnostic of the _LIBCPP_HAS_UNICODE define.
 template <class _CharT>
 class __code_point_view {
-  using _Iterator = typename basic_string_view<_CharT>::const_iterator;
+  using _Iterator _LIBCPP_NODEBUG = typename basic_string_view<_CharT>::const_iterator;
 
 public:
   _LIBCPP_HIDE_FROM_ABI constexpr explicit __code_point_view(_Iterator __first, _Iterator __last)
diff --git a/libcxx/include/__functional/binary_function.h b/libcxx/include/__functional/binary_function.h
index ddee3b170311..bde8b03ef828 100644
--- a/libcxx/include/__functional/binary_function.h
+++ b/libcxx/include/__functional/binary_function.h
@@ -42,11 +42,11 @@ struct __binary_function_keep_layout_base {
 _LIBCPP_DIAGNOSTIC_PUSH
 _LIBCPP_CLANG_DIAGNOSTIC_IGNORED("-Wdeprecated-declarations")
 template <class _Arg1, class _Arg2, class _Result>
-using __binary_function = binary_function<_Arg1, _Arg2, _Result>;
+using __binary_function _LIBCPP_NODEBUG = binary_function<_Arg1, _Arg2, _Result>;
 _LIBCPP_DIAGNOSTIC_POP
 #else
 template <class _Arg1, class _Arg2, class _Result>
-using __binary_function = __binary_function_keep_layout_base<_Arg1, _Arg2, _Result>;
+using __binary_function _LIBCPP_NODEBUG = __binary_function_keep_layout_base<_Arg1, _Arg2, _Result>;
 #endif
 
 _LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/include/__functional/bind.h b/libcxx/include/__functional/bind.h
index f82c1517249b..e31ad2979035 100644
--- a/libcxx/include/__functional/bind.h
+++ b/libcxx/include/__functional/bind.h
@@ -198,7 +198,7 @@ __apply_functor(_Fp& __f, _BoundArgs& __bound_args, __tuple_indices<_Indx...>, _
 template <class _Fp, class... _BoundArgs>
 class __bind : public __weak_result_type<__decay_t<_Fp> > {
 protected:
-  using _Fd = __decay_t<_Fp>;
+  using _Fd _LIBCPP_NODEBUG = __decay_t<_Fp>;
   typedef tuple<__decay_t<_BoundArgs>...> _Td;
 
 private:
diff --git a/libcxx/include/__functional/boyer_moore_searcher.h b/libcxx/include/__functional/boyer_moore_searcher.h
index 52a58d57a8d4..1e49cc5464be 100644
--- a/libcxx/include/__functional/boyer_moore_searcher.h
+++ b/libcxx/include/__functional/boyer_moore_searcher.h
@@ -92,7 +92,7 @@ class _LIBCPP_TEMPLATE_VIS boyer_moore_searcher {
 private:
   using difference_type = typename std::iterator_traits<_RandomAccessIterator1>::difference_type;
   using value_type      = typename std::iterator_traits<_RandomAccessIterator1>::value_type;
-  using __skip_table_type =
+  using __skip_table_type _LIBCPP_NODEBUG =
       _BMSkipTable<value_type,
                    difference_type,
                    _Hash,
@@ -223,7 +223,7 @@ class _LIBCPP_TEMPLATE_VIS boyer_moore_horspool_searcher {
 private:
   using difference_type = typename iterator_traits<_RandomAccessIterator1>::difference_type;
   using value_type      = typename iterator_traits<_RandomAccessIterator1>::value_type;
-  using __skip_table_type =
+  using __skip_table_type _LIBCPP_NODEBUG =
       _BMSkipTable<value_type,
                    difference_type,
                    _Hash,
diff --git a/libcxx/include/__functional/function.h b/libcxx/include/__functional/function.h
index a421a3ef4f5f..b483e8ea8f85 100644
--- a/libcxx/include/__functional/function.h
+++ b/libcxx/include/__functional/function.h
@@ -576,7 +576,7 @@ private:
 // Used to choose between perfect forwarding or pass-by-value. Pass-by-value is
 // faster for types that can be passed in registers.
 template <typename _Tp>
-using __fast_forward = __conditional_t<is_scalar<_Tp>::value, _Tp, _Tp&&>;
+using __fast_forward _LIBCPP_NODEBUG = __conditional_t<is_scalar<_Tp>::value, _Tp, _Tp&&>;
 
 // __policy_invoker calls an instance of __alloc_func held in __policy_storage.
 
@@ -847,7 +847,7 @@ class _LIBCPP_TEMPLATE_VIS function<_Rp(_ArgTypes...)>
   };
 
   template <class _Fp>
-  using _EnableIfLValueCallable = __enable_if_t<__callable<_Fp&>::value>;
+  using _EnableIfLValueCallable _LIBCPP_NODEBUG = __enable_if_t<__callable<_Fp&>::value>;
 
 public:
   typedef _Rp result_type;
diff --git a/libcxx/include/__functional/perfect_forward.h b/libcxx/include/__functional/perfect_forward.h
index 8fd68db3d6eb..37c3d15b4bec 100644
--- a/libcxx/include/__functional/perfect_forward.h
+++ b/libcxx/include/__functional/perfect_forward.h
@@ -94,7 +94,7 @@ public:
 
 // __perfect_forward implements a perfect-forwarding call wrapper as explained in [func.require].
 template <class _Op, class... _Args>
-using __perfect_forward = __perfect_forward_impl<_Op, index_sequence_for<_Args...>, _Args...>;
+using __perfect_forward _LIBCPP_NODEBUG = __perfect_forward_impl<_Op, index_sequence_for<_Args...>, _Args...>;
 
 #endif // _LIBCPP_STD_VER >= 17
 
diff --git a/libcxx/include/__functional/unary_function.h b/libcxx/include/__functional/unary_function.h
index 69b1bc94220a..769ffc9893a7 100644
--- a/libcxx/include/__functional/unary_function.h
+++ b/libcxx/include/__functional/unary_function.h
@@ -39,11 +39,11 @@ struct __unary_function_keep_layout_base {
 _LIBCPP_DIAGNOSTIC_PUSH
 _LIBCPP_CLANG_DIAGNOSTIC_IGNORED("-Wdeprecated-declarations")
 template <class _Arg, class _Result>
-using __unary_function = unary_function<_Arg, _Result>;
+using __unary_function _LIBCPP_NODEBUG = unary_function<_Arg, _Result>;
 _LIBCPP_DIAGNOSTIC_POP
 #else
 template <class _Arg, class _Result>
-using __unary_function = __unary_function_keep_layout_base<_Arg, _Result>;
+using __unary_function _LIBCPP_NODEBUG = __unary_function_keep_layout_base<_Arg, _Result>;
 #endif
 
 _LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/include/__hash_table b/libcxx/include/__hash_table
index 9c821ea69081..7788f687746f 100644
--- a/libcxx/include/__hash_table
+++ b/libcxx/include/__hash_table
@@ -111,8 +111,8 @@ struct __hash_node_base {
 template <class _Tp, class _VoidPtr>
 struct __hash_node : public __hash_node_base< __rebind_pointer_t<_VoidPtr, __hash_node<_Tp, _VoidPtr> > > {
   typedef _Tp __node_value_type;
-  using _Base          = __hash_node_base<__rebind_pointer_t<_VoidPtr, __hash_node<_Tp, _VoidPtr> > >;
-  using __next_pointer = typename _Base::__next_pointer;
+  using _Base _LIBCPP_NODEBUG          = __hash_node_base<__rebind_pointer_t<_VoidPtr, __hash_node<_Tp, _VoidPtr> > >;
+  using __next_pointer _LIBCPP_NODEBUG = typename _Base::__next_pointer;
 
   size_t __hash_;
 
diff --git a/libcxx/include/__iterator/aliasing_iterator.h b/libcxx/include/__iterator/aliasing_iterator.h
index aeb5b4a88ec3..e01127142ae9 100644
--- a/libcxx/include/__iterator/aliasing_iterator.h
+++ b/libcxx/include/__iterator/aliasing_iterator.h
@@ -31,8 +31,8 @@ struct __aliasing_iterator_wrapper {
   class __iterator {
     _BaseIter __base_ = nullptr;
 
-    using __iter_traits     = iterator_traits<_BaseIter>;
-    using __base_value_type = typename __iter_traits::value_type;
+    using __iter_traits _LIBCPP_NODEBUG     = iterator_traits<_BaseIter>;
+    using __base_value_type _LIBCPP_NODEBUG = typename __iter_traits::value_type;
 
     static_assert(__has_random_access_iterator_category<_BaseIter>::value,
                   "The base iterator has to be a random access iterator!");
@@ -120,7 +120,7 @@ struct __aliasing_iterator_wrapper {
 
 // This is required to avoid ADL instantiations on _BaseT
 template <class _BaseT, class _Alias>
-using __aliasing_iterator = typename __aliasing_iterator_wrapper<_BaseT, _Alias>::__iterator;
+using __aliasing_iterator _LIBCPP_NODEBUG = typename __aliasing_iterator_wrapper<_BaseT, _Alias>::__iterator;
 
 _LIBCPP_END_NAMESPACE_STD
 
diff --git a/libcxx/include/__iterator/concepts.h b/libcxx/include/__iterator/concepts.h
index 1c227933a482..6e5ac1d3af37 100644
--- a/libcxx/include/__iterator/concepts.h
+++ b/libcxx/include/__iterator/concepts.h
@@ -67,10 +67,10 @@ template <class _In>
 concept indirectly_readable = __indirectly_readable_impl<remove_cvref_t<_In>>;
 
 template <class _Tp>
-using __projected_iterator_t = typename _Tp::__projected_iterator;
+using __projected_iterator_t _LIBCPP_NODEBUG = typename _Tp::__projected_iterator;
 
 template <class _Tp>
-using __projected_projection_t = typename _Tp::__projected_projection;
+using __projected_projection_t _LIBCPP_NODEBUG = typename _Tp::__projected_projection;
 
 template <class _Tp>
 concept __specialization_of_projected = requires {
@@ -89,7 +89,7 @@ struct __indirect_value_t_impl<_Tp> {
 };
 
 template <indirectly_readable _Tp>
-using __indirect_value_t = typename __indirect_value_t_impl<_Tp>::type;
+using __indirect_value_t _LIBCPP_NODEBUG = typename __indirect_value_t_impl<_Tp>::type;
 
 template <indirectly_readable _Tp>
 using iter_common_reference_t = common_reference_t<iter_reference_t<_Tp>, __indirect_value_t<_Tp>>;
@@ -274,7 +274,7 @@ concept indirectly_copyable_storable =
 #endif // _LIBCPP_STD_VER >= 20
 
 template <class _Tp>
-using __has_random_access_iterator_category_or_concept
+using __has_random_access_iterator_category_or_concept _LIBCPP_NODEBUG
 #if _LIBCPP_STD_VER >= 20
     = integral_constant<bool, random_access_iterator<_Tp>>;
 #else  // _LIBCPP_STD_VER < 20
diff --git a/libcxx/include/__iterator/insert_iterator.h b/libcxx/include/__iterator/insert_iterator.h
index b3311042014f..e0ee0ce035e2 100644
--- a/libcxx/include/__iterator/insert_iterator.h
+++ b/libcxx/include/__iterator/insert_iterator.h
@@ -29,10 +29,10 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 
 #if _LIBCPP_STD_VER >= 20
 template <class _Container>
-using __insert_iterator_iter_t = ranges::iterator_t<_Container>;
+using __insert_iterator_iter_t _LIBCPP_NODEBUG = ranges::iterator_t<_Container>;
 #else
 template <class _Container>
-using __insert_iterator_iter_t = typename _Container::iterator;
+using __insert_iterator_iter_t _LIBCPP_NODEBUG = typename _Container::iterator;
 #endif
 
 _LIBCPP_SUPPRESS_DEPRECATED_PUSH
diff --git a/libcxx/include/__iterator/iterator_traits.h b/libcxx/include/__iterator/iterator_traits.h
index eb6ba8b62fb3..db68dd2c377a 100644
--- a/libcxx/include/__iterator/iterator_traits.h
+++ b/libcxx/include/__iterator/iterator_traits.h
@@ -47,7 +47,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 #if _LIBCPP_STD_VER >= 20
 
 template <class _Tp>
-using __with_reference = _Tp&;
+using __with_reference _LIBCPP_NODEBUG = _Tp&;
 
 template <class _Tp>
 concept __can_reference = requires { typename __with_reference<_Tp>; };
@@ -80,19 +80,20 @@ struct __iter_traits_cache {
   using type = _If< __is_primary_template<iterator_traits<_Iter> >::value, _Iter, iterator_traits<_Iter> >;
 };
 template <class _Iter>
-using _ITER_TRAITS = typename __iter_traits_cache<_Iter>::type;
+using _ITER_TRAITS _LIBCPP_NODEBUG = typename __iter_traits_cache<_Iter>::type;
 
 struct __iter_concept_concept_test {
   template <class _Iter>
-  using _Apply = typename _ITER_TRAITS<_Iter>::iterator_concept;
+  using _Apply _LIBCPP_NODEBUG = typename _ITER_TRAITS<_Iter>::iterator_concept;
 };
 struct __iter_concept_category_test {
   template <class _Iter>
-  using _Apply = typename _ITER_TRAITS<_Iter>::iterator_category;
+  using _Apply _LIBCPP_NODEBUG = typename _ITER_TRAITS<_Iter>::iterator_category;
 };
 struct __iter_concept_random_fallback {
   template <class _Iter>
-  using _Apply = __enable_if_t< __is_primary_template<iterator_traits<_Iter> >::value, random_access_iterator_tag >;
+  using _Apply _LIBCPP_NODEBUG =
+      __enable_if_t<__is_primary_template<iterator_traits<_Iter> >::value, random_access_iterator_tag>;
 };
 
 template <class _Iter, class _Tester>
@@ -106,7 +107,7 @@ struct __iter_concept_cache {
 };
 
 template <class _Iter>
-using _ITER_CONCEPT = typename __iter_concept_cache<_Iter>::type::template _Apply<_Iter>;
+using _ITER_CONCEPT _LIBCPP_NODEBUG = typename __iter_concept_cache<_Iter>::type::template _Apply<_Iter>;
 
 template <class _Tp>
 struct __has_iterator_typedefs {
@@ -364,7 +365,7 @@ struct __iterator_traits<_Ip> {
 
 template <class _Ip>
 struct iterator_traits : __iterator_traits<_Ip> {
-  using __primary_template = iterator_traits;
+  using __primary_template _LIBCPP_NODEBUG = iterator_traits;
 };
 
 #else  // _LIBCPP_STD_VER >= 20
@@ -397,7 +398,7 @@ struct __iterator_traits<_Iter, true>
 
 template <class _Iter>
 struct _LIBCPP_TEMPLATE_VIS iterator_traits : __iterator_traits<_Iter, __has_iterator_typedefs<_Iter>::value> {
-  using __primary_template = iterator_traits;
+  using __primary_template _LIBCPP_NODEBUG = iterator_traits;
 };
 #endif // _LIBCPP_STD_VER >= 20
 
@@ -430,16 +431,19 @@ template <class _Tp, class _Up>
 struct __has_iterator_concept_convertible_to<_Tp, _Up, false> : false_type {};
 
 template <class _Tp>
-using __has_input_iterator_category = __has_iterator_category_convertible_to<_Tp, input_iterator_tag>;
+using __has_input_iterator_category _LIBCPP_NODEBUG = __has_iterator_category_convertible_to<_Tp, input_iterator_tag>;
 
 template <class _Tp>
-using __has_forward_iterator_category = __has_iterator_category_convertible_to<_Tp, forward_iterator_tag>;
+using __has_forward_iterator_category _LIBCPP_NODEBUG =
+    __has_iterator_category_convertible_to<_Tp, forward_iterator_tag>;
 
 template <class _Tp>
-using __has_bidirectional_iterator_category = __has_iterator_category_convertible_to<_Tp, bidirectional_iterator_tag>;
+using __has_bidirectional_iterator_category _LIBCPP_NODEBUG =
+    __has_iterator_category_convertible_to<_Tp, bidirectional_iterator_tag>;
 
 template <class _Tp>
-using __has_random_access_iterator_category = __has_iterator_category_convertible_to<_Tp, random_access_iterator_tag>;
+using __has_random_access_iterator_category _LIBCPP_NODEBUG =
+    __has_iterator_category_convertible_to<_Tp, random_access_iterator_tag>;
 
 // __libcpp_is_contiguous_iterator determines if an iterator is known by
 // libc++ to be contiguous, either because it advertises itself as such
@@ -466,48 +470,49 @@ template <class _Iter>
 class __wrap_iter;
 
 template <class _Tp>
-using __has_exactly_input_iterator_category =
+using __has_exactly_input_iterator_category _LIBCPP_NODEBUG =
     integral_constant<bool,
                       __has_iterator_category_convertible_to<_Tp, input_iterator_tag>::value &&
                           !__has_iterator_category_convertible_to<_Tp, forward_iterator_tag>::value>;
 
 template <class _Tp>
-using __has_exactly_forward_iterator_category =
+using __has_exactly_forward_iterator_category _LIBCPP_NODEBUG =
     integral_constant<bool,
                       __has_iterator_category_convertible_to<_Tp, forward_iterator_tag>::value &&
                           !__has_iterator_category_convertible_to<_Tp, bidirectional_iterator_tag>::value>;
 
 template <class _Tp>
-using __has_exactly_bidirectional_iterator_category =
+using __has_exactly_bidirectional_iterator_category _LIBCPP_NODEBUG =
     integral_constant<bool,
                       __has_iterator_category_convertible_to<_Tp, bidirectional_iterator_tag>::value &&
                           !__has_iterator_category_convertible_to<_Tp, random_access_iterator_tag>::value>;
 
 template <class _InputIterator>
-using __iter_value_type = typename iterator_traits<_InputIterator>::value_type;
+using __iter_value_type _LIBCPP_NODEBUG = typename iterator_traits<_InputIterator>::value_type;
 
 template <class _InputIterator>
-using __iter_key_type = __remove_const_t<typename iterator_traits<_InputIterator>::value_type::first_type>;
+using __iter_key_type _LIBCPP_NODEBUG =
+    __remove_const_t<typename iterator_traits<_InputIterator>::value_type::first_type>;
 
 template <class _InputIterator>
-using __iter_mapped_type = typename iterator_traits<_InputIterator>::value_type::second_type;
+using __iter_mapped_type _LIBCPP_NODEBUG = typename iterator_traits<_InputIterator>::value_type::second_type;
 
 template <class _InputIterator>
-using __iter_to_alloc_type =
+using __iter_to_alloc_type _LIBCPP_NODEBUG =
     pair<const typename iterator_traits<_InputIterator>::value_type::first_type,
          typename iterator_traits<_InputIterator>::value_type::second_type>;
 
 template <class _Iter>
-using __iterator_category_type = typename iterator_traits<_Iter>::iterator_category;
+using __iterator_category_type _LIBCPP_NODEBUG = typename iterator_traits<_Iter>::iterator_category;
 
 template <class _Iter>
-using __iterator_pointer_type = typename iterator_traits<_Iter>::pointer;
+using __iterator_pointer_type _LIBCPP_NODEBUG = typename iterator_traits<_Iter>::pointer;
 
 template <class _Iter>
-using __iter_diff_t = typename iterator_traits<_Iter>::difference_type;
+using __iter_diff_t _LIBCPP_NODEBUG = typename iterator_traits<_Iter>::difference_type;
 
 template <class _Iter>
-using __iter_reference = typename iterator_traits<_Iter>::reference;
+using __iter_reference _LIBCPP_NODEBUG = typename iterator_traits<_Iter>::reference;
 
 #if _LIBCPP_STD_VER >= 20
 
diff --git a/libcxx/include/__iterator/projected.h b/libcxx/include/__iterator/projected.h
index 1c560ec05500..d12f0167de1d 100644
--- a/libcxx/include/__iterator/projected.h
+++ b/libcxx/include/__iterator/projected.h
@@ -26,9 +26,9 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 template <class _It, class _Proj>
 struct __projected_impl {
   struct __type {
-    using __primary_template     = __type;
-    using __projected_iterator   = _It;
-    using __projected_projection = _Proj;
+    using __primary_template _LIBCPP_NODEBUG     = __type;
+    using __projected_iterator _LIBCPP_NODEBUG   = _It;
+    using __projected_projection _LIBCPP_NODEBUG = _Proj;
 
     using value_type = remove_cvref_t<indirect_result_t<_Proj&, _It>>;
     indirect_result_t<_Proj&, _It> operator*() const; // not defined
@@ -38,9 +38,9 @@ struct __projected_impl {
 template <weakly_incrementable _It, class _Proj>
 struct __projected_impl<_It, _Proj> {
   struct __type {
-    using __primary_template     = __type;
-    using __projected_iterator   = _It;
-    using __projected_projection = _Proj;
+    using __primary_template _LIBCPP_NODEBUG     = __type;
+    using __projected_iterator _LIBCPP_NODEBUG   = _It;
+    using __projected_projection _LIBCPP_NODEBUG = _Proj;
 
     using value_type      = remove_cvref_t<indirect_result_t<_Proj&, _It>>;
     using difference_type = iter_difference_t<_It>;
diff --git a/libcxx/include/__iterator/ranges_iterator_traits.h b/libcxx/include/__iterator/ranges_iterator_traits.h
index 859e7082048a..9a31b651eb5d 100644
--- a/libcxx/include/__iterator/ranges_iterator_traits.h
+++ b/libcxx/include/__iterator/ranges_iterator_traits.h
@@ -24,13 +24,13 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 #if _LIBCPP_STD_VER >= 23
 
 template <ranges::input_range _Range>
-using __range_key_type = __remove_const_t<typename ranges::range_value_t<_Range>::first_type>;
+using __range_key_type _LIBCPP_NODEBUG = __remove_const_t<typename ranges::range_value_t<_Range>::first_type>;
 
 template <ranges::input_range _Range>
-using __range_mapped_type = typename ranges::range_value_t<_Range>::second_type;
+using __range_mapped_type _LIBCPP_NODEBUG = typename ranges::range_value_t<_Range>::second_type;
 
 template <ranges::input_range _Range>
-using __range_to_alloc_type =
+using __range_to_alloc_type _LIBCPP_NODEBUG =
     pair<const typename ranges::range_value_t<_Range>::first_type, typename ranges::range_value_t<_Range>::second_type>;
 
 #endif
diff --git a/libcxx/include/__iterator/reverse_iterator.h b/libcxx/include/__iterator/reverse_iterator.h
index 5e88d86ad5e9..5bd1f868d3ff 100644
--- a/libcxx/include/__iterator/reverse_iterator.h
+++ b/libcxx/include/__iterator/reverse_iterator.h
@@ -329,8 +329,8 @@ __reverse_range(_Range&& __range) {
 
 template <class _Iter, bool __b>
 struct __unwrap_iter_impl<reverse_iterator<reverse_iterator<_Iter> >, __b> {
-  using _UnwrappedIter  = decltype(__unwrap_iter_impl<_Iter>::__unwrap(std::declval<_Iter>()));
-  using _ReverseWrapper = reverse_iterator<reverse_iterator<_Iter> >;
+  using _UnwrappedIter _LIBCPP_NODEBUG  = decltype(__unwrap_iter_impl<_Iter>::__unwrap(std::declval<_Iter>()));
+  using _ReverseWrapper _LIBCPP_NODEBUG = reverse_iterator<reverse_iterator<_Iter> >;
 
   static _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR _ReverseWrapper
   __rewrap(_ReverseWrapper __orig_iter, _UnwrappedIter __unwrapped_iter) {
diff --git a/libcxx/include/__iterator/segmented_iterator.h b/libcxx/include/__iterator/segmented_iterator.h
index 8cb54a35a7f5..7a8e1addeacd 100644
--- a/libcxx/include/__iterator/segmented_iterator.h
+++ b/libcxx/include/__iterator/segmented_iterator.h
@@ -72,7 +72,7 @@ template <class _Tp>
 struct __has_specialization<_Tp, sizeof(_Tp) * 0> : true_type {};
 
 template <class _Iterator>
-using __is_segmented_iterator = __has_specialization<__segmented_iterator_traits<_Iterator> >;
+using __is_segmented_iterator _LIBCPP_NODEBUG = __has_specialization<__segmented_iterator_traits<_Iterator> >;
 
 _LIBCPP_END_NAMESPACE_STD
 
diff --git a/libcxx/include/__locale b/libcxx/include/__locale
index b675e01bac81..94dc8a08437b 100644
--- a/libcxx/include/__locale
+++ b/libcxx/include/__locale
@@ -50,7 +50,7 @@ _LIBCPP_HIDE_FROM_ABI const _Facet& use_facet(const locale&);
 class _LIBCPP_EXPORTED_FROM_ABI locale {
 public:
   // locale is essentially a shared_ptr that doesn't support weak_ptrs and never got a move constructor.
-  using __trivially_relocatable = locale;
+  using __trivially_relocatable _LIBCPP_NODEBUG = locale;
 
   // types:
   class _LIBCPP_EXPORTED_FROM_ABI facet;
diff --git a/libcxx/include/__locale_dir/locale_base_api.h b/libcxx/include/__locale_dir/locale_base_api.h
index c8097beb9052..9957429c1e7c 100644
--- a/libcxx/include/__locale_dir/locale_base_api.h
+++ b/libcxx/include/__locale_dir/locale_base_api.h
@@ -135,7 +135,7 @@ namespace __locale {
 //
 // Locale management
 //
-using __locale_t = locale_t;
+using __locale_t _LIBCPP_NODEBUG = locale_t;
 
 inline _LIBCPP_HIDE_FROM_ABI __locale_t __newlocale(int __category_mask, const char* __name, __locale_t __loc) {
   return newlocale(__category_mask, __name, __loc);
diff --git a/libcxx/include/__mdspan/extents.h b/libcxx/include/__mdspan/extents.h
index edbc30a7a40e..65a697769bda 100644
--- a/libcxx/include/__mdspan/extents.h
+++ b/libcxx/include/__mdspan/extents.h
@@ -129,14 +129,14 @@ private:
   // Static values member
   static constexpr size_t __size_         = sizeof...(_Values);
   static constexpr size_t __size_dynamic_ = ((_Values == _DynTag) + ... + 0);
-  using _StaticValues                     = __static_array<_TStatic, _Values...>;
-  using _DynamicValues                    = __possibly_empty_array<_TDynamic, __size_dynamic_>;
+  using _StaticValues _LIBCPP_NODEBUG     = __static_array<_TStatic, _Values...>;
+  using _DynamicValues _LIBCPP_NODEBUG    = __possibly_empty_array<_TDynamic, __size_dynamic_>;
 
   // Dynamic values member
   _LIBCPP_NO_UNIQUE_ADDRESS _DynamicValues __dyn_vals_;
 
   // static mapping of indices to the position in the dynamic values array
-  using _DynamicIdxMap = __static_partial_sums<static_cast<size_t>(_Values == _DynTag)...>;
+  using _DynamicIdxMap _LIBCPP_NODEBUG = __static_partial_sums<static_cast<size_t>(_Values == _DynTag)...>;
 
   template <size_t... _Indices>
   _LIBCPP_HIDE_FROM_ABI static constexpr _DynamicValues __zeros(index_sequence<_Indices...>) noexcept {
@@ -292,7 +292,8 @@ private:
   static constexpr rank_type __rank_dynamic_ = ((_Extents == dynamic_extent) + ... + 0);
 
   // internal storage type using __maybe_static_array
-  using _Values = __mdspan_detail::__maybe_static_array<_IndexType, size_t, dynamic_extent, _Extents...>;
+  using _Values _LIBCPP_NODEBUG =
+      __mdspan_detail::__maybe_static_array<_IndexType, size_t, dynamic_extent, _Extents...>;
   [[no_unique_address]] _Values __vals_;
 
 public:
diff --git a/libcxx/include/__memory/allocation_guard.h b/libcxx/include/__memory/allocation_guard.h
index 66d6a5002c29..66edcd92ed61 100644
--- a/libcxx/include/__memory/allocation_guard.h
+++ b/libcxx/include/__memory/allocation_guard.h
@@ -45,8 +45,8 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 // custom allocator.
 template <class _Alloc>
 struct __allocation_guard {
-  using _Pointer = typename allocator_traits<_Alloc>::pointer;
-  using _Size    = typename allocator_traits<_Alloc>::size_type;
+  using _Pointer _LIBCPP_NODEBUG = typename allocator_traits<_Alloc>::pointer;
+  using _Size _LIBCPP_NODEBUG    = typename allocator_traits<_Alloc>::size_type;
 
   template <class _AllocT> // we perform the allocator conversion inside the constructor
   _LIBCPP_HIDE_FROM_ABI explicit __allocation_guard(_AllocT __alloc, _Size __n)
diff --git a/libcxx/include/__memory/pointer_traits.h b/libcxx/include/__memory/pointer_traits.h
index e35cfb7c3b87..afe3d1bf8a2d 100644
--- a/libcxx/include/__memory/pointer_traits.h
+++ b/libcxx/include/__memory/pointer_traits.h
@@ -176,10 +176,10 @@ public:
 
 #ifndef _LIBCPP_CXX03_LANG
 template <class _From, class _To>
-using __rebind_pointer_t = typename pointer_traits<_From>::template rebind<_To>;
+using __rebind_pointer_t _LIBCPP_NODEBUG = typename pointer_traits<_From>::template rebind<_To>;
 #else
 template <class _From, class _To>
-using __rebind_pointer_t = typename pointer_traits<_From>::template rebind<_To>::other;
+using __rebind_pointer_t _LIBCPP_NODEBUG = typename pointer_traits<_From>::template rebind<_To>::other;
 #endif
 
 // to_address
@@ -276,7 +276,7 @@ struct __pointer_of<_Tp> {
 };
 
 template <typename _Tp>
-using __pointer_of_t = typename __pointer_of<_Tp>::type;
+using __pointer_of_t _LIBCPP_NODEBUG = typename __pointer_of<_Tp>::type;
 
 template <class _Tp, class _Up>
 struct __pointer_of_or {
@@ -290,7 +290,7 @@ struct __pointer_of_or<_Tp, _Up> {
 };
 
 template <typename _Tp, typename _Up>
-using __pointer_of_or_t = typename __pointer_of_or<_Tp, _Up>::type;
+using __pointer_of_or_t _LIBCPP_NODEBUG = typename __pointer_of_or<_Tp, _Up>::type;
 
 template <class _Smart>
 concept __resettable_smart_pointer = requires(_Smart __s) { __s.reset(); };
diff --git a/libcxx/include/__memory/shared_ptr.h b/libcxx/include/__memory/shared_ptr.h
index 97e4031499ed..06b1fc488cf5 100644
--- a/libcxx/include/__memory/shared_ptr.h
+++ b/libcxx/include/__memory/shared_ptr.h
@@ -141,7 +141,7 @@ struct __for_overwrite_tag {};
 
 template <class _Tp, class _Alloc>
 struct __shared_ptr_emplace : __shared_weak_count {
-  using __value_type = __remove_cv_t<_Tp>;
+  using __value_type _LIBCPP_NODEBUG = __remove_cv_t<_Tp>;
 
   template <class... _Args,
             class _Allocator                                                                         = _Alloc,
@@ -293,7 +293,8 @@ struct __shared_ptr_deleter_ctor_reqs {
 };
 
 template <class _Dp>
-using __shared_ptr_nullptr_deleter_ctor_reqs = _And<is_move_constructible<_Dp>, __well_formed_deleter<_Dp, nullptr_t> >;
+using __shared_ptr_nullptr_deleter_ctor_reqs _LIBCPP_NODEBUG =
+    _And<is_move_constructible<_Dp>, __well_formed_deleter<_Dp, nullptr_t> >;
 
 #if defined(_LIBCPP_ABI_ENABLE_SHARED_PTR_TRIVIAL_ABI)
 #  define _LIBCPP_SHARED_PTR_TRIVIAL_ABI __attribute__((__trivial_abi__))
@@ -315,7 +316,7 @@ public:
 
   // A shared_ptr contains only two raw pointers which point to the heap and move constructing already doesn't require
   // any bookkeeping, so it's always trivially relocatable.
-  using __trivially_relocatable = shared_ptr;
+  using __trivially_relocatable _LIBCPP_NODEBUG = shared_ptr;
 
 private:
   element_type* __ptr_;
@@ -1210,7 +1211,7 @@ public:
 
   // A weak_ptr contains only two raw pointers which point to the heap and move constructing already doesn't require
   // any bookkeeping, so it's always trivially relocatable.
-  using __trivially_relocatable = weak_ptr;
+  using __trivially_relocatable _LIBCPP_NODEBUG = weak_ptr;
 
 private:
   element_type* __ptr_;
diff --git a/libcxx/include/__memory/unique_ptr.h b/libcxx/include/__memory/unique_ptr.h
index 2368f7b03e00..29d391fc80fe 100644
--- a/libcxx/include/__memory/unique_ptr.h
+++ b/libcxx/include/__memory/unique_ptr.h
@@ -153,7 +153,7 @@ public:
   //
   // This unique_ptr implementation only contains a pointer to the unique object and a deleter, so there are no
   // references to itself. This means that the entire structure is trivially relocatable if its members are.
-  using __trivially_relocatable = __conditional_t<
+  using __trivially_relocatable _LIBCPP_NODEBUG = __conditional_t<
       __libcpp_is_trivially_relocatable<pointer>::value && __libcpp_is_trivially_relocatable<deleter_type>::value,
       unique_ptr,
       void>;
@@ -189,7 +189,7 @@ private:
                      (!is_reference<_Dp>::value && is_convertible<_UDel, _Dp>::value) >;
 
   template <class _UDel>
-  using _EnableIfDeleterAssignable = __enable_if_t< is_assignable<_Dp&, _UDel&&>::value >;
+  using _EnableIfDeleterAssignable _LIBCPP_NODEBUG = __enable_if_t< is_assignable<_Dp&, _UDel&&>::value >;
 
 public:
   template <bool _Dummy = true, class = _EnableIfDeleterDefaultConstructible<_Dummy> >
@@ -419,7 +419,7 @@ public:
   //
   // This unique_ptr implementation only contains a pointer to the unique object and a deleter, so there are no
   // references to itself. This means that the entire structure is trivially relocatable if its members are.
-  using __trivially_relocatable = __conditional_t<
+  using __trivially_relocatable _LIBCPP_NODEBUG = __conditional_t<
       __libcpp_is_trivially_relocatable<pointer>::value && __libcpp_is_trivially_relocatable<deleter_type>::value,
       unique_ptr,
       void>;
@@ -430,9 +430,9 @@ private:
 
   _LIBCPP_COMPRESSED_PAIR(pointer, __ptr_, deleter_type, __deleter_);
 #ifdef _LIBCPP_ABI_BOUNDED_UNIQUE_PTR
-  using _BoundsChecker = __unique_ptr_array_bounds_stored;
+  using _BoundsChecker _LIBCPP_NODEBUG = __unique_ptr_array_bounds_stored;
 #else
-  using _BoundsChecker = __unique_ptr_array_bounds_stateless;
+  using _BoundsChecker _LIBCPP_NODEBUG = __unique_ptr_array_bounds_stateless;
 #endif
   _LIBCPP_NO_UNIQUE_ADDRESS _BoundsChecker __checker_;
 
diff --git a/libcxx/include/__memory/unique_temporary_buffer.h b/libcxx/include/__memory/unique_temporary_buffer.h
index ca6292338c00..dea7fa8e1872 100644
--- a/libcxx/include/__memory/unique_temporary_buffer.h
+++ b/libcxx/include/__memory/unique_temporary_buffer.h
@@ -45,7 +45,7 @@ struct __temporary_buffer_deleter {
 };
 
 template <class _Tp>
-using __unique_temporary_buffer = unique_ptr<_Tp, __temporary_buffer_deleter<_Tp> >;
+using __unique_temporary_buffer _LIBCPP_NODEBUG = unique_ptr<_Tp, __temporary_buffer_deleter<_Tp> >;
 
 template <class _Tp>
 inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_NO_CFI _LIBCPP_CONSTEXPR_SINCE_CXX23 __unique_temporary_buffer<_Tp>
diff --git a/libcxx/include/__node_handle b/libcxx/include/__node_handle
index d0b35bfd1934..8f32f2de8339 100644
--- a/libcxx/include/__node_handle
+++ b/libcxx/include/__node_handle
@@ -188,10 +188,10 @@ struct __map_node_handle_specifics {
 };
 
 template <class _NodeType, class _Alloc>
-using __set_node_handle = __basic_node_handle< _NodeType, _Alloc, __set_node_handle_specifics>;
+using __set_node_handle _LIBCPP_NODEBUG = __basic_node_handle< _NodeType, _Alloc, __set_node_handle_specifics>;
 
 template <class _NodeType, class _Alloc>
-using __map_node_handle = __basic_node_handle< _NodeType, _Alloc, __map_node_handle_specifics>;
+using __map_node_handle _LIBCPP_NODEBUG = __basic_node_handle< _NodeType, _Alloc, __map_node_handle_specifics>;
 
 template <class _Iterator, class _NodeType>
 struct _LIBCPP_TEMPLATE_VIS __insert_return_type {
diff --git a/libcxx/include/__pstl/backend_fwd.h b/libcxx/include/__pstl/backend_fwd.h
index 2132e8dbceb3..a7d53b6a1c98 100644
--- a/libcxx/include/__pstl/backend_fwd.h
+++ b/libcxx/include/__pstl/backend_fwd.h
@@ -53,11 +53,13 @@ struct __serial_backend_tag;
 struct __std_thread_backend_tag;
 
 #  if defined(_LIBCPP_PSTL_BACKEND_SERIAL)
-using __current_configuration = __backend_configuration<__serial_backend_tag, __default_backend_tag>;
+using __current_configuration _LIBCPP_NODEBUG = __backend_configuration<__serial_backend_tag, __default_backend_tag>;
 #  elif defined(_LIBCPP_PSTL_BACKEND_STD_THREAD)
-using __current_configuration = __backend_configuration<__std_thread_backend_tag, __default_backend_tag>;
+using __current_configuration _LIBCPP_NODEBUG =
+    __backend_configuration<__std_thread_backend_tag, __default_backend_tag>;
 #  elif defined(_LIBCPP_PSTL_BACKEND_LIBDISPATCH)
-using __current_configuration = __backend_configuration<__libdispatch_backend_tag, __default_backend_tag>;
+using __current_configuration _LIBCPP_NODEBUG =
+    __backend_configuration<__libdispatch_backend_tag, __default_backend_tag>;
 #  else
 
 // ...New vendors can add parallel backends here...
diff --git a/libcxx/include/__pstl/dispatch.h b/libcxx/include/__pstl/dispatch.h
index ea40fa79eb94..828842368e33 100644
--- a/libcxx/include/__pstl/dispatch.h
+++ b/libcxx/include/__pstl/dispatch.h
@@ -58,7 +58,8 @@ struct __find_first_implemented<_Algorithm, __backend_configuration<_B1, _Bn...>
           __find_first_implemented<_Algorithm, __backend_configuration<_Bn...>, _ExecutionPolicy> > {};
 
 template <template <class, class> class _Algorithm, class _BackendConfiguration, class _ExecutionPolicy>
-using __dispatch = typename __find_first_implemented<_Algorithm, _BackendConfiguration, _ExecutionPolicy>::type;
+using __dispatch _LIBCPP_NODEBUG =
+    typename __find_first_implemented<_Algorithm, _BackendConfiguration, _ExecutionPolicy>::type;
 
 } // namespace __pstl
 _LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/include/__ranges/chunk_by_view.h b/libcxx/include/__ranges/chunk_by_view.h
index e4df589bc286..71fee3a4f2d1 100644
--- a/libcxx/include/__ranges/chunk_by_view.h
+++ b/libcxx/include/__ranges/chunk_by_view.h
@@ -59,7 +59,7 @@ class _LIBCPP_ABI_LLVM18_NO_UNIQUE_ADDRESS chunk_by_view : public view_interface
   _LIBCPP_NO_UNIQUE_ADDRESS __movable_box<_Pred> __pred_;
 
   // We cache the result of begin() to allow providing an amortized O(1).
-  using _Cache = __non_propagating_cache<iterator_t<_View>>;
+  using _Cache _LIBCPP_NODEBUG = __non_propagating_cache<iterator_t<_View>>;
   _Cache __cached_begin_;
 
   class __iterator;
diff --git a/libcxx/include/__ranges/drop_view.h b/libcxx/include/__ranges/drop_view.h
index 87f66f17a2ab..3f963d04fff2 100644
--- a/libcxx/include/__ranges/drop_view.h
+++ b/libcxx/include/__ranges/drop_view.h
@@ -64,7 +64,7 @@ class drop_view : public view_interface<drop_view<_View>> {
   // Note: drop_view<input-range>::begin() is still trivially amortized O(1) because
   // one can't call begin() on it more than once.
   static constexpr bool _UseCache = forward_range<_View> && !(random_access_range<_View> && sized_range<_View>);
-  using _Cache                    = _If<_UseCache, __non_propagating_cache<iterator_t<_View>>, __empty_cache>;
+  using _Cache _LIBCPP_NODEBUG    = _If<_UseCache, __non_propagating_cache<iterator_t<_View>>, __empty_cache>;
   _LIBCPP_NO_UNIQUE_ADDRESS _Cache __cached_begin_ = _Cache();
   range_difference_t<_View> __count_               = 0;
   _View __base_                                    = _View();
@@ -204,7 +204,7 @@ struct __passthrough_type<subrange<_Iter, _Sent, _Kind>> {
 };
 
 template <class _Tp>
-using __passthrough_type_t = typename __passthrough_type<_Tp>::type;
+using __passthrough_type_t _LIBCPP_NODEBUG = typename __passthrough_type<_Tp>::type;
 
 struct __fn {
   // [range.drop.overview]: the `empty_view` case.
diff --git a/libcxx/include/__ranges/drop_while_view.h b/libcxx/include/__ranges/drop_while_view.h
index 6413ff52bc0e..bc7f019393a8 100644
--- a/libcxx/include/__ranges/drop_while_view.h
+++ b/libcxx/include/__ranges/drop_while_view.h
@@ -90,7 +90,7 @@ private:
   _LIBCPP_NO_UNIQUE_ADDRESS __movable_box<_Pred> __pred_;
 
   static constexpr bool _UseCache = forward_range<_View>;
-  using _Cache                    = _If<_UseCache, __non_propagating_cache<iterator_t<_View>>, __empty_cache>;
+  using _Cache _LIBCPP_NODEBUG    = _If<_UseCache, __non_propagating_cache<iterator_t<_View>>, __empty_cache>;
   _LIBCPP_NO_UNIQUE_ADDRESS _Cache __cached_begin_ = _Cache();
 };
 
diff --git a/libcxx/include/__ranges/elements_view.h b/libcxx/include/__ranges/elements_view.h
index c99282f37960..5121298fb684 100644
--- a/libcxx/include/__ranges/elements_view.h
+++ b/libcxx/include/__ranges/elements_view.h
@@ -171,7 +171,7 @@ class elements_view<_View, _Np>::__iterator
   template <bool>
   friend class __sentinel;
 
-  using _Base = __maybe_const<_Const, _View>;
+  using _Base _LIBCPP_NODEBUG = __maybe_const<_Const, _View>;
 
   iterator_t<_Base> __current_ = iterator_t<_Base>();
 
@@ -335,7 +335,7 @@ template <input_range _View, size_t _Np>
 template <bool _Const>
 class elements_view<_View, _Np>::__sentinel {
 private:
-  using _Base                                        = __maybe_const<_Const, _View>;
+  using _Base _LIBCPP_NODEBUG                        = __maybe_const<_Const, _View>;
   _LIBCPP_NO_UNIQUE_ADDRESS sentinel_t<_Base> __end_ = sentinel_t<_Base>();
 
   template <bool>
diff --git a/libcxx/include/__ranges/filter_view.h b/libcxx/include/__ranges/filter_view.h
index 22f67b2d967d..07980e735319 100644
--- a/libcxx/include/__ranges/filter_view.h
+++ b/libcxx/include/__ranges/filter_view.h
@@ -61,7 +61,7 @@ class _LIBCPP_ABI_LLVM18_NO_UNIQUE_ADDRESS filter_view : public view_interface<f
   // We cache the result of begin() to allow providing an amortized O(1) begin() whenever
   // the underlying range is at least a forward_range.
   static constexpr bool _UseCache = forward_range<_View>;
-  using _Cache                    = _If<_UseCache, __non_propagating_cache<iterator_t<_View>>, __empty_cache>;
+  using _Cache _LIBCPP_NODEBUG    = _If<_UseCache, __non_propagating_cache<iterator_t<_View>>, __empty_cache>;
   _LIBCPP_NO_UNIQUE_ADDRESS _Cache __cached_begin_ = _Cache();
 
   class __iterator;
@@ -115,7 +115,7 @@ struct __filter_iterator_category {};
 
 template <forward_range _View>
 struct __filter_iterator_category<_View> {
-  using _Cat = typename iterator_traits<iterator_t<_View>>::iterator_category;
+  using _Cat _LIBCPP_NODEBUG = typename iterator_traits<iterator_t<_View>>::iterator_category;
   using iterator_category =
       _If<derived_from<_Cat, bidirectional_iterator_tag>,
           bidirectional_iterator_tag,
diff --git a/libcxx/include/__ranges/iota_view.h b/libcxx/include/__ranges/iota_view.h
index b2fa958a0f56..4b84585258b9 100644
--- a/libcxx/include/__ranges/iota_view.h
+++ b/libcxx/include/__ranges/iota_view.h
@@ -68,7 +68,7 @@ struct __get_wider_signed {
 };
 
 template <class _Start>
-using _IotaDiffT =
+using _IotaDiffT _LIBCPP_NODEBUG =
     typename _If< (!integral<_Start> || sizeof(iter_difference_t<_Start>) > sizeof(_Start)),
                   type_identity<iter_difference_t<_Start>>,
                   __get_wider_signed<_Start> >::type;
diff --git a/libcxx/include/__ranges/join_view.h b/libcxx/include/__ranges/join_view.h
index 6aadd387860e..327b349f476a 100644
--- a/libcxx/include/__ranges/join_view.h
+++ b/libcxx/include/__ranges/join_view.h
@@ -55,8 +55,8 @@ struct __join_view_iterator_category {};
 template <class _View>
   requires is_reference_v<range_reference_t<_View>> && forward_range<_View> && forward_range<range_reference_t<_View>>
 struct __join_view_iterator_category<_View> {
-  using _OuterC = typename iterator_traits<iterator_t<_View>>::iterator_category;
-  using _InnerC = typename iterator_traits<iterator_t<range_reference_t<_View>>>::iterator_category;
+  using _OuterC _LIBCPP_NODEBUG = typename iterator_traits<iterator_t<_View>>::iterator_category;
+  using _InnerC _LIBCPP_NODEBUG = typename iterator_traits<iterator_t<range_reference_t<_View>>>::iterator_category;
 
   using iterator_category =
       _If< derived_from<_OuterC, bidirectional_iterator_tag> && derived_from<_InnerC, bidirectional_iterator_tag> &&
@@ -71,7 +71,7 @@ template <input_range _View>
   requires view<_View> && input_range<range_reference_t<_View>>
 class join_view : public view_interface<join_view<_View>> {
 private:
-  using _InnerRange = range_reference_t<_View>;
+  using _InnerRange _LIBCPP_NODEBUG = range_reference_t<_View>;
 
   template <bool>
   struct __iterator;
@@ -85,11 +85,12 @@ private:
   _LIBCPP_NO_UNIQUE_ADDRESS _View __base_ = _View();
 
   static constexpr bool _UseOuterCache = !forward_range<_View>;
-  using _OuterCache                    = _If<_UseOuterCache, __non_propagating_cache<iterator_t<_View>>, __empty_cache>;
+  using _OuterCache _LIBCPP_NODEBUG    = _If<_UseOuterCache, __non_propagating_cache<iterator_t<_View>>, __empty_cache>;
   _LIBCPP_NO_UNIQUE_ADDRESS _OuterCache __outer_;
 
   static constexpr bool _UseInnerCache = !is_reference_v<_InnerRange>;
-  using _InnerCache = _If<_UseInnerCache, __non_propagating_cache<remove_cvref_t<_InnerRange>>, __empty_cache>;
+  using _InnerCache _LIBCPP_NODEBUG =
+      _If<_UseInnerCache, __non_propagating_cache<remove_cvref_t<_InnerRange>>, __empty_cache>;
   _LIBCPP_NO_UNIQUE_ADDRESS _InnerCache __inner_;
 
 public:
@@ -155,9 +156,9 @@ private:
   template <bool>
   friend struct __sentinel;
 
-  using _Parent            = __maybe_const<_Const, join_view>;
-  using _Base              = __maybe_const<_Const, _View>;
-  sentinel_t<_Base> __end_ = sentinel_t<_Base>();
+  using _Parent _LIBCPP_NODEBUG = __maybe_const<_Const, join_view>;
+  using _Base _LIBCPP_NODEBUG   = __maybe_const<_Const, _View>;
+  sentinel_t<_Base> __end_      = sentinel_t<_Base>();
 
 public:
   _LIBCPP_HIDE_FROM_ABI __sentinel() = default;
@@ -190,18 +191,18 @@ struct join_view<_View>::__iterator final : public __join_view_iterator_category
   static constexpr bool __is_join_view_iterator = true;
 
 private:
-  using _Parent     = __maybe_const<_Const, join_view<_View>>;
-  using _Base       = __maybe_const<_Const, _View>;
-  using _Outer      = iterator_t<_Base>;
-  using _Inner      = iterator_t<range_reference_t<_Base>>;
-  using _InnerRange = range_reference_t<_View>;
+  using _Parent _LIBCPP_NODEBUG     = __maybe_const<_Const, join_view<_View>>;
+  using _Base _LIBCPP_NODEBUG       = __maybe_const<_Const, _View>;
+  using _Outer _LIBCPP_NODEBUG      = iterator_t<_Base>;
+  using _Inner _LIBCPP_NODEBUG      = iterator_t<range_reference_t<_Base>>;
+  using _InnerRange _LIBCPP_NODEBUG = range_reference_t<_View>;
 
   static_assert(!_Const || forward_range<_Base>, "Const can only be true when Base models forward_range.");
 
   static constexpr bool __ref_is_glvalue = is_reference_v<range_reference_t<_Base>>;
 
   static constexpr bool _OuterPresent           = forward_range<_Base>;
-  using _OuterType                              = _If<_OuterPresent, _Outer, std::__empty>;
+  using _OuterType _LIBCPP_NODEBUG              = _If<_OuterPresent, _Outer, std::__empty>;
   _LIBCPP_NO_UNIQUE_ADDRESS _OuterType __outer_ = _OuterType();
 
   optional<_Inner> __inner_;
@@ -379,7 +380,7 @@ template <class _JoinViewIterator>
 struct __segmented_iterator_traits<_JoinViewIterator> {
   using __segment_iterator _LIBCPP_NODEBUG =
       __iterator_with_data<typename _JoinViewIterator::_Outer, typename _JoinViewIterator::_Parent*>;
-  using __local_iterator = typename _JoinViewIterator::_Inner;
+  using __local_iterator _LIBCPP_NODEBUG = typename _JoinViewIterator::_Inner;
 
   // TODO: Would it make sense to enable the optimization for other iterator types?
 
diff --git a/libcxx/include/__ranges/lazy_split_view.h b/libcxx/include/__ranges/lazy_split_view.h
index 0dcbc134a21f..cca9191d2681 100644
--- a/libcxx/include/__ranges/lazy_split_view.h
+++ b/libcxx/include/__ranges/lazy_split_view.h
@@ -72,7 +72,8 @@ class lazy_split_view : public view_interface<lazy_split_view<_View, _Pattern>>
   _LIBCPP_NO_UNIQUE_ADDRESS _View __base_       = _View();
   _LIBCPP_NO_UNIQUE_ADDRESS _Pattern __pattern_ = _Pattern();
 
-  using _MaybeCurrent = _If<!forward_range<_View>, __non_propagating_cache<iterator_t<_View>>, __empty_cache>;
+  using _MaybeCurrent _LIBCPP_NODEBUG =
+      _If<!forward_range<_View>, __non_propagating_cache<iterator_t<_View>>, __empty_cache>;
   _LIBCPP_NO_UNIQUE_ADDRESS _MaybeCurrent __current_ = _MaybeCurrent();
 
   template <bool>
@@ -146,11 +147,11 @@ private:
     friend struct __inner_iterator;
     friend __outer_iterator<true>;
 
-    using _Parent = __maybe_const<_Const, lazy_split_view>;
-    using _Base   = __maybe_const<_Const, _View>;
+    using _Parent _LIBCPP_NODEBUG = __maybe_const<_Const, lazy_split_view>;
+    using _Base _LIBCPP_NODEBUG   = __maybe_const<_Const, _View>;
 
     _Parent* __parent_                                 = nullptr;
-    using _MaybeCurrent                                = _If<forward_range<_View>, iterator_t<_Base>, __empty_cache>;
+    using _MaybeCurrent _LIBCPP_NODEBUG                = _If<forward_range<_View>, iterator_t<_Base>, __empty_cache>;
     _LIBCPP_NO_UNIQUE_ADDRESS _MaybeCurrent __current_ = _MaybeCurrent();
     bool __trailing_empty_                             = false;
 
@@ -283,7 +284,7 @@ private:
   template <bool _Const>
   struct __inner_iterator : __inner_iterator_category<__maybe_const<_Const, _View>> {
   private:
-    using _Base = __maybe_const<_Const, _View>;
+    using _Base _LIBCPP_NODEBUG = __maybe_const<_Const, _View>;
     // Workaround for a GCC issue.
     static constexpr bool _OuterConst = _Const;
     __outer_iterator<_Const> __i_     = __outer_iterator<_OuterConst>();
diff --git a/libcxx/include/__ranges/repeat_view.h b/libcxx/include/__ranges/repeat_view.h
index 93ceaf1711d3..61a8b6357105 100644
--- a/libcxx/include/__ranges/repeat_view.h
+++ b/libcxx/include/__ranges/repeat_view.h
@@ -61,7 +61,7 @@ struct __repeat_view_iterator_difference<_Tp> {
 };
 
 template <class _Tp>
-using __repeat_view_iterator_difference_t = typename __repeat_view_iterator_difference<_Tp>::type;
+using __repeat_view_iterator_difference_t _LIBCPP_NODEBUG = typename __repeat_view_iterator_difference<_Tp>::type;
 
 namespace views::__drop {
 struct __fn;
@@ -139,7 +139,7 @@ template <move_constructible _Tp, semiregular _Bound>
 class repeat_view<_Tp, _Bound>::__iterator {
   friend class repeat_view;
 
-  using _IndexT = conditional_t<same_as<_Bound, unreachable_sentinel_t>, ptrdiff_t, _Bound>;
+  using _IndexT _LIBCPP_NODEBUG = conditional_t<same_as<_Bound, unreachable_sentinel_t>, ptrdiff_t, _Bound>;
 
   _LIBCPP_HIDE_FROM_ABI constexpr explicit __iterator(const _Tp* __value, _IndexT __bound_sentinel = _IndexT())
       : __value_(__value), __current_(__bound_sentinel) {}
diff --git a/libcxx/include/__ranges/reverse_view.h b/libcxx/include/__ranges/reverse_view.h
index 796f5be22328..80d54b9a6c83 100644
--- a/libcxx/include/__ranges/reverse_view.h
+++ b/libcxx/include/__ranges/reverse_view.h
@@ -47,7 +47,8 @@ class reverse_view : public view_interface<reverse_view<_View>> {
   // We cache begin() whenever ranges::next is not guaranteed O(1) to provide an
   // amortized O(1) begin() method.
   static constexpr bool _UseCache = !random_access_range<_View> && !common_range<_View>;
-  using _Cache = _If<_UseCache, __non_propagating_cache<reverse_iterator<iterator_t<_View>>>, __empty_cache>;
+  using _Cache _LIBCPP_NODEBUG =
+      _If<_UseCache, __non_propagating_cache<reverse_iterator<iterator_t<_View>>>, __empty_cache>;
   _LIBCPP_NO_UNIQUE_ADDRESS _Cache __cached_begin_ = _Cache();
   _LIBCPP_NO_UNIQUE_ADDRESS _View __base_          = _View();
 
diff --git a/libcxx/include/__ranges/split_view.h b/libcxx/include/__ranges/split_view.h
index 7527281905ff..2ec908ba4070 100644
--- a/libcxx/include/__ranges/split_view.h
+++ b/libcxx/include/__ranges/split_view.h
@@ -52,7 +52,7 @@ class split_view : public view_interface<split_view<_View, _Pattern>> {
 private:
   _LIBCPP_NO_UNIQUE_ADDRESS _View __base_       = _View();
   _LIBCPP_NO_UNIQUE_ADDRESS _Pattern __pattern_ = _Pattern();
-  using _Cache                                  = __non_propagating_cache<subrange<iterator_t<_View>>>;
+  using _Cache _LIBCPP_NODEBUG                  = __non_propagating_cache<subrange<iterator_t<_View>>>;
   _Cache __cached_begin_                        = _Cache();
 
   template <class, class>
diff --git a/libcxx/include/__ranges/subrange.h b/libcxx/include/__ranges/subrange.h
index a40eab3c5a25..2d006d3570a7 100644
--- a/libcxx/include/__ranges/subrange.h
+++ b/libcxx/include/__ranges/subrange.h
@@ -82,7 +82,7 @@ private:
   struct _Empty {
     _LIBCPP_HIDE_FROM_ABI constexpr _Empty(auto) noexcept {}
   };
-  using _Size = conditional_t<_StoreSize, make_unsigned_t<iter_difference_t<_Iter>>, _Empty>;
+  using _Size _LIBCPP_NODEBUG = conditional_t<_StoreSize, make_unsigned_t<iter_difference_t<_Iter>>, _Empty>;
   _LIBCPP_NO_UNIQUE_ADDRESS _Iter __begin_ = _Iter();
   _LIBCPP_NO_UNIQUE_ADDRESS _Sent __end_   = _Sent();
   _LIBCPP_NO_UNIQUE_ADDRESS _Size __size_  = 0;
diff --git a/libcxx/include/__ranges/take_view.h b/libcxx/include/__ranges/take_view.h
index 39f99cee6b4d..5892c1e31fae 100644
--- a/libcxx/include/__ranges/take_view.h
+++ b/libcxx/include/__ranges/take_view.h
@@ -161,9 +161,9 @@ public:
 template <view _View>
 template <bool _Const>
 class take_view<_View>::__sentinel {
-  using _Base = __maybe_const<_Const, _View>;
+  using _Base _LIBCPP_NODEBUG = __maybe_const<_Const, _View>;
   template <bool _OtherConst>
-  using _Iter                                        = counted_iterator<iterator_t<__maybe_const<_OtherConst, _View>>>;
+  using _Iter _LIBCPP_NODEBUG                        = counted_iterator<iterator_t<__maybe_const<_OtherConst, _View>>>;
   _LIBCPP_NO_UNIQUE_ADDRESS sentinel_t<_Base> __end_ = sentinel_t<_Base>();
 
   template <bool>
@@ -244,7 +244,7 @@ struct __passthrough_type<subrange<_Iter, _Sent, _Kind>> {
 };
 
 template <class _Tp>
-using __passthrough_type_t = typename __passthrough_type<_Tp>::type;
+using __passthrough_type_t _LIBCPP_NODEBUG = typename __passthrough_type<_Tp>::type;
 
 struct __fn {
   // [range.take.overview]: the `empty_view` case.
diff --git a/libcxx/include/__ranges/take_while_view.h b/libcxx/include/__ranges/take_while_view.h
index b7cb0aef58f1..4977f139fc55 100644
--- a/libcxx/include/__ranges/take_while_view.h
+++ b/libcxx/include/__ranges/take_while_view.h
@@ -103,7 +103,7 @@ template <view _View, class _Pred>
   requires input_range<_View> && is_object_v<_Pred> && indirect_unary_predicate<const _Pred, iterator_t<_View>>
 template <bool _Const>
 class take_while_view<_View, _Pred>::__sentinel {
-  using _Base = __maybe_const<_Const, _View>;
+  using _Base _LIBCPP_NODEBUG = __maybe_const<_Const, _View>;
 
   sentinel_t<_Base> __end_ = sentinel_t<_Base>();
   const _Pred* __pred_     = nullptr;
diff --git a/libcxx/include/__ranges/transform_view.h b/libcxx/include/__ranges/transform_view.h
index 0ce1bfe88350..4ae21e92b69d 100644
--- a/libcxx/include/__ranges/transform_view.h
+++ b/libcxx/include/__ranges/transform_view.h
@@ -159,7 +159,7 @@ struct __transform_view_iterator_category_base {};
 
 template <forward_range _View, class _Fn>
 struct __transform_view_iterator_category_base<_View, _Fn> {
-  using _Cat = typename iterator_traits<iterator_t<_View>>::iterator_category;
+  using _Cat _LIBCPP_NODEBUG = typename iterator_traits<iterator_t<_View>>::iterator_category;
 
   using iterator_category =
       conditional_t< is_reference_v<invoke_result_t<_Fn&, range_reference_t<_View>>>,
@@ -177,8 +177,8 @@ template <bool _Const>
 class transform_view<_View, _Fn>::__iterator
     : public __transform_view_iterator_category_base<_View, __maybe_const<_Const, _Fn>> {
 
-  using _Parent = __maybe_const<_Const, transform_view>;
-  using _Base   = __maybe_const<_Const, _View>;
+  using _Parent _LIBCPP_NODEBUG = __maybe_const<_Const, transform_view>;
+  using _Base _LIBCPP_NODEBUG   = __maybe_const<_Const, _View>;
 
   _Parent* __parent_ = nullptr;
 
@@ -338,8 +338,8 @@ template <input_range _View, copy_constructible _Fn>
   requires __transform_view_constraints<_View, _Fn>
 template <bool _Const>
 class transform_view<_View, _Fn>::__sentinel {
-  using _Parent = __maybe_const<_Const, transform_view>;
-  using _Base   = __maybe_const<_Const, _View>;
+  using _Parent _LIBCPP_NODEBUG = __maybe_const<_Const, transform_view>;
+  using _Base _LIBCPP_NODEBUG   = __maybe_const<_Const, _View>;
 
   sentinel_t<_Base> __end_ = sentinel_t<_Base>();
 
diff --git a/libcxx/include/__split_buffer b/libcxx/include/__split_buffer
index 9d7fd3e0df62..a8f679cc30a9 100644
--- a/libcxx/include/__split_buffer
+++ b/libcxx/include/__split_buffer
@@ -51,24 +51,24 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 template <class _Tp, class _Allocator = allocator<_Tp> >
 struct __split_buffer {
 public:
-  using value_type      = _Tp;
-  using allocator_type  = _Allocator;
-  using __alloc_rr      = __libcpp_remove_reference_t<allocator_type>;
-  using __alloc_traits  = allocator_traits<__alloc_rr>;
-  using reference       = value_type&;
-  using const_reference = const value_type&;
-  using size_type       = typename __alloc_traits::size_type;
-  using difference_type = typename __alloc_traits::difference_type;
-  using pointer         = typename __alloc_traits::pointer;
-  using const_pointer   = typename __alloc_traits::const_pointer;
-  using iterator        = pointer;
-  using const_iterator  = const_pointer;
+  using value_type                     = _Tp;
+  using allocator_type                 = _Allocator;
+  using __alloc_rr _LIBCPP_NODEBUG     = __libcpp_remove_reference_t<allocator_type>;
+  using __alloc_traits _LIBCPP_NODEBUG = allocator_traits<__alloc_rr>;
+  using reference                      = value_type&;
+  using const_reference                = const value_type&;
+  using size_type                      = typename __alloc_traits::size_type;
+  using difference_type                = typename __alloc_traits::difference_type;
+  using pointer                        = typename __alloc_traits::pointer;
+  using const_pointer                  = typename __alloc_traits::const_pointer;
+  using iterator                       = pointer;
+  using const_iterator                 = const_pointer;
 
   // A __split_buffer contains the following members which may be trivially relocatable:
   // - pointer: may be trivially relocatable, so it's checked
   // - allocator_type: may be trivially relocatable, so it's checked
   // __split_buffer doesn't have any self-references, so it's trivially relocatable if its members are.
-  using __trivially_relocatable = __conditional_t<
+  using __trivially_relocatable _LIBCPP_NODEBUG = __conditional_t<
       __libcpp_is_trivially_relocatable<pointer>::value && __libcpp_is_trivially_relocatable<allocator_type>::value,
       __split_buffer,
       void>;
diff --git a/libcxx/include/__stop_token/stop_state.h b/libcxx/include/__stop_token/stop_state.h
index 84dc208dda36..cc1f1d830e8d 100644
--- a/libcxx/include/__stop_token/stop_state.h
+++ b/libcxx/include/__stop_token/stop_state.h
@@ -27,7 +27,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 #if _LIBCPP_STD_VER >= 20 && _LIBCPP_HAS_THREADS
 
 struct __stop_callback_base : __intrusive_node_base<__stop_callback_base> {
-  using __callback_fn_t = void(__stop_callback_base*) noexcept;
+  using __callback_fn_t _LIBCPP_NODEBUG = void(__stop_callback_base*) noexcept;
   _LIBCPP_HIDE_FROM_ABI explicit __stop_callback_base(__callback_fn_t* __callback_fn) : __callback_fn_(__callback_fn) {}
 
   _LIBCPP_HIDE_FROM_ABI void __invoke() noexcept { __callback_fn_(this); }
@@ -58,9 +58,9 @@ class __stop_state {
   // It is used by __intrusive_shared_ptr, but it is stored here for better layout
   atomic<uint32_t> __ref_count_ = 0;
 
-  using __state_t            = uint32_t;
-  using __callback_list_lock = __atomic_unique_lock<__state_t, __callback_list_locked_bit>;
-  using __callback_list      = __intrusive_list_view<__stop_callback_base>;
+  using __state_t _LIBCPP_NODEBUG            = uint32_t;
+  using __callback_list_lock _LIBCPP_NODEBUG = __atomic_unique_lock<__state_t, __callback_list_locked_bit>;
+  using __callback_list _LIBCPP_NODEBUG      = __intrusive_list_view<__stop_callback_base>;
 
   __callback_list __callback_list_;
   __thread_id __requesting_thread_;
diff --git a/libcxx/include/__system_error/system_error.h b/libcxx/include/__system_error/system_error.h
index 918effb6917c..36ccf94cc010 100644
--- a/libcxx/include/__system_error/system_error.h
+++ b/libcxx/include/__system_error/system_error.h
@@ -39,6 +39,10 @@ public:
   _LIBCPP_HIDE_FROM_ABI const error_code& code() const _NOEXCEPT { return __ec_; }
 };
 
+// __ev is expected to be an error in the generic_category domain (e.g. from
+// errno, or std::errc::*), not system_category (e.g. from windows syscalls).
+[[__noreturn__]] _LIBCPP_EXPORTED_FROM_ABI void __throw_system_error(int __ev, const char* __what_arg);
+
 [[__noreturn__]] _LIBCPP_HIDE_FROM_ABI inline void __throw_system_error(error_code __ec, const char* __what_arg) {
 #if _LIBCPP_HAS_EXCEPTIONS
   throw system_error(__ec, __what_arg);
diff --git a/libcxx/include/__thread/support/pthread.h b/libcxx/include/__thread/support/pthread.h
index 531f3e71de83..14e92079dadf 100644
--- a/libcxx/include/__thread/support/pthread.h
+++ b/libcxx/include/__thread/support/pthread.h
@@ -39,7 +39,7 @@
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
-using __libcpp_timespec_t = ::timespec;
+using __libcpp_timespec_t _LIBCPP_NODEBUG = ::timespec;
 
 //
 // Mutex
diff --git a/libcxx/include/__tuple/make_tuple_types.h b/libcxx/include/__tuple/make_tuple_types.h
index 3d312395131d..ff95ca4313a5 100644
--- a/libcxx/include/__tuple/make_tuple_types.h
+++ b/libcxx/include/__tuple/make_tuple_types.h
@@ -47,9 +47,9 @@ struct __make_tuple_types_flat<_Tuple<_Types...>, __tuple_indices<_Idx...>> {
 template <class _Vt, size_t _Np, size_t... _Idx>
 struct __make_tuple_types_flat<array<_Vt, _Np>, __tuple_indices<_Idx...>> {
   template <size_t>
-  using __value_type = _Vt;
+  using __value_type _LIBCPP_NODEBUG = _Vt;
   template <class _Tp>
-  using __apply_quals = __tuple_types<__copy_cvref_t<_Tp, __value_type<_Idx>>...>;
+  using __apply_quals _LIBCPP_NODEBUG = __tuple_types<__copy_cvref_t<_Tp, __value_type<_Idx>>...>;
 };
 
 template <class _Tp,
@@ -58,9 +58,9 @@ template <class _Tp,
           bool _SameSize = (_Ep == tuple_size<__libcpp_remove_reference_t<_Tp> >::value)>
 struct __make_tuple_types {
   static_assert(_Sp <= _Ep, "__make_tuple_types input error");
-  using _RawTp = __remove_cvref_t<_Tp>;
-  using _Maker = __make_tuple_types_flat<_RawTp, typename __make_tuple_indices<_Ep, _Sp>::type>;
-  using type   = typename _Maker::template __apply_quals<_Tp>;
+  using _RawTp _LIBCPP_NODEBUG = __remove_cvref_t<_Tp>;
+  using _Maker _LIBCPP_NODEBUG = __make_tuple_types_flat<_RawTp, typename __make_tuple_indices<_Ep, _Sp>::type>;
+  using type                   = typename _Maker::template __apply_quals<_Tp>;
 };
 
 template <class... _Types, size_t _Ep>
diff --git a/libcxx/include/__tuple/sfinae_helpers.h b/libcxx/include/__tuple/sfinae_helpers.h
index 9041d1d4473e..4084e8bb31fd 100644
--- a/libcxx/include/__tuple/sfinae_helpers.h
+++ b/libcxx/include/__tuple/sfinae_helpers.h
@@ -41,7 +41,7 @@ struct __tuple_sfinae_base {
   static auto __do_test(...) -> false_type;
 
   template <class _FromArgs, class _ToArgs>
-  using __constructible = decltype(__do_test<is_constructible>(_ToArgs{}, _FromArgs{}));
+  using __constructible _LIBCPP_NODEBUG = decltype(__do_test<is_constructible>(_ToArgs{}, _FromArgs{}));
 };
 
 // __tuple_constructible
diff --git a/libcxx/include/__tuple/tuple_size.h b/libcxx/include/__tuple/tuple_size.h
index b970280fe378..27d57eb56ba6 100644
--- a/libcxx/include/__tuple/tuple_size.h
+++ b/libcxx/include/__tuple/tuple_size.h
@@ -29,7 +29,7 @@ struct _LIBCPP_TEMPLATE_VIS tuple_size;
 
 #if !defined(_LIBCPP_CXX03_LANG)
 template <class _Tp, class...>
-using __enable_if_tuple_size_imp = _Tp;
+using __enable_if_tuple_size_imp _LIBCPP_NODEBUG = _Tp;
 
 template <class _Tp>
 struct _LIBCPP_TEMPLATE_VIS tuple_size<__enable_if_tuple_size_imp< const _Tp,
diff --git a/libcxx/include/__type_traits/add_lvalue_reference.h b/libcxx/include/__type_traits/add_lvalue_reference.h
index 157c8f94d476..b1ee6ed73c8a 100644
--- a/libcxx/include/__type_traits/add_lvalue_reference.h
+++ b/libcxx/include/__type_traits/add_lvalue_reference.h
@@ -21,7 +21,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 #if __has_builtin(__add_lvalue_reference)
 
 template <class _Tp>
-using __add_lvalue_reference_t = __add_lvalue_reference(_Tp);
+using __add_lvalue_reference_t _LIBCPP_NODEBUG = __add_lvalue_reference(_Tp);
 
 #else
 
diff --git a/libcxx/include/__type_traits/add_pointer.h b/libcxx/include/__type_traits/add_pointer.h
index f66e1f9e6c06..b53d8eae708e 100644
--- a/libcxx/include/__type_traits/add_pointer.h
+++ b/libcxx/include/__type_traits/add_pointer.h
@@ -23,7 +23,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 #if !defined(_LIBCPP_WORKAROUND_OBJCXX_COMPILER_INTRINSICS) && __has_builtin(__add_pointer)
 
 template <class _Tp>
-using __add_pointer_t = __add_pointer(_Tp);
+using __add_pointer_t _LIBCPP_NODEBUG = __add_pointer(_Tp);
 
 #else
 template <class _Tp, bool = __libcpp_is_referenceable<_Tp>::value || is_void<_Tp>::value>
diff --git a/libcxx/include/__type_traits/add_rvalue_reference.h b/libcxx/include/__type_traits/add_rvalue_reference.h
index 205058892c87..d844ccc1f539 100644
--- a/libcxx/include/__type_traits/add_rvalue_reference.h
+++ b/libcxx/include/__type_traits/add_rvalue_reference.h
@@ -21,7 +21,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 #if __has_builtin(__add_rvalue_reference)
 
 template <class _Tp>
-using __add_rvalue_reference_t = __add_rvalue_reference(_Tp);
+using __add_rvalue_reference_t _LIBCPP_NODEBUG = __add_rvalue_reference(_Tp);
 
 #else
 
diff --git a/libcxx/include/__type_traits/aligned_storage.h b/libcxx/include/__type_traits/aligned_storage.h
index 5cd1f587b988..d98749980122 100644
--- a/libcxx/include/__type_traits/aligned_storage.h
+++ b/libcxx/include/__type_traits/aligned_storage.h
@@ -34,7 +34,7 @@ struct __struct_double4 {
   double __lx[4];
 };
 
-using __all_types =
+using __all_types _LIBCPP_NODEBUG =
     __type_list<__align_type<unsigned char>,
                 __align_type<unsigned short>,
                 __align_type<unsigned int>,
diff --git a/libcxx/include/__type_traits/common_reference.h b/libcxx/include/__type_traits/common_reference.h
index c802902eb19f..d436949e692f 100644
--- a/libcxx/include/__type_traits/common_reference.h
+++ b/libcxx/include/__type_traits/common_reference.h
@@ -30,7 +30,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 #if _LIBCPP_STD_VER >= 20
 // Let COND_RES(X, Y) be:
 template <class _Xp, class _Yp>
-using __cond_res = decltype(false ? std::declval<_Xp (&)()>()() : std::declval<_Yp (&)()>()());
+using __cond_res _LIBCPP_NODEBUG = decltype(false ? std::declval<_Xp (&)()>()() : std::declval<_Yp (&)()>()());
 
 // Let `XREF(A)` denote a unary alias template `T` such that `T<U>` denotes the same type as `U`
 // with the addition of `A`'s cv and reference qualifiers, for a non-reference cv-unqualified type
@@ -39,7 +39,7 @@ using __cond_res = decltype(false ? std::declval<_Xp (&)()>()() : std::declval<_
 template <class _Tp>
 struct __xref {
   template <class _Up>
-  using __apply = __copy_cvref_t<_Tp, _Up>;
+  using __apply _LIBCPP_NODEBUG = __copy_cvref_t<_Tp, _Up>;
 };
 
 // Given types A and B, let X be remove_reference_t<A>, let Y be remove_reference_t<B>,
@@ -48,10 +48,10 @@ template <class _Ap, class _Bp, class _Xp = remove_reference_t<_Ap>, class _Yp =
 struct __common_ref;
 
 template <class _Xp, class _Yp>
-using __common_ref_t = typename __common_ref<_Xp, _Yp>::__type;
+using __common_ref_t _LIBCPP_NODEBUG = typename __common_ref<_Xp, _Yp>::__type;
 
 template <class _Xp, class _Yp>
-using __cv_cond_res = __cond_res<__copy_cv_t<_Xp, _Yp>&, __copy_cv_t<_Yp, _Xp>&>;
+using __cv_cond_res _LIBCPP_NODEBUG = __cond_res<__copy_cv_t<_Xp, _Yp>&, __copy_cv_t<_Yp, _Xp>&>;
 
 //    If A and B are both lvalue reference types, COMMON-REF(A, B) is
 //    COND-RES(COPYCV(X, Y)&, COPYCV(Y, X)&) if that type exists and is a reference type.
@@ -61,13 +61,13 @@ template <class _Ap, class _Bp, class _Xp, class _Yp>
     requires { typename __cv_cond_res<_Xp, _Yp>; } &&
     is_reference_v<__cv_cond_res<_Xp, _Yp>>
 struct __common_ref<_Ap&, _Bp&, _Xp, _Yp> {
-  using __type = __cv_cond_res<_Xp, _Yp>;
+  using __type _LIBCPP_NODEBUG = __cv_cond_res<_Xp, _Yp>;
 };
 // clang-format on
 
 //    Otherwise, let C be remove_reference_t<COMMON-REF(X&, Y&)>&&. ...
 template <class _Xp, class _Yp>
-using __common_ref_C = remove_reference_t<__common_ref_t<_Xp&, _Yp&>>&&;
+using __common_ref_C _LIBCPP_NODEBUG = remove_reference_t<__common_ref_t<_Xp&, _Yp&>>&&;
 
 //    .... If A and B are both rvalue reference types, C is well-formed, and
 //    is_convertible_v<A, C> && is_convertible_v<B, C> is true, then COMMON-REF(A, B) is C.
@@ -78,13 +78,13 @@ template <class _Ap, class _Bp, class _Xp, class _Yp>
     is_convertible_v<_Ap&&, __common_ref_C<_Xp, _Yp>> &&
     is_convertible_v<_Bp&&, __common_ref_C<_Xp, _Yp>>
 struct __common_ref<_Ap&&, _Bp&&, _Xp, _Yp> {
-  using __type = __common_ref_C<_Xp, _Yp>;
+  using __type _LIBCPP_NODEBUG = __common_ref_C<_Xp, _Yp>;
 };
 // clang-format on
 
 //    Otherwise, let D be COMMON-REF(const X&, Y&). ...
 template <class _Tp, class _Up>
-using __common_ref_D = __common_ref_t<const _Tp&, _Up&>;
+using __common_ref_D _LIBCPP_NODEBUG = __common_ref_t<const _Tp&, _Up&>;
 
 //    ... If A is an rvalue reference and B is an lvalue reference and D is well-formed and
 //    is_convertible_v<A, D> is true, then COMMON-REF(A, B) is D.
@@ -94,7 +94,7 @@ template <class _Ap, class _Bp, class _Xp, class _Yp>
     requires { typename __common_ref_D<_Xp, _Yp>; } &&
     is_convertible_v<_Ap&&, __common_ref_D<_Xp, _Yp>>
 struct __common_ref<_Ap&&, _Bp&, _Xp, _Yp> {
-  using __type = __common_ref_D<_Xp, _Yp>;
+  using __type _LIBCPP_NODEBUG = __common_ref_D<_Xp, _Yp>;
 };
 // clang-format on
 
@@ -150,7 +150,7 @@ template <class, class, template <class> class, template <class> class>
 struct basic_common_reference {};
 
 template <class _Tp, class _Up>
-using __basic_common_reference_t =
+using __basic_common_reference_t _LIBCPP_NODEBUG =
     typename basic_common_reference<remove_cvref_t<_Tp>,
                                     remove_cvref_t<_Up>,
                                     __xref<_Tp>::template __apply,
diff --git a/libcxx/include/__type_traits/common_type.h b/libcxx/include/__type_traits/common_type.h
index ee5596bd1249..e4c6b495c3bd 100644
--- a/libcxx/include/__type_traits/common_type.h
+++ b/libcxx/include/__type_traits/common_type.h
@@ -31,7 +31,7 @@ template <class... _Args>
 struct common_type;
 
 template <class... _Args>
-using __common_type_t = typename common_type<_Args...>::type;
+using __common_type_t _LIBCPP_NODEBUG = typename common_type<_Args...>::type;
 
 template <class... _Args>
 struct common_type : __builtin_common_type<__common_type_t, __type_identity, __empty, _Args...> {};
@@ -40,7 +40,7 @@ struct common_type : __builtin_common_type<__common_type_t, __type_identity, __e
 #  if _LIBCPP_STD_VER >= 20
 // Let COND_RES(X, Y) be:
 template <class _Tp, class _Up>
-using __cond_type = decltype(false ? std::declval<_Tp>() : std::declval<_Up>());
+using __cond_type _LIBCPP_NODEBUG = decltype(false ? std::declval<_Tp>() : std::declval<_Up>());
 
 template <class _Tp, class _Up, class = void>
 struct __common_type3 {};
diff --git a/libcxx/include/__type_traits/conjunction.h b/libcxx/include/__type_traits/conjunction.h
index c2995591bbc2..4001d6c12d5d 100644
--- a/libcxx/include/__type_traits/conjunction.h
+++ b/libcxx/include/__type_traits/conjunction.h
@@ -22,7 +22,7 @@
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 template <class...>
-using __expand_to_true = true_type;
+using __expand_to_true _LIBCPP_NODEBUG = true_type;
 
 template <class... _Pred>
 __expand_to_true<__enable_if_t<_Pred::value>...> __and_helper(int);
diff --git a/libcxx/include/__type_traits/copy_cv.h b/libcxx/include/__type_traits/copy_cv.h
index d482cb42bffe..8378fbd50ef5 100644
--- a/libcxx/include/__type_traits/copy_cv.h
+++ b/libcxx/include/__type_traits/copy_cv.h
@@ -22,29 +22,29 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 template <class _From>
 struct __copy_cv {
   template <class _To>
-  using __apply = _To;
+  using __apply _LIBCPP_NODEBUG = _To;
 };
 
 template <class _From>
 struct __copy_cv<const _From> {
   template <class _To>
-  using __apply = const _To;
+  using __apply _LIBCPP_NODEBUG = const _To;
 };
 
 template <class _From>
 struct __copy_cv<volatile _From> {
   template <class _To>
-  using __apply = volatile _To;
+  using __apply _LIBCPP_NODEBUG = volatile _To;
 };
 
 template <class _From>
 struct __copy_cv<const volatile _From> {
   template <class _To>
-  using __apply = const volatile _To;
+  using __apply _LIBCPP_NODEBUG = const volatile _To;
 };
 
 template <class _From, class _To>
-using __copy_cv_t = typename __copy_cv<_From>::template __apply<_To>;
+using __copy_cv_t _LIBCPP_NODEBUG = typename __copy_cv<_From>::template __apply<_To>;
 
 _LIBCPP_END_NAMESPACE_STD
 
diff --git a/libcxx/include/__type_traits/copy_cvref.h b/libcxx/include/__type_traits/copy_cvref.h
index 8bbf8efdf44d..511d4e0776d6 100644
--- a/libcxx/include/__type_traits/copy_cvref.h
+++ b/libcxx/include/__type_traits/copy_cvref.h
@@ -36,7 +36,7 @@ struct __copy_cvref<_From&&, _To> {
 };
 
 template <class _From, class _To>
-using __copy_cvref_t = typename __copy_cvref<_From, _To>::type;
+using __copy_cvref_t _LIBCPP_NODEBUG = typename __copy_cvref<_From, _To>::type;
 
 _LIBCPP_END_NAMESPACE_STD
 
diff --git a/libcxx/include/__type_traits/disjunction.h b/libcxx/include/__type_traits/disjunction.h
index 2c89528d9f2f..d579de9b9843 100644
--- a/libcxx/include/__type_traits/disjunction.h
+++ b/libcxx/include/__type_traits/disjunction.h
@@ -31,7 +31,7 @@ struct _OrImpl<true> {
 template <>
 struct _OrImpl<false> {
   template <class _Res, class...>
-  using _Result = _Res;
+  using _Result _LIBCPP_NODEBUG = _Res;
 };
 
 // _Or always performs lazy evaluation of its arguments.
diff --git a/libcxx/include/__type_traits/invoke.h b/libcxx/include/__type_traits/invoke.h
index 71db32ae6a3c..6f641b9a81b8 100644
--- a/libcxx/include/__type_traits/invoke.h
+++ b/libcxx/include/__type_traits/invoke.h
@@ -44,12 +44,12 @@ template <class _Fp,
           class _DecayFp = __decay_t<_Fp>,
           class _DecayA0 = __decay_t<_A0>,
           class _ClassT  = typename __member_pointer_class_type<_DecayFp>::type>
-using __enable_if_bullet1 =
+using __enable_if_bullet1 _LIBCPP_NODEBUG =
     __enable_if_t<is_member_function_pointer<_DecayFp>::value &&
                   (is_same<_ClassT, _DecayA0>::value || is_base_of<_ClassT, _DecayA0>::value)>;
 
 template <class _Fp, class _A0, class _DecayFp = __decay_t<_Fp>, class _DecayA0 = __decay_t<_A0> >
-using __enable_if_bullet2 =
+using __enable_if_bullet2 _LIBCPP_NODEBUG =
     __enable_if_t<is_member_function_pointer<_DecayFp>::value && __is_reference_wrapper<_DecayA0>::value>;
 
 template <class _Fp,
@@ -57,7 +57,7 @@ template <class _Fp,
           class _DecayFp = __decay_t<_Fp>,
           class _DecayA0 = __decay_t<_A0>,
           class _ClassT  = typename __member_pointer_class_type<_DecayFp>::type>
-using __enable_if_bullet3 =
+using __enable_if_bullet3 _LIBCPP_NODEBUG =
     __enable_if_t<is_member_function_pointer<_DecayFp>::value &&
                   !(is_same<_ClassT, _DecayA0>::value || is_base_of<_ClassT, _DecayA0>::value) &&
                   !__is_reference_wrapper<_DecayA0>::value>;
@@ -67,12 +67,12 @@ template <class _Fp,
           class _DecayFp = __decay_t<_Fp>,
           class _DecayA0 = __decay_t<_A0>,
           class _ClassT  = typename __member_pointer_class_type<_DecayFp>::type>
-using __enable_if_bullet4 =
+using __enable_if_bullet4 _LIBCPP_NODEBUG =
     __enable_if_t<is_member_object_pointer<_DecayFp>::value &&
                   (is_same<_ClassT, _DecayA0>::value || is_base_of<_ClassT, _DecayA0>::value)>;
 
 template <class _Fp, class _A0, class _DecayFp = __decay_t<_Fp>, class _DecayA0 = __decay_t<_A0> >
-using __enable_if_bullet5 =
+using __enable_if_bullet5 _LIBCPP_NODEBUG =
     __enable_if_t<is_member_object_pointer<_DecayFp>::value && __is_reference_wrapper<_DecayA0>::value>;
 
 template <class _Fp,
@@ -80,7 +80,7 @@ template <class _Fp,
           class _DecayFp = __decay_t<_Fp>,
           class _DecayA0 = __decay_t<_A0>,
           class _ClassT  = typename __member_pointer_class_type<_DecayFp>::type>
-using __enable_if_bullet6 =
+using __enable_if_bullet6 _LIBCPP_NODEBUG =
     __enable_if_t<is_member_object_pointer<_DecayFp>::value &&
                   !(is_same<_ClassT, _DecayA0>::value || is_base_of<_ClassT, _DecayA0>::value) &&
                   !__is_reference_wrapper<_DecayA0>::value>;
@@ -159,7 +159,7 @@ struct __invokable_r {
 
   // FIXME: Check that _Ret, _Fp, and _Args... are all complete types, cv void,
   // or incomplete array types as required by the standard.
-  using _Result = decltype(__try_call<_Fp, _Args...>(0));
+  using _Result _LIBCPP_NODEBUG = decltype(__try_call<_Fp, _Args...>(0));
 
   using type              = __conditional_t<_IsNotSame<_Result, __nat>::value,
                                             __conditional_t<is_void<_Ret>::value, true_type, __is_core_convertible<_Result, _Ret> >,
@@ -167,7 +167,7 @@ struct __invokable_r {
   static const bool value = type::value;
 };
 template <class _Fp, class... _Args>
-using __invokable = __invokable_r<void, _Fp, _Args...>;
+using __invokable _LIBCPP_NODEBUG = __invokable_r<void, _Fp, _Args...>;
 
 template <bool _IsInvokable, bool _IsCVVoid, class _Ret, class _Fp, class... _Args>
 struct __nothrow_invokable_r_imp {
@@ -199,11 +199,12 @@ struct __nothrow_invokable_r_imp<true, true, _Ret, _Fp, _Args...> {
 };
 
 template <class _Ret, class _Fp, class... _Args>
-using __nothrow_invokable_r =
+using __nothrow_invokable_r _LIBCPP_NODEBUG =
     __nothrow_invokable_r_imp<__invokable_r<_Ret, _Fp, _Args...>::value, is_void<_Ret>::value, _Ret, _Fp, _Args...>;
 
 template <class _Fp, class... _Args>
-using __nothrow_invokable = __nothrow_invokable_r_imp<__invokable<_Fp, _Args...>::value, true, void, _Fp, _Args...>;
+using __nothrow_invokable _LIBCPP_NODEBUG =
+    __nothrow_invokable_r_imp<__invokable<_Fp, _Args...>::value, true, void, _Fp, _Args...>;
 
 template <class _Fp, class... _Args>
 struct __invoke_of
diff --git a/libcxx/include/__type_traits/is_always_bitcastable.h b/libcxx/include/__type_traits/is_always_bitcastable.h
index 5bc650b41358..4c6c43c6571f 100644
--- a/libcxx/include/__type_traits/is_always_bitcastable.h
+++ b/libcxx/include/__type_traits/is_always_bitcastable.h
@@ -31,8 +31,8 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 // considered bit-castable.
 template <class _From, class _To>
 struct __is_always_bitcastable {
-  using _UnqualFrom = __remove_cv_t<_From>;
-  using _UnqualTo   = __remove_cv_t<_To>;
+  using _UnqualFrom _LIBCPP_NODEBUG = __remove_cv_t<_From>;
+  using _UnqualTo _LIBCPP_NODEBUG   = __remove_cv_t<_To>;
 
   // clang-format off
   static const bool value =
diff --git a/libcxx/include/__type_traits/is_char_like_type.h b/libcxx/include/__type_traits/is_char_like_type.h
index 26205843047c..913c0821c8c6 100644
--- a/libcxx/include/__type_traits/is_char_like_type.h
+++ b/libcxx/include/__type_traits/is_char_like_type.h
@@ -21,7 +21,7 @@
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 template <class _CharT>
-using _IsCharLikeType = _And<is_standard_layout<_CharT>, is_trivial<_CharT> >;
+using _IsCharLikeType _LIBCPP_NODEBUG = _And<is_standard_layout<_CharT>, is_trivial<_CharT> >;
 
 _LIBCPP_END_NAMESPACE_STD
 
diff --git a/libcxx/include/__type_traits/is_equality_comparable.h b/libcxx/include/__type_traits/is_equality_comparable.h
index 4397f743e5ee..3ee1839996be 100644
--- a/libcxx/include/__type_traits/is_equality_comparable.h
+++ b/libcxx/include/__type_traits/is_equality_comparable.h
@@ -80,7 +80,7 @@ struct __libcpp_is_trivially_equality_comparable_impl<_Tp*, _Up*>
 };
 
 template <class _Tp, class _Up>
-using __libcpp_is_trivially_equality_comparable =
+using __libcpp_is_trivially_equality_comparable _LIBCPP_NODEBUG =
     __libcpp_is_trivially_equality_comparable_impl<__remove_cv_t<_Tp>, __remove_cv_t<_Up> >;
 
 _LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/include/__type_traits/is_execution_policy.h b/libcxx/include/__type_traits/is_execution_policy.h
index 6884f17ba16c..a2d876db0309 100644
--- a/libcxx/include/__type_traits/is_execution_policy.h
+++ b/libcxx/include/__type_traits/is_execution_policy.h
@@ -50,7 +50,7 @@ __remove_parallel_policy(const _ExecutionPolicy& = _ExecutionPolicy{execution::_
 // Removes the "parallel" part of an execution policy.
 // For example, turns par_unseq into unseq, and par into seq.
 template <class _ExecutionPolicy>
-using __remove_parallel_policy_t = decltype(std::__remove_parallel_policy<_ExecutionPolicy>());
+using __remove_parallel_policy_t _LIBCPP_NODEBUG = decltype(std::__remove_parallel_policy<_ExecutionPolicy>());
 
 _LIBCPP_END_NAMESPACE_STD
 
diff --git a/libcxx/include/__type_traits/is_primary_template.h b/libcxx/include/__type_traits/is_primary_template.h
index f308dfadc8ec..5fe6820bc7f7 100644
--- a/libcxx/include/__type_traits/is_primary_template.h
+++ b/libcxx/include/__type_traits/is_primary_template.h
@@ -21,10 +21,11 @@
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 template <class _Tp>
-using __test_for_primary_template = __enable_if_t<_IsSame<_Tp, typename _Tp::__primary_template>::value>;
+using __test_for_primary_template _LIBCPP_NODEBUG =
+    __enable_if_t<_IsSame<_Tp, typename _Tp::__primary_template>::value>;
 
 template <class _Tp>
-using __is_primary_template = _IsValidExpansion<__test_for_primary_template, _Tp>;
+using __is_primary_template _LIBCPP_NODEBUG = _IsValidExpansion<__test_for_primary_template, _Tp>;
 
 _LIBCPP_END_NAMESPACE_STD
 
diff --git a/libcxx/include/__type_traits/is_same.h b/libcxx/include/__type_traits/is_same.h
index 9561b7b5d6da..400f870904d2 100644
--- a/libcxx/include/__type_traits/is_same.h
+++ b/libcxx/include/__type_traits/is_same.h
@@ -34,10 +34,10 @@ inline constexpr bool is_same_v = __is_same(_Tp, _Up);
 // (such as in a dependent return type).
 
 template <class _Tp, class _Up>
-using _IsSame = _BoolConstant<__is_same(_Tp, _Up)>;
+using _IsSame _LIBCPP_NODEBUG = _BoolConstant<__is_same(_Tp, _Up)>;
 
 template <class _Tp, class _Up>
-using _IsNotSame = _BoolConstant<!__is_same(_Tp, _Up)>;
+using _IsNotSame _LIBCPP_NODEBUG = _BoolConstant<!__is_same(_Tp, _Up)>;
 
 _LIBCPP_END_NAMESPACE_STD
 
diff --git a/libcxx/include/__type_traits/is_swappable.h b/libcxx/include/__type_traits/is_swappable.h
index 221f017700a2..aa5eecd9abe0 100644
--- a/libcxx/include/__type_traits/is_swappable.h
+++ b/libcxx/include/__type_traits/is_swappable.h
@@ -41,10 +41,11 @@ inline const bool __is_nothrow_swappable_v = __is_nothrow_swappable_with_v<_Tp&,
 
 #ifndef _LIBCPP_CXX03_LANG
 template <class _Tp>
-using __swap_result_t = __enable_if_t<is_move_constructible<_Tp>::value && is_move_assignable<_Tp>::value>;
+using __swap_result_t _LIBCPP_NODEBUG =
+    __enable_if_t<is_move_constructible<_Tp>::value && is_move_assignable<_Tp>::value>;
 #else
 template <class>
-using __swap_result_t = void;
+using __swap_result_t _LIBCPP_NODEBUG = void;
 #endif
 
 template <class _Tp>
diff --git a/libcxx/include/__type_traits/make_32_64_or_128_bit.h b/libcxx/include/__type_traits/make_32_64_or_128_bit.h
index 70f84fcd1868..7016209ec9c0 100644
--- a/libcxx/include/__type_traits/make_32_64_or_128_bit.h
+++ b/libcxx/include/__type_traits/make_32_64_or_128_bit.h
@@ -31,7 +31,7 @@ template <class _Tp>
   requires(is_signed_v<_Tp> || is_unsigned_v<_Tp> || is_same_v<_Tp, char>)
 #endif
 // clang-format off
-using __make_32_64_or_128_bit_t =
+using __make_32_64_or_128_bit_t _LIBCPP_NODEBUG =
     __copy_unsigned_t<_Tp,
         __conditional_t<sizeof(_Tp) <= sizeof(int32_t),    int32_t,
         __conditional_t<sizeof(_Tp) <= sizeof(int64_t),    int64_t,
diff --git a/libcxx/include/__type_traits/make_const_lvalue_ref.h b/libcxx/include/__type_traits/make_const_lvalue_ref.h
index 469d4cb31ef7..f9955334de30 100644
--- a/libcxx/include/__type_traits/make_const_lvalue_ref.h
+++ b/libcxx/include/__type_traits/make_const_lvalue_ref.h
@@ -19,7 +19,7 @@
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 template <class _Tp>
-using __make_const_lvalue_ref = const __libcpp_remove_reference_t<_Tp>&;
+using __make_const_lvalue_ref _LIBCPP_NODEBUG = const __libcpp_remove_reference_t<_Tp>&;
 
 _LIBCPP_END_NAMESPACE_STD
 
diff --git a/libcxx/include/__type_traits/make_signed.h b/libcxx/include/__type_traits/make_signed.h
index 5c2739e67435..88513fea3006 100644
--- a/libcxx/include/__type_traits/make_signed.h
+++ b/libcxx/include/__type_traits/make_signed.h
@@ -26,7 +26,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 #if __has_builtin(__make_signed)
 
 template <class _Tp>
-using __make_signed_t = __make_signed(_Tp);
+using __make_signed_t _LIBCPP_NODEBUG = __make_signed(_Tp);
 
 #else
 using __signed_types =
diff --git a/libcxx/include/__type_traits/make_unsigned.h b/libcxx/include/__type_traits/make_unsigned.h
index 6c238685c233..83ff8b7bb801 100644
--- a/libcxx/include/__type_traits/make_unsigned.h
+++ b/libcxx/include/__type_traits/make_unsigned.h
@@ -28,7 +28,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 #if __has_builtin(__make_unsigned)
 
 template <class _Tp>
-using __make_unsigned_t = __make_unsigned(_Tp);
+using __make_unsigned_t _LIBCPP_NODEBUG = __make_unsigned(_Tp);
 
 #else
 using __unsigned_types =
@@ -88,7 +88,7 @@ _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR __make_unsigned_t<_Tp> __to_unsigned_lik
 }
 
 template <class _Tp, class _Up>
-using __copy_unsigned_t = __conditional_t<is_unsigned<_Tp>::value, __make_unsigned_t<_Up>, _Up>;
+using __copy_unsigned_t _LIBCPP_NODEBUG = __conditional_t<is_unsigned<_Tp>::value, __make_unsigned_t<_Up>, _Up>;
 
 _LIBCPP_END_NAMESPACE_STD
 
diff --git a/libcxx/include/__type_traits/maybe_const.h b/libcxx/include/__type_traits/maybe_const.h
index 25fba58fb773..7ef742a123d0 100644
--- a/libcxx/include/__type_traits/maybe_const.h
+++ b/libcxx/include/__type_traits/maybe_const.h
@@ -19,7 +19,7 @@
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 template <bool _Const, class _Tp>
-using __maybe_const = __conditional_t<_Const, const _Tp, _Tp>;
+using __maybe_const _LIBCPP_NODEBUG = __conditional_t<_Const, const _Tp, _Tp>;
 
 _LIBCPP_END_NAMESPACE_STD
 
diff --git a/libcxx/include/__type_traits/remove_all_extents.h b/libcxx/include/__type_traits/remove_all_extents.h
index db7dab4a6c13..d46a3228b4ab 100644
--- a/libcxx/include/__type_traits/remove_all_extents.h
+++ b/libcxx/include/__type_traits/remove_all_extents.h
@@ -25,7 +25,7 @@ struct remove_all_extents {
 };
 
 template <class _Tp>
-using __remove_all_extents_t = __remove_all_extents(_Tp);
+using __remove_all_extents_t _LIBCPP_NODEBUG = __remove_all_extents(_Tp);
 #else
 template <class _Tp>
 struct _LIBCPP_TEMPLATE_VIS remove_all_extents {
diff --git a/libcxx/include/__type_traits/remove_const.h b/libcxx/include/__type_traits/remove_const.h
index a3f0648c4785..6250d9f53117 100644
--- a/libcxx/include/__type_traits/remove_const.h
+++ b/libcxx/include/__type_traits/remove_const.h
@@ -24,7 +24,7 @@ struct remove_const {
 };
 
 template <class _Tp>
-using __remove_const_t = __remove_const(_Tp);
+using __remove_const_t _LIBCPP_NODEBUG = __remove_const(_Tp);
 #else
 template <class _Tp>
 struct _LIBCPP_TEMPLATE_VIS remove_const {
diff --git a/libcxx/include/__type_traits/remove_const_ref.h b/libcxx/include/__type_traits/remove_const_ref.h
index d3b334935a5b..e6583b396e6c 100644
--- a/libcxx/include/__type_traits/remove_const_ref.h
+++ b/libcxx/include/__type_traits/remove_const_ref.h
@@ -20,7 +20,7 @@
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 template <class _Tp>
-using __remove_const_ref_t = __remove_const_t<__libcpp_remove_reference_t<_Tp> >;
+using __remove_const_ref_t _LIBCPP_NODEBUG = __remove_const_t<__libcpp_remove_reference_t<_Tp> >;
 
 _LIBCPP_END_NAMESPACE_STD
 
diff --git a/libcxx/include/__type_traits/remove_cv.h b/libcxx/include/__type_traits/remove_cv.h
index 50e9f3e8aa78..16848e6d7112 100644
--- a/libcxx/include/__type_traits/remove_cv.h
+++ b/libcxx/include/__type_traits/remove_cv.h
@@ -24,10 +24,10 @@ struct remove_cv {
 
 #if defined(_LIBCPP_COMPILER_GCC)
 template <class _Tp>
-using __remove_cv_t = typename remove_cv<_Tp>::type;
+using __remove_cv_t _LIBCPP_NODEBUG = typename remove_cv<_Tp>::type;
 #else
 template <class _Tp>
-using __remove_cv_t = __remove_cv(_Tp);
+using __remove_cv_t _LIBCPP_NODEBUG = __remove_cv(_Tp);
 #endif
 
 #if _LIBCPP_STD_VER >= 14
diff --git a/libcxx/include/__type_traits/remove_cvref.h b/libcxx/include/__type_traits/remove_cvref.h
index 55f894dbd1d8..e3c65944e33c 100644
--- a/libcxx/include/__type_traits/remove_cvref.h
+++ b/libcxx/include/__type_traits/remove_cvref.h
@@ -34,7 +34,7 @@ using __remove_cvref_t _LIBCPP_NODEBUG = __remove_cvref(_Tp);
 #endif // __has_builtin(__remove_cvref)
 
 template <class _Tp, class _Up>
-using __is_same_uncvref = _IsSame<__remove_cvref_t<_Tp>, __remove_cvref_t<_Up> >;
+using __is_same_uncvref _LIBCPP_NODEBUG = _IsSame<__remove_cvref_t<_Tp>, __remove_cvref_t<_Up> >;
 
 #if _LIBCPP_STD_VER >= 20
 template <class _Tp>
diff --git a/libcxx/include/__type_traits/remove_extent.h b/libcxx/include/__type_traits/remove_extent.h
index aceeb4706966..95a7971d7a9c 100644
--- a/libcxx/include/__type_traits/remove_extent.h
+++ b/libcxx/include/__type_traits/remove_extent.h
@@ -25,7 +25,7 @@ struct remove_extent {
 };
 
 template <class _Tp>
-using __remove_extent_t = __remove_extent(_Tp);
+using __remove_extent_t _LIBCPP_NODEBUG = __remove_extent(_Tp);
 #else
 template <class _Tp>
 struct _LIBCPP_TEMPLATE_VIS remove_extent {
diff --git a/libcxx/include/__type_traits/remove_pointer.h b/libcxx/include/__type_traits/remove_pointer.h
index 6f98ed1a5d64..47cd1cd1d80f 100644
--- a/libcxx/include/__type_traits/remove_pointer.h
+++ b/libcxx/include/__type_traits/remove_pointer.h
@@ -25,10 +25,10 @@ struct remove_pointer {
 
 #  ifdef _LIBCPP_COMPILER_GCC
 template <class _Tp>
-using __remove_pointer_t = typename remove_pointer<_Tp>::type;
+using __remove_pointer_t _LIBCPP_NODEBUG = typename remove_pointer<_Tp>::type;
 #  else
 template <class _Tp>
-using __remove_pointer_t = __remove_pointer(_Tp);
+using __remove_pointer_t _LIBCPP_NODEBUG = __remove_pointer(_Tp);
 #  endif
 #else
 // clang-format off
diff --git a/libcxx/include/__type_traits/remove_reference.h b/libcxx/include/__type_traits/remove_reference.h
index ba67891758ad..f68815691ac0 100644
--- a/libcxx/include/__type_traits/remove_reference.h
+++ b/libcxx/include/__type_traits/remove_reference.h
@@ -24,7 +24,7 @@ struct remove_reference {
 };
 
 template <class _Tp>
-using __libcpp_remove_reference_t = __remove_reference_t(_Tp);
+using __libcpp_remove_reference_t _LIBCPP_NODEBUG = __remove_reference_t(_Tp);
 #elif __has_builtin(__remove_reference)
 template <class _Tp>
 struct remove_reference {
diff --git a/libcxx/include/__type_traits/remove_volatile.h b/libcxx/include/__type_traits/remove_volatile.h
index 7600ae0ec516..099945df0124 100644
--- a/libcxx/include/__type_traits/remove_volatile.h
+++ b/libcxx/include/__type_traits/remove_volatile.h
@@ -24,7 +24,7 @@ struct remove_volatile {
 };
 
 template <class _Tp>
-using __remove_volatile_t = __remove_volatile(_Tp);
+using __remove_volatile_t _LIBCPP_NODEBUG = __remove_volatile(_Tp);
 #else
 template <class _Tp>
 struct _LIBCPP_TEMPLATE_VIS remove_volatile {
diff --git a/libcxx/include/__type_traits/unwrap_ref.h b/libcxx/include/__type_traits/unwrap_ref.h
index 5ac037333d08..11a069d66302 100644
--- a/libcxx/include/__type_traits/unwrap_ref.h
+++ b/libcxx/include/__type_traits/unwrap_ref.h
@@ -30,7 +30,7 @@ struct __unwrap_reference<reference_wrapper<_Tp> > {
 };
 
 template <class _Tp>
-using __unwrap_ref_decay_t = typename __unwrap_reference<__decay_t<_Tp> >::type;
+using __unwrap_ref_decay_t _LIBCPP_NODEBUG = typename __unwrap_reference<__decay_t<_Tp> >::type;
 
 #if _LIBCPP_STD_VER >= 20
 template <class _Tp>
diff --git a/libcxx/include/__type_traits/void_t.h b/libcxx/include/__type_traits/void_t.h
index 985bba02e72f..8adadfa69637 100644
--- a/libcxx/include/__type_traits/void_t.h
+++ b/libcxx/include/__type_traits/void_t.h
@@ -23,7 +23,7 @@ using void_t = void;
 #endif
 
 template <class...>
-using __void_t = void;
+using __void_t _LIBCPP_NODEBUG = void;
 
 _LIBCPP_END_NAMESPACE_STD
 
diff --git a/libcxx/include/__utility/exception_guard.h b/libcxx/include/__utility/exception_guard.h
index 71e52fdb4b2a..a6b4ec521107 100644
--- a/libcxx/include/__utility/exception_guard.h
+++ b/libcxx/include/__utility/exception_guard.h
@@ -126,10 +126,10 @@ _LIBCPP_CTAD_SUPPORTED_FOR_TYPE(__exception_guard_noexceptions);
 
 #if !_LIBCPP_HAS_EXCEPTIONS
 template <class _Rollback>
-using __exception_guard = __exception_guard_noexceptions<_Rollback>;
+using __exception_guard _LIBCPP_NODEBUG = __exception_guard_noexceptions<_Rollback>;
 #else
 template <class _Rollback>
-using __exception_guard = __exception_guard_exceptions<_Rollback>;
+using __exception_guard _LIBCPP_NODEBUG = __exception_guard_exceptions<_Rollback>;
 #endif
 
 template <class _Rollback>
diff --git a/libcxx/include/__utility/forward_like.h b/libcxx/include/__utility/forward_like.h
index 67bdf6d054ea..409f716cfbce 100644
--- a/libcxx/include/__utility/forward_like.h
+++ b/libcxx/include/__utility/forward_like.h
@@ -26,13 +26,13 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 #if _LIBCPP_STD_VER >= 23
 
 template <class _Ap, class _Bp>
-using _CopyConst = _If<is_const_v<_Ap>, const _Bp, _Bp>;
+using _CopyConst _LIBCPP_NODEBUG = _If<is_const_v<_Ap>, const _Bp, _Bp>;
 
 template <class _Ap, class _Bp>
-using _OverrideRef = _If<is_rvalue_reference_v<_Ap>, remove_reference_t<_Bp>&&, _Bp&>;
+using _OverrideRef _LIBCPP_NODEBUG = _If<is_rvalue_reference_v<_Ap>, remove_reference_t<_Bp>&&, _Bp&>;
 
 template <class _Ap, class _Bp>
-using _ForwardLike = _OverrideRef<_Ap&&, _CopyConst<remove_reference_t<_Ap>, remove_reference_t<_Bp>>>;
+using _ForwardLike _LIBCPP_NODEBUG = _OverrideRef<_Ap&&, _CopyConst<remove_reference_t<_Ap>, remove_reference_t<_Bp>>>;
 
 template <class _Tp, class _Up>
 [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr auto
diff --git a/libcxx/include/__utility/in_place.h b/libcxx/include/__utility/in_place.h
index edaa4e02c55f..9b48446d8370 100644
--- a/libcxx/include/__utility/in_place.h
+++ b/libcxx/include/__utility/in_place.h
@@ -47,7 +47,7 @@ template <class _Tp>
 struct __is_inplace_type_imp<in_place_type_t<_Tp>> : true_type {};
 
 template <class _Tp>
-using __is_inplace_type = __is_inplace_type_imp<__remove_cvref_t<_Tp>>;
+using __is_inplace_type _LIBCPP_NODEBUG = __is_inplace_type_imp<__remove_cvref_t<_Tp>>;
 
 template <class _Tp>
 struct __is_inplace_index_imp : false_type {};
@@ -55,7 +55,7 @@ template <size_t _Idx>
 struct __is_inplace_index_imp<in_place_index_t<_Idx>> : true_type {};
 
 template <class _Tp>
-using __is_inplace_index = __is_inplace_index_imp<__remove_cvref_t<_Tp>>;
+using __is_inplace_index _LIBCPP_NODEBUG = __is_inplace_index_imp<__remove_cvref_t<_Tp>>;
 
 #endif // _LIBCPP_STD_VER >= 17
 
diff --git a/libcxx/include/__utility/integer_sequence.h b/libcxx/include/__utility/integer_sequence.h
index 35eb606ee37f..2c1ff3c543e8 100644
--- a/libcxx/include/__utility/integer_sequence.h
+++ b/libcxx/include/__utility/integer_sequence.h
@@ -25,19 +25,19 @@ struct __tuple_indices;
 template <class _IdxType, _IdxType... _Values>
 struct __integer_sequence {
   template <template <class _OIdxType, _OIdxType...> class _ToIndexSeq, class _ToIndexType>
-  using __convert = _ToIndexSeq<_ToIndexType, _Values...>;
+  using __convert _LIBCPP_NODEBUG = _ToIndexSeq<_ToIndexType, _Values...>;
 
   template <size_t _Sp>
-  using __to_tuple_indices = __tuple_indices<(_Values + _Sp)...>;
+  using __to_tuple_indices _LIBCPP_NODEBUG = __tuple_indices<(_Values + _Sp)...>;
 };
 
 #if __has_builtin(__make_integer_seq)
 template <size_t _Ep, size_t _Sp>
-using __make_indices_imp =
+using __make_indices_imp _LIBCPP_NODEBUG =
     typename __make_integer_seq<__integer_sequence, size_t, _Ep - _Sp>::template __to_tuple_indices<_Sp>;
 #elif __has_builtin(__integer_pack)
 template <size_t _Ep, size_t _Sp>
-using __make_indices_imp =
+using __make_indices_imp _LIBCPP_NODEBUG =
     typename __integer_sequence<size_t, __integer_pack(_Ep - _Sp)...>::template __to_tuple_indices<_Sp>;
 #else
 #  error "No known way to get an integer pack from the compiler"
diff --git a/libcxx/include/__utility/move.h b/libcxx/include/__utility/move.h
index 015986f610bd..bc16697b5c89 100644
--- a/libcxx/include/__utility/move.h
+++ b/libcxx/include/__utility/move.h
@@ -33,7 +33,7 @@ move(_LIBCPP_LIFETIMEBOUND _Tp&& __t) _NOEXCEPT {
 }
 
 template <class _Tp>
-using __move_if_noexcept_result_t =
+using __move_if_noexcept_result_t _LIBCPP_NODEBUG =
     __conditional_t<!is_nothrow_move_constructible<_Tp>::value && is_copy_constructible<_Tp>::value, const _Tp&, _Tp&&>;
 
 template <class _Tp>
diff --git a/libcxx/include/__utility/pair.h b/libcxx/include/__utility/pair.h
index bb81e30926d7..7689ab2a48c6 100644
--- a/libcxx/include/__utility/pair.h
+++ b/libcxx/include/__utility/pair.h
@@ -71,7 +71,7 @@ struct _LIBCPP_TEMPLATE_VIS pair
   _T1 first;
   _T2 second;
 
-  using __trivially_relocatable =
+  using __trivially_relocatable _LIBCPP_NODEBUG =
       __conditional_t<__libcpp_is_trivially_relocatable<_T1>::value && __libcpp_is_trivially_relocatable<_T2>::value,
                       pair,
                       void>;
diff --git a/libcxx/include/__utility/swap.h b/libcxx/include/__utility/swap.h
index 666d6d50f0d9..b4311540d36e 100644
--- a/libcxx/include/__utility/swap.h
+++ b/libcxx/include/__utility/swap.h
@@ -31,10 +31,11 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 
 #ifndef _LIBCPP_CXX03_LANG
 template <class _Tp>
-using __swap_result_t = __enable_if_t<is_move_constructible<_Tp>::value && is_move_assignable<_Tp>::value>;
+using __swap_result_t _LIBCPP_NODEBUG =
+    __enable_if_t<is_move_constructible<_Tp>::value && is_move_assignable<_Tp>::value>;
 #else
 template <class>
-using __swap_result_t = void;
+using __swap_result_t _LIBCPP_NODEBUG = void;
 #endif
 
 template <class _Tp>
diff --git a/libcxx/include/__vector/vector.h b/libcxx/include/__vector/vector.h
index 6ba7ba7bcf72..ddbf1235b906 100644
--- a/libcxx/include/__vector/vector.h
+++ b/libcxx/include/__vector/vector.h
@@ -114,7 +114,7 @@ public:
   // - pointer: may be trivially relocatable, so it's checked
   // - allocator_type: may be trivially relocatable, so it's checked
   // vector doesn't contain any self-references, so it's trivially relocatable if its members are.
-  using __trivially_relocatable = __conditional_t<
+  using __trivially_relocatable _LIBCPP_NODEBUG = __conditional_t<
       __libcpp_is_trivially_relocatable<pointer>::value && __libcpp_is_trivially_relocatable<allocator_type>::value,
       vector,
       void>;
diff --git a/libcxx/include/__vector/vector_bool.h b/libcxx/include/__vector/vector_bool.h
index 8658745b8a8f..6c6605fb3bd0 100644
--- a/libcxx/include/__vector/vector_bool.h
+++ b/libcxx/include/__vector/vector_bool.h
@@ -275,17 +275,33 @@ public:
   }
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 const_reverse_iterator crend() const _NOEXCEPT { return rend(); }
 
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 reference operator[](size_type __n) { return __make_ref(__n); }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 reference operator[](size_type __n) {
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(__n < size(), "vector<bool>::operator[] index out of bounds");
+    return __make_ref(__n);
+  }
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 const_reference operator[](size_type __n) const {
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(__n < size(), "vector<bool>::operator[] index out of bounds");
     return __make_ref(__n);
   }
-  _LIBCPP_HIDE_FROM_ABI reference at(size_type __n);
-  _LIBCPP_HIDE_FROM_ABI const_reference at(size_type __n) const;
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 reference at(size_type __n);
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 const_reference at(size_type __n) const;
 
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 reference front() { return __make_ref(0); }
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 const_reference front() const { return __make_ref(0); }
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 reference back() { return __make_ref(__size_ - 1); }
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 const_reference back() const { return __make_ref(__size_ - 1); }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 reference front() {
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(!empty(), "vector<bool>::front() called on an empty vector");
+    return __make_ref(0);
+  }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 const_reference front() const {
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(!empty(), "vector<bool>::front() called on an empty vector");
+    return __make_ref(0);
+  }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 reference back() {
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(!empty(), "vector<bool>::back() called on an empty vector");
+    return __make_ref(__size_ - 1);
+  }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 const_reference back() const {
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(!empty(), "vector<bool>::back() called on an empty vector");
+    return __make_ref(__size_ - 1);
+  }
 
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void push_back(const value_type& __x);
 #if _LIBCPP_STD_VER >= 14
@@ -310,7 +326,10 @@ public:
   }
 #endif
 
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void pop_back() { --__size_; }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void pop_back() {
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(!empty(), "vector<bool>::pop_back called on an empty vector");
+    --__size_;
+  }
 
 #if _LIBCPP_STD_VER >= 14
   template <class... _Args>
@@ -853,14 +872,15 @@ _LIBCPP_CONSTEXPR_SINCE_CXX20 void vector<bool, _Allocator>::shrink_to_fit() _NO
 }
 
 template <class _Allocator>
-typename vector<bool, _Allocator>::reference vector<bool, _Allocator>::at(size_type __n) {
+_LIBCPP_CONSTEXPR_SINCE_CXX20 typename vector<bool, _Allocator>::reference vector<bool, _Allocator>::at(size_type __n) {
   if (__n >= size())
     this->__throw_out_of_range();
   return (*this)[__n];
 }
 
 template <class _Allocator>
-typename vector<bool, _Allocator>::const_reference vector<bool, _Allocator>::at(size_type __n) const {
+_LIBCPP_CONSTEXPR_SINCE_CXX20 typename vector<bool, _Allocator>::const_reference
+vector<bool, _Allocator>::at(size_type __n) const {
   if (__n >= size())
     this->__throw_out_of_range();
   return (*this)[__n];
@@ -994,6 +1014,8 @@ vector<bool, _Allocator>::__insert_with_size(
 template <class _Allocator>
 inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 typename vector<bool, _Allocator>::iterator
 vector<bool, _Allocator>::erase(const_iterator __position) {
+  _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
+      __position != end(), "vector<bool>::erase(iterator) called with a non-dereferenceable iterator");
   iterator __r = __const_iterator_cast(__position);
   std::copy(__position + 1, this->cend(), __r);
   --__size_;
@@ -1003,6 +1025,8 @@ vector<bool, _Allocator>::erase(const_iterator __position) {
 template <class _Allocator>
 _LIBCPP_CONSTEXPR_SINCE_CXX20 typename vector<bool, _Allocator>::iterator
 vector<bool, _Allocator>::erase(const_iterator __first, const_iterator __last) {
+  _LIBCPP_ASSERT_VALID_INPUT_RANGE(
+      __first <= __last, "vector<bool>::erase(iterator, iterator) called with an invalid range");
   iterator __r        = __const_iterator_cast(__first);
   difference_type __d = __last - __first;
   std::copy(__last, this->cend(), __r);
diff --git a/libcxx/include/any b/libcxx/include/any
index 934c4dbd45a6..786e86b5ccd8 100644
--- a/libcxx/include/any
+++ b/libcxx/include/any
@@ -149,11 +149,11 @@ _LIBCPP_HIDE_FROM_ABI add_pointer_t<_ValueType> any_cast(any*) _NOEXCEPT;
 
 namespace __any_imp {
 _LIBCPP_SUPPRESS_DEPRECATED_PUSH
-using _Buffer = aligned_storage_t<3 * sizeof(void*), alignof(void*)>;
+using _Buffer _LIBCPP_NODEBUG = aligned_storage_t<3 * sizeof(void*), alignof(void*)>;
 _LIBCPP_SUPPRESS_DEPRECATED_POP
 
 template <class _Tp>
-using _IsSmallObject =
+using _IsSmallObject _LIBCPP_NODEBUG =
     integral_constant<bool,
                       sizeof(_Tp) <= sizeof(_Buffer) && alignof(_Buffer) % alignof(_Tp) == 0 &&
                           is_nothrow_move_constructible<_Tp>::value >;
@@ -185,7 +185,7 @@ inline _LIBCPP_HIDE_FROM_ABI bool __compare_typeid(type_info const* __id, const
 }
 
 template <class _Tp>
-using _Handler = conditional_t< _IsSmallObject<_Tp>::value, _SmallHandler<_Tp>, _LargeHandler<_Tp>>;
+using _Handler _LIBCPP_NODEBUG = conditional_t< _IsSmallObject<_Tp>::value, _SmallHandler<_Tp>, _LargeHandler<_Tp>>;
 
 } // namespace __any_imp
 
@@ -278,8 +278,9 @@ public:
 #    endif
 
 private:
-  typedef __any_imp::_Action _Action;
-  using _HandleFuncPtr = void* (*)(_Action, any const*, any*, const type_info*, const void* __fallback_info);
+  using _Action _LIBCPP_NODEBUG = __any_imp::_Action;
+  using _HandleFuncPtr
+      _LIBCPP_NODEBUG = void* (*)(_Action, any const*, any*, const type_info*, const void* __fallback_info);
 
   union _Storage {
     _LIBCPP_HIDE_FROM_ABI constexpr _Storage() : __ptr(nullptr) {}
diff --git a/libcxx/include/array b/libcxx/include/array
index 516d96538f5a..1b9bcd6891d9 100644
--- a/libcxx/include/array
+++ b/libcxx/include/array
@@ -173,15 +173,16 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 
 template <class _Tp, size_t _Size>
 struct _LIBCPP_TEMPLATE_VIS array {
-  using __trivially_relocatable = __conditional_t<__libcpp_is_trivially_relocatable<_Tp>::value, array, void>;
+  using __trivially_relocatable _LIBCPP_NODEBUG =
+      __conditional_t<__libcpp_is_trivially_relocatable<_Tp>::value, array, void>;
 
   // types:
-  using __self          = array;
-  using value_type      = _Tp;
-  using reference       = value_type&;
-  using const_reference = const value_type&;
-  using pointer         = value_type*;
-  using const_pointer   = const value_type*;
+  using __self _LIBCPP_NODEBUG = array;
+  using value_type             = _Tp;
+  using reference              = value_type&;
+  using const_reference        = const value_type&;
+  using pointer                = value_type*;
+  using const_pointer          = const value_type*;
 #  if defined(_LIBCPP_ABI_BOUNDED_ITERATORS_IN_STD_ARRAY)
   using iterator       = __static_bounded_iter<pointer, _Size>;
   using const_iterator = __static_bounded_iter<const_pointer, _Size>;
@@ -299,12 +300,12 @@ struct _LIBCPP_TEMPLATE_VIS array {
 template <class _Tp>
 struct _LIBCPP_TEMPLATE_VIS array<_Tp, 0> {
   // types:
-  using __self          = array;
-  using value_type      = _Tp;
-  using reference       = value_type&;
-  using const_reference = const value_type&;
-  using pointer         = value_type*;
-  using const_pointer   = const value_type*;
+  using __self _LIBCPP_NODEBUG = array;
+  using value_type             = _Tp;
+  using reference              = value_type&;
+  using const_reference        = const value_type&;
+  using pointer                = value_type*;
+  using const_pointer          = const value_type*;
 #  if defined(_LIBCPP_ABI_BOUNDED_ITERATORS_IN_STD_ARRAY)
   using iterator       = __static_bounded_iter<pointer, 0>;
   using const_iterator = __static_bounded_iter<const_pointer, 0>;
@@ -320,7 +321,7 @@ struct _LIBCPP_TEMPLATE_VIS array<_Tp, 0> {
   using reverse_iterator       = std::reverse_iterator<iterator>;
   using const_reverse_iterator = std::reverse_iterator<const_iterator>;
 
-  using _EmptyType = __conditional_t<is_const<_Tp>::value, const __empty, __empty>;
+  using _EmptyType _LIBCPP_NODEBUG = __conditional_t<is_const<_Tp>::value, const __empty, __empty>;
 
   struct _ArrayInStructT {
     _Tp __data_[1];
diff --git a/libcxx/include/barrier b/libcxx/include/barrier
index 6861532b02ba..91dfa9720a37 100644
--- a/libcxx/include/barrier
+++ b/libcxx/include/barrier
@@ -95,7 +95,7 @@ It looks different from literature pseudocode for two main reasons:
 
 */
 
-using __barrier_phase_t = uint8_t;
+using __barrier_phase_t _LIBCPP_NODEBUG = uint8_t;
 
 class __barrier_algorithm_base;
 
diff --git a/libcxx/include/ccomplex b/libcxx/include/ccomplex
index 10eb8a36e417..ee7e088aac54 100644
--- a/libcxx/include/ccomplex
+++ b/libcxx/include/ccomplex
@@ -28,13 +28,14 @@
 
 #  if _LIBCPP_STD_VER >= 20
 
-using __standard_header_ccomplex _LIBCPP_DEPRECATED_("removed in C++20. Include <complex> instead.") = void;
-using __use_standard_header_ccomplex = __standard_header_ccomplex;
+using __standard_header_ccomplex
+    _LIBCPP_DEPRECATED_("removed in C++20. Include <complex> instead.") _LIBCPP_NODEBUG = void;
+using __use_standard_header_ccomplex _LIBCPP_NODEBUG                                    = __standard_header_ccomplex;
 
 #  elif _LIBCPP_STD_VER >= 17
 
-using __standard_header_ccomplex _LIBCPP_DEPRECATED_("Include <complex> instead.") = void;
-using __use_standard_header_ccomplex                                               = __standard_header_ccomplex;
+using __standard_header_ccomplex _LIBCPP_DEPRECATED_("Include <complex> instead.") _LIBCPP_NODEBUG = void;
+using __use_standard_header_ccomplex _LIBCPP_NODEBUG = __standard_header_ccomplex;
 
 #  endif
 #endif // __cplusplus < 201103L && defined(_LIBCPP_USE_FROZEN_CXX03_HEADERS)
diff --git a/libcxx/include/ciso646 b/libcxx/include/ciso646
index 5b956401430a..34164362dc10 100644
--- a/libcxx/include/ciso646
+++ b/libcxx/include/ciso646
@@ -26,8 +26,9 @@
 
 #  if _LIBCPP_STD_VER >= 20
 
-using __standard_header_ciso646 _LIBCPP_DEPRECATED_("removed in C++20. Include <version> instead.") = void;
-using __use_standard_header_ciso646 = __standard_header_ciso646;
+using __standard_header_ciso646
+    _LIBCPP_DEPRECATED_("removed in C++20. Include <version> instead.") _LIBCPP_NODEBUG = void;
+using __use_standard_header_ciso646 _LIBCPP_NODEBUG                                     = __standard_header_ciso646;
 
 #  endif
 #endif // __cplusplus < 201103L && defined(_LIBCPP_USE_FROZEN_CXX03_HEADERS)
diff --git a/libcxx/include/complex b/libcxx/include/complex
index bc73f313c2ce..df18159595b3 100644
--- a/libcxx/include/complex
+++ b/libcxx/include/complex
@@ -400,7 +400,7 @@ class _LIBCPP_TEMPLATE_VIS complex<long double>;
 struct __from_builtin_tag {};
 
 template <class _Tp>
-using __complex_t =
+using __complex_t _LIBCPP_NODEBUG =
     __conditional_t<is_same<_Tp, float>::value,
                     _Complex float,
                     __conditional_t<is_same<_Tp, double>::value, _Complex double, _Complex long double> >;
diff --git a/libcxx/include/cstdalign b/libcxx/include/cstdalign
index 6a277e467300..7f8dd1e1fbaf 100644
--- a/libcxx/include/cstdalign
+++ b/libcxx/include/cstdalign
@@ -45,13 +45,13 @@ Macros:
 
 #  if _LIBCPP_STD_VER >= 20
 
-using __standard_header_cstdalign _LIBCPP_DEPRECATED_("removed in C++20.") = void;
-using __use_standard_header_cstdalign                                      = __standard_header_cstdalign;
+using __standard_header_cstdalign _LIBCPP_DEPRECATED_("removed in C++20.") _LIBCPP_NODEBUG = void;
+using __use_standard_header_cstdalign _LIBCPP_NODEBUG = __standard_header_cstdalign;
 
 #  elif _LIBCPP_STD_VER >= 17
 
-using __standard_header_cstdalign _LIBCPP_DEPRECATED = void;
-using __use_standard_header_cstdalign                = __standard_header_cstdalign;
+using __standard_header_cstdalign _LIBCPP_DEPRECATED _LIBCPP_NODEBUG = void;
+using __use_standard_header_cstdalign _LIBCPP_NODEBUG                = __standard_header_cstdalign;
 
 #  endif
 #endif // __cplusplus < 201103L && defined(_LIBCPP_USE_FROZEN_CXX03_HEADERS)
diff --git a/libcxx/include/cstdbool b/libcxx/include/cstdbool
index a12954f07398..a432d5f08b9a 100644
--- a/libcxx/include/cstdbool
+++ b/libcxx/include/cstdbool
@@ -33,13 +33,13 @@ Macros:
 
 #  if _LIBCPP_STD_VER >= 20
 
-using __standard_header_cstdbool _LIBCPP_DEPRECATED_("removed in C++20.") = void;
-using __use_standard_header_cstdbool                                      = __standard_header_cstdbool;
+using __standard_header_cstdbool _LIBCPP_DEPRECATED_("removed in C++20.") _LIBCPP_NODEBUG = void;
+using __use_standard_header_cstdbool _LIBCPP_NODEBUG                                      = __standard_header_cstdbool;
 
 #  elif _LIBCPP_STD_VER >= 17
 
-using __standard_header_cstdbool _LIBCPP_DEPRECATED = void;
-using __use_standard_header_cstdbool                = __standard_header_cstdbool;
+using __standard_header_cstdbool _LIBCPP_DEPRECATED _LIBCPP_NODEBUG = void;
+using __use_standard_header_cstdbool _LIBCPP_NODEBUG                = __standard_header_cstdbool;
 
 #  endif
 #endif // __cplusplus < 201103L && defined(_LIBCPP_USE_FROZEN_CXX03_HEADERS)
diff --git a/libcxx/include/ctgmath b/libcxx/include/ctgmath
index 237f474f01c2..db0786f1e2c4 100644
--- a/libcxx/include/ctgmath
+++ b/libcxx/include/ctgmath
@@ -30,13 +30,14 @@
 
 #  if _LIBCPP_STD_VER >= 20
 
-using __standard_header_ctgmath _LIBCPP_DEPRECATED_("removed in C++20. Include <cmath> and <complex> instead.") = void;
-using __use_standard_header_ctgmath = __standard_header_ctgmath;
+using __standard_header_ctgmath
+    _LIBCPP_DEPRECATED_("removed in C++20. Include <cmath> and <complex> instead.") _LIBCPP_NODEBUG = void;
+using __use_standard_header_ctgmath _LIBCPP_NODEBUG = __standard_header_ctgmath;
 
 #  elif _LIBCPP_STD_VER >= 17
 
-using __standard_header_ctgmath _LIBCPP_DEPRECATED_("Include <cmath> and <complex> instead.") = void;
-using __use_standard_header_ctgmath = __standard_header_ctgmath;
+using __standard_header_ctgmath _LIBCPP_DEPRECATED_("Include <cmath> and <complex> instead.") _LIBCPP_NODEBUG = void;
+using __use_standard_header_ctgmath _LIBCPP_NODEBUG = __standard_header_ctgmath;
 
 #  endif
 
diff --git a/libcxx/include/deque b/libcxx/include/deque
index 883332f1ebdc..df3094cff7f8 100644
--- a/libcxx/include/deque
+++ b/libcxx/include/deque
@@ -456,12 +456,13 @@ template <class _ValueType, class _Pointer, class _Reference, class _MapPointer,
 struct __segmented_iterator_traits<
     __deque_iterator<_ValueType, _Pointer, _Reference, _MapPointer, _DiffType, _BlockSize> > {
 private:
-  using _Iterator = __deque_iterator<_ValueType, _Pointer, _Reference, _MapPointer, _DiffType, _BlockSize>;
+  using _Iterator _LIBCPP_NODEBUG =
+      __deque_iterator<_ValueType, _Pointer, _Reference, _MapPointer, _DiffType, _BlockSize>;
 
 public:
-  using __is_segmented_iterator = true_type;
-  using __segment_iterator      = _MapPointer;
-  using __local_iterator        = _Pointer;
+  using __is_segmented_iterator _LIBCPP_NODEBUG = true_type;
+  using __segment_iterator _LIBCPP_NODEBUG      = _MapPointer;
+  using __local_iterator _LIBCPP_NODEBUG        = _Pointer;
 
   static _LIBCPP_HIDE_FROM_ABI __segment_iterator __segment(_Iterator __iter) { return __iter.__m_iter_; }
   static _LIBCPP_HIDE_FROM_ABI __local_iterator __local(_Iterator __iter) { return __iter.__ptr_; }
@@ -491,8 +492,8 @@ public:
 
   using value_type = _Tp;
 
-  using allocator_type = _Allocator;
-  using __alloc_traits = allocator_traits<allocator_type>;
+  using allocator_type                 = _Allocator;
+  using __alloc_traits _LIBCPP_NODEBUG = allocator_traits<allocator_type>;
   static_assert(__check_valid_allocator<allocator_type>::value, "");
   static_assert(is_same<typename allocator_type::value_type, value_type>::value,
                 "Allocator::value_type must be same type as value_type");
@@ -503,13 +504,13 @@ public:
   using pointer       = typename __alloc_traits::pointer;
   using const_pointer = typename __alloc_traits::const_pointer;
 
-  using __pointer_allocator       = __rebind_alloc<__alloc_traits, pointer>;
-  using __const_pointer_allocator = __rebind_alloc<__alloc_traits, const_pointer>;
-  using __map                     = __split_buffer<pointer, __pointer_allocator>;
-  using __map_alloc_traits        = allocator_traits<__pointer_allocator>;
-  using __map_pointer             = typename __map_alloc_traits::pointer;
-  using __map_const_pointer       = typename allocator_traits<__const_pointer_allocator>::const_pointer;
-  using __map_const_iterator      = typename __map::const_iterator;
+  using __pointer_allocator _LIBCPP_NODEBUG       = __rebind_alloc<__alloc_traits, pointer>;
+  using __const_pointer_allocator _LIBCPP_NODEBUG = __rebind_alloc<__alloc_traits, const_pointer>;
+  using __map _LIBCPP_NODEBUG                     = __split_buffer<pointer, __pointer_allocator>;
+  using __map_alloc_traits _LIBCPP_NODEBUG        = allocator_traits<__pointer_allocator>;
+  using __map_pointer _LIBCPP_NODEBUG             = typename __map_alloc_traits::pointer;
+  using __map_const_pointer _LIBCPP_NODEBUG       = typename allocator_traits<__const_pointer_allocator>::const_pointer;
+  using __map_const_iterator _LIBCPP_NODEBUG      = typename __map::const_iterator;
 
   using reference       = value_type&;
   using const_reference = const value_type&;
@@ -525,7 +526,7 @@ public:
   // - size_type: is always trivially relocatable, since it is required to be an integral type
   // - allocator_type: may not be trivially relocatable, so it's checked
   // None of these are referencing the `deque` itself, so if all of them are trivially relocatable, `deque` is too.
-  using __trivially_relocatable = __conditional_t<
+  using __trivially_relocatable _LIBCPP_NODEBUG = __conditional_t<
       __libcpp_is_trivially_relocatable<__map>::value && __libcpp_is_trivially_relocatable<allocator_type>::value,
       deque,
       void>;
diff --git a/libcxx/include/experimental/__simd/scalar.h b/libcxx/include/experimental/__simd/scalar.h
index da318d2f4650..20c8b02c6589 100644
--- a/libcxx/include/experimental/__simd/scalar.h
+++ b/libcxx/include/experimental/__simd/scalar.h
@@ -49,8 +49,8 @@ struct __mask_storage<_Tp, simd_abi::__scalar> : __simd_storage<bool, simd_abi::
 
 template <class _Tp>
 struct __simd_operations<_Tp, simd_abi::__scalar> {
-  using _SimdStorage = __simd_storage<_Tp, simd_abi::__scalar>;
-  using _MaskStorage = __mask_storage<_Tp, simd_abi::__scalar>;
+  using _SimdStorage _LIBCPP_NODEBUG = __simd_storage<_Tp, simd_abi::__scalar>;
+  using _MaskStorage _LIBCPP_NODEBUG = __mask_storage<_Tp, simd_abi::__scalar>;
 
   static _LIBCPP_HIDE_FROM_ABI _SimdStorage __broadcast(_Tp __v) noexcept { return {__v}; }
 
@@ -86,7 +86,7 @@ struct __simd_operations<_Tp, simd_abi::__scalar> {
 
 template <class _Tp>
 struct __mask_operations<_Tp, simd_abi::__scalar> {
-  using _MaskStorage = __mask_storage<_Tp, simd_abi::__scalar>;
+  using _MaskStorage _LIBCPP_NODEBUG = __mask_storage<_Tp, simd_abi::__scalar>;
 
   static _LIBCPP_HIDE_FROM_ABI _MaskStorage __broadcast(bool __v) noexcept { return {__v}; }
 
diff --git a/libcxx/include/experimental/__simd/simd.h b/libcxx/include/experimental/__simd/simd.h
index fd919e75e32f..2fd2b2644d0b 100644
--- a/libcxx/include/experimental/__simd/simd.h
+++ b/libcxx/include/experimental/__simd/simd.h
@@ -43,8 +43,8 @@ public:
 // TODO: implement simd class
 template <class _Tp, class _Abi>
 class simd : public __simd_int_operators<simd<_Tp, _Abi>, __simd_operations<_Tp, _Abi>, is_integral_v<_Tp>> {
-  using _Impl    = __simd_operations<_Tp, _Abi>;
-  using _Storage = typename _Impl::_SimdStorage;
+  using _Impl _LIBCPP_NODEBUG    = __simd_operations<_Tp, _Abi>;
+  using _Storage _LIBCPP_NODEBUG = typename _Impl::_SimdStorage;
 
   _Storage __s_;
 
diff --git a/libcxx/include/experimental/__simd/simd_mask.h b/libcxx/include/experimental/__simd/simd_mask.h
index 6b6f671bf3e6..a11766545b43 100644
--- a/libcxx/include/experimental/__simd/simd_mask.h
+++ b/libcxx/include/experimental/__simd/simd_mask.h
@@ -27,8 +27,8 @@ inline namespace parallelism_v2 {
 // TODO: implement simd_mask class
 template <class _Tp, class _Abi>
 class simd_mask {
-  using _Impl    = __mask_operations<_Tp, _Abi>;
-  using _Storage = typename _Impl::_MaskStorage;
+  using _Impl _LIBCPP_NODEBUG    = __mask_operations<_Tp, _Abi>;
+  using _Storage _LIBCPP_NODEBUG = typename _Impl::_MaskStorage;
 
   _Storage __s_;
 
diff --git a/libcxx/include/experimental/__simd/vec_ext.h b/libcxx/include/experimental/__simd/vec_ext.h
index abc7e9595be9..2a4b8c748f86 100644
--- a/libcxx/include/experimental/__simd/vec_ext.h
+++ b/libcxx/include/experimental/__simd/vec_ext.h
@@ -55,8 +55,8 @@ struct __mask_storage<_Tp, simd_abi::__vec_ext<_Np>>
 
 template <class _Tp, int _Np>
 struct __simd_operations<_Tp, simd_abi::__vec_ext<_Np>> {
-  using _SimdStorage = __simd_storage<_Tp, simd_abi::__vec_ext<_Np>>;
-  using _MaskStorage = __mask_storage<_Tp, simd_abi::__vec_ext<_Np>>;
+  using _SimdStorage _LIBCPP_NODEBUG = __simd_storage<_Tp, simd_abi::__vec_ext<_Np>>;
+  using _MaskStorage _LIBCPP_NODEBUG = __mask_storage<_Tp, simd_abi::__vec_ext<_Np>>;
 
   static _LIBCPP_HIDE_FROM_ABI _SimdStorage __broadcast(_Tp __v) noexcept {
     _SimdStorage __result;
@@ -101,7 +101,7 @@ struct __simd_operations<_Tp, simd_abi::__vec_ext<_Np>> {
 
 template <class _Tp, int _Np>
 struct __mask_operations<_Tp, simd_abi::__vec_ext<_Np>> {
-  using _MaskStorage = __mask_storage<_Tp, simd_abi::__vec_ext<_Np>>;
+  using _MaskStorage _LIBCPP_NODEBUG = __mask_storage<_Tp, simd_abi::__vec_ext<_Np>>;
 
   static _LIBCPP_HIDE_FROM_ABI _MaskStorage __broadcast(bool __v) noexcept {
     _MaskStorage __result;
diff --git a/libcxx/include/forward_list b/libcxx/include/forward_list
index ea854ea828b3..f3b9617ab2e0 100644
--- a/libcxx/include/forward_list
+++ b/libcxx/include/forward_list
@@ -317,7 +317,8 @@ struct __forward_begin_node {
 };
 
 template <class _Tp, class _VoidPtr>
-using __begin_node_of = __forward_begin_node<__rebind_pointer_t<_VoidPtr, __forward_list_node<_Tp, _VoidPtr> > >;
+using __begin_node_of _LIBCPP_NODEBUG =
+    __forward_begin_node<__rebind_pointer_t<_VoidPtr, __forward_list_node<_Tp, _VoidPtr> > >;
 
 template <class _Tp, class _VoidPtr>
 struct __forward_list_node : public __begin_node_of<_Tp, _VoidPtr> {
diff --git a/libcxx/include/ios b/libcxx/include/ios
index 7c2ee83d4624..98a088266539 100644
--- a/libcxx/include/ios
+++ b/libcxx/include/ios
@@ -629,9 +629,9 @@ private:
   basic_ostream<char_type, traits_type>* __tie_;
 
 #    if defined(_LIBCPP_ABI_IOS_ALLOW_ARBITRARY_FILL_VALUE)
-  using _FillType = _FillHelper<traits_type>;
+  using _FillType _LIBCPP_NODEBUG = _FillHelper<traits_type>;
 #    else
-  using _FillType = _SentinelValueFill<traits_type>;
+  using _FillType _LIBCPP_NODEBUG = _SentinelValueFill<traits_type>;
 #    endif
   mutable _FillType __fill_;
 };
diff --git a/libcxx/include/optional b/libcxx/include/optional
index 165e0f16cab9..c325140ee66f 100644
--- a/libcxx/include/optional
+++ b/libcxx/include/optional
@@ -353,8 +353,8 @@ struct __optional_destruct_base<_Tp, true> {
 
 template <class _Tp, bool = is_reference<_Tp>::value>
 struct __optional_storage_base : __optional_destruct_base<_Tp> {
-  using __base     = __optional_destruct_base<_Tp>;
-  using value_type = _Tp;
+  using __base _LIBCPP_NODEBUG = __optional_destruct_base<_Tp>;
+  using value_type             = _Tp;
   using __base::__base;
 
   _LIBCPP_HIDE_FROM_ABI constexpr bool has_value() const noexcept { return this->__engaged_; }
@@ -396,8 +396,8 @@ struct __optional_storage_base : __optional_destruct_base<_Tp> {
 // to ensure we can make the change in an ABI-compatible manner.
 template <class _Tp>
 struct __optional_storage_base<_Tp, true> {
-  using value_type = _Tp;
-  using __raw_type = remove_reference_t<_Tp>;
+  using value_type                 = _Tp;
+  using __raw_type _LIBCPP_NODEBUG = remove_reference_t<_Tp>;
   __raw_type* __value_;
 
   template <class _Up>
@@ -555,11 +555,11 @@ struct __optional_move_assign_base<_Tp, false> : __optional_copy_assign_base<_Tp
 };
 
 template <class _Tp>
-using __optional_sfinae_ctor_base_t =
+using __optional_sfinae_ctor_base_t _LIBCPP_NODEBUG =
     __sfinae_ctor_base< is_copy_constructible<_Tp>::value, is_move_constructible<_Tp>::value >;
 
 template <class _Tp>
-using __optional_sfinae_assign_base_t =
+using __optional_sfinae_assign_base_t _LIBCPP_NODEBUG =
     __sfinae_assign_base< (is_copy_constructible<_Tp>::value && is_copy_assignable<_Tp>::value),
                           (is_move_constructible<_Tp>::value && is_move_assignable<_Tp>::value) >;
 
@@ -583,12 +583,13 @@ class _LIBCPP_DECLSPEC_EMPTY_BASES optional
     : private __optional_move_assign_base<_Tp>,
       private __optional_sfinae_ctor_base_t<_Tp>,
       private __optional_sfinae_assign_base_t<_Tp> {
-  using __base = __optional_move_assign_base<_Tp>;
+  using __base _LIBCPP_NODEBUG = __optional_move_assign_base<_Tp>;
 
 public:
   using value_type = _Tp;
 
-  using __trivially_relocatable = conditional_t<__libcpp_is_trivially_relocatable<_Tp>::value, optional, void>;
+  using __trivially_relocatable _LIBCPP_NODEBUG =
+      conditional_t<__libcpp_is_trivially_relocatable<_Tp>::value, optional, void>;
 
 private:
   // Disable the reference extension using this static assert.
@@ -613,7 +614,7 @@ private:
     }
   };
   template <class _Up>
-  using _CheckOptionalArgsCtor =
+  using _CheckOptionalArgsCtor _LIBCPP_NODEBUG =
       _If< _IsNotSame<__remove_cvref_t<_Up>, in_place_t>::value && _IsNotSame<__remove_cvref_t<_Up>, optional>::value &&
                (!is_same_v<remove_cv_t<_Tp>, bool> || !__is_std_optional<__remove_cvref_t<_Up>>::value),
            _CheckOptionalArgsConstructor,
@@ -621,7 +622,7 @@ private:
   template <class _QualUp>
   struct _CheckOptionalLikeConstructor {
     template <class _Up, class _Opt = optional<_Up>>
-    using __check_constructible_from_opt =
+    using __check_constructible_from_opt _LIBCPP_NODEBUG =
         _Or< is_constructible<_Tp, _Opt&>,
              is_constructible<_Tp, _Opt const&>,
              is_constructible<_Tp, _Opt&&>,
@@ -631,7 +632,7 @@ private:
              is_convertible<_Opt&&, _Tp>,
              is_convertible<_Opt const&&, _Tp> >;
     template <class _Up, class _Opt = optional<_Up>>
-    using __check_assignable_from_opt =
+    using __check_assignable_from_opt _LIBCPP_NODEBUG =
         _Or< is_assignable<_Tp&, _Opt&>,
              is_assignable<_Tp&, _Opt const&>,
              is_assignable<_Tp&, _Opt&&>,
@@ -655,12 +656,12 @@ private:
   };
 
   template <class _Up, class _QualUp>
-  using _CheckOptionalLikeCtor =
+  using _CheckOptionalLikeCtor _LIBCPP_NODEBUG =
       _If< _And< _IsNotSame<_Up, _Tp>, is_constructible<_Tp, _QualUp> >::value,
            _CheckOptionalLikeConstructor<_QualUp>,
            __check_tuple_constructor_fail >;
   template <class _Up, class _QualUp>
-  using _CheckOptionalLikeAssign =
+  using _CheckOptionalLikeAssign _LIBCPP_NODEBUG =
       _If< _And< _IsNotSame<_Up, _Tp>, is_constructible<_Tp, _QualUp>, is_assignable<_Tp&, _QualUp> >::value,
            _CheckOptionalLikeConstructor<_QualUp>,
            __check_tuple_constructor_fail >;
diff --git a/libcxx/include/ratio b/libcxx/include/ratio
index b35e2bd9dad6..2b5e34cbcd18 100644
--- a/libcxx/include/ratio
+++ b/libcxx/include/ratio
@@ -465,7 +465,7 @@ struct _LIBCPP_TEMPLATE_VIS ratio_greater_equal : _BoolConstant<!ratio_less<_R1,
 };
 
 template <class _R1, class _R2>
-using __ratio_gcd = ratio<__static_gcd<_R1::num, _R2::num>, __static_lcm<_R1::den, _R2::den> >;
+using __ratio_gcd _LIBCPP_NODEBUG = ratio<__static_gcd<_R1::num, _R2::num>, __static_lcm<_R1::den, _R2::den> >;
 
 #  if _LIBCPP_STD_VER >= 17
 template <class _R1, class _R2>
diff --git a/libcxx/include/regex b/libcxx/include/regex
index 15ec15a6985e..5cad0bc4b812 100644
--- a/libcxx/include/regex
+++ b/libcxx/include/regex
@@ -4229,7 +4229,8 @@ inline _LIBCPP_HIDE_FROM_ABI bool operator==(const sub_match<_BiIter>& __x, cons
 
 #  if _LIBCPP_STD_VER >= 20
 template <class _BiIter>
-using __sub_match_cat = compare_three_way_result_t<basic_string<typename iterator_traits<_BiIter>::value_type>>;
+using __sub_match_cat _LIBCPP_NODEBUG =
+    compare_three_way_result_t<basic_string<typename iterator_traits<_BiIter>::value_type>>;
 
 template <class _BiIter>
 _LIBCPP_HIDE_FROM_ABI auto operator<=>(const sub_match<_BiIter>& __x, const sub_match<_BiIter>& __y) {
diff --git a/libcxx/include/source_location b/libcxx/include/source_location
index bbbb86bc68c8..b4777ce5a100 100644
--- a/libcxx/include/source_location
+++ b/libcxx/include/source_location
@@ -55,7 +55,7 @@ class source_location {
   // in constant evaluation, so we don't want to use `void*` as the argument
   // type unless the builtin returned that, anyhow, and the invalid cast is
   // unavoidable.
-  using __bsl_ty = decltype(__builtin_source_location());
+  using __bsl_ty _LIBCPP_NODEBUG = decltype(__builtin_source_location());
 
 public:
   // The defaulted __ptr argument is necessary so that the builtin is evaluated
diff --git a/libcxx/include/string b/libcxx/include/string
index 7808f56f6001..39982d5670bd 100644
--- a/libcxx/include/string
+++ b/libcxx/include/string
@@ -764,7 +764,7 @@ struct __padding<0> {};
 template <class _CharT, class _Traits, class _Allocator>
 class basic_string {
 private:
-  using __default_allocator_type = allocator<_CharT>;
+  using __default_allocator_type _LIBCPP_NODEBUG = allocator<_CharT>;
 
 public:
   typedef basic_string __self;
@@ -798,7 +798,7 @@ public:
   // Therefore it's crucial to ensure the destructor is called.
   using __trivially_relocatable = void;
 #  else
-  using __trivially_relocatable = __conditional_t<
+  using __trivially_relocatable _LIBCPP_NODEBUG = __conditional_t<
       __libcpp_is_trivially_relocatable<allocator_type>::value && __libcpp_is_trivially_relocatable<pointer>::value,
       basic_string,
       void>;
diff --git a/libcxx/include/tuple b/libcxx/include/tuple
index e4f1fc209b73..aca14ba408d3 100644
--- a/libcxx/include/tuple
+++ b/libcxx/include/tuple
@@ -552,7 +552,8 @@ class _LIBCPP_TEMPLATE_VIS tuple {
   get(const tuple<_Up...>&&) _NOEXCEPT;
 
 public:
-  using __trivially_relocatable = __conditional_t<_And<__libcpp_is_trivially_relocatable<_Tp>...>::value, tuple, void>;
+  using __trivially_relocatable _LIBCPP_NODEBUG =
+      __conditional_t<_And<__libcpp_is_trivially_relocatable<_Tp>...>::value, tuple, void>;
 
   // [tuple.cnstr]
 
diff --git a/libcxx/include/variant b/libcxx/include/variant
index 3fa1b4b30f05..6c7be7f8f1eb 100644
--- a/libcxx/include/variant
+++ b/libcxx/include/variant
@@ -357,7 +357,7 @@ _LIBCPP_HIDE_FROM_ABI constexpr auto __choose_index_type() {
 }
 
 template <size_t _NumAlts>
-using __variant_index_t = decltype(std::__choose_index_type<_NumAlts>());
+using __variant_index_t _LIBCPP_NODEBUG = decltype(std::__choose_index_type<_NumAlts>());
 
 template <class _IndexType>
 constexpr _IndexType __variant_npos = static_cast<_IndexType>(-1);
@@ -658,8 +658,8 @@ private:
 
 template <size_t _Index, class _Tp>
 struct _LIBCPP_TEMPLATE_VIS __alt {
-  using __value_type              = _Tp;
-  static constexpr size_t __index = _Index;
+  using __value_type _LIBCPP_NODEBUG = _Tp;
+  static constexpr size_t __index    = _Index;
 
   template <class... _Args>
   _LIBCPP_HIDE_FROM_ABI explicit constexpr __alt(in_place_t, _Args&&... __args)
@@ -713,7 +713,7 @@ _LIBCPP_VARIANT_UNION(_Trait::_Unavailable, _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTE
 template <_Trait _DestructibleTrait, class... _Types>
 class _LIBCPP_TEMPLATE_VIS __base {
 public:
-  using __index_t = __variant_index_t<sizeof...(_Types)>;
+  using __index_t _LIBCPP_NODEBUG = __variant_index_t<sizeof...(_Types)>;
 
   _LIBCPP_HIDE_FROM_ABI explicit constexpr __base(__valueless_t __tag) noexcept
       : __data(__tag), __index(__variant_npos<__index_t>) {}
@@ -753,8 +753,8 @@ class _LIBCPP_TEMPLATE_VIS __dtor;
       template <class... _Types>                                                                                       \
       class _LIBCPP_TEMPLATE_VIS __dtor<__traits<_Types...>, destructible_trait>                                       \
           : public __base<destructible_trait, _Types...> {                                                             \
-        using __base_type = __base<destructible_trait, _Types...>;                                                     \
-        using __index_t   = typename __base_type::__index_t;                                                           \
+        using __base_type _LIBCPP_NODEBUG = __base<destructible_trait, _Types...>;                                     \
+        using __index_t _LIBCPP_NODEBUG   = typename __base_type::__index_t;                                           \
                                                                                                                        \
       public:                                                                                                          \
         using __base_type::__base_type;                                                                                \
@@ -799,7 +799,7 @@ _LIBCPP_VARIANT_DESTRUCTOR(_Trait::_Unavailable,
 
 template <class _Traits>
 class _LIBCPP_TEMPLATE_VIS __ctor : public __dtor<_Traits> {
-  using __base_type = __dtor<_Traits>;
+  using __base_type _LIBCPP_NODEBUG = __dtor<_Traits>;
 
 public:
   using __base_type::__base_type;
@@ -831,7 +831,7 @@ class _LIBCPP_TEMPLATE_VIS __move_constructor;
       template <class... _Types>                                                                                       \
       class _LIBCPP_TEMPLATE_VIS __move_constructor<__traits<_Types...>, move_constructible_trait>                     \
           : public __ctor<__traits<_Types...>> {                                                                       \
-        using __base_type = __ctor<__traits<_Types...>>;                                                               \
+        using __base_type _LIBCPP_NODEBUG = __ctor<__traits<_Types...>>;                                               \
                                                                                                                        \
       public:                                                                                                          \
         using __base_type::__base_type;                                                                                \
@@ -869,7 +869,7 @@ class _LIBCPP_TEMPLATE_VIS __copy_constructor;
       template <class... _Types>                                                                                       \
       class _LIBCPP_TEMPLATE_VIS __copy_constructor<__traits<_Types...>, copy_constructible_trait>                     \
           : public __move_constructor<__traits<_Types...>> {                                                           \
-        using __base_type = __move_constructor<__traits<_Types...>>;                                                   \
+        using __base_type _LIBCPP_NODEBUG = __move_constructor<__traits<_Types...>>;                                   \
                                                                                                                        \
       public:                                                                                                          \
         using __base_type::__base_type;                                                                                \
@@ -899,7 +899,7 @@ _LIBCPP_VARIANT_COPY_CONSTRUCTOR(
 
 template <class _Traits>
 class _LIBCPP_TEMPLATE_VIS __assignment : public __copy_constructor<_Traits> {
-  using __base_type = __copy_constructor<_Traits>;
+  using __base_type _LIBCPP_NODEBUG = __copy_constructor<_Traits>;
 
 public:
   using __base_type::__base_type;
@@ -958,7 +958,7 @@ class _LIBCPP_TEMPLATE_VIS __move_assignment;
       template <class... _Types>                                                                                       \
       class _LIBCPP_TEMPLATE_VIS __move_assignment<__traits<_Types...>, move_assignable_trait>                         \
           : public __assignment<__traits<_Types...>> {                                                                 \
-        using __base_type = __assignment<__traits<_Types...>>;                                                         \
+        using __base_type _LIBCPP_NODEBUG = __assignment<__traits<_Types...>>;                                         \
                                                                                                                        \
       public:                                                                                                          \
         using __base_type::__base_type;                                                                                \
@@ -997,7 +997,7 @@ class _LIBCPP_TEMPLATE_VIS __copy_assignment;
       template <class... _Types>                                                                                       \
       class _LIBCPP_TEMPLATE_VIS __copy_assignment<__traits<_Types...>, copy_assignable_trait>                         \
           : public __move_assignment<__traits<_Types...>> {                                                            \
-        using __base_type = __move_assignment<__traits<_Types...>>;                                                    \
+        using __base_type _LIBCPP_NODEBUG = __move_assignment<__traits<_Types...>>;                                    \
                                                                                                                        \
       public:                                                                                                          \
         using __base_type::__base_type;                                                                                \
@@ -1030,7 +1030,7 @@ _LIBCPP_VARIANT_COPY_ASSIGNMENT(_Trait::_Unavailable,
 
 template <class... _Types>
 class _LIBCPP_TEMPLATE_VIS __impl : public __copy_assignment<__traits<_Types...>> {
-  using __base_type = __copy_assignment<__traits<_Types...>>;
+  using __base_type _LIBCPP_NODEBUG = __copy_assignment<__traits<_Types...>>;
 
 public:
   using __base_type::__base_type; // get in_place_index_t constructor & friends
@@ -1097,7 +1097,7 @@ private:
 
 struct __no_narrowing_check {
   template <class _Dest, class _Source>
-  using _Apply = __type_identity<_Dest>;
+  using _Apply _LIBCPP_NODEBUG = __type_identity<_Dest>;
 };
 
 struct __narrowing_check {
@@ -1138,7 +1138,7 @@ using _MakeOverloads _LIBCPP_NODEBUG =
     typename __make_overloads_imp< __make_indices_imp<sizeof...(_Types), 0> >::template _Apply<_Types...>;
 
 template <class _Tp, class... _Types>
-using __best_match_t = typename invoke_result_t<_MakeOverloads<_Types...>, _Tp, _Tp>::type;
+using __best_match_t _LIBCPP_NODEBUG = typename invoke_result_t<_MakeOverloads<_Types...>, _Tp, _Tp>::type;
 
 } // namespace __variant_detail
 
@@ -1170,10 +1170,10 @@ class _LIBCPP_TEMPLATE_VIS _LIBCPP_DECLSPEC_EMPTY_BASES variant
 
   static_assert(__all<!is_void_v<_Types>...>::value, "variant can not have a void type as an alternative.");
 
-  using __first_type = variant_alternative_t<0, variant>;
+  using __first_type _LIBCPP_NODEBUG = variant_alternative_t<0, variant>;
 
 public:
-  using __trivially_relocatable =
+  using __trivially_relocatable _LIBCPP_NODEBUG =
       conditional_t<_And<__libcpp_is_trivially_relocatable<_Types>...>::value, variant, void>;
 
   template <bool _Dummy                                                                               = true,
diff --git a/libcxx/src/experimental/tzdb.cpp b/libcxx/src/experimental/tzdb.cpp
index d22de21c9981..638d45f69e03 100644
--- a/libcxx/src/experimental/tzdb.cpp
+++ b/libcxx/src/experimental/tzdb.cpp
@@ -9,11 +9,14 @@
 // For information see https://libcxx.llvm.org/DesignDocs/TimeZone.html
 
 #include <algorithm>
+#include <cctype>
 #include <chrono>
 #include <filesystem>
 #include <fstream>
 #include <stdexcept>
 #include <string>
+#include <string_view>
+#include <vector>
 
 #include "include/tzdb/time_zone_private.h"
 #include "include/tzdb/types_private.h"
diff --git a/libcxx/src/filesystem/directory_iterator.cpp b/libcxx/src/filesystem/directory_iterator.cpp
index d7ed9a358f55..7e8e40d17f7a 100644
--- a/libcxx/src/filesystem/directory_iterator.cpp
+++ b/libcxx/src/filesystem/directory_iterator.cpp
@@ -47,9 +47,9 @@ public:
     }
     __stream_ = ::FindFirstFileW((root / "*").c_str(), &__data_);
     if (__stream_ == INVALID_HANDLE_VALUE) {
-      ec                                  = detail::make_windows_error(GetLastError());
+      ec                                  = detail::get_last_error();
       const bool ignore_permission_denied = bool(opts & directory_options::skip_permission_denied);
-      if (ignore_permission_denied && ec.value() == static_cast<int>(errc::permission_denied))
+      if (ignore_permission_denied && ec == errc::permission_denied)
         ec.clear();
       return;
     }
@@ -91,7 +91,7 @@ private:
   error_code close() noexcept {
     error_code ec;
     if (!::FindClose(__stream_))
-      ec = detail::make_windows_error(GetLastError());
+      ec = detail::get_last_error();
     __stream_ = INVALID_HANDLE_VALUE;
     return ec;
   }
@@ -118,7 +118,7 @@ public:
     if ((__stream_ = ::opendir(root.c_str())) == nullptr) {
       ec                      = detail::capture_errno();
       const bool allow_eacces = bool(opts & directory_options::skip_permission_denied);
-      if (allow_eacces && ec.value() == EACCES)
+      if (allow_eacces && ec == errc::permission_denied)
         ec.clear();
       return;
     }
@@ -307,7 +307,7 @@ bool recursive_directory_iterator::__try_recursion(error_code* ec) {
   }
   if (m_ec) {
     const bool allow_eacess = bool(__imp_->__options_ & directory_options::skip_permission_denied);
-    if (m_ec.value() == EACCES && allow_eacess) {
+    if (m_ec == errc::permission_denied && allow_eacess) {
       if (ec)
         ec->clear();
     } else {
diff --git a/libcxx/src/filesystem/error.h b/libcxx/src/filesystem/error.h
index 07ba7fc3eef2..c0213910b378 100644
--- a/libcxx/src/filesystem/error.h
+++ b/libcxx/src/filesystem/error.h
@@ -32,80 +32,21 @@ _LIBCPP_BEGIN_NAMESPACE_FILESYSTEM
 
 namespace detail {
 
-#if defined(_LIBCPP_WIN32API)
-
-inline errc __win_err_to_errc(int err) {
-  constexpr struct {
-    DWORD win;
-    errc errc;
-  } win_error_mapping[] = {
-      {ERROR_ACCESS_DENIED, errc::permission_denied},
-      {ERROR_ALREADY_EXISTS, errc::file_exists},
-      {ERROR_BAD_NETPATH, errc::no_such_file_or_directory},
-      {ERROR_BAD_PATHNAME, errc::no_such_file_or_directory},
-      {ERROR_BAD_UNIT, errc::no_such_device},
-      {ERROR_BROKEN_PIPE, errc::broken_pipe},
-      {ERROR_BUFFER_OVERFLOW, errc::filename_too_long},
-      {ERROR_BUSY, errc::device_or_resource_busy},
-      {ERROR_BUSY_DRIVE, errc::device_or_resource_busy},
-      {ERROR_CANNOT_MAKE, errc::permission_denied},
-      {ERROR_CANTOPEN, errc::io_error},
-      {ERROR_CANTREAD, errc::io_error},
-      {ERROR_CANTWRITE, errc::io_error},
-      {ERROR_CURRENT_DIRECTORY, errc::permission_denied},
-      {ERROR_DEV_NOT_EXIST, errc::no_such_device},
-      {ERROR_DEVICE_IN_USE, errc::device_or_resource_busy},
-      {ERROR_DIR_NOT_EMPTY, errc::directory_not_empty},
-      {ERROR_DIRECTORY, errc::invalid_argument},
-      {ERROR_DISK_FULL, errc::no_space_on_device},
-      {ERROR_FILE_EXISTS, errc::file_exists},
-      {ERROR_FILE_NOT_FOUND, errc::no_such_file_or_directory},
-      {ERROR_HANDLE_DISK_FULL, errc::no_space_on_device},
-      {ERROR_INVALID_ACCESS, errc::permission_denied},
-      {ERROR_INVALID_DRIVE, errc::no_such_device},
-      {ERROR_INVALID_FUNCTION, errc::function_not_supported},
-      {ERROR_INVALID_HANDLE, errc::invalid_argument},
-      {ERROR_INVALID_NAME, errc::no_such_file_or_directory},
-      {ERROR_INVALID_PARAMETER, errc::invalid_argument},
-      {ERROR_LOCK_VIOLATION, errc::no_lock_available},
-      {ERROR_LOCKED, errc::no_lock_available},
-      {ERROR_NEGATIVE_SEEK, errc::invalid_argument},
-      {ERROR_NOACCESS, errc::permission_denied},
-      {ERROR_NOT_ENOUGH_MEMORY, errc::not_enough_memory},
-      {ERROR_NOT_READY, errc::resource_unavailable_try_again},
-      {ERROR_NOT_SAME_DEVICE, errc::cross_device_link},
-      {ERROR_NOT_SUPPORTED, errc::not_supported},
-      {ERROR_OPEN_FAILED, errc::io_error},
-      {ERROR_OPEN_FILES, errc::device_or_resource_busy},
-      {ERROR_OPERATION_ABORTED, errc::operation_canceled},
-      {ERROR_OUTOFMEMORY, errc::not_enough_memory},
-      {ERROR_PATH_NOT_FOUND, errc::no_such_file_or_directory},
-      {ERROR_READ_FAULT, errc::io_error},
-      {ERROR_REPARSE_TAG_INVALID, errc::invalid_argument},
-      {ERROR_RETRY, errc::resource_unavailable_try_again},
-      {ERROR_SEEK, errc::io_error},
-      {ERROR_SHARING_VIOLATION, errc::permission_denied},
-      {ERROR_TOO_MANY_OPEN_FILES, errc::too_many_files_open},
-      {ERROR_WRITE_FAULT, errc::io_error},
-      {ERROR_WRITE_PROTECT, errc::permission_denied},
-  };
-
-  for (const auto& pair : win_error_mapping)
-    if (pair.win == static_cast<DWORD>(err))
-      return pair.errc;
-  return errc::invalid_argument;
-}
-
-#endif // _LIBCPP_WIN32API
+// On windows, libc functions use errno, but system functions use GetLastError.
+// So, callers need to be careful which of these next functions they call!
 
 inline error_code capture_errno() {
   _LIBCPP_ASSERT_INTERNAL(errno != 0, "Expected errno to be non-zero");
   return error_code(errno, generic_category());
 }
 
+inline error_code get_last_error() {
 #if defined(_LIBCPP_WIN32API)
-inline error_code make_windows_error(int err) { return make_error_code(__win_err_to_errc(err)); }
+  return std::error_code(GetLastError(), std::system_category());
+#else
+  return capture_errno();
 #endif
+}
 
 template <class T>
 T error_value();
diff --git a/libcxx/src/filesystem/file_descriptor.h b/libcxx/src/filesystem/file_descriptor.h
index db66ad55bd4f..9c279c451f28 100644
--- a/libcxx/src/filesystem/file_descriptor.h
+++ b/libcxx/src/filesystem/file_descriptor.h
@@ -201,7 +201,7 @@ inline perms posix_get_perms(const StatT& st) noexcept { return static_cast<perm
 inline file_status create_file_status(error_code& m_ec, path const& p, const StatT& path_stat, error_code* ec) {
   if (ec)
     *ec = m_ec;
-  if (m_ec && (m_ec.value() == ENOENT || m_ec.value() == ENOTDIR)) {
+  if (m_ec && (m_ec == errc::no_such_file_or_directory || m_ec == errc::not_a_directory)) {
     return file_status(file_type::not_found);
   } else if (m_ec) {
     ErrorHandler<void> err("posix_stat", ec, &p);
@@ -236,7 +236,7 @@ inline file_status create_file_status(error_code& m_ec, path const& p, const Sta
 inline file_status posix_stat(path const& p, StatT& path_stat, error_code* ec) {
   error_code m_ec;
   if (detail::stat(p.c_str(), &path_stat) == -1)
-    m_ec = detail::capture_errno();
+    m_ec = detail::get_last_error();
   return create_file_status(m_ec, p, path_stat, ec);
 }
 
@@ -248,7 +248,7 @@ inline file_status posix_stat(path const& p, error_code* ec) {
 inline file_status posix_lstat(path const& p, StatT& path_stat, error_code* ec) {
   error_code m_ec;
   if (detail::lstat(p.c_str(), &path_stat) == -1)
-    m_ec = detail::capture_errno();
+    m_ec = detail::get_last_error();
   return create_file_status(m_ec, p, path_stat, ec);
 }
 
@@ -260,7 +260,7 @@ inline file_status posix_lstat(path const& p, error_code* ec) {
 // http://pubs.opengroup.org/onlinepubs/9699919799/functions/ftruncate.html
 inline bool posix_ftruncate(const FileDescriptor& fd, off_t to_size, error_code& ec) {
   if (detail::ftruncate(fd.fd, to_size) == -1) {
-    ec = capture_errno();
+    ec = get_last_error();
     return true;
   }
   ec.clear();
@@ -269,7 +269,7 @@ inline bool posix_ftruncate(const FileDescriptor& fd, off_t to_size, error_code&
 
 inline bool posix_fchmod(const FileDescriptor& fd, const StatT& st, error_code& ec) {
   if (detail::fchmod(fd.fd, st.st_mode) == -1) {
-    ec = capture_errno();
+    ec = get_last_error();
     return true;
   }
   ec.clear();
@@ -286,7 +286,7 @@ inline file_status FileDescriptor::refresh_status(error_code& ec) {
   m_stat   = {};
   error_code m_ec;
   if (detail::fstat(fd, &m_stat) == -1)
-    m_ec = capture_errno();
+    m_ec = get_last_error();
   m_status = create_file_status(m_ec, name, m_stat, &ec);
   return m_status;
 }
diff --git a/libcxx/src/filesystem/operations.cpp b/libcxx/src/filesystem/operations.cpp
index bd37c5af86f6..23c1c281ba1c 100644
--- a/libcxx/src/filesystem/operations.cpp
+++ b/libcxx/src/filesystem/operations.cpp
@@ -39,8 +39,16 @@
 #include <fcntl.h> /* values for fchmodat */
 #include <time.h>
 
-// since Linux 4.5 and FreeBSD 13, but the Linux libc wrapper is only provided by glibc and musl
-#if (defined(__linux__) && (defined(__GLIBC__) || _LIBCPP_HAS_MUSL_LIBC)) || defined(__FreeBSD__)
+// since Linux 4.5 and FreeBSD 13, but the Linux libc wrapper is only provided by glibc >= 2.27 and musl
+#if defined(__linux__)
+#  if defined(_LIBCPP_GLIBC_PREREQ)
+#    if _LIBCPP_GLIBC_PREREQ(2, 27)
+#      define _LIBCPP_FILESYSTEM_USE_COPY_FILE_RANGE
+#    endif
+#  elif _LIBCPP_HAS_MUSL_LIBC
+#    define _LIBCPP_FILESYSTEM_USE_COPY_FILE_RANGE
+#  endif
+#elif defined(__FreeBSD__)
 #  define _LIBCPP_FILESYSTEM_USE_COPY_FILE_RANGE
 #endif
 #if __has_include(<sys/sendfile.h>)
@@ -100,7 +108,7 @@ path __canonical(path const& orig_p, error_code* ec) {
 #if (defined(_POSIX_VERSION) && _POSIX_VERSION >= 200112) || defined(_LIBCPP_WIN32API)
   std::unique_ptr<path::value_type, decltype(&::free)> hold(detail::realpath(p.c_str(), nullptr), &::free);
   if (hold.get() == nullptr)
-    return err.report(capture_errno());
+    return err.report(detail::get_last_error());
   return {hold.get()};
 #else
 #  if defined(__MVS__) && !defined(PATH_MAX)
@@ -110,7 +118,7 @@ path __canonical(path const& orig_p, error_code* ec) {
 #  endif
   path::value_type* ret;
   if ((ret = detail::realpath(p.c_str(), buff)) == nullptr)
-    return err.report(capture_errno());
+    return err.report(detail::get_last_error());
   return {ret};
 #endif
 }
@@ -238,8 +246,14 @@ bool copy_file_impl_copy_file_range(FileDescriptor& read_fd, FileDescriptor& wri
     return false;
   }
   // do not modify the fd positions as copy_file_impl_sendfile may be called after a partial copy
+#  if defined(__linux__)
+  loff_t off_in  = 0;
+  loff_t off_out = 0;
+#  else
   off_t off_in  = 0;
   off_t off_out = 0;
+#  endif
+
   do {
     ssize_t res;
 
@@ -499,9 +513,9 @@ bool __create_directory(const path& p, error_code* ec) {
   if (detail::mkdir(p.c_str(), static_cast<int>(perms::all)) == 0)
     return true;
 
-  if (errno != EEXIST)
-    return err.report(capture_errno());
-  error_code mec = capture_errno();
+  error_code mec = detail::get_last_error();
+  if (mec != errc::file_exists)
+    return err.report(mec);
   error_code ignored_ec;
   const file_status st = status(p, ignored_ec);
   if (!is_directory(st))
@@ -523,10 +537,10 @@ bool __create_directory(path const& p, path const& attributes, error_code* ec) {
   if (detail::mkdir(p.c_str(), attr_stat.st_mode) == 0)
     return true;
 
-  if (errno != EEXIST)
-    return err.report(capture_errno());
+  mec = detail::get_last_error();
+  if (mec != errc::file_exists)
+    return err.report(mec);
 
-  mec = capture_errno();
   error_code ignored_ec;
   st = status(p, ignored_ec);
   if (!is_directory(st))
@@ -537,19 +551,19 @@ bool __create_directory(path const& p, path const& attributes, error_code* ec) {
 void __create_directory_symlink(path const& from, path const& to, error_code* ec) {
   ErrorHandler<void> err("create_directory_symlink", ec, &from, &to);
   if (detail::symlink_dir(from.c_str(), to.c_str()) == -1)
-    return err.report(capture_errno());
+    return err.report(detail::get_last_error());
 }
 
 void __create_hard_link(const path& from, const path& to, error_code* ec) {
   ErrorHandler<void> err("create_hard_link", ec, &from, &to);
   if (detail::link(from.c_str(), to.c_str()) == -1)
-    return err.report(capture_errno());
+    return err.report(detail::get_last_error());
 }
 
 void __create_symlink(path const& from, path const& to, error_code* ec) {
   ErrorHandler<void> err("create_symlink", ec, &from, &to);
   if (detail::symlink_file(from.c_str(), to.c_str()) == -1)
-    return err.report(capture_errno());
+    return err.report(detail::get_last_error());
 }
 
 path __current_path(error_code* ec) {
@@ -592,7 +606,7 @@ path __current_path(error_code* ec) {
 
   unique_ptr<path::value_type, Deleter> hold(detail::getcwd(ptr, size), deleter);
   if (hold.get() == nullptr)
-    return err.report(capture_errno(), "call to getcwd failed");
+    return err.report(detail::get_last_error(), "call to getcwd failed");
 
   return {hold.get()};
 }
@@ -600,7 +614,7 @@ path __current_path(error_code* ec) {
 void __current_path(const path& p, error_code* ec) {
   ErrorHandler<void> err("current_path", ec, &p);
   if (detail::chdir(p.c_str()) == -1)
-    err.report(capture_errno());
+    err.report(detail::get_last_error());
 }
 
 bool __equivalent(const path& p1, const path& p2, error_code* ec) {
@@ -688,10 +702,10 @@ void __last_write_time(const path& p, file_time_type new_time, error_code* ec) {
     return err.report(errc::value_too_large);
   detail::WinHandle h(p.c_str(), FILE_WRITE_ATTRIBUTES, 0);
   if (!h)
-    return err.report(detail::make_windows_error(GetLastError()));
+    return err.report(detail::get_last_error());
   FILETIME last_write = timespec_to_filetime(ts);
   if (!SetFileTime(h, nullptr, nullptr, &last_write))
-    return err.report(detail::make_windows_error(GetLastError()));
+    return err.report(detail::get_last_error());
 #else
   error_code m_ec;
   array<TimeSpec, 2> tbuf;
@@ -749,7 +763,7 @@ void __permissions(const path& p, perms prms, perm_options opts, error_code* ec)
 #if defined(AT_SYMLINK_NOFOLLOW) && defined(AT_FDCWD)
   const int flags = set_sym_perms ? AT_SYMLINK_NOFOLLOW : 0;
   if (detail::fchmodat(AT_FDCWD, p.c_str(), real_perms, flags) == -1) {
-    return err.report(capture_errno());
+    return err.report(detail::get_last_error());
   }
 #else
   if (set_sym_perms)
@@ -777,14 +791,14 @@ path __read_symlink(const path& p, error_code* ec) {
 #else
   StatT sb;
   if (detail::lstat(p.c_str(), &sb) == -1) {
-    return err.report(capture_errno());
+    return err.report(detail::get_last_error());
   }
   const size_t size = sb.st_size + 1;
   auto buff         = unique_ptr<path::value_type[]>(new path::value_type[size]);
 #endif
   detail::SSizeT ret;
   if ((ret = detail::readlink(p.c_str(), buff.get(), size)) == -1)
-    return err.report(capture_errno());
+    return err.report(detail::get_last_error());
   // Note that `ret` returning `0` would work, resulting in a valid empty string being returned.
   if (static_cast<size_t>(ret) >= size)
     return err.report(errc::value_too_large);
@@ -795,8 +809,9 @@ path __read_symlink(const path& p, error_code* ec) {
 bool __remove(const path& p, error_code* ec) {
   ErrorHandler<bool> err("remove", ec, &p);
   if (detail::remove(p.c_str()) == -1) {
-    if (errno != ENOENT)
-      err.report(capture_errno());
+    error_code mec = detail::get_last_error();
+    if (mec != errc::no_such_file_or_directory)
+      err.report(mec);
     return false;
   }
   return true;
@@ -949,13 +964,13 @@ uintmax_t __remove_all(const path& p, error_code* ec) {
 void __rename(const path& from, const path& to, error_code* ec) {
   ErrorHandler<void> err("rename", ec, &from, &to);
   if (detail::rename(from.c_str(), to.c_str()) == -1)
-    err.report(capture_errno());
+    err.report(detail::get_last_error());
 }
 
 void __resize_file(const path& p, uintmax_t size, error_code* ec) {
   ErrorHandler<void> err("resize_file", ec, &p);
   if (detail::truncate(p.c_str(), static_cast< ::off_t>(size)) == -1)
-    return err.report(capture_errno());
+    return err.report(detail::get_last_error());
 }
 
 space_info __space(const path& p, error_code* ec) {
@@ -963,7 +978,7 @@ space_info __space(const path& p, error_code* ec) {
   space_info si;
   detail::StatVFS m_svfs = {};
   if (detail::statvfs(p.c_str(), &m_svfs) == -1) {
-    err.report(capture_errno());
+    err.report(detail::get_last_error());
     si.capacity = si.free = si.available = static_cast<uintmax_t>(-1);
     return si;
   }
@@ -990,7 +1005,7 @@ path __temp_directory_path(error_code* ec) {
   wchar_t buf[MAX_PATH];
   DWORD retval = GetTempPathW(MAX_PATH, buf);
   if (!retval)
-    return err.report(detail::make_windows_error(GetLastError()));
+    return err.report(detail::get_last_error());
   if (retval > MAX_PATH)
     return err.report(errc::filename_too_long);
   // GetTempPathW returns a path with a trailing slash, which we
diff --git a/libcxx/src/filesystem/posix_compat.h b/libcxx/src/filesystem/posix_compat.h
index b41c004341af..ddd99d8aaf20 100644
--- a/libcxx/src/filesystem/posix_compat.h
+++ b/libcxx/src/filesystem/posix_compat.h
@@ -11,9 +11,10 @@
 //
 // These generally behave like the proper posix functions, with these
 // exceptions:
-// On Windows, they take paths in wchar_t* form, instead of char* form.
-// The symlink() function is split into two frontends, symlink_file()
-// and symlink_dir().
+// - On Windows, they take paths in wchar_t* form, instead of char* form.
+// - The symlink() function is split into two frontends, symlink_file()
+//   and symlink_dir().
+// - Errors should be retrieved with get_last_error, not errno.
 //
 // These are provided within an anonymous namespace within the detail
 // namespace - callers need to include this header and call them as
@@ -122,11 +123,6 @@ namespace detail {
 
 #  define O_NONBLOCK 0
 
-inline int set_errno(int e = GetLastError()) {
-  errno = static_cast<int>(__win_err_to_errc(e));
-  return -1;
-}
-
 class WinHandle {
 public:
   WinHandle(const wchar_t* p, DWORD access, DWORD flags) {
@@ -153,7 +149,7 @@ private:
 inline int stat_handle(HANDLE h, StatT* buf) {
   FILE_BASIC_INFO basic;
   if (!GetFileInformationByHandleEx(h, FileBasicInfo, &basic, sizeof(basic)))
-    return set_errno();
+    return -1;
   memset(buf, 0, sizeof(*buf));
   buf->st_mtim = filetime_to_timespec(basic.LastWriteTime);
   buf->st_atim = filetime_to_timespec(basic.LastAccessTime);
@@ -168,18 +164,18 @@ inline int stat_handle(HANDLE h, StatT* buf) {
   if (basic.FileAttributes & FILE_ATTRIBUTE_REPARSE_POINT) {
     FILE_ATTRIBUTE_TAG_INFO tag;
     if (!GetFileInformationByHandleEx(h, FileAttributeTagInfo, &tag, sizeof(tag)))
-      return set_errno();
+      return -1;
     if (tag.ReparseTag == IO_REPARSE_TAG_SYMLINK)
       buf->st_mode = (buf->st_mode & ~_S_IFMT) | _S_IFLNK;
   }
   FILE_STANDARD_INFO standard;
   if (!GetFileInformationByHandleEx(h, FileStandardInfo, &standard, sizeof(standard)))
-    return set_errno();
+    return -1;
   buf->st_nlink = standard.NumberOfLinks;
   buf->st_size  = standard.EndOfFile.QuadPart;
   BY_HANDLE_FILE_INFORMATION info;
   if (!GetFileInformationByHandle(h, &info))
-    return set_errno();
+    return -1;
   buf->st_dev = info.dwVolumeSerialNumber;
   memcpy(&buf->st_ino.id[0], &info.nFileIndexHigh, 4);
   memcpy(&buf->st_ino.id[4], &info.nFileIndexLow, 4);
@@ -189,7 +185,7 @@ inline int stat_handle(HANDLE h, StatT* buf) {
 inline int stat_file(const wchar_t* path, StatT* buf, DWORD flags) {
   WinHandle h(path, FILE_READ_ATTRIBUTES, flags);
   if (!h)
-    return set_errno();
+    return -1;
   int ret = stat_handle(h, buf);
   return ret;
 }
@@ -206,7 +202,7 @@ inline int fstat(int fd, StatT* buf) {
 inline int mkdir(const wchar_t* path, int permissions) {
   (void)permissions;
   if (!CreateDirectoryW(path, nullptr))
-    return set_errno();
+    return -1;
   return 0;
 }
 
@@ -219,10 +215,10 @@ inline int symlink_file_dir(const wchar_t* oldname, const wchar_t* newname, bool
     return 0;
   int e = GetLastError();
   if (e != ERROR_INVALID_PARAMETER)
-    return set_errno(e);
+    return -1;
   if (CreateSymbolicLinkW(newname, oldname, flags))
     return 0;
-  return set_errno();
+  return -1;
 }
 
 inline int symlink_file(const wchar_t* oldname, const wchar_t* newname) {
@@ -236,17 +232,17 @@ inline int symlink_dir(const wchar_t* oldname, const wchar_t* newname) {
 inline int link(const wchar_t* oldname, const wchar_t* newname) {
   if (CreateHardLinkW(newname, oldname, nullptr))
     return 0;
-  return set_errno();
+  return -1;
 }
 
 inline int remove(const wchar_t* path) {
   detail::WinHandle h(path, DELETE, FILE_FLAG_OPEN_REPARSE_POINT);
   if (!h)
-    return set_errno();
+    return -1;
   FILE_DISPOSITION_INFO info;
   info.DeleteFile = TRUE;
   if (!SetFileInformationByHandle(h, FileDispositionInfo, &info, sizeof(info)))
-    return set_errno();
+    return -1;
   return 0;
 }
 
@@ -254,9 +250,9 @@ inline int truncate_handle(HANDLE h, off_t length) {
   LARGE_INTEGER size_param;
   size_param.QuadPart = length;
   if (!SetFilePointerEx(h, size_param, 0, FILE_BEGIN))
-    return set_errno();
+    return -1;
   if (!SetEndOfFile(h))
-    return set_errno();
+    return -1;
   return 0;
 }
 
@@ -268,19 +264,19 @@ inline int ftruncate(int fd, off_t length) {
 inline int truncate(const wchar_t* path, off_t length) {
   detail::WinHandle h(path, GENERIC_WRITE, 0);
   if (!h)
-    return set_errno();
+    return -1;
   return truncate_handle(h, length);
 }
 
 inline int rename(const wchar_t* from, const wchar_t* to) {
   if (!(MoveFileExW(from, to, MOVEFILE_COPY_ALLOWED | MOVEFILE_REPLACE_EXISTING | MOVEFILE_WRITE_THROUGH)))
-    return set_errno();
+    return -1;
   return 0;
 }
 
 inline int chdir(const wchar_t* path) {
   if (!SetCurrentDirectoryW(path))
-    return set_errno();
+    return -1;
   return 0;
 }
 
@@ -300,7 +296,7 @@ inline int statvfs(const wchar_t* p, StatVFS* buf) {
       break;
     path parent = dir.parent_path();
     if (parent == dir) {
-      errno = ENOENT;
+      SetLastError(ERROR_PATH_NOT_FOUND);
       return -1;
     }
     dir = parent;
@@ -308,7 +304,7 @@ inline int statvfs(const wchar_t* p, StatVFS* buf) {
   ULARGE_INTEGER free_bytes_available_to_caller, total_number_of_bytes, total_number_of_free_bytes;
   if (!GetDiskFreeSpaceExW(
           dir.c_str(), &free_bytes_available_to_caller, &total_number_of_bytes, &total_number_of_free_bytes))
-    return set_errno();
+    return -1;
   buf->f_frsize = 1;
   buf->f_blocks = total_number_of_bytes.QuadPart;
   buf->f_bfree  = total_number_of_free_bytes.QuadPart;
@@ -330,7 +326,6 @@ inline wchar_t* getcwd([[maybe_unused]] wchar_t* in_buf, [[maybe_unused]] size_t
     retval = GetCurrentDirectoryW(buff_size, buff.get());
   }
   if (!retval) {
-    set_errno();
     return nullptr;
   }
   return buff.release();
@@ -342,7 +337,6 @@ inline wchar_t* realpath(const wchar_t* path, [[maybe_unused]] wchar_t* resolved
 
   WinHandle h(path, FILE_READ_ATTRIBUTES, 0);
   if (!h) {
-    set_errno();
     return nullptr;
   }
   size_t buff_size = MAX_PATH + 10;
@@ -354,7 +348,6 @@ inline wchar_t* realpath(const wchar_t* path, [[maybe_unused]] wchar_t* resolved
     retval = GetFinalPathNameByHandleW(h, buff.get(), buff_size, FILE_NAME_NORMALIZED | VOLUME_NAME_DOS);
   }
   if (!retval) {
-    set_errno();
     return nullptr;
   }
   wchar_t* ptr = buff.get();
@@ -376,20 +369,20 @@ using ModeT = int;
 inline int fchmod_handle(HANDLE h, int perms) {
   FILE_BASIC_INFO basic;
   if (!GetFileInformationByHandleEx(h, FileBasicInfo, &basic, sizeof(basic)))
-    return set_errno();
+    return -1;
   DWORD orig_attributes = basic.FileAttributes;
   basic.FileAttributes &= ~FILE_ATTRIBUTE_READONLY;
   if ((perms & 0222) == 0)
     basic.FileAttributes |= FILE_ATTRIBUTE_READONLY;
   if (basic.FileAttributes != orig_attributes && !SetFileInformationByHandle(h, FileBasicInfo, &basic, sizeof(basic)))
-    return set_errno();
+    return -1;
   return 0;
 }
 
 inline int fchmodat(int /*fd*/, const wchar_t* path, int perms, int flag) {
   DWORD attributes = GetFileAttributesW(path);
   if (attributes == INVALID_FILE_ATTRIBUTES)
-    return set_errno();
+    return -1;
   if (attributes & FILE_ATTRIBUTE_REPARSE_POINT && !(flag & AT_SYMLINK_NOFOLLOW)) {
     // If the file is a symlink, and we are supposed to operate on the target
     // of the symlink, we need to open a handle to it, without the
@@ -397,7 +390,7 @@ inline int fchmodat(int /*fd*/, const wchar_t* path, int perms, int flag) {
     // symlink, and operate on it via the handle.
     detail::WinHandle h(path, FILE_READ_ATTRIBUTES | FILE_WRITE_ATTRIBUTES, 0);
     if (!h)
-      return set_errno();
+      return -1;
     return fchmod_handle(h, perms);
   } else {
     // For a non-symlink, or if operating on the symlink itself instead of
@@ -407,7 +400,7 @@ inline int fchmodat(int /*fd*/, const wchar_t* path, int perms, int flag) {
     if ((perms & 0222) == 0)
       attributes |= FILE_ATTRIBUTE_READONLY;
     if (attributes != orig_attributes && !SetFileAttributesW(path, attributes))
-      return set_errno();
+      return -1;
   }
   return 0;
 }
@@ -424,18 +417,18 @@ inline SSizeT readlink(const wchar_t* path, wchar_t* ret_buf, size_t bufsize) {
   uint8_t buf[MAXIMUM_REPARSE_DATA_BUFFER_SIZE];
   detail::WinHandle h(path, FILE_READ_ATTRIBUTES, FILE_FLAG_OPEN_REPARSE_POINT);
   if (!h)
-    return set_errno();
+    return -1;
   DWORD out;
   if (!DeviceIoControl(h, FSCTL_GET_REPARSE_POINT, nullptr, 0, buf, sizeof(buf), &out, 0))
-    return set_errno();
+    return -1;
   const auto* reparse    = reinterpret_cast<LIBCPP_REPARSE_DATA_BUFFER*>(buf);
   size_t path_buf_offset = offsetof(LIBCPP_REPARSE_DATA_BUFFER, SymbolicLinkReparseBuffer.PathBuffer[0]);
   if (out < path_buf_offset) {
-    errno = EINVAL;
+    SetLastError(ERROR_REPARSE_TAG_INVALID);
     return -1;
   }
   if (reparse->ReparseTag != IO_REPARSE_TAG_SYMLINK) {
-    errno = EINVAL;
+    SetLastError(ERROR_REPARSE_TAG_INVALID);
     return -1;
   }
   const auto& symlink = reparse->SymbolicLinkReparseBuffer;
@@ -449,11 +442,11 @@ inline SSizeT readlink(const wchar_t* path, wchar_t* ret_buf, size_t bufsize) {
   }
   // name_offset/length are expressed in bytes, not in wchar_t
   if (path_buf_offset + name_offset + name_length > out) {
-    errno = EINVAL;
+    SetLastError(ERROR_REPARSE_TAG_INVALID);
     return -1;
   }
   if (name_length / sizeof(wchar_t) > bufsize) {
-    errno = ENOMEM;
+    SetLastError(ERROR_NOT_ENOUGH_MEMORY);
     return -1;
   }
   memcpy(ret_buf, &symlink.PathBuffer[name_offset / sizeof(wchar_t)], name_length);
diff --git a/libcxx/src/include/overridable_function.h b/libcxx/src/include/overridable_function.h
index 6c70f6242ddd..7372e347831b 100644
--- a/libcxx/src/include/overridable_function.h
+++ b/libcxx/src/include/overridable_function.h
@@ -29,106 +29,81 @@
 // This is a low-level utility which does not work on all platforms, since it needs
 // to make assumptions about the object file format in use. Furthermore, it requires
 // the "base definition" of the function (the one we want to check whether it has been
-// overridden) to be annotated with the _LIBCPP_MAKE_OVERRIDABLE_FUNCTION_DETECTABLE macro.
+// overridden) to be defined using the _LIBCPP_OVERRIDABLE_FUNCTION macro.
 //
 // This currently works with Mach-O files (used on Darwin) and with ELF files (used on Linux
 // and others). On platforms where we know how to implement this detection, the macro
 // _LIBCPP_CAN_DETECT_OVERRIDDEN_FUNCTION is defined to 1, and it is defined to 0 on
-// other platforms. The _LIBCPP_MAKE_OVERRIDABLE_FUNCTION_DETECTABLE macro is defined to
-// nothing on unsupported platforms so that it can be used to decorate functions regardless
-// of whether detection is actually supported.
+// other platforms. The _LIBCPP_OVERRIDABLE_FUNCTION macro expands to regular function
+// definition on unsupported platforms so that it can be used to decorate functions
+// regardless of whether detection is actually supported.
 //
 // How does this work?
 // -------------------
 //
 // Let's say we want to check whether a weak function `f` has been overridden by the user.
-// The general mechanism works by placing `f`'s definition (in the libc++ built library)
-// inside a special section, which we do using the `__section__` attribute via the
-// _LIBCPP_MAKE_OVERRIDABLE_FUNCTION_DETECTABLE macro.
+// The general mechanism works by defining a symbol `f_impl__` and a weak alias `f` via the
+// _LIBCPP_OVERRIDABLE_FUNCTION macro.
 //
 // Then, when comes the time to check whether the function has been overridden, we take
-// the address of the function and we check whether it falls inside the special function
-// we created. This can be done by finding pointers to the start and the end of the section
-// (which is done differently for ELF and Mach-O), and then checking whether `f` falls
-// within those bounds. If it falls within those bounds, then `f` is still inside the
-// special section and so it is the version we defined in the libc++ built library, i.e.
-// it was not overridden. Otherwise, it was overridden by the user because it falls
-// outside of the section.
+// the address of the function `f` and we check whether it is different from `f_impl__`.
+// If so it means the function was overriden by the user.
 //
 // Important note
 // --------------
 //
-// This mechanism should never be used outside of the libc++ built library. In particular,
-// attempting to use this within the libc++ headers will not work at all because we don't
-// want to be defining special sections inside user's executables which use our headers.
+// This mechanism should never be used outside of the libc++ built library. Functions defined
+// with this macro must be defined at global scope.
 //
 
 #if defined(_LIBCPP_OBJECT_FORMAT_MACHO)
 
-#  define _LIBCPP_CAN_DETECT_OVERRIDDEN_FUNCTION 1
-#  define _LIBCPP_MAKE_OVERRIDABLE_FUNCTION_DETECTABLE                                                                 \
-    __attribute__((__section__("__TEXT,__lcxx_override,regular,pure_instructions")))
-
 _LIBCPP_BEGIN_NAMESPACE_STD
-template <class _Ret, class... _Args>
-_LIBCPP_HIDE_FROM_ABI bool __is_function_overridden(_Ret (*__fptr)(_Args...)) noexcept {
-  // Declare two dummy bytes and give them these special `__asm` values. These values are
-  // defined by the linker, which means that referring to `&__lcxx_override_start` will
-  // effectively refer to the address where the section starts (and same for the end).
-  extern char __lcxx_override_start __asm("section$start$__TEXT$__lcxx_override");
-  extern char __lcxx_override_end __asm("section$end$__TEXT$__lcxx_override");
-
-  // Now get a uintptr_t out of these locations, and out of the function pointer.
-  uintptr_t __start = reinterpret_cast<uintptr_t>(&__lcxx_override_start);
-  uintptr_t __end   = reinterpret_cast<uintptr_t>(&__lcxx_override_end);
-  uintptr_t __ptr   = reinterpret_cast<uintptr_t>(__fptr);
-
-#  if __has_feature(ptrauth_calls)
-  // We must pass a void* to ptrauth_strip since it only accepts a pointer type. Also, in particular,
-  // we must NOT pass a function pointer, otherwise we will strip the function pointer, and then attempt
-  // to authenticate and re-sign it when casting it to a uintptr_t again, which will fail because we just
-  // stripped the function pointer. See rdar://122927845.
-  __ptr = reinterpret_cast<uintptr_t>(ptrauth_strip(reinterpret_cast<void*>(__ptr), ptrauth_key_function_pointer));
-#  endif
-
-  // Finally, the function was overridden if it falls outside of the section's bounds.
-  return __ptr < __start || __ptr > __end;
-}
-_LIBCPP_END_NAMESPACE_STD
 
-// The NVPTX linker cannot create '__start/__stop' sections.
-#elif defined(_LIBCPP_OBJECT_FORMAT_ELF) && !defined(__NVPTX__)
+template <auto _Func>
+_LIBCPP_HIDE_FROM_ABI constexpr bool __is_function_overridden();
 
-#  define _LIBCPP_CAN_DETECT_OVERRIDDEN_FUNCTION 1
-#  define _LIBCPP_MAKE_OVERRIDABLE_FUNCTION_DETECTABLE __attribute__((__section__("__lcxx_override")))
+_LIBCPP_END_NAMESPACE_STD
 
-// This is very similar to what we do for Mach-O above. The ELF linker will implicitly define
-// variables with those names corresponding to the start and the end of the section.
-//
-// See https://stackoverflow.com/questions/16552710/how-do-you-get-the-start-and-end-addresses-of-a-custom-elf-section
-extern char __start___lcxx_override;
-extern char __stop___lcxx_override;
+#  define _LIBCPP_CAN_DETECT_OVERRIDDEN_FUNCTION 1
+#  define _LIBCPP_OVERRIDABLE_FUNCTION(symbol, type, name, arglist)                                                    \
+    static __attribute__((used)) type symbol##_impl__ arglist __asm__("_" _LIBCPP_TOSTRING(symbol));                   \
+    __asm__(".globl _" _LIBCPP_TOSTRING(symbol));                                                                      \
+    __asm__(".weak_definition _" _LIBCPP_TOSTRING(symbol));                                                            \
+    extern __typeof(symbol##_impl__) name __attribute__((weak_import));                                                \
+    _LIBCPP_BEGIN_NAMESPACE_STD                                                                                        \
+    template <>                                                                                                        \
+    inline bool __is_function_overridden<static_cast<type(*) arglist>(name)>() {                                       \
+      return static_cast<type(*) arglist>(name) != symbol##_impl__;                                                    \
+    }                                                                                                                  \
+    _LIBCPP_END_NAMESPACE_STD                                                                                          \
+    static type symbol##_impl__ arglist
+
+#elif defined(_LIBCPP_OBJECT_FORMAT_ELF)
 
 _LIBCPP_BEGIN_NAMESPACE_STD
-template <class _Ret, class... _Args>
-_LIBCPP_HIDE_FROM_ABI bool __is_function_overridden(_Ret (*__fptr)(_Args...)) noexcept {
-  uintptr_t __start = reinterpret_cast<uintptr_t>(&__start___lcxx_override);
-  uintptr_t __end   = reinterpret_cast<uintptr_t>(&__stop___lcxx_override);
-  uintptr_t __ptr   = reinterpret_cast<uintptr_t>(__fptr);
-
-#  if __has_feature(ptrauth_calls)
-  // We must pass a void* to ptrauth_strip since it only accepts a pointer type. See full explanation above.
-  __ptr = reinterpret_cast<uintptr_t>(ptrauth_strip(reinterpret_cast<void*>(__ptr), ptrauth_key_function_pointer));
-#  endif
-
-  return __ptr < __start || __ptr > __end;
-}
+
+template <auto _Func>
+_LIBCPP_HIDE_FROM_ABI constexpr bool __is_function_overridden();
+
 _LIBCPP_END_NAMESPACE_STD
 
+#  define _LIBCPP_CAN_DETECT_OVERRIDDEN_FUNCTION 1
+#  define _LIBCPP_OVERRIDABLE_FUNCTION(symbol, type, name, arglist)                                                    \
+    static type symbol##_impl__ arglist __asm__(_LIBCPP_TOSTRING(symbol##_impl__));                                    \
+    [[gnu::weak, gnu::alias(_LIBCPP_TOSTRING(symbol##_impl__))]] type name arglist;                                    \
+    _LIBCPP_BEGIN_NAMESPACE_STD                                                                                        \
+    template <>                                                                                                        \
+    inline bool __is_function_overridden<static_cast<type(*) arglist>(name)>() {                                       \
+      return static_cast<type(*) arglist>(name) != symbol##_impl__;                                                    \
+    }                                                                                                                  \
+    _LIBCPP_END_NAMESPACE_STD                                                                                          \
+    static type symbol##_impl__ arglist
+
 #else
 
 #  define _LIBCPP_CAN_DETECT_OVERRIDDEN_FUNCTION 0
-#  define _LIBCPP_MAKE_OVERRIDABLE_FUNCTION_DETECTABLE /* nothing */
+#  define _LIBCPP_OVERRIDABLE_FUNCTION(symbol, type, name, arglist) _LIBCPP_WEAK type name arglist
 
 #endif
 
diff --git a/libcxx/src/new.cpp b/libcxx/src/new.cpp
index e010fe4c4f19..b14b52248df3 100644
--- a/libcxx/src/new.cpp
+++ b/libcxx/src/new.cpp
@@ -43,7 +43,7 @@ static void* operator_new_impl(std::size_t size) {
   return p;
 }
 
-_LIBCPP_MAKE_OVERRIDABLE_FUNCTION_DETECTABLE _LIBCPP_WEAK void* operator new(std::size_t size) _THROW_BAD_ALLOC {
+_LIBCPP_OVERRIDABLE_FUNCTION(_Znwm, void*, operator new, (std::size_t size)) _THROW_BAD_ALLOC {
   void* p = operator_new_impl(size);
   if (p == nullptr)
     __throw_bad_alloc_shim();
@@ -54,7 +54,7 @@ _LIBCPP_WEAK void* operator new(size_t size, const std::nothrow_t&) noexcept {
 #  if !_LIBCPP_HAS_EXCEPTIONS
 #    if _LIBCPP_CAN_DETECT_OVERRIDDEN_FUNCTION
   _LIBCPP_ASSERT_SHIM(
-      !std::__is_function_overridden(static_cast<void* (*)(std::size_t)>(&operator new)),
+      !std::__is_function_overridden<static_cast<void* (*)(std::size_t)>(&operator new)>(),
       "libc++ was configured with exceptions disabled and `operator new(size_t)` has been overridden, "
       "but `operator new(size_t, nothrow_t)` has not been overridden. This is problematic because "
       "`operator new(size_t, nothrow_t)` must call `operator new(size_t)`, which will terminate in case "
@@ -74,7 +74,7 @@ _LIBCPP_WEAK void* operator new(size_t size, const std::nothrow_t&) noexcept {
 #  endif
 }
 
-_LIBCPP_MAKE_OVERRIDABLE_FUNCTION_DETECTABLE _LIBCPP_WEAK void* operator new[](size_t size) _THROW_BAD_ALLOC {
+_LIBCPP_OVERRIDABLE_FUNCTION(_Znam, void*, operator new[], (size_t size)) _THROW_BAD_ALLOC {
   return ::operator new(size);
 }
 
@@ -82,7 +82,7 @@ _LIBCPP_WEAK void* operator new[](size_t size, const std::nothrow_t&) noexcept {
 #  if !_LIBCPP_HAS_EXCEPTIONS
 #    if _LIBCPP_CAN_DETECT_OVERRIDDEN_FUNCTION
   _LIBCPP_ASSERT_SHIM(
-      !std::__is_function_overridden(static_cast<void* (*)(std::size_t)>(&operator new[])),
+      !std::__is_function_overridden<static_cast<void* (*)(std::size_t)>(&operator new[])>(),
       "libc++ was configured with exceptions disabled and `operator new[](size_t)` has been overridden, "
       "but `operator new[](size_t, nothrow_t)` has not been overridden. This is problematic because "
       "`operator new[](size_t, nothrow_t)` must call `operator new[](size_t)`, which will terminate in case "
@@ -136,8 +136,8 @@ static void* operator_new_aligned_impl(std::size_t size, std::align_val_t alignm
   return p;
 }
 
-_LIBCPP_MAKE_OVERRIDABLE_FUNCTION_DETECTABLE _LIBCPP_WEAK void*
-operator new(std::size_t size, std::align_val_t alignment) _THROW_BAD_ALLOC {
+_LIBCPP_OVERRIDABLE_FUNCTION(_ZnwmSt11align_val_t, void*, operator new, (std::size_t size, std::align_val_t alignment))
+_THROW_BAD_ALLOC {
   void* p = operator_new_aligned_impl(size, alignment);
   if (p == nullptr)
     __throw_bad_alloc_shim();
@@ -148,7 +148,7 @@ _LIBCPP_WEAK void* operator new(size_t size, std::align_val_t alignment, const s
 #    if !_LIBCPP_HAS_EXCEPTIONS
 #      if _LIBCPP_CAN_DETECT_OVERRIDDEN_FUNCTION
   _LIBCPP_ASSERT_SHIM(
-      !std::__is_function_overridden(static_cast<void* (*)(std::size_t, std::align_val_t)>(&operator new)),
+      !std::__is_function_overridden<static_cast<void* (*)(std::size_t, std::align_val_t)>(&operator new)>(),
       "libc++ was configured with exceptions disabled and `operator new(size_t, align_val_t)` has been overridden, "
       "but `operator new(size_t, align_val_t, nothrow_t)` has not been overridden. This is problematic because "
       "`operator new(size_t, align_val_t, nothrow_t)` must call `operator new(size_t, align_val_t)`, which will "
@@ -168,16 +168,14 @@ _LIBCPP_WEAK void* operator new(size_t size, std::align_val_t alignment, const s
 #    endif
 }
 
-_LIBCPP_MAKE_OVERRIDABLE_FUNCTION_DETECTABLE _LIBCPP_WEAK void*
-operator new[](size_t size, std::align_val_t alignment) _THROW_BAD_ALLOC {
-  return ::operator new(size, alignment);
-}
+_LIBCPP_OVERRIDABLE_FUNCTION(_ZnamSt11align_val_t, void*, operator new[], (size_t size, std::align_val_t alignment))
+_THROW_BAD_ALLOC { return ::operator new(size, alignment); }
 
 _LIBCPP_WEAK void* operator new[](size_t size, std::align_val_t alignment, const std::nothrow_t&) noexcept {
 #    if !_LIBCPP_HAS_EXCEPTIONS
 #      if _LIBCPP_CAN_DETECT_OVERRIDDEN_FUNCTION
   _LIBCPP_ASSERT_SHIM(
-      !std::__is_function_overridden(static_cast<void* (*)(std::size_t, std::align_val_t)>(&operator new[])),
+      !std::__is_function_overridden<static_cast<void* (*)(std::size_t, std::align_val_t)>(&operator new[])>(),
       "libc++ was configured with exceptions disabled and `operator new[](size_t, align_val_t)` has been overridden, "
       "but `operator new[](size_t, align_val_t, nothrow_t)` has not been overridden. This is problematic because "
       "`operator new[](size_t, align_val_t, nothrow_t)` must call `operator new[](size_t, align_val_t)`, which will "
diff --git a/libcxx/src/print.cpp b/libcxx/src/print.cpp
index 37b1fc00cd7c..4937aafe8417 100644
--- a/libcxx/src/print.cpp
+++ b/libcxx/src/print.cpp
@@ -51,7 +51,7 @@ __write_to_windows_console([[maybe_unused]] FILE* __stream, [[maybe_unused]] wst
                     __view.size(),
                     nullptr,
                     nullptr) == 0) {
-    __throw_system_error(filesystem::detail::make_windows_error(GetLastError()), "failed to write formatted output");
+    __throw_system_error(filesystem::detail::get_last_error(), "failed to write formatted output");
   }
 }
 #  endif // _LIBCPP_HAS_WIDE_CHARACTERS
diff --git a/libcxx/src/system_error.cpp b/libcxx/src/system_error.cpp
index d555bca995c4..d5ec73084f63 100644
--- a/libcxx/src/system_error.cpp
+++ b/libcxx/src/system_error.cpp
@@ -14,6 +14,7 @@
 #include <cstdio>
 #include <cstdlib>
 #include <cstring>
+#include <optional>
 #include <string.h>
 #include <string>
 #include <system_error>
@@ -24,8 +25,123 @@
 #  include <android/api-level.h>
 #endif
 
+#if defined(_LIBCPP_WIN32API)
+#  include <windows.h>
+#  include <winerror.h>
+#endif
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 
+#if defined(_LIBCPP_WIN32API)
+
+namespace {
+std::optional<errc> __win_err_to_errc(int err) {
+  switch (err) {
+  case ERROR_ACCESS_DENIED:
+    return errc::permission_denied;
+  case ERROR_ALREADY_EXISTS:
+    return errc::file_exists;
+  case ERROR_BAD_NETPATH:
+    return errc::no_such_file_or_directory;
+  case ERROR_BAD_PATHNAME:
+    return errc::no_such_file_or_directory;
+  case ERROR_BAD_UNIT:
+    return errc::no_such_device;
+  case ERROR_BROKEN_PIPE:
+    return errc::broken_pipe;
+  case ERROR_BUFFER_OVERFLOW:
+    return errc::filename_too_long;
+  case ERROR_BUSY:
+    return errc::device_or_resource_busy;
+  case ERROR_BUSY_DRIVE:
+    return errc::device_or_resource_busy;
+  case ERROR_CANNOT_MAKE:
+    return errc::permission_denied;
+  case ERROR_CANTOPEN:
+    return errc::io_error;
+  case ERROR_CANTREAD:
+    return errc::io_error;
+  case ERROR_CANTWRITE:
+    return errc::io_error;
+  case ERROR_CURRENT_DIRECTORY:
+    return errc::permission_denied;
+  case ERROR_DEV_NOT_EXIST:
+    return errc::no_such_device;
+  case ERROR_DEVICE_IN_USE:
+    return errc::device_or_resource_busy;
+  case ERROR_DIR_NOT_EMPTY:
+    return errc::directory_not_empty;
+  case ERROR_DIRECTORY:
+    return errc::invalid_argument;
+  case ERROR_DISK_FULL:
+    return errc::no_space_on_device;
+  case ERROR_FILE_EXISTS:
+    return errc::file_exists;
+  case ERROR_FILE_NOT_FOUND:
+    return errc::no_such_file_or_directory;
+  case ERROR_HANDLE_DISK_FULL:
+    return errc::no_space_on_device;
+  case ERROR_INVALID_ACCESS:
+    return errc::permission_denied;
+  case ERROR_INVALID_DRIVE:
+    return errc::no_such_device;
+  case ERROR_INVALID_FUNCTION:
+    return errc::function_not_supported;
+  case ERROR_INVALID_HANDLE:
+    return errc::invalid_argument;
+  case ERROR_INVALID_NAME:
+    return errc::no_such_file_or_directory;
+  case ERROR_INVALID_PARAMETER:
+    return errc::invalid_argument;
+  case ERROR_LOCK_VIOLATION:
+    return errc::no_lock_available;
+  case ERROR_LOCKED:
+    return errc::no_lock_available;
+  case ERROR_NEGATIVE_SEEK:
+    return errc::invalid_argument;
+  case ERROR_NOACCESS:
+    return errc::permission_denied;
+  case ERROR_NOT_ENOUGH_MEMORY:
+    return errc::not_enough_memory;
+  case ERROR_NOT_READY:
+    return errc::resource_unavailable_try_again;
+  case ERROR_NOT_SAME_DEVICE:
+    return errc::cross_device_link;
+  case ERROR_NOT_SUPPORTED:
+    return errc::not_supported;
+  case ERROR_OPEN_FAILED:
+    return errc::io_error;
+  case ERROR_OPEN_FILES:
+    return errc::device_or_resource_busy;
+  case ERROR_OPERATION_ABORTED:
+    return errc::operation_canceled;
+  case ERROR_OUTOFMEMORY:
+    return errc::not_enough_memory;
+  case ERROR_PATH_NOT_FOUND:
+    return errc::no_such_file_or_directory;
+  case ERROR_READ_FAULT:
+    return errc::io_error;
+  case ERROR_REPARSE_TAG_INVALID:
+    return errc::invalid_argument;
+  case ERROR_RETRY:
+    return errc::resource_unavailable_try_again;
+  case ERROR_SEEK:
+    return errc::io_error;
+  case ERROR_SHARING_VIOLATION:
+    return errc::permission_denied;
+  case ERROR_TOO_MANY_OPEN_FILES:
+    return errc::too_many_files_open;
+  case ERROR_WRITE_FAULT:
+    return errc::io_error;
+  case ERROR_WRITE_PROTECT:
+    return errc::permission_denied;
+  default:
+    return {};
+  }
+}
+} // namespace
+#endif
+
 namespace {
 #if _LIBCPP_HAS_THREADS
 
@@ -157,19 +273,52 @@ public:
 const char* __system_error_category::name() const noexcept { return "system"; }
 
 string __system_error_category::message(int ev) const {
-#ifdef _LIBCPP_ELAST
+#ifdef _LIBCPP_WIN32API
+  std::string result;
+  char* str               = nullptr;
+  unsigned long num_chars = ::FormatMessageA(
+      FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
+      nullptr,
+      ev,
+      0,
+      reinterpret_cast<char*>(&str),
+      0,
+      nullptr);
+  auto is_whitespace = [](char ch) { return ch == '\n' || ch == '\r' || ch == ' '; };
+  while (num_chars > 0 && is_whitespace(str[num_chars - 1]))
+    --num_chars;
+
+  if (num_chars)
+    result = std::string(str, num_chars);
+  else
+    result = "Unknown error";
+
+  LocalFree(str);
+  return result;
+#else
+#  ifdef _LIBCPP_ELAST
   if (ev > _LIBCPP_ELAST)
     return string("unspecified system_category error");
-#endif // _LIBCPP_ELAST
+#  endif // _LIBCPP_ELAST
   return __do_message::message(ev);
+#endif
 }
 
 error_condition __system_error_category::default_error_condition(int ev) const noexcept {
-#ifdef _LIBCPP_ELAST
+#ifdef _LIBCPP_WIN32API
+  // Remap windows error codes to generic error codes if possible.
+  if (ev == 0)
+    return error_condition(0, generic_category());
+  if (auto maybe_errc = __win_err_to_errc(ev))
+    return error_condition(static_cast<int>(*maybe_errc), generic_category());
+  return error_condition(ev, system_category());
+#else
+#  ifdef _LIBCPP_ELAST
   if (ev > _LIBCPP_ELAST)
     return error_condition(ev, system_category());
-#endif // _LIBCPP_ELAST
+#  endif // _LIBCPP_ELAST
   return error_condition(ev, generic_category());
+#endif
 }
 
 const error_category& system_category() noexcept {
@@ -213,7 +362,7 @@ system_error::~system_error() noexcept {}
 
 void __throw_system_error(int ev, const char* what_arg) {
 #if _LIBCPP_HAS_EXCEPTIONS
-  std::__throw_system_error(error_code(ev, system_category()), what_arg);
+  std::__throw_system_error(error_code(ev, generic_category()), what_arg);
 #else
   // The above could also handle the no-exception case, but for size, avoid referencing system_category() unnecessarily.
   _LIBCPP_VERBOSE_ABORT(
diff --git a/libcxx/test/libcxx/containers/sequences/vector.bool/assert.pass.cpp b/libcxx/test/libcxx/containers/sequences/vector.bool/assert.pass.cpp
new file mode 100644
index 000000000000..41badad8f569
--- /dev/null
+++ b/libcxx/test/libcxx/containers/sequences/vector.bool/assert.pass.cpp
@@ -0,0 +1,63 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// <vector>
+
+// Test hardening assertions for std::vector<bool>.
+
+// REQUIRES: has-unix-headers
+// UNSUPPORTED: libcpp-hardening-mode=none
+// UNSUPPORTED: c++03
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
+
+#include <vector>
+
+#include "check_assertion.h"
+#include "min_allocator.h"
+
+template <class Allocator>
+void test() {
+  std::vector<bool, Allocator> c;
+  TEST_LIBCPP_ASSERT_FAILURE(c.front(), "vector<bool>::front() called on an empty vector");
+  TEST_LIBCPP_ASSERT_FAILURE(c.back(), "vector<bool>::back() called on an empty vector");
+  TEST_LIBCPP_ASSERT_FAILURE(c[0], "vector<bool>::operator[] index out of bounds");
+  TEST_LIBCPP_ASSERT_FAILURE(c.pop_back(), "vector<bool>::pop_back called on an empty vector");
+
+  // Repeat the test with a const reference to test the const overloads.
+  {
+    const std::vector<bool, Allocator>& cc = c;
+    TEST_LIBCPP_ASSERT_FAILURE(cc.front(), "vector<bool>::front() called on an empty vector");
+    TEST_LIBCPP_ASSERT_FAILURE(cc.back(), "vector<bool>::back() called on an empty vector");
+    TEST_LIBCPP_ASSERT_FAILURE(cc[0], "vector<bool>::operator[] index out of bounds");
+  }
+
+  c.push_back(true);
+  c.push_back(false);
+  c.push_back(true);
+  TEST_LIBCPP_ASSERT_FAILURE(c[3], "vector<bool>::operator[] index out of bounds");
+  TEST_LIBCPP_ASSERT_FAILURE(c[100], "vector<bool>::operator[] index out of bounds");
+
+  // Repeat the test with a const reference to test the const overloads.
+  {
+    const std::vector<bool, Allocator>& cc = c;
+    TEST_LIBCPP_ASSERT_FAILURE(cc[3], "vector<bool>::operator[] index out of bounds");
+    TEST_LIBCPP_ASSERT_FAILURE(cc[100], "vector<bool>::operator[] index out of bounds");
+  }
+
+  TEST_LIBCPP_ASSERT_FAILURE(
+      c.erase(c.end()), "vector<bool>::erase(iterator) called with a non-dereferenceable iterator");
+  TEST_LIBCPP_ASSERT_FAILURE(
+      c.erase(c.begin() + 1, c.begin()), "vector<bool>::erase(iterator, iterator) called with an invalid range");
+}
+
+int main(int, char**) {
+  test<std::allocator<bool>>();
+  test<min_allocator<bool>>();
+
+  return 0;
+}
diff --git a/libcxx/test/libcxx/diagnostics/system_error_win_codes.pass.cpp b/libcxx/test/libcxx/diagnostics/system_error_win_codes.pass.cpp
new file mode 100644
index 000000000000..799a5b5c0b08
--- /dev/null
+++ b/libcxx/test/libcxx/diagnostics/system_error_win_codes.pass.cpp
@@ -0,0 +1,25 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// REQUIRES: windows
+
+// Validate that system_error on windows accepts Windows' System Error Codes (as
+// used by win32 APIs and reported by GetLastError), and that they are properly
+// translated to generic conditions.
+
+#include <windows.h>
+#include <system_error>
+#include <cassert>
+
+#include "test_macros.h"
+
+int main(int, char**) {
+  LIBCPP_ASSERT(std::error_code(ERROR_ACCESS_DENIED, std::system_category()) == std::errc::permission_denied);
+  LIBCPP_ASSERT(std::error_code(ERROR_PATH_NOT_FOUND, std::system_category()) == std::errc::no_such_file_or_directory);
+  return 0;
+}
diff --git a/libcxx/test/std/atomics/atomics.ref/exchange.pass.cpp b/libcxx/test/std/atomics/atomics.ref/exchange.pass.cpp
index cd998d46b7e8..c2afa6b8dfd0 100644
--- a/libcxx/test/std/atomics/atomics.ref/exchange.pass.cpp
+++ b/libcxx/test/std/atomics/atomics.ref/exchange.pass.cpp
@@ -17,24 +17,47 @@
 #include <type_traits>
 
 #include "atomic_helpers.h"
+#include "test_helper.h"
 #include "test_macros.h"
 
 template <typename T>
 struct TestExchange {
   void operator()() const {
-    T x(T(1));
-    std::atomic_ref<T> const a(x);
+    {
+      T x(T(1));
+      std::atomic_ref<T> const a(x);
+
+      {
+        std::same_as<T> decltype(auto) y = a.exchange(T(2));
+        assert(y == T(1));
+        ASSERT_NOEXCEPT(a.exchange(T(2)));
+      }
+
+      {
+        std::same_as<T> decltype(auto) y = a.exchange(T(3), std::memory_order_seq_cst);
+        assert(y == T(2));
+        ASSERT_NOEXCEPT(a.exchange(T(3), std::memory_order_seq_cst));
+      }
+    }
 
+    // memory_order::release
     {
-      std::same_as<T> decltype(auto) y = a.exchange(T(2));
-      assert(y == T(1));
-      ASSERT_NOEXCEPT(a.exchange(T(2)));
+      auto exchange = [](std::atomic_ref<T> const& x, T, T new_val) {
+        x.exchange(new_val, std::memory_order::release);
+      };
+      auto load = [](std::atomic_ref<T> const& x) { return x.load(std::memory_order::acquire); };
+      test_acquire_release<T>(exchange, load);
     }
 
+    // memory_order::seq_cst
     {
-      std::same_as<T> decltype(auto) y = a.exchange(T(3), std::memory_order_seq_cst);
-      assert(y == T(2));
-      ASSERT_NOEXCEPT(a.exchange(T(3), std::memory_order_seq_cst));
+      auto exchange_no_arg     = [](std::atomic_ref<T> const& x, T, T new_val) { x.exchange(new_val); };
+      auto exchange_with_order = [](std::atomic_ref<T> const& x, T, T new_val) {
+        x.exchange(new_val, std::memory_order::seq_cst);
+      };
+      auto load = [](std::atomic_ref<T> const& x) { return x.load(); };
+      test_seq_cst<T>(exchange_no_arg, load);
+      test_seq_cst<T>(exchange_with_order, load);
     }
   }
 };
diff --git a/libcxx/test/std/containers/sequences/vector.bool/at.pass.cpp b/libcxx/test/std/containers/sequences/vector.bool/at.pass.cpp
new file mode 100644
index 000000000000..16832dd831e6
--- /dev/null
+++ b/libcxx/test/std/containers/sequences/vector.bool/at.pass.cpp
@@ -0,0 +1,125 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// <vector>
+
+// reference at(size_type n); // constexpr since C++20
+
+#include <cassert>
+#include <memory>
+#include <vector>
+
+#include "min_allocator.h"
+#include "test_allocator.h"
+#include "test_macros.h"
+
+#ifndef TEST_HAS_NO_EXCEPTIONS
+#  include <stdexcept>
+#endif
+
+template <typename Allocator>
+TEST_CONSTEXPR_CXX20 void test() {
+  using C         = std::vector<bool, Allocator>;
+  using reference = typename C::reference;
+  bool a[]        = {1, 0, 1, 0, 1};
+  C v(a, a + sizeof(a) / sizeof(a[0]));
+  ASSERT_SAME_TYPE(reference, decltype(v.at(0)));
+  assert(v.at(0) == true);
+  assert(v.at(1) == false);
+  assert(v.at(2) == true);
+  assert(v.at(3) == false);
+  assert(v.at(4) == true);
+  v.at(1) = 1;
+  assert(v.at(1) == true);
+  v.at(3) = 1;
+  assert(v.at(3) == true);
+}
+
+template <typename Allocator>
+void test_exception() {
+#ifndef TEST_HAS_NO_EXCEPTIONS
+  {
+    bool a[] = {1, 0, 1, 1};
+    using C  = std::vector<bool, Allocator>;
+    C v(a, a + sizeof(a) / sizeof(a[0]));
+
+    try {
+      TEST_IGNORE_NODISCARD v.at(4);
+      assert(false);
+    } catch (std::out_of_range const&) {
+      // pass
+    } catch (...) {
+      assert(false);
+    }
+
+    try {
+      TEST_IGNORE_NODISCARD v.at(5);
+      assert(false);
+    } catch (std::out_of_range const&) {
+      // pass
+    } catch (...) {
+      assert(false);
+    }
+
+    try {
+      TEST_IGNORE_NODISCARD v.at(6);
+      assert(false);
+    } catch (std::out_of_range const&) {
+      // pass
+    } catch (...) {
+      assert(false);
+    }
+
+    try {
+      using size_type = typename C::size_type;
+      TEST_IGNORE_NODISCARD v.at(static_cast<size_type>(-1));
+      assert(false);
+    } catch (std::out_of_range const&) {
+      // pass
+    } catch (...) {
+      assert(false);
+    }
+  }
+
+  {
+    std::vector<bool, Allocator> v;
+    try {
+      TEST_IGNORE_NODISCARD v.at(0);
+      assert(false);
+    } catch (std::out_of_range const&) {
+      // pass
+    } catch (...) {
+      assert(false);
+    }
+  }
+#endif
+}
+
+TEST_CONSTEXPR_CXX20 bool tests() {
+  test<std::allocator<bool> >();
+  test<min_allocator<bool> >();
+  test<test_allocator<bool> >();
+  return true;
+}
+
+void test_exceptions() {
+  test_exception<std::allocator<bool> >();
+  test_exception<min_allocator<bool> >();
+  test_exception<test_allocator<bool> >();
+}
+
+int main(int, char**) {
+  tests();
+  test_exceptions();
+
+#if TEST_STD_VER >= 20
+  static_assert(tests());
+#endif
+
+  return 0;
+}
diff --git a/libcxx/test/std/containers/sequences/vector.bool/at_const.pass.cpp b/libcxx/test/std/containers/sequences/vector.bool/at_const.pass.cpp
new file mode 100644
index 000000000000..5ed794d13f19
--- /dev/null
+++ b/libcxx/test/std/containers/sequences/vector.bool/at_const.pass.cpp
@@ -0,0 +1,121 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// <vector>
+
+// const_reference at(size_type n) const; // constexpr since C++20
+
+#include <cassert>
+#include <memory>
+#include <vector>
+
+#include "min_allocator.h"
+#include "test_allocator.h"
+#include "test_macros.h"
+
+#ifndef TEST_HAS_NO_EXCEPTIONS
+#  include <stdexcept>
+#endif
+
+template <typename Allocator>
+TEST_CONSTEXPR_CXX20 void test() {
+  using C               = const std::vector<bool, Allocator>;
+  using const_reference = typename C::const_reference;
+  bool a[]              = {1, 0, 1, 0, 1};
+  C v(a, a + sizeof(a) / sizeof(a[0]));
+  ASSERT_SAME_TYPE(const_reference, decltype(v.at(0)));
+  assert(v.at(0) == true);
+  assert(v.at(1) == false);
+  assert(v.at(2) == true);
+  assert(v.at(3) == false);
+  assert(v.at(4) == true);
+}
+
+template <typename Allocator>
+void test_exception() {
+#ifndef TEST_HAS_NO_EXCEPTIONS
+  {
+    bool a[] = {1, 0, 1, 1};
+    using C  = const std::vector<bool, Allocator>;
+    C v(a, a + sizeof(a) / sizeof(a[0]));
+
+    try {
+      TEST_IGNORE_NODISCARD v.at(4);
+      assert(false);
+    } catch (std::out_of_range const&) {
+      // pass
+    } catch (...) {
+      assert(false);
+    }
+
+    try {
+      TEST_IGNORE_NODISCARD v.at(5);
+      assert(false);
+    } catch (std::out_of_range const&) {
+      // pass
+    } catch (...) {
+      assert(false);
+    }
+
+    try {
+      TEST_IGNORE_NODISCARD v.at(6);
+      assert(false);
+    } catch (std::out_of_range const&) {
+      // pass
+    } catch (...) {
+      assert(false);
+    }
+
+    try {
+      using size_type = typename C::size_type;
+      TEST_IGNORE_NODISCARD v.at(static_cast<size_type>(-1));
+      assert(false);
+    } catch (std::out_of_range const&) {
+      // pass
+    } catch (...) {
+      assert(false);
+    }
+  }
+
+  {
+    std::vector<bool, Allocator> v;
+    try {
+      TEST_IGNORE_NODISCARD v.at(0);
+      assert(false);
+    } catch (std::out_of_range const&) {
+      // pass
+    } catch (...) {
+      assert(false);
+    }
+  }
+#endif
+}
+
+TEST_CONSTEXPR_CXX20 bool tests() {
+  test<std::allocator<bool> >();
+  test<min_allocator<bool> >();
+  test<test_allocator<bool> >();
+  return true;
+}
+
+void test_exceptions() {
+  test_exception<std::allocator<bool> >();
+  test_exception<min_allocator<bool> >();
+  test_exception<test_allocator<bool> >();
+}
+
+int main(int, char**) {
+  tests();
+  test_exceptions();
+
+#if TEST_STD_VER >= 20
+  static_assert(tests());
+#endif
+
+  return 0;
+}
diff --git a/libcxx/test/std/diagnostics/syserr/syserr.compare/eq_error_code_error_code.pass.cpp b/libcxx/test/std/diagnostics/syserr/syserr.compare/eq_error_code_error_code.pass.cpp
index f1f49733280b..a8b565bb0ab9 100644
--- a/libcxx/test/std/diagnostics/syserr/syserr.compare/eq_error_code_error_code.pass.cpp
+++ b/libcxx/test/std/diagnostics/syserr/syserr.compare/eq_error_code_error_code.pass.cpp
@@ -22,6 +22,10 @@
 
 #include "test_macros.h"
 
+#ifndef _WIN32
+#  define TEST_SYSTEM_CATEGORY_IS_GENERIC_CATEGORY
+#endif
+
 int main(int, char**) {
   std::error_code e_code1(5, std::generic_category());
   std::error_code e_code2(5, std::system_category());
@@ -45,7 +49,9 @@ int main(int, char**) {
   assert(e_code2 == e_code2);
   assert(e_code2 != e_code3);
   assert(e_code2 != e_code4);
+#ifdef TEST_SYSTEM_CATEGORY_IS_GENERIC_CATEGORY
   LIBCPP_ASSERT(e_code2 == e_condition1);
+#endif
   assert(e_code2 == e_condition2);
   LIBCPP_ASSERT(e_code2 != e_condition3);
   assert(e_code2 != e_condition4);
@@ -65,11 +71,15 @@ int main(int, char**) {
   assert(e_code4 == e_code4);
   LIBCPP_ASSERT(e_code4 != e_condition1);
   assert(e_code4 != e_condition2);
+#ifdef TEST_SYSTEM_CATEGORY_IS_GENERIC_CATEGORY
   LIBCPP_ASSERT(e_code4 == e_condition3);
+#endif
   assert(e_code4 == e_condition4);
 
   assert(e_condition1 == e_code1);
+#ifdef TEST_SYSTEM_CATEGORY_IS_GENERIC_CATEGORY
   LIBCPP_ASSERT(e_condition1 == e_code2);
+#endif
   assert(e_condition1 != e_code3);
   LIBCPP_ASSERT(e_condition1 != e_code4);
   assert(e_condition1 == e_condition1);
@@ -89,7 +99,9 @@ int main(int, char**) {
   assert(e_condition3 != e_code1);
   LIBCPP_ASSERT(e_condition3 != e_code2);
   assert(e_condition3 == e_code3);
+#ifdef TEST_SYSTEM_CATEGORY_IS_GENERIC_CATEGORY
   LIBCPP_ASSERT(e_condition3 == e_code4);
+#endif
   assert(e_condition3 != e_condition1);
   assert(e_condition3 != e_condition2);
   assert(e_condition3 == e_condition3);
diff --git a/libcxx/test/std/diagnostics/syserr/syserr.errcat/syserr.errcat.derived/message.pass.cpp b/libcxx/test/std/diagnostics/syserr/syserr.errcat/syserr.errcat.derived/message.pass.cpp
index 9f7eb42bc78d..f7f43132902f 100644
--- a/libcxx/test/std/diagnostics/syserr/syserr.errcat/syserr.errcat.derived/message.pass.cpp
+++ b/libcxx/test/std/diagnostics/syserr/syserr.errcat/syserr.errcat.derived/message.pass.cpp
@@ -29,8 +29,11 @@ int main(int, char**) {
   assert(!m1.empty());
   assert(!m2.empty());
   assert(!m3.empty());
+#ifndef _WIN32
+  // On windows, system_category is distinct.
   LIBCPP_ASSERT(m1 == m2);
-  assert(m1 != m3);
+#endif
+  assert(m2 != m3);
 
   return 0;
 }
diff --git a/libcxx/test/std/diagnostics/syserr/syserr.errcat/syserr.errcat.objects/system_category.pass.cpp b/libcxx/test/std/diagnostics/syserr/syserr.errcat/syserr.errcat.objects/system_category.pass.cpp
index 6ba33ba44ca4..255cbe75e2fa 100644
--- a/libcxx/test/std/diagnostics/syserr/syserr.errcat/syserr.errcat.objects/system_category.pass.cpp
+++ b/libcxx/test/std/diagnostics/syserr/syserr.errcat/syserr.errcat.objects/system_category.pass.cpp
@@ -33,7 +33,12 @@ int main(int, char**) {
   {
     const std::error_category& e_cat1 = std::system_category();
     std::error_condition e_cond       = e_cat1.default_error_condition(5);
+#ifdef _WIN32
+    // Windows' system error 5 is ERROR_ACCESS_DENIED, which maps to generic code permission_denied.
+    LIBCPP_ASSERT(e_cond.value() == static_cast<int>(std::errc::permission_denied));
+#else
     LIBCPP_ASSERT(e_cond.value() == 5);
+#endif
     LIBCPP_ASSERT(e_cond.category() == std::generic_category());
     assert(e_cat1.equivalent(5, e_cond));
 
diff --git a/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.obs/file_type_obs.pass.cpp b/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.obs/file_type_obs.pass.cpp
index 303a95a0128b..071ee7f6c891 100644
--- a/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.obs/file_type_obs.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.obs/file_type_obs.pass.cpp
@@ -172,8 +172,13 @@ static void test_with_ec_dne() {
     file_status st = status(p, status_ec);
     file_status sym_st = symlink_status(p, sym_status_ec);
     std::error_code ec = GetTestEC(2);
-    auto CheckEC = [&](std::error_code const& other_ec) {
-      bool res = ec == other_ec;
+    auto CheckEC                  = [&](std::error_code const& other_ec) {
+      // Note: we're comparing equality of the _canonicalized_ error_condition
+      // here (unlike in other tests where we expect exactly the same
+      // error_code). This is because directory_entry can construct its own
+      // generic_category error when a file doesn't exist, instead of passing
+      // through an underlying system_category error.
+      bool res = ec.default_error_condition() == other_ec.default_error_condition();
       ec = GetTestEC(2);
       return res;
     };
diff --git a/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.obs/status.pass.cpp b/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.obs/status.pass.cpp
index dd72232ee530..dec04df7ca01 100644
--- a/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.obs/status.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.obs/status.pass.cpp
@@ -44,7 +44,7 @@ static void test_basic() {
     file_status es = e.status(eec);
     assert(ps.type() == es.type());
     assert(ps.permissions() == es.permissions());
-    assert(pec == eec);
+    assert(pec.default_error_condition() == eec.default_error_condition());
   }
   for (const auto& p : TestCases) {
     const directory_entry e(p);
diff --git a/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.obs/symlink_status.pass.cpp b/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.obs/symlink_status.pass.cpp
index 24e806950952..77da936382aa 100644
--- a/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.obs/symlink_status.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.obs/symlink_status.pass.cpp
@@ -44,7 +44,7 @@ static void test_signature() {
     file_status es = e.symlink_status(eec);
     assert(ps.type() == es.type());
     assert(ps.permissions() == es.permissions());
-    assert(pec == eec);
+    assert(pec.default_error_condition() == eec.default_error_condition());
   }
   for (const auto& p : TestCases) {
     const directory_entry e(p);
diff --git a/libcxx/test/support/filesystem_test_helper.h b/libcxx/test/support/filesystem_test_helper.h
index a63d645d1a01..2ad9efb32c60 100644
--- a/libcxx/test/support/filesystem_test_helper.h
+++ b/libcxx/test/support/filesystem_test_helper.h
@@ -583,7 +583,11 @@ struct ExceptionChecker {
     assert(ErrorIsImp(Err.code(), {expected_err}));
     assert(Err.path1() == expected_path1);
     assert(Err.path2() == expected_path2);
+#ifndef _WIN32
+    // On Windows, the error strings are windows error code strings, and don't
+    // match textually with the strings generated for generic std::errc::*.
     LIBCPP_ONLY(check_libcxx_string(Err));
+#endif
   }
 
   void check_libcxx_string(fs::filesystem_error const& Err) {
diff --git a/libcxx/test/tools/clang_tidy_checks/CMakeLists.txt b/libcxx/test/tools/clang_tidy_checks/CMakeLists.txt
index 05c44e49b448..0f8f0e8864d0 100644
--- a/libcxx/test/tools/clang_tidy_checks/CMakeLists.txt
+++ b/libcxx/test/tools/clang_tidy_checks/CMakeLists.txt
@@ -92,6 +92,7 @@ set(SOURCES
     header_exportable_declarations.cpp
     hide_from_abi.cpp
     internal_ftm_use.cpp
+    nodebug_on_aliases.cpp
     proper_version_checks.cpp
     qualify_declval.cpp
     robust_against_adl.cpp
diff --git a/libcxx/test/tools/clang_tidy_checks/libcpp_module.cpp b/libcxx/test/tools/clang_tidy_checks/libcpp_module.cpp
index 54beed5e30be..bc7c8ce7ec44 100644
--- a/libcxx/test/tools/clang_tidy_checks/libcpp_module.cpp
+++ b/libcxx/test/tools/clang_tidy_checks/libcpp_module.cpp
@@ -13,6 +13,7 @@
 #include "header_exportable_declarations.hpp"
 #include "hide_from_abi.hpp"
 #include "internal_ftm_use.hpp"
+#include "nodebug_on_aliases.hpp"
 #include "proper_version_checks.hpp"
 #include "qualify_declval.hpp"
 #include "robust_against_adl.hpp"
@@ -26,6 +27,7 @@ public:
     check_factories.registerCheck<libcpp::header_exportable_declarations>("libcpp-header-exportable-declarations");
     check_factories.registerCheck<libcpp::hide_from_abi>("libcpp-hide-from-abi");
     check_factories.registerCheck<libcpp::internal_ftm_use>("libcpp-internal-ftms");
+    check_factories.registerCheck<libcpp::nodebug_on_aliases>("libcpp-nodebug-on-aliases");
     check_factories.registerCheck<libcpp::proper_version_checks>("libcpp-cpp-version-check");
     check_factories.registerCheck<libcpp::robust_against_adl_check>("libcpp-robust-against-adl");
     check_factories.registerCheck<libcpp::uglify_attributes>("libcpp-uglify-attributes");
diff --git a/libcxx/test/tools/clang_tidy_checks/nodebug_on_aliases.cpp b/libcxx/test/tools/clang_tidy_checks/nodebug_on_aliases.cpp
new file mode 100644
index 000000000000..9b96269e5980
--- /dev/null
+++ b/libcxx/test/tools/clang_tidy_checks/nodebug_on_aliases.cpp
@@ -0,0 +1,35 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang-tidy/ClangTidyCheck.h"
+
+#include "nodebug_on_aliases.hpp"
+#include "utilities.hpp"
+
+namespace libcpp {
+namespace {
+AST_MATCHER(clang::NamedDecl, isPretty) { return !is_ugly_name(Node.getName()); }
+} // namespace
+
+nodebug_on_aliases::nodebug_on_aliases(llvm::StringRef name, clang::tidy::ClangTidyContext* context)
+    : clang::tidy::ClangTidyCheck(name, context) {}
+
+void nodebug_on_aliases::registerMatchers(clang::ast_matchers::MatchFinder* finder) {
+  using namespace clang::ast_matchers;
+  finder->addMatcher(
+      typeAliasDecl(unless(anyOf(isPretty(), hasAttr(clang::attr::NoDebug), hasAncestor(functionDecl()))))
+          .bind("nodebug_on_internal_aliases"),
+      this);
+}
+
+void nodebug_on_aliases::check(const clang::ast_matchers::MatchFinder::MatchResult& result) {
+  if (const auto* alias = result.Nodes.getNodeAs<clang::TypeAliasDecl>("nodebug_on_internal_aliases")) {
+    diag(alias->getBeginLoc(), "Internal aliases should always be marked _LIBCPP_NODEBUG");
+  }
+}
+} // namespace libcpp
diff --git a/libcxx/test/tools/clang_tidy_checks/nodebug_on_aliases.hpp b/libcxx/test/tools/clang_tidy_checks/nodebug_on_aliases.hpp
new file mode 100644
index 000000000000..1097e8910649
--- /dev/null
+++ b/libcxx/test/tools/clang_tidy_checks/nodebug_on_aliases.hpp
@@ -0,0 +1,18 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang-tidy/ClangTidyCheck.h"
+
+namespace libcpp {
+class nodebug_on_aliases : public clang::tidy::ClangTidyCheck {
+public:
+  nodebug_on_aliases(llvm::StringRef, clang::tidy::ClangTidyContext*);
+  void registerMatchers(clang::ast_matchers::MatchFinder*) override;
+  void check(const clang::ast_matchers::MatchFinder::MatchResult&) override;
+};
+} // namespace libcpp
diff --git a/libcxx/test/tools/clang_tidy_checks/uglify_attributes.cpp b/libcxx/test/tools/clang_tidy_checks/uglify_attributes.cpp
index 7812b236f613..24bacde6304b 100644
--- a/libcxx/test/tools/clang_tidy_checks/uglify_attributes.cpp
+++ b/libcxx/test/tools/clang_tidy_checks/uglify_attributes.cpp
@@ -10,20 +10,11 @@
 #include "clang-tidy/ClangTidyModuleRegistry.h"
 
 #include "uglify_attributes.hpp"
+#include "utilities.hpp"
 
-#include <algorithm>
-#include <array>
-#include <span>
-#include <string_view>
+#include <optional>
 
 namespace {
-bool isUgly(std::string_view str) {
-  if (str.size() < 2)
-    return false;
-  if (str[0] == '_' && str[1] >= 'A' && str[1] <= 'Z')
-    return true;
-  return str.find("__") != std::string_view::npos;
-}
 
 // Starting with Clang 17 ToT C++23 support is provided by CPlusPlus23 instead
 // of C++23 support is provided by CPlusPlus2b. To allow a smooth transition for
@@ -77,17 +68,15 @@ AST_MATCHER(clang::Attr, isPretty) {
   if (Node.isKeywordAttribute() || !Node.getAttrName())
     return false;
   if (Node.isCXX11Attribute() && !Node.hasScope()) {
-    if (isUgly(Node.getAttrName()->getName()))
+    if (is_ugly_name(Node.getAttrName()->getName()))
       return false;
     return !llvm::is_contained(
         get_standard_attributes(Finder->getASTContext().getLangOpts()), Node.getAttrName()->getName());
   }
   if (Node.hasScope())
-    if (!isUgly(Node.getScopeName()->getName()))
+    if (!is_ugly_name(Node.getScopeName()->getName()))
       return true;
-  return !isUgly(Node.getAttrName()->getName());
-
-  return false;
+  return !is_ugly_name(Node.getAttrName()->getName());
 }
 
 std::optional<std::string> getUglyfiedCXX11Attr(const clang::Attr& attr) {
diff --git a/libcxx/test/tools/clang_tidy_checks/utilities.hpp b/libcxx/test/tools/clang_tidy_checks/utilities.hpp
new file mode 100644
index 000000000000..b780efef2385
--- /dev/null
+++ b/libcxx/test/tools/clang_tidy_checks/utilities.hpp
@@ -0,0 +1,22 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LIBCXX_TEST_TOOLS_CLANG_TIDY_CHECKS_UTILITIES_HPP
+#define LIBCXX_TEST_TOOLS_CLANG_TIDY_CHECKS_UTILITIES_HPP
+
+#include <string_view>
+
+inline bool is_ugly_name(std::string_view str) {
+  if (str.size() < 2)
+    return false;
+  if (str[0] == '_' && str[1] >= 'A' && str[1] <= 'Z')
+    return true;
+  return str.find("__") != std::string_view::npos;
+}
+
+#endif // LIBCXX_TEST_TOOLS_CLANG_TIDY_CHECKS_UTILITIES_HPP
diff --git a/libcxx/utils/libcxx-benchmark-json b/libcxx/utils/libcxx-benchmark-json
new file mode 100755
index 000000000000..7f743c32caf4
--- /dev/null
+++ b/libcxx/utils/libcxx-benchmark-json
@@ -0,0 +1,57 @@
+#!/usr/bin/env bash
+
+set -e
+
+PROGNAME="$(basename "${0}")"
+MONOREPO_ROOT="$(realpath $(dirname "${PROGNAME}"))"
+function usage() {
+cat <<EOF
+Usage:
+${PROGNAME} [-h|--help] <build-directory> benchmarks...
+
+Print the path to the JSON files containing benchmark results for the given benchmarks.
+
+This requires those benchmarks to have already been run, i.e. this only resolves the path
+to the benchmark .json file within the build directory.
+
+<build-directory>  The path to the build directory.
+benchmarks...      Paths of the benchmarks to extract the results for. Those paths are relative to '<monorepo-root>'.
+
+Example
+=======
+$ cmake -S runtimes -B build/ -DLLVM_ENABLE_RUNTIMES="libcxx;libcxxabi"
+$ libcxx-lit build/ -sv libcxx/test/benchmarks/algorithms/for_each.bench.cpp
+$ less \$(${PROGNAME} build/ libcxx/test/benchmarks/algorithms/for_each.bench.cpp)
+EOF
+}
+
+if [[ "${1}" == "-h" || "${1}" == "--help" ]]; then
+    usage
+    exit 0
+fi
+
+if [[ $# -lt 1 ]]; then
+    usage
+    exit 1
+fi
+
+build_dir="${1}"
+shift
+
+for benchmark in ${@}; do
+    # Normalize the paths by turning all benchmarks paths into absolute ones and then making them
+    # relative to the root of the monorepo.
+    benchmark="$(realpath ${benchmark})"
+    relative=$(python -c "import os; import sys; print(os.path.relpath(sys.argv[1], sys.argv[2]))" "${benchmark}" "${MONOREPO_ROOT}")
+
+    # Extract components of the benchmark path
+    directory="$(dirname ${relative})"
+    file="$(basename ${relative})"
+
+    # Reconstruct the (slightly weird) path to the benchmark json file. This should be kept in sync
+    # whenever the test suite changes.
+    json="${build_dir}/${directory}/Output/${file}.dir/benchmark-result.json"
+    if [[ -f "${json}" ]]; then
+        echo "${json}"
+    fi
+done
diff --git a/libcxx/utils/libcxx-compare-benchmarks b/libcxx/utils/libcxx-compare-benchmarks
new file mode 100755
index 000000000000..e04820fc57ed
--- /dev/null
+++ b/libcxx/utils/libcxx-compare-benchmarks
@@ -0,0 +1,62 @@
+#!/usr/bin/env bash
+
+set -e
+
+PROGNAME="$(basename "${0}")"
+MONOREPO_ROOT="$(realpath $(dirname "${PROGNAME}"))"
+function usage() {
+cat <<EOF
+Usage:
+${PROGNAME} [-h|--help] <baseline-build> <candidate-build> benchmarks...
+
+Compare the given benchmarks between the baseline and the candidate build directories.
+
+This requires those benchmarks to have already been generated in both build directories.
+
+<baseline-build>   The path to the build directory considered the baseline.
+<candidate-build>  The path to the build directory considered the candidate.
+benchmarks...      Paths of the benchmarks to compare. Those paths are relative to '<monorepo-root>'.
+
+Example
+=======
+$ libcxx-lit build1/ -sv libcxx/test/benchmarks/algorithms/for_each.bench.cpp
+$ libcxx-lit build2/ -sv libcxx/test/benchmarks/algorithms/for_each.bench.cpp
+$ ${PROGNAME} build1/ build2/ libcxx/test/benchmarks/algorithms/for_each.bench.cpp
+EOF
+}
+
+if [[ "${1}" == "-h" || "${1}" == "--help" ]]; then
+    usage
+    exit 0
+fi
+
+if [[ $# -lt 1 ]]; then
+    usage
+    exit 1
+fi
+
+baseline="${1}"
+candidate="${2}"
+shift; shift
+
+GBENCH="${MONOREPO_ROOT}/third-party/benchmark"
+
+python3 -m venv /tmp/libcxx-compare-benchmarks-venv
+source /tmp/libcxx-compare-benchmarks-venv/bin/activate
+pip3 install -r ${GBENCH}/tools/requirements.txt
+
+for benchmark in ${@}; do
+    base="$(${MONOREPO_ROOT}/libcxx/utils/libcxx-benchmark-json ${baseline} ${benchmark})"
+    cand="$(${MONOREPO_ROOT}/libcxx/utils/libcxx-benchmark-json ${candidate} ${benchmark})"
+
+    if [[ ! -e "${base}" ]]; then
+        echo "Benchmark ${benchmark} does not exist in the baseline"
+        continue
+    fi
+    if [[ ! -e "${cand}" ]]; then
+        echo "Benchmark ${benchmark} does not exist in the candidate"
+        continue
+    fi
+
+    "${GBENCH}/tools/compare.py" benchmarks "${base}" "${cand}"
+done
diff --git a/libcxxabi/src/stdlib_new_delete.cpp b/libcxxabi/src/stdlib_new_delete.cpp
index f386b28f0cfe..73798e211c31 100644
--- a/libcxxabi/src/stdlib_new_delete.cpp
+++ b/libcxxabi/src/stdlib_new_delete.cpp
@@ -63,7 +63,7 @@ static void* operator_new_impl(std::size_t size) {
   return p;
 }
 
-_LIBCPP_MAKE_OVERRIDABLE_FUNCTION_DETECTABLE _LIBCPP_WEAK void* operator new(std::size_t size) _THROW_BAD_ALLOC {
+_LIBCPP_OVERRIDABLE_FUNCTION(_Znwm, void*, operator new, (std::size_t size)) _THROW_BAD_ALLOC {
   void* p = operator_new_impl(size);
   if (p == nullptr)
     __throw_bad_alloc_shim();
@@ -74,7 +74,7 @@ _LIBCPP_WEAK void* operator new(size_t size, const std::nothrow_t&) noexcept {
 #if !_LIBCPP_HAS_EXCEPTIONS
 #  if _LIBCPP_CAN_DETECT_OVERRIDDEN_FUNCTION
   _LIBCPP_ASSERT_SHIM(
-      !std::__is_function_overridden(static_cast<void* (*)(std::size_t)>(&operator new)),
+      !std::__is_function_overridden<static_cast<void* (*)(std::size_t)>(&operator new)>(),
       "libc++ was configured with exceptions disabled and `operator new(size_t)` has been overridden, "
       "but `operator new(size_t, nothrow_t)` has not been overridden. This is problematic because "
       "`operator new(size_t, nothrow_t)` must call `operator new(size_t)`, which will terminate in case "
@@ -94,7 +94,7 @@ _LIBCPP_WEAK void* operator new(size_t size, const std::nothrow_t&) noexcept {
 #endif
 }
 
-_LIBCPP_MAKE_OVERRIDABLE_FUNCTION_DETECTABLE _LIBCPP_WEAK void* operator new[](size_t size) _THROW_BAD_ALLOC {
+_LIBCPP_OVERRIDABLE_FUNCTION(_Znam, void*, operator new[], (size_t size)) _THROW_BAD_ALLOC {
   return ::operator new(size);
 }
 
@@ -102,7 +102,7 @@ _LIBCPP_WEAK void* operator new[](size_t size, const std::nothrow_t&) noexcept {
 #if !_LIBCPP_HAS_EXCEPTIONS
 #  if _LIBCPP_CAN_DETECT_OVERRIDDEN_FUNCTION
   _LIBCPP_ASSERT_SHIM(
-      !std::__is_function_overridden(static_cast<void* (*)(std::size_t)>(&operator new[])),
+      !std::__is_function_overridden<static_cast<void* (*)(std::size_t)>(&operator new[])>(),
       "libc++ was configured with exceptions disabled and `operator new[](size_t)` has been overridden, "
       "but `operator new[](size_t, nothrow_t)` has not been overridden. This is problematic because "
       "`operator new[](size_t, nothrow_t)` must call `operator new[](size_t)`, which will terminate in case "
@@ -156,8 +156,8 @@ static void* operator_new_aligned_impl(std::size_t size, std::align_val_t alignm
   return p;
 }
 
-_LIBCPP_MAKE_OVERRIDABLE_FUNCTION_DETECTABLE _LIBCPP_WEAK void*
-operator new(std::size_t size, std::align_val_t alignment) _THROW_BAD_ALLOC {
+_LIBCPP_OVERRIDABLE_FUNCTION(_ZnwmSt11align_val_t, void*, operator new, (std::size_t size, std::align_val_t alignment))
+_THROW_BAD_ALLOC {
   void* p = operator_new_aligned_impl(size, alignment);
   if (p == nullptr)
     __throw_bad_alloc_shim();
@@ -168,7 +168,7 @@ _LIBCPP_WEAK void* operator new(size_t size, std::align_val_t alignment, const s
 #  if !_LIBCPP_HAS_EXCEPTIONS
 #    if _LIBCPP_CAN_DETECT_OVERRIDDEN_FUNCTION
   _LIBCPP_ASSERT_SHIM(
-      !std::__is_function_overridden(static_cast<void* (*)(std::size_t, std::align_val_t)>(&operator new)),
+      !std::__is_function_overridden<static_cast<void* (*)(std::size_t, std::align_val_t)>(&operator new)>(),
       "libc++ was configured with exceptions disabled and `operator new(size_t, align_val_t)` has been overridden, "
       "but `operator new(size_t, align_val_t, nothrow_t)` has not been overridden. This is problematic because "
       "`operator new(size_t, align_val_t, nothrow_t)` must call `operator new(size_t, align_val_t)`, which will "
@@ -188,16 +188,14 @@ _LIBCPP_WEAK void* operator new(size_t size, std::align_val_t alignment, const s
 #  endif
 }
 
-_LIBCPP_MAKE_OVERRIDABLE_FUNCTION_DETECTABLE _LIBCPP_WEAK void*
-operator new[](size_t size, std::align_val_t alignment) _THROW_BAD_ALLOC {
-  return ::operator new(size, alignment);
-}
+_LIBCPP_OVERRIDABLE_FUNCTION(_ZnamSt11align_val_t, void*, operator new[], (size_t size, std::align_val_t alignment))
+_THROW_BAD_ALLOC { return ::operator new(size, alignment); }
 
 _LIBCPP_WEAK void* operator new[](size_t size, std::align_val_t alignment, const std::nothrow_t&) noexcept {
 #  if !_LIBCPP_HAS_EXCEPTIONS
 #    if _LIBCPP_CAN_DETECT_OVERRIDDEN_FUNCTION
   _LIBCPP_ASSERT_SHIM(
-      !std::__is_function_overridden(static_cast<void* (*)(std::size_t, std::align_val_t)>(&operator new[])),
+      !std::__is_function_overridden<static_cast<void* (*)(std::size_t, std::align_val_t)>(&operator new[])>(),
       "libc++ was configured with exceptions disabled and `operator new[](size_t, align_val_t)` has been overridden, "
       "but `operator new[](size_t, align_val_t, nothrow_t)` has not been overridden. This is problematic because "
       "`operator new[](size_t, align_val_t, nothrow_t)` must call `operator new[](size_t, align_val_t)`, which will "
diff --git a/lld/COFF/Driver.h b/lld/COFF/Driver.h
index b04a00e2d1cd..513256890429 100644
--- a/lld/COFF/Driver.h
+++ b/lld/COFF/Driver.h
@@ -112,7 +112,7 @@ private:
   void detectWinSysRoot(const llvm::opt::InputArgList &args);
 
   // Adds various search paths based on the sysroot.  Must only be called once
-  // config->machine has been set.
+  // config.machine has been set.
   void addWinSysRootLibSearchPaths();
 
   // Symbol names are mangled by prepending "_" on x86.
diff --git a/lld/ELF/SymbolTable.cpp b/lld/ELF/SymbolTable.cpp
index d5b2200b5022..975700505fac 100644
--- a/lld/ELF/SymbolTable.cpp
+++ b/lld/ELF/SymbolTable.cpp
@@ -318,7 +318,7 @@ void SymbolTable::scanVersionScript() {
     // script with `global: *` are used.
     //
     // '--retain-symbol-file' adds a "*" pattern to
-    // 'config->versionDefinitions[VER_NDX_LOCAL].nonLocalPatterns', see
+    // 'versionDefinitions[VER_NDX_LOCAL].nonLocalPatterns', see
     // 'readConfigs()' in 'Driver.cpp'. Note that it is not '.localPatterns',
     // and may seem counterintuitive, but still works as expected. Here we can
     // exploit that and skip analyzing the pattern added for this option.
diff --git a/lld/MachO/ConcatOutputSection.h b/lld/MachO/ConcatOutputSection.h
index 9af661d0ab1e..8131c48d3111 100644
--- a/lld/MachO/ConcatOutputSection.h
+++ b/lld/MachO/ConcatOutputSection.h
@@ -25,8 +25,9 @@ class Defined;
 // in the final binary.
 class ConcatOutputSection : public OutputSection {
 public:
-  explicit ConcatOutputSection(StringRef name)
-      : OutputSection(ConcatKind, name) {}
+  explicit ConcatOutputSection(StringRef name,
+                               OutputSection::Kind kind = ConcatKind)
+      : OutputSection(kind, name) {}
 
   const ConcatInputSection *firstSection() const { return inputs.front(); }
   const ConcatInputSection *lastSection() const { return inputs.back(); }
@@ -46,7 +47,7 @@ public:
   void writeTo(uint8_t *buf) const override;
 
   static bool classof(const OutputSection *sec) {
-    return sec->kind() == ConcatKind;
+    return sec->kind() == ConcatKind || sec->kind() == TextKind;
   }
 
   static ConcatOutputSection *getOrCreateForInput(const InputSection *);
@@ -66,12 +67,18 @@ private:
 // support thunk insertion.
 class TextOutputSection : public ConcatOutputSection {
 public:
-  explicit TextOutputSection(StringRef name) : ConcatOutputSection(name) {}
+  explicit TextOutputSection(StringRef name)
+      : ConcatOutputSection(name, TextKind) {}
   void finalizeContents() override {}
   void finalize() override;
   bool needsThunks() const;
+  ArrayRef<ConcatInputSection *> getThunks() const { return thunks; }
   void writeTo(uint8_t *buf) const override;
 
+  static bool classof(const OutputSection *sec) {
+    return sec->kind() == TextKind;
+  }
+
 private:
   uint64_t estimateStubsInRangeVA(size_t callIdx) const;
 
diff --git a/lld/MachO/MapFile.cpp b/lld/MachO/MapFile.cpp
index 9c0621622ae2..12417df8cecb 100644
--- a/lld/MachO/MapFile.cpp
+++ b/lld/MachO/MapFile.cpp
@@ -161,6 +161,20 @@ static uint64_t getSymSizeForMap(Defined *sym) {
   return sym->size;
 }
 
+// Merges two vectors of input sections in order of their outSecOff values.
+// This approach creates a new (temporary) vector which is not ideal but the
+// ideal approach leads to a lot of code duplication.
+static std::vector<ConcatInputSection *>
+mergeOrderedInputs(ArrayRef<ConcatInputSection *> inputs1,
+                   ArrayRef<ConcatInputSection *> inputs2) {
+  std::vector<ConcatInputSection *> vec(inputs1.size() + inputs2.size());
+  std::merge(inputs1.begin(), inputs1.end(), inputs2.begin(), inputs2.end(),
+             vec.begin(), [](ConcatInputSection *a, ConcatInputSection *b) {
+               return a->outSecOff < b->outSecOff;
+             });
+  return vec;
+}
+
 void macho::writeMapFile() {
   if (config->mapFile.empty())
     return;
@@ -220,7 +234,11 @@ void macho::writeMapFile() {
   os << "# Address\tSize    \tFile  Name\n";
   for (const OutputSegment *seg : outputSegments) {
     for (const OutputSection *osec : seg->getSections()) {
-      if (auto *concatOsec = dyn_cast<ConcatOutputSection>(osec)) {
+      if (auto *textOsec = dyn_cast<TextOutputSection>(osec)) {
+        auto inputsAndThunks =
+            mergeOrderedInputs(textOsec->inputs, textOsec->getThunks());
+        printIsecArrSyms(inputsAndThunks);
+      } else if (auto *concatOsec = dyn_cast<ConcatOutputSection>(osec)) {
         printIsecArrSyms(concatOsec->inputs);
       } else if (osec == in.cStringSection || osec == in.objcMethnameSection) {
         const auto &liveCStrings = info.liveCStringsForSection.lookup(osec);
diff --git a/lld/MachO/OutputSection.h b/lld/MachO/OutputSection.h
index 5297a03c2cfa..9afd3a9eeb19 100644
--- a/lld/MachO/OutputSection.h
+++ b/lld/MachO/OutputSection.h
@@ -37,6 +37,7 @@ public:
   enum Kind {
     ConcatKind,
     SyntheticKind,
+    TextKind,
   };
 
   OutputSection(Kind kind, StringRef name) : name(name), sectionKind(kind) {}
diff --git a/lld/test/MachO/arm64-thunks.s b/lld/test/MachO/arm64-thunks.s
index d887359bbc23..858a27dfe36a 100644
--- a/lld/test/MachO/arm64-thunks.s
+++ b/lld/test/MachO/arm64-thunks.s
@@ -8,14 +8,46 @@
 ## (4) early calls to a dylib stub use a thunk, and later calls the stub
 ##     directly
 ## (5) Thunks are created for all sections in the text segment with branches.
+## (6) Thunks are in the linker map file.
 ## Notes:
 ## 0x4000000 = 64 Mi = half the magnitude of the forward-branch range
 
 # RUN: rm -rf %t; mkdir %t
 # RUN: llvm-mc -filetype=obj -triple=arm64-apple-darwin %s -o %t/input.o
-# RUN: %lld -arch arm64 -dead_strip -lSystem -U _extern_sym -o %t/thunk %t/input.o
+# RUN: %lld -arch arm64 -dead_strip -lSystem -U _extern_sym -map %t/thunk.map -o %t/thunk %t/input.o
 # RUN: llvm-objdump --no-print-imm-hex -d --no-show-raw-insn %t/thunk | FileCheck %s
 
+## Check that the thunks appear in the map file and that everything is sorted by address
+# Because of the `.space` instructions, there will end up being a lot of dead symbols in the 
+# linker map (linker map will be ~2.7GB). So to avoid the test trying to (slowly) match regex
+# across all the ~2.7GB of the linker map - generate a version of the linker map without dead symbols.
+# RUN: awk '/# Dead Stripped Symbols:/ {exit} {print}' %t/thunk.map > %t/thunk_no_dead_syms.map
+
+# RUN: FileCheck %s --input-file %t/thunk_no_dead_syms.map --check-prefix=MAP
+ 
+# MAP:      0x{{[[:xdigit:]]+}} {{.*}} _b
+# MAP-NEXT: 0x{{[[:xdigit:]]+}} {{.*}} _c
+# MAP-NEXT: 0x{{[[:xdigit:]]+}} {{.*}} _d.thunk.0
+# MAP-NEXT: 0x{{[[:xdigit:]]+}} {{.*}} _e.thunk.0
+# MAP-NEXT: 0x{{[[:xdigit:]]+}} {{.*}} _f.thunk.0
+# MAP-NEXT: 0x{{[[:xdigit:]]+}} {{.*}} _g.thunk.0
+# MAP-NEXT: 0x{{[[:xdigit:]]+}} {{.*}} _h.thunk.0
+# MAP-NEXT: 0x{{[[:xdigit:]]+}} {{.*}} ___nan.thunk.0
+# MAP-NEXT: 0x{{[[:xdigit:]]+}} {{.*}} _d
+# MAP-NEXT: 0x{{[[:xdigit:]]+}} {{.*}} _e
+# MAP-NEXT: 0x{{[[:xdigit:]]+}} {{.*}} _f
+# MAP-NEXT: 0x{{[[:xdigit:]]+}} {{.*}} _g
+# MAP-NEXT: 0x{{[[:xdigit:]]+}} {{.*}} _a.thunk.0
+# MAP-NEXT: 0x{{[[:xdigit:]]+}} {{.*}} _b.thunk.0
+# MAP-NEXT: 0x{{[[:xdigit:]]+}} {{.*}} _h
+# MAP-NEXT: 0x{{[[:xdigit:]]+}} {{.*}} _main
+# MAP-NEXT: 0x{{[[:xdigit:]]+}} {{.*}} _c.thunk.0
+# MAP-NEXT: 0x{{[[:xdigit:]]+}} {{.*}} _d.thunk.1
+# MAP-NEXT: 0x{{[[:xdigit:]]+}} {{.*}} _e.thunk.1
+# MAP-NEXT: 0x{{[[:xdigit:]]+}} {{.*}} _f.thunk.1
+# MAP-NEXT: 0x{{[[:xdigit:]]+}} {{.*}} _z
+
+
 # CHECK: Disassembly of section __TEXT,__text:
 
 # CHECK: [[#%.13x, A_PAGE:]][[#%.3x, A_OFFSET:]] <_a>:
diff --git a/lldb/source/Host/openbsd/Host.cpp b/lldb/source/Host/openbsd/Host.cpp
index a4dc3918acfd..24650ff97075 100644
--- a/lldb/source/Host/openbsd/Host.cpp
+++ b/lldb/source/Host/openbsd/Host.cpp
@@ -41,18 +41,7 @@ namespace lldb_private {
 class ProcessLaunchInfo;
 }
 
-Environment Host::GetEnvironment() {
-  Environment env;
-  char *v;
-  char **var = environ;
-  for (; var != NULL && *var != NULL; ++var) {
-    v = strchr(*var, (int)'-');
-    if (v == NULL)
-      continue;
-    env.insert(v);
-  }
-  return env;
-}
+Environment Host::GetEnvironment() { return Environment(environ); }
 
 static bool
 GetOpenBSDProcessArgs(const ProcessInstanceInfoMatch *match_info_ptr,
@@ -127,7 +116,7 @@ static bool GetOpenBSDProcessUserAndGroup(ProcessInstanceInfo &process_info) {
         process_info.SetUserID(proc_kinfo.p_ruid);
         process_info.SetGroupID(proc_kinfo.p_rgid);
         process_info.SetEffectiveUserID(proc_kinfo.p_uid);
-	process_info.SetEffectiveGroupID(proc_kinfo.p_gid);
+        process_info.SetEffectiveGroupID(proc_kinfo.p_gid);
         return true;
       }
     }
diff --git a/lldb/source/Host/posix/ConnectionFileDescriptorPosix.cpp b/lldb/source/Host/posix/ConnectionFileDescriptorPosix.cpp
index ab4ddbfe1fb2..0ed201666716 100644
--- a/lldb/source/Host/posix/ConnectionFileDescriptorPosix.cpp
+++ b/lldb/source/Host/posix/ConnectionFileDescriptorPosix.cpp
@@ -730,9 +730,19 @@ ConnectionStatus ConnectionFileDescriptor::ConnectFile(
     struct termios options;
     ::tcgetattr(fd, &options);
 
-    // Set port speed to maximum
+    // Set port speed to the available maximum
+#ifdef B115200
     ::cfsetospeed(&options, B115200);
     ::cfsetispeed(&options, B115200);
+#elif B57600
+    ::cfsetospeed(&options, B57600);
+    ::cfsetispeed(&options, B57600);
+#elif B38400
+    ::cfsetospeed(&options, B38400);
+    ::cfsetispeed(&options, B38400);
+#else
+#error "Maximum Baud rate is Unknown"
+#endif
 
     // Raw input, disable echo and signals
     options.c_lflag &= ~(ICANON | ECHO | ECHOE | ISIG);
diff --git a/lldb/source/Host/posix/FileSystemPosix.cpp b/lldb/source/Host/posix/FileSystemPosix.cpp
index 945e2affc837..4c326a29812f 100644
--- a/lldb/source/Host/posix/FileSystemPosix.cpp
+++ b/lldb/source/Host/posix/FileSystemPosix.cpp
@@ -9,16 +9,8 @@
 #include "lldb/Host/FileSystem.h"
 
 // C includes
-#include <dirent.h>
 #include <fcntl.h>
-#include <sys/mount.h>
-#include <sys/param.h>
-#include <sys/stat.h>
-#include <sys/types.h>
 #include <unistd.h>
-#if defined(__NetBSD__)
-#include <sys/statvfs.h>
-#endif
 
 // lldb Includes
 #include "lldb/Host/Host.h"
diff --git a/lldb/source/Plugins/Language/ObjC/Cocoa.cpp b/lldb/source/Plugins/Language/ObjC/Cocoa.cpp
index b35e27ad8123..1d79edbede5d 100644
--- a/lldb/source/Plugins/Language/ObjC/Cocoa.cpp
+++ b/lldb/source/Plugins/Language/ObjC/Cocoa.cpp
@@ -1226,7 +1226,7 @@ bool lldb_private::formatters::ObjCSELSummaryProvider(
 time_t lldb_private::formatters::GetOSXEpoch() {
   static time_t epoch = 0;
   if (!epoch) {
-#ifndef _WIN32
+#if !defined(_WIN32) && !defined(_AIX)
     tzset();
     tm tm_epoch;
     tm_epoch.tm_sec = 0;
diff --git a/lldb/source/Plugins/ObjectContainer/BSD-Archive/ObjectContainerBSDArchive.cpp b/lldb/source/Plugins/ObjectContainer/BSD-Archive/ObjectContainerBSDArchive.cpp
index 3835f2b08a05..b202898ff438 100644
--- a/lldb/source/Plugins/ObjectContainer/BSD-Archive/ObjectContainerBSDArchive.cpp
+++ b/lldb/source/Plugins/ObjectContainer/BSD-Archive/ObjectContainerBSDArchive.cpp
@@ -8,7 +8,7 @@
 
 #include "ObjectContainerBSDArchive.h"
 
-#if defined(_WIN32) || defined(__ANDROID__)
+#if defined(_WIN32) || defined(__ANDROID__) || defined(_AIX)
 // Defines from ar, missing on Windows
 #define SARMAG 8
 #define ARFMAG "`\n"
diff --git a/lldb/source/Plugins/Platform/Android/PlatformAndroidRemoteGDBServer.cpp b/lldb/source/Plugins/Platform/Android/PlatformAndroidRemoteGDBServer.cpp
index d18b718d4a56..0cf64807ec0d 100644
--- a/lldb/source/Plugins/Platform/Android/PlatformAndroidRemoteGDBServer.cpp
+++ b/lldb/source/Plugins/Platform/Android/PlatformAndroidRemoteGDBServer.cpp
@@ -64,7 +64,7 @@ static Status DeleteForwardPortWithAdb(uint16_t local_port,
 
 static Status FindUnusedPort(uint16_t &port) {
   Status error;
-  std::unique_ptr<TCPSocket> tcp_socket(new TCPSocket(true, false));
+  std::unique_ptr<TCPSocket> tcp_socket(new TCPSocket(true));
   if (error.Fail())
     return error;
 
diff --git a/lldb/source/Plugins/Process/elf-core/ProcessElfCore.cpp b/lldb/source/Plugins/Process/elf-core/ProcessElfCore.cpp
index b3916cc913f7..5f85f99ce7bd 100644
--- a/lldb/source/Plugins/Process/elf-core/ProcessElfCore.cpp
+++ b/lldb/source/Plugins/Process/elf-core/ProcessElfCore.cpp
@@ -1031,6 +1031,8 @@ UUID ProcessElfCore::FindBuidIdInCoreMemory(lldb::addr_t address) {
 
   std::vector<uint8_t> ph_bytes;
   ph_bytes.resize(elf_header.e_phentsize);
+  lldb::addr_t base_addr = 0;
+  bool found_first_load_segment = false;
   for (unsigned int i = 0; i < elf_header.e_phnum; ++i) {
     byte_read = ReadMemory(ph_addr + i * elf_header.e_phentsize,
                            ph_bytes.data(), elf_header.e_phentsize, error);
@@ -1041,6 +1043,11 @@ UUID ProcessElfCore::FindBuidIdInCoreMemory(lldb::addr_t address) {
     offset = 0;
     elf::ELFProgramHeader program_header;
     program_header.Parse(program_header_data, &offset);
+    if (program_header.p_type == llvm::ELF::PT_LOAD &&
+        !found_first_load_segment) {
+      base_addr = program_header.p_vaddr;
+      found_first_load_segment = true;
+    }
     if (program_header.p_type != llvm::ELF::PT_NOTE)
       continue;
 
@@ -1049,7 +1056,7 @@ UUID ProcessElfCore::FindBuidIdInCoreMemory(lldb::addr_t address) {
 
     // We need to slide the address of the p_vaddr as these values don't get
     // relocated in memory.
-    const lldb::addr_t vaddr = program_header.p_vaddr + address;
+    const lldb::addr_t vaddr = program_header.p_vaddr + address - base_addr;
     byte_read =
         ReadMemory(vaddr, note_bytes.data(), program_header.p_memsz, error);
     if (byte_read != program_header.p_memsz)
diff --git a/llvm/Maintainers.md b/llvm/Maintainers.md
index 7acca062f42a..2ccf30b8139a 100644
--- a/llvm/Maintainers.md
+++ b/llvm/Maintainers.md
@@ -45,8 +45,12 @@ Matthew.Arsenault@amd.com, arsenm2@gmail.com (email), [arsenm](https://github.co
 
 #### Inlining
 
-Chandler Carruth \
-chandlerc@gmail.com, chandlerc@google.com (email), [chandlerc](https://github.com/chandlerc) (GitHub)
+Arthur Eubanks \
+aeubanks@google.com (email), [aeubanks](https://github.com/aeubanks) (GitHub) \
+Mircea Trofin (esp. ML inliner) \
+mtrofin@google.com (email), [mtrofin](https://github.com/mtrofin) (GitHub) \
+Kazu Hirata (esp. module inliner and inline order) \
+kazu@google.com (email), [kazutakahirata](https://github.com/kazutakahirata) (GitHub)
 
 #### InstCombine, InstSimplify, ValueTracking, ConstantFold
 
@@ -65,6 +69,11 @@ mail@justinbogner.com (email), [bogner](https://github.com/bogner) (GitHub)
 Diego Novillo \
 dnovillo@google.com (email), [dnovillo](https://github.com/dnovillo) (GitHub)
 
+#### New pass manager, CGSCC, LazyCallGraph
+
+Arthur Eubanks \
+aeubanks@google.com (email), [aeubanks](https://github.com/aeubanks) (GitHub)
+
 #### LoopStrengthReduce
 
 Quentin Colombet \
@@ -323,16 +332,19 @@ peter@pcc.me.uk (email), [pcc](https://github.com/pcc) (GitHub)
 Chandler Carruth \
 chandlerc@gmail.com, chandlerc@google.com (email), [chandlerc](https://github.com/chandlerc) (GitHub)
 
-#### Debug info
+#### Debug info and DWARF
 
+Adrian Prantl \
+aprantl@apple.com (email), [adrian-prantl](https://github.com/adrian-prantl) (GitHub) \
+David Blaikie (especially type information) \
+dblaikie@gmail.com (email), [dwblaikie](https://github.com/dwblaike) (GitHub) \
+Jeremy Morse (especially variable information) \
+jeremy.morse@sony.com (email), [jmorse](https://github.com/jmorse) (GitHub) \
+Jonas Devlieghere (especially dsymutil/DWARFLinker) \
+jonas@devlieghere.com (email), [JDevlieghere](https://github.com/JDevlieghere) (GitHub) \
 Eric Christopher \
 echristo@gmail.com (email), [echristo](https://github.com/echristo) (GitHub)
 
-#### DWARF Parser
-
-Benjamin Kramer \
-benny.kra@gmail.com (email), [d0k](https://github.com/d0k) (GitHub)
-
 #### IR Linker and LTO
 
 Teresa Johnson \
@@ -459,7 +471,7 @@ sabre@nondot.org (email), [lattner](https://github.com/lattner) (GitHub), clattn
 
 Paul C. Anagnostopoulos (paul@windfall.com, [Paul-C-Anagnostopoulos](https://github.com/Paul-C-Anagnostopoulos)) -- TableGen \
 Justin Bogner (mail@justinbogner.com, [bogner](https://github.com/bogner)) -- SelectionDAG \
-Chandler Carruth (chandlerc@gmail.com, chandlerc@google.com, [chandlerc](https://github.com/chandlerc)) -- ADT, Support \
+Chandler Carruth (chandlerc@gmail.com, chandlerc@google.com, [chandlerc](https://github.com/chandlerc)) -- ADT, Support, Inlining \
 Peter Collingbourne (peter@pcc.me.uk, [pcc](https://github.com/pcc)) -- LTO \
 Evan Cheng (evan.cheng@apple.com) -- Parts of code generator not covered by someone else \
 Jake Ehrlich (jakehehrlich@google.com, [jakehehrlich](https://github.com/jakehehrlich)) -- llvm-objcopy and ObjCopy library \
@@ -468,6 +480,7 @@ Renato Golin (rengolin@systemcall.eu, [rengolin](https://github.com/rengolin)) -
 Venkatraman Govindaraju (venkatra@cs.wisc.edu, [vegovin](https://github.com/vegovin) -- Sparc backend \
 James Grosbach (grosbach@apple.com) -- MC layer \
 Anton Korobeynikov (anton@korobeynikov.info, [asl](https://github.com/asl)) -- ARM EABI \
+Benjamin Kramer (benny.kra@gmail.com, [d0k](https://github.com/d0k)) -- DWARF Parser \
 David Majnemer (david.majnemer@gmail.com, [majnemer](https://github.com/majnemer)) -- InstCombine, ConstantFold \
 Chad Rosier (mcrosier@codeaurora.org) -- FastISel \
 Hans Wennborg (hans@chromium.org, [zmodem](https://github.com/zmodem)) -- Release management \
diff --git a/llvm/cmake/modules/AddLLVM.cmake b/llvm/cmake/modules/AddLLVM.cmake
index 54a54db338e6..e046e3798e54 100644
--- a/llvm/cmake/modules/AddLLVM.cmake
+++ b/llvm/cmake/modules/AddLLVM.cmake
@@ -932,7 +932,7 @@ endfunction()
 
 macro(add_llvm_library name)
   cmake_parse_arguments(ARG
-    "SHARED;BUILDTREE_ONLY;MODULE;INSTALL_WITH_TOOLCHAIN"
+    "SHARED;BUILDTREE_ONLY;MODULE;INSTALL_WITH_TOOLCHAIN;NO_EXPORT"
     ""
     ""
     ${ARGN})
@@ -967,7 +967,11 @@ macro(add_llvm_library name)
         set(umbrella)
       endif()
 
-      get_target_export_arg(${name} LLVM export_to_llvmexports ${umbrella})
+      if(ARG_NO_EXPORT)
+        set(export_to_llvmexports)
+      else()
+        get_target_export_arg(${name} LLVM export_to_llvmexports ${umbrella})
+      endif()
       install(TARGETS ${name}
               ${export_to_llvmexports}
               LIBRARY DESTINATION lib${LLVM_LIBDIR_SUFFIX} COMPONENT ${name}
@@ -980,7 +984,9 @@ macro(add_llvm_library name)
                                  COMPONENT ${name})
       endif()
     endif()
-    set_property(GLOBAL APPEND PROPERTY LLVM_EXPORTS ${name})
+    if(NOT ARG_NO_EXPORT)
+      set_property(GLOBAL APPEND PROPERTY LLVM_EXPORTS ${name})
+    endif()
   endif()
 
   get_subproject_title(subproject_title)
diff --git a/llvm/docs/DirectX/DXILResources.rst b/llvm/docs/DirectX/DXILResources.rst
index 3971d3788b8a..857d29e48363 100644
--- a/llvm/docs/DirectX/DXILResources.rst
+++ b/llvm/docs/DirectX/DXILResources.rst
@@ -274,39 +274,87 @@ Examples:
                @llvm.dx.handle.fromHeap.tdx.RawBuffer_v4f32_1_0(
                    i32 2, i1 false)
 
-16-byte Loads, Samples, and Gathers
------------------------------------
-
-*relevant types: TypedBuffer, CBuffer, and Textures*
-
-TypedBuffer, CBuffer, and Texture loads, as well as samples and gathers, can
-return 1 to 4 elements from the given resource, to a maximum of 16 bytes of
-data. DXIL's modeling of this is influenced by DirectX and DXBC's history and
-it generally treats these operations as returning 4 32-bit values. For 16-bit
-elements the values are 16-bit values, and for 64-bit values the operations
-return 4 32-bit integers and emit further code to construct the double.
-
-In DXIL, these operations return `ResRet`_ and `CBufRet`_ values, are structs
-containing 4 elements of the same type, and in the case of `ResRet` a 5th
-element that is used by the `CheckAccessFullyMapped`_ operation.
-
-In LLVM IR the intrinsics will return the contained type of the resource
-instead. That is, ``llvm.dx.resource.load.typedbuffer`` from a
-``Buffer<float>`` would return a single float, from ``Buffer<float4>`` a vector
-of 4 floats, and from ``Buffer<double2>`` a vector of two doubles, etc. The
-operations are then expanded out to match DXIL's format during lowering.
-
-In cases where we need ``CheckAccessFullyMapped``, we have a second intrinsic
-that returns an anonymous struct with element-0 being the contained type, and
-element-1 being the ``i1`` result of a ``CheckAccessFullyMapped`` call. We
-don't have a separate call to ``CheckAccessFullyMapped`` at all, since that's
-the only operation that can possibly be done on this value. In practice this
-may mean we insert a DXIL operation for the check when this was missing in the
-HLSL source, but this actually matches DXC's behaviour in practice.
+Accessing Resources as Memory
+-----------------------------
+
+*relevant types: Buffers, CBuffer, and Textures*
+
+Loading and storing from resources is generally represented in LLVM using
+operations on memory that is only accessible via a handle object. Given a
+handle, `llvm.dx.resource.getpointer` gives a pointer that can be used to read
+and (depending on type) write to the resource.
+
+Accesses using `llvm.dx.resource.getpointer` are replaced with direct load and
+store operations in the `DXILResourceAccess` pass. These direct loads and
+stores are described later in this document.
+
+.. note:: Currently the pointers returned by `dx.resource.getpointer` are in
+          the default address space, but that will likely change in the future.
+
+.. list-table:: ``@llvm.dx.resource.getpointer``
+   :header-rows: 1
+
+   * - Argument
+     -
+     - Type
+     - Description
+   * - Return value
+     -
+     - Pointer
+     - A pointer to an object in the buffer
+   * - ``%buffer``
+     - 0
+     - ``target(dx.TypedBuffer, ...)``
+     - The buffer to access
+   * - ``%index``
+     - 1
+     - ``i32``
+     - Index into the buffer
+
+Examples:
+
+.. code-block:: llvm
+
+   %ptr = call ptr @llvm.dx.resource.getpointer.p0.tdx.TypedBuffer_v4f32_0_0_0t(
+       target("dx.TypedBuffer", <4 x float>, 0, 0, 0) %buffer, i32 %index)
+
+Loads, Samples, and Gathers
+---------------------------
+
+*relevant types: Buffers, CBuffers, and Textures*
+
+All load, sample, and gather operations in DXIL return a `ResRet`_ type, and
+CBuffer loads return a similar `CBufRet`_ type. These types are structs
+containing 4 elements of some basic type, and in the case of `ResRet` a 5th
+element that is used by the `CheckAccessFullyMapped`_ operation. Some of these
+operations, like `RawBufferLoad`_ include a mask and/or alignment that tell us
+some information about how to interpret those four values.
+
+In the LLVM IR representations of these operations we instead return scalars or
+vectors, but we keep the requirement that we only return up to 4 elements of a
+basic type. This avoids some unnecessary casting and structure manipulation in
+the intermediate format while also keeping lowering to DXIL straightforward.
+
+LLVM intrinsics that map to operations returning `ResRet` return an anonymous
+struct with element-0 being the scalar or vector type, and element-1 being the
+``i1`` result of a ``CheckAccessFullyMapped`` call. We don't have a separate
+call to ``CheckAccessFullyMapped`` at all, since that's the only operation that
+can possibly be done on this value. In practice this may mean we insert a DXIL
+operation for the check when this was missing in the HLSL source, but this
+actually matches DXC's behaviour in practice.
+
+For TypedBuffer and Texture, we map directly from the contained type of the
+resource to the return value of the intrinsic. Since these resources are
+constrained to contain only scalars and vectors of up to 4 elements, the
+lowering to DXIL ops is generally straightforward. The one exception we have
+here is that `double` types in the elements are special - these are allowed in
+the LLVM intrinsics, but are lowered to pairs of `i32` followed by
+``MakeDouble`` operations for DXIL.
 
 .. _ResRet: https://github.com/microsoft/DirectXShaderCompiler/blob/main/docs/DXIL.rst#resource-operation-return-types
 .. _CBufRet: https://github.com/microsoft/DirectXShaderCompiler/blob/main/docs/DXIL.rst#cbufferloadlegacy
 .. _CheckAccessFullyMapped: https://learn.microsoft.com/en-us/windows/win32/direct3dhlsl/checkaccessfullymapped
+.. _RawBufferLoad: https://github.com/microsoft/DirectXShaderCompiler/blob/main/docs/DXIL.rst#rawbufferload
 
 .. list-table:: ``@llvm.dx.resource.load.typedbuffer``
    :header-rows: 1
@@ -317,8 +365,8 @@ HLSL source, but this actually matches DXC's behaviour in practice.
      - Description
    * - Return value
      -
-     - The contained type of the buffer
-     - The data loaded from the buffer
+     - A structure of the contained type and the check bit
+     - The data loaded from the buffer and the check bit
    * - ``%buffer``
      - 0
      - ``target(dx.TypedBuffer, ...)``
@@ -332,23 +380,32 @@ Examples:
 
 .. code-block:: llvm
 
-   %ret = call <4 x float>
+   %ret = call {<4 x float>, i1}
        @llvm.dx.resource.load.typedbuffer.v4f32.tdx.TypedBuffer_v4f32_0_0_0t(
            target("dx.TypedBuffer", <4 x float>, 0, 0, 0) %buffer, i32 %index)
-   %ret = call float
+   %ret = call {float, i1}
        @llvm.dx.resource.load.typedbuffer.f32.tdx.TypedBuffer_f32_0_0_0t(
            target("dx.TypedBuffer", float, 0, 0, 0) %buffer, i32 %index)
-   %ret = call <4 x i32>
+   %ret = call {<4 x i32>, i1}
        @llvm.dx.resource.load.typedbuffer.v4i32.tdx.TypedBuffer_v4i32_0_0_0t(
            target("dx.TypedBuffer", <4 x i32>, 0, 0, 0) %buffer, i32 %index)
-   %ret = call <4 x half>
+   %ret = call {<4 x half>, i1}
        @llvm.dx.resource.load.typedbuffer.v4f16.tdx.TypedBuffer_v4f16_0_0_0t(
            target("dx.TypedBuffer", <4 x half>, 0, 0, 0) %buffer, i32 %index)
-   %ret = call <2 x double>
+   %ret = call {<2 x double>, i1}
        @llvm.dx.resource.load.typedbuffer.v2f64.tdx.TypedBuffer_v2f64_0_0t(
            target("dx.TypedBuffer", <2 x double>, 0, 0, 0) %buffer, i32 %index)
 
-.. list-table:: ``@llvm.dx.resource.loadchecked.typedbuffer``
+For RawBuffer, an HLSL load operation may return an arbitrarily sized result,
+but we still constrain the LLVM intrinsic to return only up to 4 elements of a
+basic type. This means that larger loads are represented as a series of loads,
+which matches DXIL. Unlike in the `RawBufferLoad`_ operation, we do not need
+arguments for the mask/type size and alignment, since we can calculate these
+from the return type of the load during lowering.
+
+.. _RawBufferLoad: https://github.com/microsoft/DirectXShaderCompiler/blob/main/docs/DXIL.rst#rawbufferload
+
+.. list-table:: ``@llvm.dx.resource.load.rawbuffer``
    :header-rows: 1
 
    * - Argument
@@ -357,22 +414,82 @@ Examples:
      - Description
    * - Return value
      -
-     - A structure of the contained type and the check bit
+     - A structure of a scalar or vector and the check bit
      - The data loaded from the buffer and the check bit
    * - ``%buffer``
      - 0
-     - ``target(dx.TypedBuffer, ...)``
+     - ``target(dx.RawBuffer, ...)``
      - The buffer to load from
    * - ``%index``
      - 1
      - ``i32``
      - Index into the buffer
+   * - ``%offset``
+     - 2
+     - ``i32``
+     - Offset into the structure at the given index
+
+Examples:
 
 .. code-block:: llvm
 
+   ; float
+   %ret = call {float, i1}
+       @llvm.dx.resource.load.rawbuffer.f32.tdx.RawBuffer_f32_0_0_0t(
+           target("dx.RawBuffer", float, 0, 0, 0) %buffer,
+           i32 %index,
+           i32 0)
+   %ret = call {float, i1}
+       @llvm.dx.resource.load.rawbuffer.f32.tdx.RawBuffer_i8_0_0_0t(
+           target("dx.RawBuffer", i8, 0, 0, 0) %buffer,
+           i32 %byte_offset,
+           i32 0)
+
+   ; float4
    %ret = call {<4 x float>, i1}
-       @llvm.dx.resource.loadchecked.typedbuffer.v4f32.tdx.TypedBuffer_v4f32_0_0_0t(
-           target("dx.TypedBuffer", <4 x float>, 0, 0, 0) %buffer, i32 %index)
+       @llvm.dx.resource.load.rawbuffer.v4f32.tdx.RawBuffer_v4f32_0_0_0t(
+           target("dx.RawBuffer", float, 0, 0, 0) %buffer,
+           i32 %index,
+           i32 0)
+   %ret = call {float, i1}
+       @llvm.dx.resource.load.rawbuffer.v4f32.tdx.RawBuffer_i8_0_0_0t(
+           target("dx.RawBuffer", i8, 0, 0, 0) %buffer,
+           i32 %byte_offset,
+           i32 0)
+
+   ; struct S0 { float4 f; int4 i; };
+   %ret = call {<4 x float>, i1}
+       @llvm.dx.resource.load.rawbuffer.v4f32.tdx.RawBuffer_sl_v4f32v4i32s_0_0t(
+           target("dx.RawBuffer", {<4 x float>, <4 x i32>}, 0, 0, 0) %buffer,
+           i32 %index,
+           i32 0)
+   %ret = call {<4 x i32>, i1}
+       @llvm.dx.resource.load.rawbuffer.v4i32.tdx.RawBuffer_sl_v4f32v4i32s_0_0t(
+           target("dx.RawBuffer", {<4 x float>, <4 x i32>}, 0, 0, 0) %buffer,
+           i32 %index,
+           i32 1)
+
+   ; struct Q { float4 f; int3 i; }
+   ; struct R { int z; S x; }
+   %ret = call {i32, i1}
+       @llvm.dx.resource.load.rawbuffer.i32(
+           target("dx.RawBuffer", {i32, {<4 x float>, <3 x i32>}}, 0, 0, 0)
+               %buffer, i32 %index, i32 0)
+   %ret = call {<4 x float>, i1}
+       @llvm.dx.resource.load.rawbuffer.i32(
+           target("dx.RawBuffer", {i32, {<4 x float>, <3 x i32>}}, 0, 0, 0)
+               %buffer, i32 %index, i32 4)
+   %ret = call {<3 x i32>, i1}
+       @llvm.dx.resource.load.rawbuffer.i32(
+           target("dx.RawBuffer", {i32, {<4 x float>, <3 x i32>}}, 0, 0, 0)
+               %buffer, i32 %index, i32 20)
+
+   ; byteaddressbuf.Load<int64_t4>
+   %ret = call {<4 x i64>, i1}
+       @llvm.dx.resource.load.rawbuffer.v4i64.tdx.RawBuffer_i8_0_0t(
+           target("dx.RawBuffer", i8, 0, 0, 0) %buffer,
+           i32 %byte_offset,
+           i32 0)
 
 Texture and Typed Buffer Stores
 -------------------------------
diff --git a/llvm/docs/GettingStarted.rst b/llvm/docs/GettingStarted.rst
index 7f4b62f63957..7f73b61e11dd 100644
--- a/llvm/docs/GettingStarted.rst
+++ b/llvm/docs/GettingStarted.rst
@@ -233,12 +233,13 @@ Hardware
 
 LLVM is known to work on the following host platforms:
 
-================== ===================== =============
+================== ===================== ==============================
 OS                 Arch                  Compilers
-================== ===================== =============
+================== ===================== ==============================
 Linux              x86\ :sup:`1`         GCC, Clang
 Linux              amd64                 GCC, Clang
 Linux              ARM                   GCC, Clang
+Linux              AArch64               GCC, Clang
 Linux              Mips                  GCC, Clang
 Linux              PowerPC               GCC, Clang
 Linux              SystemZ               GCC, Clang
@@ -246,16 +247,19 @@ Solaris            V9 (Ultrasparc)       GCC
 DragonFlyBSD       amd64                 GCC, Clang
 FreeBSD            x86\ :sup:`1`         GCC, Clang
 FreeBSD            amd64                 GCC, Clang
+FreeBSD            AArch64               GCC, Clang
 NetBSD             x86\ :sup:`1`         GCC, Clang
 NetBSD             amd64                 GCC, Clang
 OpenBSD            x86\ :sup:`1`         GCC, Clang
 OpenBSD            amd64                 GCC, Clang
 macOS\ :sup:`2`    PowerPC               GCC
 macOS              x86                   GCC, Clang
+macOS              arm64                 Clang
 Cygwin/Win32       x86\ :sup:`1, 3`      GCC
 Windows            x86\ :sup:`1`         Visual Studio
-Windows x64        x86-64                Visual Studio
-================== ===================== =============
+Windows x64        x86-64                Visual Studio, Clang\ :sup:`4`
+Windows on Arm     ARM64                 Visual Studio, Clang\ :sup:`4`
+================== ===================== ==============================
 
 .. note::
 
@@ -263,6 +267,8 @@ Windows x64        x86-64                Visual Studio
   #. Code generation supported for 32-bit ABI only
   #. To use LLVM modules on Win32-based system, you may configure LLVM
      with ``-DBUILD_SHARED_LIBS=On``.
+  #. Visual Studio alone can compile LLVM. When using Clang, you
+     must also have Visual Studio installed.
 
 Note that Debug builds require a lot of time and disk space.  An LLVM-only build
 will need about 1-3 GB of space.  A full build of LLVM and Clang will need around
diff --git a/llvm/docs/RISCVUsage.rst b/llvm/docs/RISCVUsage.rst
index 835b910ec452..0dc63f34806b 100644
--- a/llvm/docs/RISCVUsage.rst
+++ b/llvm/docs/RISCVUsage.rst
@@ -438,6 +438,9 @@ The current vendor extensions supported are:
 ``experimental-Xqcicli``
   LLVM implements `version 0.2 of the Qualcomm uC Conditional Load Immediate extension specification <https://github.com/quic/riscv-unified-db/releases/latest>`__ by Qualcomm.  All instructions are prefixed with `qc.` as described in the specification. These instructions are only available for riscv32.
 
+``experimental-Xqcicm``
+  LLVM implements `version 0.2 of the Qualcomm uC Conditional Move extension specification <https://github.com/quic/riscv-unified-db/releases/latest>`__ by Qualcomm.  All instructions are prefixed with `qc.` as described in the specification. These instructions are only available for riscv32.
+
 ``experimental-Xqcics``
   LLVM implements `version 0.2 of the Qualcomm uC Conditional Select extension specification <https://github.com/quic/riscv-unified-db/releases/latest>`__ by Qualcomm.  All instructions are prefixed with `qc.` as described in the specification. These instructions are only available for riscv32.
 
diff --git a/llvm/docs/ReleaseNotes.md b/llvm/docs/ReleaseNotes.md
index 11ee9864e517..3463dc8339fd 100644
--- a/llvm/docs/ReleaseNotes.md
+++ b/llvm/docs/ReleaseNotes.md
@@ -198,6 +198,7 @@ Changes to the RISC-V Backend
 * `-mcpu=syntacore-scr7` was added.
 * `-mcpu=tt-ascalon-d8` was added.
 * `-mcpu=mips-p8700` was added.
+* `-mcpu=sifive-p550` was added.
 * The `Zacas` extension is no longer marked as experimental.
 * Added Smdbltrp, Ssdbltrp extensions to -march.
 * The `Smmpm`, `Smnpm`, `Ssnpm`, `Supm`, and `Sspm` pointer masking extensions
@@ -232,6 +233,8 @@ Changes to the RISC-V Backend
   extension.
 * Adds experimental assembler support for the Qualcomm uC 'Xqcicli` (Conditional Load Immediate)
   extension.
+* Adds experimental assembler support for the Qualcomm uC 'Xqcicm` (Conditonal Move)
+  extension.
 * Added ``Sdext`` and ``Sdtrig`` extensions.
 
 Changes to the WebAssembly Backend
diff --git a/llvm/include/llvm/ADT/STLFunctionalExtras.h b/llvm/include/llvm/ADT/STLFunctionalExtras.h
index 3b9d40959d71..a4d50dc3648b 100644
--- a/llvm/include/llvm/ADT/STLFunctionalExtras.h
+++ b/llvm/include/llvm/ADT/STLFunctionalExtras.h
@@ -36,8 +36,8 @@ namespace llvm {
 /// a function_ref.
 template<typename Fn> class function_ref;
 
-template<typename Ret, typename ...Params>
-class function_ref<Ret(Params...)> {
+template <typename Ret, typename... Params>
+class LLVM_GSL_POINTER function_ref<Ret(Params...)> {
   Ret (*callback)(intptr_t callable, Params ...params) = nullptr;
   intptr_t callable;
 
diff --git a/llvm/include/llvm/Analysis/ValueTracking.h b/llvm/include/llvm/Analysis/ValueTracking.h
index 8aa024a72afc..b4918c2d1e8a 100644
--- a/llvm/include/llvm/Analysis/ValueTracking.h
+++ b/llvm/include/llvm/Analysis/ValueTracking.h
@@ -1102,6 +1102,13 @@ bool mustExecuteUBIfPoisonOnPathTo(Instruction *Root,
                                    Instruction *OnPathTo,
                                    DominatorTree *DT);
 
+/// Convert an integer comparison with a constant RHS into an equivalent
+/// form with the strictness flipped predicate. Return the new predicate and
+/// corresponding constant RHS if possible. Otherwise return std::nullopt.
+/// E.g., (icmp sgt X, 0) -> (icmp sle X, 1).
+std::optional<std::pair<CmpPredicate, Constant *>>
+getFlippedStrictnessPredicateAndConstant(CmpPredicate Pred, Constant *C);
+
 /// Specific patterns of select instructions we can match.
 enum SelectPatternFlavor {
   SPF_UNKNOWN = 0,
diff --git a/llvm/include/llvm/BinaryFormat/ELF.h b/llvm/include/llvm/BinaryFormat/ELF.h
index 8abacf1b546a..1bc69f791bd8 100644
--- a/llvm/include/llvm/BinaryFormat/ELF.h
+++ b/llvm/include/llvm/BinaryFormat/ELF.h
@@ -689,6 +689,9 @@ enum : unsigned {
 // ELF Relocation types for RISC-V
 enum {
 #include "ELFRelocs/RISCV.def"
+#define ELF_RISCV_NONSTANDARD_RELOC(_vendor, name, value) name = value,
+#include "ELFRelocs/RISCV_nonstandard.def"
+#undef ELF_RISCV_NONSTANDARD_RELOC
 };
 
 enum {
diff --git a/llvm/include/llvm/BinaryFormat/ELFRelocs/RISCV_nonstandard.def b/llvm/include/llvm/BinaryFormat/ELFRelocs/RISCV_nonstandard.def
new file mode 100644
index 000000000000..7ae3d3f20577
--- /dev/null
+++ b/llvm/include/llvm/BinaryFormat/ELFRelocs/RISCV_nonstandard.def
@@ -0,0 +1,28 @@
+//===--- RISC-V Nonstandard Relocation List ---------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ELF_RISCV_NONSTANDARD_RELOC
+#error "ELF_RISCV_NONSTANDARD_RELOC must be defined"
+#endif
+
+// ELF_RISCV_NONSTANDARD_RELOC(VENDOR, NAME, ID) defines information about
+// nonstandard relocation codes. This can be used when parsing relocations, or
+// when printing them, to provide better information.
+//
+// VENDOR should be the symbol name expected in the associated `R_RISCV_VENDOR`
+// relocation. NAME and ID work like `ELF_RELOC` but the mapping is not expected
+// to be 1:1.
+//
+// The mapping in RISCV.def is 1:1, and should be used when the only information
+// available is the relocation enum value.
+
+// Qualcomm Nonstandard Relocations
+ELF_RISCV_NONSTANDARD_RELOC(QUALCOMM, R_RISCV_QC_ABS20_U,    192)
+ELF_RISCV_NONSTANDARD_RELOC(QUALCOMM, R_RISCV_QC_E_BRANCH,   193)
+ELF_RISCV_NONSTANDARD_RELOC(QUALCOMM, R_RISCV_QC_E_32,       194)
+ELF_RISCV_NONSTANDARD_RELOC(QUALCOMM, R_RISCV_QC_E_JUMP_PLT, 195)
diff --git a/llvm/include/llvm/CodeGen/ReachingDefAnalysis.h b/llvm/include/llvm/CodeGen/ReachingDefAnalysis.h
index d6a1f064ec0a..0c1e707e4ecb 100644
--- a/llvm/include/llvm/CodeGen/ReachingDefAnalysis.h
+++ b/llvm/include/llvm/CodeGen/ReachingDefAnalysis.h
@@ -176,26 +176,25 @@ public:
   void traverse();
 
   /// Provides the instruction id of the closest reaching def instruction of
-  /// PhysReg that reaches MI, relative to the begining of MI's basic block.
-  int getReachingDef(MachineInstr *MI, MCRegister PhysReg) const;
+  /// Reg that reaches MI, relative to the begining of MI's basic block.
+  int getReachingDef(MachineInstr *MI, MCRegister Reg) const;
 
-  /// Return whether A and B use the same def of PhysReg.
+  /// Return whether A and B use the same def of Reg.
   bool hasSameReachingDef(MachineInstr *A, MachineInstr *B,
-                          MCRegister PhysReg) const;
+                          MCRegister Reg) const;
 
   /// Return whether the reaching def for MI also is live out of its parent
   /// block.
-  bool isReachingDefLiveOut(MachineInstr *MI, MCRegister PhysReg) const;
+  bool isReachingDefLiveOut(MachineInstr *MI, MCRegister Reg) const;
 
-  /// Return the local MI that produces the live out value for PhysReg, or
+  /// Return the local MI that produces the live out value for Reg, or
   /// nullptr for a non-live out or non-local def.
   MachineInstr *getLocalLiveOutMIDef(MachineBasicBlock *MBB,
-                                     MCRegister PhysReg) const;
+                                     MCRegister Reg) const;
 
   /// If a single MachineInstr creates the reaching definition, then return it.
   /// Otherwise return null.
-  MachineInstr *getUniqueReachingMIDef(MachineInstr *MI,
-                                       MCRegister PhysReg) const;
+  MachineInstr *getUniqueReachingMIDef(MachineInstr *MI, MCRegister Reg) const;
 
   /// If a single MachineInstr creates the reaching definition, for MIs operand
   /// at Idx, then return it. Otherwise return null.
@@ -207,44 +206,43 @@ public:
 
   /// Provide whether the register has been defined in the same basic block as,
   /// and before, MI.
-  bool hasLocalDefBefore(MachineInstr *MI, MCRegister PhysReg) const;
+  bool hasLocalDefBefore(MachineInstr *MI, MCRegister Reg) const;
 
   /// Return whether the given register is used after MI, whether it's a local
   /// use or a live out.
-  bool isRegUsedAfter(MachineInstr *MI, MCRegister PhysReg) const;
+  bool isRegUsedAfter(MachineInstr *MI, MCRegister Reg) const;
 
   /// Return whether the given register is defined after MI.
-  bool isRegDefinedAfter(MachineInstr *MI, MCRegister PhysReg) const;
+  bool isRegDefinedAfter(MachineInstr *MI, MCRegister Reg) const;
 
   /// Provides the clearance - the number of instructions since the closest
-  /// reaching def instuction of PhysReg that reaches MI.
-  int getClearance(MachineInstr *MI, MCRegister PhysReg) const;
+  /// reaching def instuction of Reg that reaches MI.
+  int getClearance(MachineInstr *MI, MCRegister Reg) const;
 
   /// Provides the uses, in the same block as MI, of register that MI defines.
   /// This does not consider live-outs.
-  void getReachingLocalUses(MachineInstr *MI, MCRegister PhysReg,
+  void getReachingLocalUses(MachineInstr *MI, MCRegister Reg,
                             InstSet &Uses) const;
 
-  /// Search MBB for a definition of PhysReg and insert it into Defs. If no
+  /// Search MBB for a definition of Reg and insert it into Defs. If no
   /// definition is found, recursively search the predecessor blocks for them.
-  void getLiveOuts(MachineBasicBlock *MBB, MCRegister PhysReg, InstSet &Defs,
+  void getLiveOuts(MachineBasicBlock *MBB, MCRegister Reg, InstSet &Defs,
                    BlockSet &VisitedBBs) const;
-  void getLiveOuts(MachineBasicBlock *MBB, MCRegister PhysReg,
-                   InstSet &Defs) const;
+  void getLiveOuts(MachineBasicBlock *MBB, MCRegister Reg, InstSet &Defs) const;
 
   /// For the given block, collect the instructions that use the live-in
   /// value of the provided register. Return whether the value is still
   /// live on exit.
-  bool getLiveInUses(MachineBasicBlock *MBB, MCRegister PhysReg,
+  bool getLiveInUses(MachineBasicBlock *MBB, MCRegister Reg,
                      InstSet &Uses) const;
 
-  /// Collect the users of the value stored in PhysReg, which is defined
+  /// Collect the users of the value stored in Reg, which is defined
   /// by MI.
-  void getGlobalUses(MachineInstr *MI, MCRegister PhysReg, InstSet &Uses) const;
+  void getGlobalUses(MachineInstr *MI, MCRegister Reg, InstSet &Uses) const;
 
-  /// Collect all possible definitions of the value stored in PhysReg, which is
+  /// Collect all possible definitions of the value stored in Reg, which is
   /// used by MI.
-  void getGlobalReachingDefs(MachineInstr *MI, MCRegister PhysReg,
+  void getGlobalReachingDefs(MachineInstr *MI, MCRegister Reg,
                              InstSet &Defs) const;
 
   /// Return whether From can be moved forwards to just before To.
@@ -269,12 +267,12 @@ public:
 
   /// Return whether a MachineInstr could be inserted at MI and safely define
   /// the given register without affecting the program.
-  bool isSafeToDefRegAt(MachineInstr *MI, MCRegister PhysReg) const;
+  bool isSafeToDefRegAt(MachineInstr *MI, MCRegister Reg) const;
 
   /// Return whether a MachineInstr could be inserted at MI and safely define
   /// the given register without affecting the program, ignoring any effects
   /// on the provided instructions.
-  bool isSafeToDefRegAt(MachineInstr *MI, MCRegister PhysReg,
+  bool isSafeToDefRegAt(MachineInstr *MI, MCRegister Reg,
                         InstSet &Ignore) const;
 
 private:
@@ -309,9 +307,8 @@ private:
   MachineInstr *getInstFromId(MachineBasicBlock *MBB, int InstId) const;
 
   /// Provides the instruction of the closest reaching def instruction of
-  /// PhysReg that reaches MI, relative to the begining of MI's basic block.
-  MachineInstr *getReachingLocalMIDef(MachineInstr *MI,
-                                      MCRegister PhysReg) const;
+  /// Reg that reaches MI, relative to the begining of MI's basic block.
+  MachineInstr *getReachingLocalMIDef(MachineInstr *MI, MCRegister Reg) const;
 };
 
 } // namespace llvm
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/Core.h b/llvm/include/llvm/ExecutionEngine/Orc/Core.h
index 2788932ca4bc..db853362f657 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/Core.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/Core.h
@@ -1312,6 +1312,7 @@ public:
   MaterializationTask(std::unique_ptr<MaterializationUnit> MU,
                       std::unique_ptr<MaterializationResponsibility> MR)
       : MU(std::move(MU)), MR(std::move(MR)) {}
+  ~MaterializationTask() override;
   void printDescription(raw_ostream &OS) override;
   void run() override;
 
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/TaskDispatch.h b/llvm/include/llvm/ExecutionEngine/Orc/TaskDispatch.h
index 8c65677aae25..d7939864fd60 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/TaskDispatch.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/TaskDispatch.h
@@ -122,7 +122,7 @@ public:
   void shutdown() override;
 private:
   std::mutex DispatchMutex;
-  bool Running = true;
+  bool Shutdown = false;
   size_t Outstanding = 0;
   std::condition_variable OutstandingCV;
 
diff --git a/llvm/include/llvm/IR/IRBuilder.h b/llvm/include/llvm/IR/IRBuilder.h
index b73309175f20..0332a6cc2e76 100644
--- a/llvm/include/llvm/IR/IRBuilder.h
+++ b/llvm/include/llvm/IR/IRBuilder.h
@@ -102,6 +102,11 @@ public:
   FastMathFlags get(FastMathFlags Default) const {
     return FMF.value_or(Default);
   }
+  /// Intersect the FMF from two instructions.
+  static FMFSource intersect(Value *A, Value *B) {
+    return FMFSource(cast<FPMathOperator>(A)->getFastMathFlags() &
+                     cast<FPMathOperator>(B)->getFastMathFlags());
+  }
 };
 
 /// Common base class shared among various IRBuilders.
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index 53a66099a92b..cc7a81e15f66 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -2974,6 +2974,7 @@ let TargetPrefix = "aarch64" in {
 
 
   def int_aarch64_sme_zero : DefaultAttrsIntrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>]>;
+  def int_aarch64_sme_in_streaming_mode : DefaultAttrsIntrinsic<[llvm_i1_ty], [], [IntrNoMem]>, ClangBuiltin<"__builtin_arm_in_streaming_mode">;
 
   class SME_OuterProduct_Intrinsic
       : DefaultAttrsIntrinsic<[],
diff --git a/llvm/include/llvm/IR/IntrinsicsDirectX.td b/llvm/include/llvm/IR/IntrinsicsDirectX.td
index d31d5afe5145..3b1d1a88e01a 100644
--- a/llvm/include/llvm/IR/IntrinsicsDirectX.td
+++ b/llvm/include/llvm/IR/IntrinsicsDirectX.td
@@ -31,19 +31,20 @@ def int_dx_resource_getpointer
     : DefaultAttrsIntrinsic<[llvm_anyptr_ty], [llvm_any_ty, llvm_i32_ty],
                             [IntrNoMem]>;
 def int_dx_resource_load_typedbuffer
-    : DefaultAttrsIntrinsic<[llvm_any_ty], [llvm_any_ty, llvm_i32_ty],
-                            [IntrReadMem]>;
-def int_dx_resource_loadchecked_typedbuffer
     : DefaultAttrsIntrinsic<[llvm_any_ty, llvm_i1_ty],
                             [llvm_any_ty, llvm_i32_ty], [IntrReadMem]>;
 def int_dx_resource_store_typedbuffer
     : DefaultAttrsIntrinsic<[], [llvm_any_ty, llvm_i32_ty, llvm_anyvector_ty],
                             [IntrWriteMem]>;
+def int_dx_resource_load_rawbuffer
+    : DefaultAttrsIntrinsic<[llvm_any_ty, llvm_i1_ty],
+                            [llvm_any_ty, llvm_i32_ty, llvm_i32_ty],
+                            [IntrReadMem]>;
 
 def int_dx_resource_updatecounter
     : DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_any_ty, llvm_i8_ty],
                             [IntrInaccessibleMemOrArgMemOnly]>;
-    
+
 // Cast between target extension handle types and dxil-style opaque handles
 def int_dx_resource_casthandle : Intrinsic<[llvm_any_ty], [llvm_any_ty]>;
 
@@ -105,7 +106,7 @@ def int_dx_wave_is_first_lane : DefaultAttrsIntrinsic<[llvm_i1_ty], [], [IntrCon
 def int_dx_wave_readlane : DefaultAttrsIntrinsic<[llvm_any_ty], [LLVMMatchType<0>, llvm_i32_ty], [IntrConvergent, IntrNoMem]>;
 def int_dx_sign : DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i32_ty>], [llvm_any_ty], [IntrNoMem]>;
 def int_dx_step : DefaultAttrsIntrinsic<[LLVMMatchType<0>], [llvm_anyfloat_ty, LLVMMatchType<0>], [IntrNoMem]>;
-def int_dx_splitdouble : DefaultAttrsIntrinsic<[llvm_anyint_ty, LLVMMatchType<0>], 
+def int_dx_splitdouble : DefaultAttrsIntrinsic<[llvm_anyint_ty, LLVMMatchType<0>],
     [LLVMScalarOrSameVectorWidth<0, llvm_double_ty>], [IntrNoMem]>;
 def int_dx_radians : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>;
 def int_dx_discard : DefaultAttrsIntrinsic<[], [llvm_i1_ty], []>;
diff --git a/llvm/include/llvm/IR/IntrinsicsSPIRV.td b/llvm/include/llvm/IR/IntrinsicsSPIRV.td
index e6e21dcd6d69..b4d2dce66a6f 100644
--- a/llvm/include/llvm/IR/IntrinsicsSPIRV.td
+++ b/llvm/include/llvm/IR/IntrinsicsSPIRV.td
@@ -33,7 +33,7 @@ let TargetPrefix = "spv" in {
   def int_spv_ptrcast : Intrinsic<[llvm_any_ty], [llvm_any_ty, llvm_metadata_ty, llvm_i32_ty], [ImmArg<ArgIndex<2>>]>;
   def int_spv_switch : Intrinsic<[], [llvm_any_ty, llvm_vararg_ty]>;
   def int_spv_loop_merge : Intrinsic<[], [llvm_vararg_ty]>;
-  def int_spv_selection_merge : Intrinsic<[], [llvm_any_ty, llvm_i32_ty], [ImmArg<ArgIndex<1>>]>;
+  def int_spv_selection_merge : Intrinsic<[], [llvm_vararg_ty]>;
   def int_spv_cmpxchg : Intrinsic<[llvm_i32_ty], [llvm_any_ty, llvm_vararg_ty]>;
   def int_spv_unreachable : Intrinsic<[], []>;
   def int_spv_alloca : Intrinsic<[llvm_any_ty], [llvm_i8_ty], [ImmArg<ArgIndex<0>>]>;
@@ -118,6 +118,10 @@ let TargetPrefix = "spv" in {
       : DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_any_ty, llvm_i8_ty],
                               [IntrInaccessibleMemOrArgMemOnly]>;
 
+  def int_spv_resource_getpointer
+      : DefaultAttrsIntrinsic<[llvm_anyptr_ty], [llvm_any_ty, llvm_i32_ty],
+                              [IntrNoMem]>;
+
   // Read a value from the image buffer. It does not translate directly to a
   // single OpImageRead because the result type is not necessarily a 4 element
   // vector.
diff --git a/llvm/include/llvm/IR/NVVMIntrinsicFlags.h b/llvm/include/llvm/IR/NVVMIntrinsicFlags.h
deleted file mode 100644
index dfb6e857b3a6..000000000000
--- a/llvm/include/llvm/IR/NVVMIntrinsicFlags.h
+++ /dev/null
@@ -1,39 +0,0 @@
-//===--- NVVMIntrinsicFlags.h -----------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// This file contains the definitions of the enumerations and flags
-/// associated with NVVM Intrinsics.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_IR_NVVMINTRINSICFLAGS_H
-#define LLVM_IR_NVVMINTRINSICFLAGS_H
-
-#include <stdint.h>
-
-namespace llvm {
-namespace nvvm {
-
-// Reduction Ops supported with TMA Copy from Shared
-// to Global Memory for the "cp.reduce.async.bulk.tensor.*"
-// family of PTX instructions.
-enum class TMAReductionOp : uint8_t {
-  ADD = 0,
-  MIN = 1,
-  MAX = 2,
-  INC = 3,
-  DEC = 4,
-  AND = 5,
-  OR = 6,
-  XOR = 7,
-};
-
-} // namespace nvvm
-} // namespace llvm
-#endif // LLVM_IR_NVVMINTRINSICFLAGS_H
diff --git a/llvm/include/llvm/IR/NVVMIntrinsicUtils.h b/llvm/include/llvm/IR/NVVMIntrinsicUtils.h
new file mode 100644
index 000000000000..8ca073ba8225
--- /dev/null
+++ b/llvm/include/llvm/IR/NVVMIntrinsicUtils.h
@@ -0,0 +1,176 @@
+//===--- NVVMIntrinsicUtils.h -----------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This file contains the definitions of the enumerations and flags
+/// associated with NVVM Intrinsics, along with some helper functions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_IR_NVVMINTRINSICUTILS_H
+#define LLVM_IR_NVVMINTRINSICUTILS_H
+
+#include <stdint.h>
+
+#include "llvm/ADT/APFloat.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IntrinsicsNVPTX.h"
+
+namespace llvm {
+namespace nvvm {
+
+// Reduction Ops supported with TMA Copy from Shared
+// to Global Memory for the "cp.reduce.async.bulk.tensor.*"
+// family of PTX instructions.
+enum class TMAReductionOp : uint8_t {
+  ADD = 0,
+  MIN = 1,
+  MAX = 2,
+  INC = 3,
+  DEC = 4,
+  AND = 5,
+  OR = 6,
+  XOR = 7,
+};
+
+inline bool IntrinsicShouldFTZ(Intrinsic::ID IntrinsicID) {
+  switch (IntrinsicID) {
+  // Float to i32 / i64 conversion intrinsics:
+  case Intrinsic::nvvm_f2i_rm_ftz:
+  case Intrinsic::nvvm_f2i_rn_ftz:
+  case Intrinsic::nvvm_f2i_rp_ftz:
+  case Intrinsic::nvvm_f2i_rz_ftz:
+
+  case Intrinsic::nvvm_f2ui_rm_ftz:
+  case Intrinsic::nvvm_f2ui_rn_ftz:
+  case Intrinsic::nvvm_f2ui_rp_ftz:
+  case Intrinsic::nvvm_f2ui_rz_ftz:
+
+  case Intrinsic::nvvm_f2ll_rm_ftz:
+  case Intrinsic::nvvm_f2ll_rn_ftz:
+  case Intrinsic::nvvm_f2ll_rp_ftz:
+  case Intrinsic::nvvm_f2ll_rz_ftz:
+
+  case Intrinsic::nvvm_f2ull_rm_ftz:
+  case Intrinsic::nvvm_f2ull_rn_ftz:
+  case Intrinsic::nvvm_f2ull_rp_ftz:
+  case Intrinsic::nvvm_f2ull_rz_ftz:
+    return true;
+  }
+  return false;
+}
+
+inline bool IntrinsicConvertsToSignedInteger(Intrinsic::ID IntrinsicID) {
+  switch (IntrinsicID) {
+  // f2i
+  case Intrinsic::nvvm_f2i_rm:
+  case Intrinsic::nvvm_f2i_rm_ftz:
+  case Intrinsic::nvvm_f2i_rn:
+  case Intrinsic::nvvm_f2i_rn_ftz:
+  case Intrinsic::nvvm_f2i_rp:
+  case Intrinsic::nvvm_f2i_rp_ftz:
+  case Intrinsic::nvvm_f2i_rz:
+  case Intrinsic::nvvm_f2i_rz_ftz:
+  // d2i
+  case Intrinsic::nvvm_d2i_rm:
+  case Intrinsic::nvvm_d2i_rn:
+  case Intrinsic::nvvm_d2i_rp:
+  case Intrinsic::nvvm_d2i_rz:
+  // f2ll
+  case Intrinsic::nvvm_f2ll_rm:
+  case Intrinsic::nvvm_f2ll_rm_ftz:
+  case Intrinsic::nvvm_f2ll_rn:
+  case Intrinsic::nvvm_f2ll_rn_ftz:
+  case Intrinsic::nvvm_f2ll_rp:
+  case Intrinsic::nvvm_f2ll_rp_ftz:
+  case Intrinsic::nvvm_f2ll_rz:
+  case Intrinsic::nvvm_f2ll_rz_ftz:
+  // d2ll
+  case Intrinsic::nvvm_d2ll_rm:
+  case Intrinsic::nvvm_d2ll_rn:
+  case Intrinsic::nvvm_d2ll_rp:
+  case Intrinsic::nvvm_d2ll_rz:
+    return true;
+  }
+  return false;
+}
+
+inline APFloat::roundingMode
+IntrinsicGetRoundingMode(Intrinsic::ID IntrinsicID) {
+  switch (IntrinsicID) {
+  // RM:
+  case Intrinsic::nvvm_f2i_rm:
+  case Intrinsic::nvvm_f2ui_rm:
+  case Intrinsic::nvvm_f2i_rm_ftz:
+  case Intrinsic::nvvm_f2ui_rm_ftz:
+  case Intrinsic::nvvm_d2i_rm:
+  case Intrinsic::nvvm_d2ui_rm:
+
+  case Intrinsic::nvvm_f2ll_rm:
+  case Intrinsic::nvvm_f2ull_rm:
+  case Intrinsic::nvvm_f2ll_rm_ftz:
+  case Intrinsic::nvvm_f2ull_rm_ftz:
+  case Intrinsic::nvvm_d2ll_rm:
+  case Intrinsic::nvvm_d2ull_rm:
+    return APFloat::rmTowardNegative;
+
+  // RN:
+  case Intrinsic::nvvm_f2i_rn:
+  case Intrinsic::nvvm_f2ui_rn:
+  case Intrinsic::nvvm_f2i_rn_ftz:
+  case Intrinsic::nvvm_f2ui_rn_ftz:
+  case Intrinsic::nvvm_d2i_rn:
+  case Intrinsic::nvvm_d2ui_rn:
+
+  case Intrinsic::nvvm_f2ll_rn:
+  case Intrinsic::nvvm_f2ull_rn:
+  case Intrinsic::nvvm_f2ll_rn_ftz:
+  case Intrinsic::nvvm_f2ull_rn_ftz:
+  case Intrinsic::nvvm_d2ll_rn:
+  case Intrinsic::nvvm_d2ull_rn:
+    return APFloat::rmNearestTiesToEven;
+
+  // RP:
+  case Intrinsic::nvvm_f2i_rp:
+  case Intrinsic::nvvm_f2ui_rp:
+  case Intrinsic::nvvm_f2i_rp_ftz:
+  case Intrinsic::nvvm_f2ui_rp_ftz:
+  case Intrinsic::nvvm_d2i_rp:
+  case Intrinsic::nvvm_d2ui_rp:
+
+  case Intrinsic::nvvm_f2ll_rp:
+  case Intrinsic::nvvm_f2ull_rp:
+  case Intrinsic::nvvm_f2ll_rp_ftz:
+  case Intrinsic::nvvm_f2ull_rp_ftz:
+  case Intrinsic::nvvm_d2ll_rp:
+  case Intrinsic::nvvm_d2ull_rp:
+    return APFloat::rmTowardPositive;
+
+  // RZ:
+  case Intrinsic::nvvm_f2i_rz:
+  case Intrinsic::nvvm_f2ui_rz:
+  case Intrinsic::nvvm_f2i_rz_ftz:
+  case Intrinsic::nvvm_f2ui_rz_ftz:
+  case Intrinsic::nvvm_d2i_rz:
+  case Intrinsic::nvvm_d2ui_rz:
+
+  case Intrinsic::nvvm_f2ll_rz:
+  case Intrinsic::nvvm_f2ull_rz:
+  case Intrinsic::nvvm_f2ll_rz_ftz:
+  case Intrinsic::nvvm_f2ull_rz_ftz:
+  case Intrinsic::nvvm_d2ll_rz:
+  case Intrinsic::nvvm_d2ull_rz:
+    return APFloat::rmTowardZero;
+  }
+  llvm_unreachable("Invalid f2i/d2i rounding mode intrinsic");
+  return APFloat::roundingMode::Invalid;
+}
+
+} // namespace nvvm
+} // namespace llvm
+#endif // LLVM_IR_NVVMINTRINSICUTILS_H
diff --git a/llvm/include/llvm/IR/PatternMatch.h b/llvm/include/llvm/IR/PatternMatch.h
index b37f967191aa..cd9a36029e6d 100644
--- a/llvm/include/llvm/IR/PatternMatch.h
+++ b/llvm/include/llvm/IR/PatternMatch.h
@@ -2870,7 +2870,7 @@ template <typename Opnd_t> struct Signum_match {
       return false;
 
     unsigned ShiftWidth = TypeSize - 1;
-    Value *OpL = nullptr, *OpR = nullptr;
+    Value *Op;
 
     // This is the representation of signum we match:
     //
@@ -2882,11 +2882,11 @@ template <typename Opnd_t> struct Signum_match {
     //
     // for i1 values.
 
-    auto LHS = m_AShr(m_Value(OpL), m_SpecificInt(ShiftWidth));
-    auto RHS = m_LShr(m_Neg(m_Value(OpR)), m_SpecificInt(ShiftWidth));
-    auto Signum = m_Or(LHS, RHS);
+    auto LHS = m_AShr(m_Value(Op), m_SpecificInt(ShiftWidth));
+    auto RHS = m_LShr(m_Neg(m_Deferred(Op)), m_SpecificInt(ShiftWidth));
+    auto Signum = m_c_Or(LHS, RHS);
 
-    return Signum.match(V) && OpL == OpR && Val.match(OpL);
+    return Signum.match(V) && Val.match(Op);
   }
 };
 
diff --git a/llvm/include/llvm/Option/OptTable.h b/llvm/include/llvm/Option/OptTable.h
index decb6cb5455e..38a03fef7ae1 100644
--- a/llvm/include/llvm/Option/OptTable.h
+++ b/llvm/include/llvm/Option/OptTable.h
@@ -451,7 +451,7 @@ protected:
   LLVM_MAKE_OPT_ID_WITH_ID_PREFIX(OPT_, PREFIXES_OFFSET, PREFIXED_NAME_OFFSET, \
                                   ID, KIND, GROUP, ALIAS, ALIASARGS, FLAGS,    \
                                   VISIBILITY, PARAM, HELPTEXT,                 \
-                                  HELPTEXTSFORVARIANTS, METAVAR, VALUE)
+                                  HELPTEXTSFORVARIANTS, METAVAR, VALUES)
 
 #define LLVM_CONSTRUCT_OPT_INFO_WITH_ID_PREFIX(                                \
     ID_PREFIX, PREFIXES_OFFSET, PREFIXED_NAME_OFFSET, ID, KIND, GROUP, ALIAS,  \
diff --git a/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h b/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h
index a9ad258716a1..72b66db271f1 100644
--- a/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h
+++ b/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h
@@ -34,6 +34,7 @@
 #include <cassert>
 #include <cstdint>
 #include <iterator>
+#include <map>
 #include <memory>
 #include <optional>
 #include <sstream>
@@ -214,6 +215,14 @@ public:
   /// Return a counter that represents the expression that subtracts RHS from
   /// LHS.
   Counter subtract(Counter LHS, Counter RHS, bool Simplify = true);
+
+  /// K to V map. K will be Counter in most cases. V may be Counter or
+  /// Expression.
+  using SubstMap = std::map<Counter, Counter>;
+
+  /// \return A counter equivalent to \C, with each term in its
+  /// expression replaced with term from \p Map.
+  Counter subst(Counter C, const SubstMap &Map);
 };
 
 using LineColPair = std::pair<unsigned, unsigned>;
diff --git a/llvm/include/llvm/Support/Recycler.h b/llvm/include/llvm/Support/Recycler.h
index bbd9ae321ae3..693c6559ff2f 100644
--- a/llvm/include/llvm/Support/Recycler.h
+++ b/llvm/include/llvm/Support/Recycler.h
@@ -60,6 +60,10 @@ public:
     // clear() before deleting the Recycler.
     assert(!FreeList && "Non-empty recycler deleted!");
   }
+  Recycler(const Recycler &) = delete;
+  Recycler(Recycler &&Other)
+      : FreeList(std::exchange(Other.FreeList, nullptr)) {}
+  Recycler() = default;
 
   /// clear - Release all the tracked allocations to the allocator. The
   /// recycler must be free of any tracked allocations before being
diff --git a/llvm/include/llvm/TargetParser/AArch64FeatPriorities.inc b/llvm/include/llvm/TargetParser/AArch64FeatPriorities.inc
new file mode 100644
index 000000000000..96af618032ae
--- /dev/null
+++ b/llvm/include/llvm/TargetParser/AArch64FeatPriorities.inc
@@ -0,0 +1,66 @@
+//===- AArch64FeatPriorities.inc - AArch64 FMV Priorities enum --*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file enumerates the AArch64 FMV features sorted in ascending priority.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef AARCH64_FEAT_PRIORITIES_INC_H
+#define AARCH64_FEAT_PRIORITIES_INC_H
+
+// Function Multi Versioning feature priorities.
+enum FeatPriorities {
+  PRIOR_RNG,
+  PRIOR_FLAGM,
+  PRIOR_FLAGM2,
+  PRIOR_LSE,
+  PRIOR_FP,
+  PRIOR_SIMD,
+  PRIOR_DOTPROD,
+  PRIOR_SM4,
+  PRIOR_RDM,
+  PRIOR_CRC,
+  PRIOR_SHA2,
+  PRIOR_SHA3,
+  PRIOR_PMULL,
+  PRIOR_FP16,
+  PRIOR_FP16FML,
+  PRIOR_DIT,
+  PRIOR_DPB,
+  PRIOR_DPB2,
+  PRIOR_JSCVT,
+  PRIOR_FCMA,
+  PRIOR_RCPC,
+  PRIOR_RCPC2,
+  PRIOR_RCPC3,
+  PRIOR_FRINTTS,
+  PRIOR_I8MM,
+  PRIOR_BF16,
+  PRIOR_SVE,
+  PRIOR_SVE_F32MM,
+  PRIOR_SVE_F64MM,
+  PRIOR_SVE2,
+  PRIOR_SVE_PMULL128,
+  PRIOR_SVE_BITPERM,
+  PRIOR_SVE_SHA3,
+  PRIOR_SVE_SM4,
+  PRIOR_SME,
+  PRIOR_MEMTAG2,
+  PRIOR_SB,
+  PRIOR_PREDRES,
+  PRIOR_SSBS2,
+  PRIOR_BTI,
+  PRIOR_LS64_ACCDATA,
+  PRIOR_WFXT,
+  PRIOR_SME_F64,
+  PRIOR_SME_I64,
+  PRIOR_SME2,
+  PRIOR_MOPS
+};
+
+#endif
diff --git a/llvm/include/llvm/TargetParser/AArch64TargetParser.h b/llvm/include/llvm/TargetParser/AArch64TargetParser.h
index ac8006d671a0..63f06a3a6929 100644
--- a/llvm/include/llvm/TargetParser/AArch64TargetParser.h
+++ b/llvm/include/llvm/TargetParser/AArch64TargetParser.h
@@ -36,6 +36,7 @@ struct ArchInfo;
 struct CpuInfo;
 
 #include "llvm/TargetParser/AArch64CPUFeatures.inc"
+#include "llvm/TargetParser/AArch64FeatPriorities.inc"
 
 static_assert(FEAT_MAX < 62,
               "Number of features in CPUFeatures are limited to 62 entries");
@@ -70,12 +71,12 @@ struct ExtensionInfo {
 
 struct FMVInfo {
   StringRef Name;                // The target_version/target_clones spelling.
-  CPUFeatures Bit;               // Index of the bit in the FMV feature bitset.
+  CPUFeatures FeatureBit;        // Index of the bit in the FMV feature bitset.
+  FeatPriorities PriorityBit;    // Index of the bit in the FMV priority bitset.
   std::optional<ArchExtKind> ID; // The architecture extension to enable.
-  unsigned Priority;             // FMV priority.
-  FMVInfo(StringRef Name, CPUFeatures Bit, std::optional<ArchExtKind> ID,
-          unsigned Priority)
-      : Name(Name), Bit(Bit), ID(ID), Priority(Priority) {};
+  FMVInfo(StringRef Name, CPUFeatures FeatureBit, FeatPriorities PriorityBit,
+          std::optional<ArchExtKind> ID)
+      : Name(Name), FeatureBit(FeatureBit), PriorityBit(PriorityBit), ID(ID) {};
 };
 
 const std::vector<FMVInfo> &getFMVInfo();
@@ -270,7 +271,7 @@ void fillValidCPUArchList(SmallVectorImpl<StringRef> &Values);
 bool isX18ReservedByDefault(const Triple &TT);
 
 // Return the priority for a given set of FMV features.
-unsigned getFMVPriority(ArrayRef<StringRef> Features);
+uint64_t getFMVPriority(ArrayRef<StringRef> Features);
 
 // For given feature names, return a bitmask corresponding to the entries of
 // AArch64::CPUFeatures. The values in CPUFeatures are not bitmasks themselves,
diff --git a/llvm/include/llvm/TargetParser/Triple.h b/llvm/include/llvm/TargetParser/Triple.h
index 844f11feef41..76914ab34c1f 100644
--- a/llvm/include/llvm/TargetParser/Triple.h
+++ b/llvm/include/llvm/TargetParser/Triple.h
@@ -564,6 +564,11 @@ public:
 
   bool isOSzOS() const { return getOS() == Triple::ZOS; }
 
+  /// Is this an Apple MachO triple.
+  bool isAppleMachO() const {
+    return (getVendor() == Triple::Apple) && isOSBinFormatMachO();
+  }
+
   /// Is this a "Darwin" OS (macOS, iOS, tvOS, watchOS, XROS, or DriverKit).
   bool isOSDarwin() const {
     return isMacOSX() || isiOS() || isWatchOS() || isDriverKit() || isXROS();
diff --git a/llvm/include/llvm/Transforms/IPO/Attributor.h b/llvm/include/llvm/Transforms/IPO/Attributor.h
index a8ee3cd531e4..28bce7b90665 100644
--- a/llvm/include/llvm/Transforms/IPO/Attributor.h
+++ b/llvm/include/llvm/Transforms/IPO/Attributor.h
@@ -1478,7 +1478,7 @@ struct AttributorConfig {
   /// The name of the pass running the attributor, used to emit remarks.
   const char *PassName = nullptr;
 
-  using IPOAmendableCBTy = function_ref<bool(const Function &F)>;
+  using IPOAmendableCBTy = std::function<bool(const Function &F)>;
   IPOAmendableCBTy IPOAmendableCB;
 };
 
diff --git a/llvm/include/llvm/Transforms/InstCombine/InstCombiner.h b/llvm/include/llvm/Transforms/InstCombine/InstCombiner.h
index 71592058e345..fa6b60cba15a 100644
--- a/llvm/include/llvm/Transforms/InstCombine/InstCombiner.h
+++ b/llvm/include/llvm/Transforms/InstCombine/InstCombiner.h
@@ -184,12 +184,6 @@ public:
     return ConstantExpr::getSub(C, ConstantInt::get(C->getType(), 1));
   }
 
-  std::optional<std::pair<
-      CmpPredicate,
-      Constant *>> static getFlippedStrictnessPredicateAndConstant(CmpPredicate
-                                                                       Pred,
-                                                                   Constant *C);
-
   static bool shouldAvoidAbsorbingNotIntoSelect(const SelectInst &SI) {
     // a ? b : false and a ? true : b are the canonical form of logical and/or.
     // This includes !a ? b : false and !a ? true : b. Absorbing the not into
diff --git a/llvm/include/llvm/Transforms/Instrumentation/TypeSanitizer.h b/llvm/include/llvm/Transforms/Instrumentation/TypeSanitizer.h
index a6cc56df35f1..20f08b679085 100644
--- a/llvm/include/llvm/Transforms/Instrumentation/TypeSanitizer.h
+++ b/llvm/include/llvm/Transforms/Instrumentation/TypeSanitizer.h
@@ -20,19 +20,11 @@ class Function;
 class FunctionPass;
 class Module;
 
-/// A function pass for tysan instrumentation.
 struct TypeSanitizerPass : public PassInfoMixin<TypeSanitizerPass> {
-  PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM);
-  static bool isRequired() { return true; }
-};
-
-/// A module pass for tysan instrumentation.
-///
-/// Create ctor and init functions.
-struct ModuleTypeSanitizerPass : public PassInfoMixin<ModuleTypeSanitizerPass> {
   PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
   static bool isRequired() { return true; }
 };
 
 } // namespace llvm
+
 #endif /* LLVM_TRANSFORMS_INSTRUMENTATION_TYPESANITIZER_H */
diff --git a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Scheduler.h b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Scheduler.h
index 3959f84c601e..9b68d47ce39a 100644
--- a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Scheduler.h
+++ b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Scheduler.h
@@ -79,6 +79,10 @@ public:
     for (auto *N : this->Nodes)
       N->setSchedBundle(*this);
   }
+  /// Copy CTOR (unimplemented).
+  SchedBundle(const SchedBundle &Other) = delete;
+  /// Copy Assignment (unimplemented).
+  SchedBundle &operator=(const SchedBundle &Other) = delete;
   ~SchedBundle() {
     for (auto *N : this->Nodes)
       N->clearSchedBundle();
diff --git a/llvm/include/module.modulemap b/llvm/include/module.modulemap
index 6beb0e03e222..46277e128d98 100644
--- a/llvm/include/module.modulemap
+++ b/llvm/include/module.modulemap
@@ -96,6 +96,7 @@ module LLVM_BinaryFormat {
     textual header "llvm/BinaryFormat/ELFRelocs/PowerPC64.def"
     textual header "llvm/BinaryFormat/ELFRelocs/PowerPC.def"
     textual header "llvm/BinaryFormat/ELFRelocs/RISCV.def"
+    textual header "llvm/BinaryFormat/ELFRelocs/RISCV_nonstandard.def"
     textual header "llvm/BinaryFormat/ELFRelocs/Sparc.def"
     textual header "llvm/BinaryFormat/ELFRelocs/SystemZ.def"
     textual header "llvm/BinaryFormat/ELFRelocs/VE.def"
diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp
index 88533f2972fa..031d675c330e 100644
--- a/llvm/lib/Analysis/ConstantFolding.cpp
+++ b/llvm/lib/Analysis/ConstantFolding.cpp
@@ -45,8 +45,10 @@
 #include "llvm/IR/IntrinsicsAArch64.h"
 #include "llvm/IR/IntrinsicsAMDGPU.h"
 #include "llvm/IR/IntrinsicsARM.h"
+#include "llvm/IR/IntrinsicsNVPTX.h"
 #include "llvm/IR/IntrinsicsWebAssembly.h"
 #include "llvm/IR/IntrinsicsX86.h"
+#include "llvm/IR/NVVMIntrinsicUtils.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Value.h"
@@ -1687,6 +1689,58 @@ bool llvm::canConstantFoldCallTo(const CallBase *Call, const Function *F) {
   case Intrinsic::x86_avx512_cvttsd2usi64:
     return !Call->isStrictFP();
 
+  // NVVM float/double to int32/uint32 conversion intrinsics
+  case Intrinsic::nvvm_f2i_rm:
+  case Intrinsic::nvvm_f2i_rn:
+  case Intrinsic::nvvm_f2i_rp:
+  case Intrinsic::nvvm_f2i_rz:
+  case Intrinsic::nvvm_f2i_rm_ftz:
+  case Intrinsic::nvvm_f2i_rn_ftz:
+  case Intrinsic::nvvm_f2i_rp_ftz:
+  case Intrinsic::nvvm_f2i_rz_ftz:
+  case Intrinsic::nvvm_f2ui_rm:
+  case Intrinsic::nvvm_f2ui_rn:
+  case Intrinsic::nvvm_f2ui_rp:
+  case Intrinsic::nvvm_f2ui_rz:
+  case Intrinsic::nvvm_f2ui_rm_ftz:
+  case Intrinsic::nvvm_f2ui_rn_ftz:
+  case Intrinsic::nvvm_f2ui_rp_ftz:
+  case Intrinsic::nvvm_f2ui_rz_ftz:
+  case Intrinsic::nvvm_d2i_rm:
+  case Intrinsic::nvvm_d2i_rn:
+  case Intrinsic::nvvm_d2i_rp:
+  case Intrinsic::nvvm_d2i_rz:
+  case Intrinsic::nvvm_d2ui_rm:
+  case Intrinsic::nvvm_d2ui_rn:
+  case Intrinsic::nvvm_d2ui_rp:
+  case Intrinsic::nvvm_d2ui_rz:
+
+  // NVVM float/double to int64/uint64 conversion intrinsics
+  case Intrinsic::nvvm_f2ll_rm:
+  case Intrinsic::nvvm_f2ll_rn:
+  case Intrinsic::nvvm_f2ll_rp:
+  case Intrinsic::nvvm_f2ll_rz:
+  case Intrinsic::nvvm_f2ll_rm_ftz:
+  case Intrinsic::nvvm_f2ll_rn_ftz:
+  case Intrinsic::nvvm_f2ll_rp_ftz:
+  case Intrinsic::nvvm_f2ll_rz_ftz:
+  case Intrinsic::nvvm_f2ull_rm:
+  case Intrinsic::nvvm_f2ull_rn:
+  case Intrinsic::nvvm_f2ull_rp:
+  case Intrinsic::nvvm_f2ull_rz:
+  case Intrinsic::nvvm_f2ull_rm_ftz:
+  case Intrinsic::nvvm_f2ull_rn_ftz:
+  case Intrinsic::nvvm_f2ull_rp_ftz:
+  case Intrinsic::nvvm_f2ull_rz_ftz:
+  case Intrinsic::nvvm_d2ll_rm:
+  case Intrinsic::nvvm_d2ll_rn:
+  case Intrinsic::nvvm_d2ll_rp:
+  case Intrinsic::nvvm_d2ll_rz:
+  case Intrinsic::nvvm_d2ull_rm:
+  case Intrinsic::nvvm_d2ull_rn:
+  case Intrinsic::nvvm_d2ull_rp:
+  case Intrinsic::nvvm_d2ull_rz:
+
   // Sign operations are actually bitwise operations, they do not raise
   // exceptions even for SNANs.
   case Intrinsic::fabs:
@@ -1849,6 +1903,12 @@ inline bool llvm_fenv_testexcept() {
   return false;
 }
 
+static const APFloat FTZPreserveSign(const APFloat &V) {
+  if (V.isDenormal())
+    return APFloat::getZero(V.getSemantics(), V.isNegative());
+  return V;
+}
+
 Constant *ConstantFoldFP(double (*NativeFP)(double), const APFloat &V,
                          Type *Ty) {
   llvm_fenv_clearexcept();
@@ -2309,6 +2369,85 @@ static Constant *ConstantFoldScalarCall1(StringRef Name,
       return ConstantFP::get(Ty->getContext(), U);
     }
 
+    // NVVM float/double to signed/unsigned int32/int64 conversions:
+    switch (IntrinsicID) {
+    // f2i
+    case Intrinsic::nvvm_f2i_rm:
+    case Intrinsic::nvvm_f2i_rn:
+    case Intrinsic::nvvm_f2i_rp:
+    case Intrinsic::nvvm_f2i_rz:
+    case Intrinsic::nvvm_f2i_rm_ftz:
+    case Intrinsic::nvvm_f2i_rn_ftz:
+    case Intrinsic::nvvm_f2i_rp_ftz:
+    case Intrinsic::nvvm_f2i_rz_ftz:
+    // f2ui
+    case Intrinsic::nvvm_f2ui_rm:
+    case Intrinsic::nvvm_f2ui_rn:
+    case Intrinsic::nvvm_f2ui_rp:
+    case Intrinsic::nvvm_f2ui_rz:
+    case Intrinsic::nvvm_f2ui_rm_ftz:
+    case Intrinsic::nvvm_f2ui_rn_ftz:
+    case Intrinsic::nvvm_f2ui_rp_ftz:
+    case Intrinsic::nvvm_f2ui_rz_ftz:
+    // d2i
+    case Intrinsic::nvvm_d2i_rm:
+    case Intrinsic::nvvm_d2i_rn:
+    case Intrinsic::nvvm_d2i_rp:
+    case Intrinsic::nvvm_d2i_rz:
+    // d2ui
+    case Intrinsic::nvvm_d2ui_rm:
+    case Intrinsic::nvvm_d2ui_rn:
+    case Intrinsic::nvvm_d2ui_rp:
+    case Intrinsic::nvvm_d2ui_rz:
+    // f2ll
+    case Intrinsic::nvvm_f2ll_rm:
+    case Intrinsic::nvvm_f2ll_rn:
+    case Intrinsic::nvvm_f2ll_rp:
+    case Intrinsic::nvvm_f2ll_rz:
+    case Intrinsic::nvvm_f2ll_rm_ftz:
+    case Intrinsic::nvvm_f2ll_rn_ftz:
+    case Intrinsic::nvvm_f2ll_rp_ftz:
+    case Intrinsic::nvvm_f2ll_rz_ftz:
+    // f2ull
+    case Intrinsic::nvvm_f2ull_rm:
+    case Intrinsic::nvvm_f2ull_rn:
+    case Intrinsic::nvvm_f2ull_rp:
+    case Intrinsic::nvvm_f2ull_rz:
+    case Intrinsic::nvvm_f2ull_rm_ftz:
+    case Intrinsic::nvvm_f2ull_rn_ftz:
+    case Intrinsic::nvvm_f2ull_rp_ftz:
+    case Intrinsic::nvvm_f2ull_rz_ftz:
+    // d2ll
+    case Intrinsic::nvvm_d2ll_rm:
+    case Intrinsic::nvvm_d2ll_rn:
+    case Intrinsic::nvvm_d2ll_rp:
+    case Intrinsic::nvvm_d2ll_rz:
+    // d2ull
+    case Intrinsic::nvvm_d2ull_rm:
+    case Intrinsic::nvvm_d2ull_rn:
+    case Intrinsic::nvvm_d2ull_rp:
+    case Intrinsic::nvvm_d2ull_rz: {
+      // In float-to-integer conversion, NaN inputs are converted to 0.
+      if (U.isNaN())
+        return ConstantInt::get(Ty, 0);
+
+      APFloat::roundingMode RMode = nvvm::IntrinsicGetRoundingMode(IntrinsicID);
+      bool IsFTZ = nvvm::IntrinsicShouldFTZ(IntrinsicID);
+      bool IsSigned = nvvm::IntrinsicConvertsToSignedInteger(IntrinsicID);
+
+      APSInt ResInt(Ty->getIntegerBitWidth(), !IsSigned);
+      auto FloatToRound = IsFTZ ? FTZPreserveSign(U) : U;
+
+      bool IsExact = false;
+      APFloat::opStatus Status =
+          FloatToRound.convertToInteger(ResInt, RMode, &IsExact);
+
+      if (Status != APFloat::opInvalidOp)
+        return ConstantInt::get(Ty, ResInt);
+      return nullptr;
+    }
+    }
+
     /// We only fold functions with finite arguments. Folding NaN and inf is
     /// likely to be aborted with an exception anyway, and some host libms
     /// have known errors raising exceptions.
diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp
index 515806428cbb..999386c0a049 100644
--- a/llvm/lib/Analysis/InstructionSimplify.cpp
+++ b/llvm/lib/Analysis/InstructionSimplify.cpp
@@ -4275,25 +4275,27 @@ Value *llvm::simplifyFCmpInst(CmpPredicate Predicate, Value *LHS, Value *RHS,
   return ::simplifyFCmpInst(Predicate, LHS, RHS, FMF, Q, RecursionLimit);
 }
 
-static Value *simplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp,
-                                     const SimplifyQuery &Q,
-                                     bool AllowRefinement,
-                                     SmallVectorImpl<Instruction *> *DropFlags,
-                                     unsigned MaxRecurse) {
+static Value *simplifyWithOpsReplaced(Value *V,
+                                      ArrayRef<std::pair<Value *, Value *>> Ops,
+                                      const SimplifyQuery &Q,
+                                      bool AllowRefinement,
+                                      SmallVectorImpl<Instruction *> *DropFlags,
+                                      unsigned MaxRecurse) {
   assert((AllowRefinement || !Q.CanUseUndef) &&
          "If AllowRefinement=false then CanUseUndef=false");
+  for (const auto &OpAndRepOp : Ops) {
+    // We cannot replace a constant, and shouldn't even try.
+    if (isa<Constant>(OpAndRepOp.first))
+      return nullptr;
 
-  // Trivial replacement.
-  if (V == Op)
-    return RepOp;
+    // Trivial replacement.
+    if (V == OpAndRepOp.first)
+      return OpAndRepOp.second;
+  }
 
   if (!MaxRecurse--)
     return nullptr;
 
-  // We cannot replace a constant, and shouldn't even try.
-  if (isa<Constant>(Op))
-    return nullptr;
-
   auto *I = dyn_cast<Instruction>(V);
   if (!I)
     return nullptr;
@@ -4303,11 +4305,6 @@ static Value *simplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp,
   if (isa<PHINode>(I))
     return nullptr;
 
-  // For vector types, the simplification must hold per-lane, so forbid
-  // potentially cross-lane operations like shufflevector.
-  if (Op->getType()->isVectorTy() && !isNotCrossLaneOperation(I))
-    return nullptr;
-
   // Don't fold away llvm.is.constant checks based on assumptions.
   if (match(I, m_Intrinsic<Intrinsic::is_constant>()))
     return nullptr;
@@ -4316,12 +4313,20 @@ static Value *simplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp,
   if (isa<FreezeInst>(I))
     return nullptr;
 
+  for (const auto &OpAndRepOp : Ops) {
+    // For vector types, the simplification must hold per-lane, so forbid
+    // potentially cross-lane operations like shufflevector.
+    if (OpAndRepOp.first->getType()->isVectorTy() &&
+        !isNotCrossLaneOperation(I))
+      return nullptr;
+  }
+
   // Replace Op with RepOp in instruction operands.
   SmallVector<Value *, 8> NewOps;
   bool AnyReplaced = false;
   for (Value *InstOp : I->operands()) {
-    if (Value *NewInstOp = simplifyWithOpReplaced(
-            InstOp, Op, RepOp, Q, AllowRefinement, DropFlags, MaxRecurse)) {
+    if (Value *NewInstOp = simplifyWithOpsReplaced(
+            InstOp, Ops, Q, AllowRefinement, DropFlags, MaxRecurse)) {
       NewOps.push_back(NewInstOp);
       AnyReplaced = InstOp != NewInstOp;
     } else {
@@ -4372,7 +4377,8 @@ static Value *simplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp,
       // by assumption and this case never wraps, so nowrap flags can be
       // ignored.
       if ((Opcode == Instruction::Sub || Opcode == Instruction::Xor) &&
-          NewOps[0] == RepOp && NewOps[1] == RepOp)
+          NewOps[0] == NewOps[1] &&
+          any_of(Ops, [=](const auto &Rep) { return NewOps[0] == Rep.second; }))
         return Constant::getNullValue(I->getType());
 
       // If we are substituting an absorber constant into a binop and extra
@@ -4382,10 +4388,10 @@ static Value *simplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp,
       // (Op == 0) ? 0 : (Op & -Op)            --> Op & -Op
       // (Op == 0) ? 0 : (Op * (binop Op, C))  --> Op * (binop Op, C)
       // (Op == -1) ? -1 : (Op | (binop C, Op) --> Op | (binop C, Op)
-      Constant *Absorber =
-          ConstantExpr::getBinOpAbsorber(Opcode, I->getType());
+      Constant *Absorber = ConstantExpr::getBinOpAbsorber(Opcode, I->getType());
       if ((NewOps[0] == Absorber || NewOps[1] == Absorber) &&
-          impliesPoison(BO, Op))
+          any_of(Ops,
+                 [=](const auto &Rep) { return impliesPoison(BO, Rep.first); }))
         return Absorber;
     }
 
@@ -4453,6 +4459,15 @@ static Value *simplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp,
                                   /*AllowNonDeterministic=*/false);
 }
 
+static Value *simplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp,
+                                     const SimplifyQuery &Q,
+                                     bool AllowRefinement,
+                                     SmallVectorImpl<Instruction *> *DropFlags,
+                                     unsigned MaxRecurse) {
+  return simplifyWithOpsReplaced(V, {{Op, RepOp}}, Q, AllowRefinement,
+                                 DropFlags, MaxRecurse);
+}
+
 Value *llvm::simplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp,
                                     const SimplifyQuery &Q,
                                     bool AllowRefinement,
@@ -4595,21 +4610,20 @@ static Value *simplifySelectWithFakeICmpEq(Value *CmpLHS, Value *CmpRHS,
 
 /// Try to simplify a select instruction when its condition operand is an
 /// integer equality or floating-point equivalence comparison.
-static Value *simplifySelectWithEquivalence(Value *CmpLHS, Value *CmpRHS,
-                                            Value *TrueVal, Value *FalseVal,
-                                            const SimplifyQuery &Q,
-                                            unsigned MaxRecurse) {
+static Value *simplifySelectWithEquivalence(
+    ArrayRef<std::pair<Value *, Value *>> Replacements, Value *TrueVal,
+    Value *FalseVal, const SimplifyQuery &Q, unsigned MaxRecurse) {
   Value *SimplifiedFalseVal =
-      simplifyWithOpReplaced(FalseVal, CmpLHS, CmpRHS, Q.getWithoutUndef(),
-                             /* AllowRefinement */ false,
-                             /* DropFlags */ nullptr, MaxRecurse);
+      simplifyWithOpsReplaced(FalseVal, Replacements, Q.getWithoutUndef(),
+                              /* AllowRefinement */ false,
+                              /* DropFlags */ nullptr, MaxRecurse);
   if (!SimplifiedFalseVal)
     SimplifiedFalseVal = FalseVal;
 
   Value *SimplifiedTrueVal =
-      simplifyWithOpReplaced(TrueVal, CmpLHS, CmpRHS, Q,
-                             /* AllowRefinement */ true,
-                             /* DropFlags */ nullptr, MaxRecurse);
+      simplifyWithOpsReplaced(TrueVal, Replacements, Q,
+                              /* AllowRefinement */ true,
+                              /* DropFlags */ nullptr, MaxRecurse);
   if (!SimplifiedTrueVal)
     SimplifiedTrueVal = TrueVal;
 
@@ -4707,10 +4721,10 @@ static Value *simplifySelectWithICmpCond(Value *CondVal, Value *TrueVal,
   // the arms of the select. See if substituting this value into the arm and
   // simplifying the result yields the same value as the other arm.
   if (Pred == ICmpInst::ICMP_EQ) {
-    if (Value *V = simplifySelectWithEquivalence(CmpLHS, CmpRHS, TrueVal,
+    if (Value *V = simplifySelectWithEquivalence({{CmpLHS, CmpRHS}}, TrueVal,
                                                  FalseVal, Q, MaxRecurse))
       return V;
-    if (Value *V = simplifySelectWithEquivalence(CmpRHS, CmpLHS, TrueVal,
+    if (Value *V = simplifySelectWithEquivalence({{CmpRHS, CmpLHS}}, TrueVal,
                                                  FalseVal, Q, MaxRecurse))
       return V;
 
@@ -4720,11 +4734,8 @@ static Value *simplifySelectWithICmpCond(Value *CondVal, Value *TrueVal,
     if (match(CmpLHS, m_Or(m_Value(X), m_Value(Y))) &&
         match(CmpRHS, m_Zero())) {
       // (X | Y) == 0 implies X == 0 and Y == 0.
-      if (Value *V = simplifySelectWithEquivalence(X, CmpRHS, TrueVal, FalseVal,
-                                                   Q, MaxRecurse))
-        return V;
-      if (Value *V = simplifySelectWithEquivalence(Y, CmpRHS, TrueVal, FalseVal,
-                                                   Q, MaxRecurse))
+      if (Value *V = simplifySelectWithEquivalence(
+              {{X, CmpRHS}, {Y, CmpRHS}}, TrueVal, FalseVal, Q, MaxRecurse))
         return V;
     }
 
@@ -4732,11 +4743,8 @@ static Value *simplifySelectWithICmpCond(Value *CondVal, Value *TrueVal,
     if (match(CmpLHS, m_And(m_Value(X), m_Value(Y))) &&
         match(CmpRHS, m_AllOnes())) {
       // (X & Y) == -1 implies X == -1 and Y == -1.
-      if (Value *V = simplifySelectWithEquivalence(X, CmpRHS, TrueVal, FalseVal,
-                                                   Q, MaxRecurse))
-        return V;
-      if (Value *V = simplifySelectWithEquivalence(Y, CmpRHS, TrueVal, FalseVal,
-                                                   Q, MaxRecurse))
+      if (Value *V = simplifySelectWithEquivalence(
+              {{X, CmpRHS}, {Y, CmpRHS}}, TrueVal, FalseVal, Q, MaxRecurse))
         return V;
     }
   }
@@ -4765,11 +4773,11 @@ static Value *simplifySelectWithFCmp(Value *Cond, Value *T, Value *F,
   // This transforms is safe if at least one operand is known to not be zero.
   // Otherwise, the select can change the sign of a zero operand.
   if (IsEquiv) {
-    if (Value *V =
-            simplifySelectWithEquivalence(CmpLHS, CmpRHS, T, F, Q, MaxRecurse))
+    if (Value *V = simplifySelectWithEquivalence({{CmpLHS, CmpRHS}}, T, F, Q,
+                                                 MaxRecurse))
       return V;
-    if (Value *V =
-            simplifySelectWithEquivalence(CmpRHS, CmpLHS, T, F, Q, MaxRecurse))
+    if (Value *V = simplifySelectWithEquivalence({{CmpRHS, CmpLHS}}, T, F, Q,
+                                                 MaxRecurse))
       return V;
   }
 
diff --git a/llvm/lib/Analysis/Lint.cpp b/llvm/lib/Analysis/Lint.cpp
index 4689451243cd..e9d96a0c2972 100644
--- a/llvm/lib/Analysis/Lint.cpp
+++ b/llvm/lib/Analysis/Lint.cpp
@@ -266,6 +266,30 @@ void Lint::visitCallBase(CallBase &I) {
           visitMemoryReference(I, Loc, DL->getABITypeAlign(Ty), Ty,
                                MemRef::Read | MemRef::Write);
         }
+
+        // Check that ABI attributes for the function and call-site match.
+        unsigned ArgNo = AI->getOperandNo();
+        Attribute::AttrKind ABIAttributes[] = {
+            Attribute::ZExt,         Attribute::SExt,     Attribute::InReg,
+            Attribute::ByVal,        Attribute::ByRef,    Attribute::InAlloca,
+            Attribute::Preallocated, Attribute::StructRet};
+        AttributeList CallAttrs = I.getAttributes();
+        for (Attribute::AttrKind Attr : ABIAttributes) {
+          Attribute CallAttr = CallAttrs.getParamAttr(ArgNo, Attr);
+          Attribute FnAttr = F->getParamAttribute(ArgNo, Attr);
+          Check(CallAttr.isValid() == FnAttr.isValid(),
+                Twine("Undefined behavior: ABI attribute ") +
+                    Attribute::getNameFromAttrKind(Attr) +
+                    " not present on both function and call-site",
+                &I);
+          if (CallAttr.isValid() && FnAttr.isValid()) {
+            Check(CallAttr == FnAttr,
+                  Twine("Undefined behavior: ABI attribute ") +
+                      Attribute::getNameFromAttrKind(Attr) +
+                      " does not have same argument for function and call-site",
+                  &I);
+          }
+        }
       }
     }
   }
diff --git a/llvm/lib/Analysis/Loads.cpp b/llvm/lib/Analysis/Loads.cpp
index 54b9521fda8f..bc03e4052a70 100644
--- a/llvm/lib/Analysis/Loads.cpp
+++ b/llvm/lib/Analysis/Loads.cpp
@@ -25,10 +25,9 @@
 
 using namespace llvm;
 
-static bool isAligned(const Value *Base, const APInt &Offset, Align Alignment,
+static bool isAligned(const Value *Base, Align Alignment,
                       const DataLayout &DL) {
-  Align BA = Base->getPointerAlignment(DL);
-  return BA >= Alignment && Offset.isAligned(BA);
+  return Base->getPointerAlignment(DL) >= Alignment;
 }
 
 /// Test if V is always a pointer to allocated and suitably aligned memory for
@@ -118,8 +117,7 @@ static bool isDereferenceableAndAlignedPointer(
     // As we recursed through GEPs to get here, we've incrementally checked
     // that each step advanced by a multiple of the alignment. If our base is
     // properly aligned, then the original offset accessed must also be.
-    APInt Offset(DL.getTypeStoreSizeInBits(V->getType()), 0);
-    return isAligned(V, Offset, Alignment, DL);
+    return isAligned(V, Alignment, DL);
   }
 
   /// TODO refactor this function to be able to search independently for
@@ -154,8 +152,7 @@ static bool isDereferenceableAndAlignedPointer(
         // checked that each step advanced by a multiple of the alignment. If
         // our base is properly aligned, then the original offset accessed
         // must also be.
-        APInt Offset(DL.getTypeStoreSizeInBits(V->getType()), 0);
-        return isAligned(V, Offset, Alignment, DL);
+        return isAligned(V, Alignment, DL);
       }
     }
   }
diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
index b5668a14a4a2..7e18f7c9c1ac 100644
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -226,7 +226,7 @@ static cl::opt<unsigned> RangeIterThreshold(
 
 static cl::opt<unsigned> MaxLoopGuardCollectionDepth(
     "scalar-evolution-max-loop-guard-collection-depth", cl::Hidden,
-    cl::desc("Maximum depth for recrusive loop guard collection"), cl::init(1));
+    cl::desc("Maximum depth for recursive loop guard collection"), cl::init(1));
 
 static cl::opt<bool>
 ClassifyExpressions("scalar-evolution-classify-expressions",
diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 2f6e869ae7b7..0eb43dd581ac 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -8641,6 +8641,80 @@ SelectPatternResult llvm::getSelectPattern(CmpInst::Predicate Pred,
   }
 }
 
+std::optional<std::pair<CmpPredicate, Constant *>>
+llvm::getFlippedStrictnessPredicateAndConstant(CmpPredicate Pred, Constant *C) {
+  assert(ICmpInst::isRelational(Pred) && ICmpInst::isIntPredicate(Pred) &&
+         "Only for relational integer predicates.");
+  if (isa<UndefValue>(C))
+    return std::nullopt;
+
+  Type *Type = C->getType();
+  bool IsSigned = ICmpInst::isSigned(Pred);
+
+  CmpInst::Predicate UnsignedPred = ICmpInst::getUnsignedPredicate(Pred);
+  bool WillIncrement =
+      UnsignedPred == ICmpInst::ICMP_ULE || UnsignedPred == ICmpInst::ICMP_UGT;
+
+  // Check if the constant operand can be safely incremented/decremented
+  // without overflowing/underflowing.
+  auto ConstantIsOk = [WillIncrement, IsSigned](ConstantInt *C) {
+    return WillIncrement ? !C->isMaxValue(IsSigned) : !C->isMinValue(IsSigned);
+  };
+
+  Constant *SafeReplacementConstant = nullptr;
+  if (auto *CI = dyn_cast<ConstantInt>(C)) {
+    // Bail out if the constant can't be safely incremented/decremented.
+    if (!ConstantIsOk(CI))
+      return std::nullopt;
+  } else if (auto *FVTy = dyn_cast<FixedVectorType>(Type)) {
+    unsigned NumElts = FVTy->getNumElements();
+    for (unsigned i = 0; i != NumElts; ++i) {
+      Constant *Elt = C->getAggregateElement(i);
+      if (!Elt)
+        return std::nullopt;
+
+      if (isa<UndefValue>(Elt))
+        continue;
+
+      // Bail out if we can't determine if this constant is min/max or if we
+      // know that this constant is min/max.
+      auto *CI = dyn_cast<ConstantInt>(Elt);
+      if (!CI || !ConstantIsOk(CI))
+        return std::nullopt;
+
+      if (!SafeReplacementConstant)
+        SafeReplacementConstant = CI;
+    }
+  } else if (isa<VectorType>(C->getType())) {
+    // Handle scalable splat
+    Value *SplatC = C->getSplatValue();
+    auto *CI = dyn_cast_or_null<ConstantInt>(SplatC);
+    // Bail out if the constant can't be safely incremented/decremented.
+    if (!CI || !ConstantIsOk(CI))
+      return std::nullopt;
+  } else {
+    // ConstantExpr?
+    return std::nullopt;
+  }
+
+  // It may not be safe to change a compare predicate in the presence of
+  // undefined elements, so replace those elements with the first safe constant
+  // that we found.
+  // TODO: in case of poison, it is safe; let's replace undefs only.
+  if (C->containsUndefOrPoisonElement()) {
+    assert(SafeReplacementConstant && "Replacement constant not set");
+    C = Constant::replaceUndefsWith(C, SafeReplacementConstant);
+  }
+
+  CmpInst::Predicate NewPred = CmpInst::getFlippedStrictnessPredicate(Pred);
+
+  // Increment or decrement the constant.
+  Constant *OneOrNegOne = ConstantInt::get(Type, WillIncrement ? 1 : -1, true);
+  Constant *NewC = ConstantExpr::getAdd(C, OneOrNegOne);
+
+  return std::make_pair(NewPred, NewC);
+}
+
 static SelectPatternResult matchSelectPattern(CmpInst::Predicate Pred,
                                               FastMathFlags FMF,
                                               Value *CmpLHS, Value *CmpRHS,
diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index 7bd3fb33b47d..3ba45900e456 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -3914,21 +3914,22 @@ static void emitGlobalConstantImpl(const DataLayout &DL, const Constant *CV,
   if (isa<ConstantAggregateZero>(CV)) {
     StructType *structType;
     if (AliasList && (structType = llvm::dyn_cast<StructType>(CV->getType()))) {
-      // Handle cases of aliases to direct struct elements
-      const StructLayout *Layout = DL.getStructLayout(structType);
-      uint64_t SizeSoFar = 0;
-      for (unsigned int i = 0, n = structType->getNumElements(); i < n - 1;
-           ++i) {
-        uint64_t GapToNext = Layout->getElementOffset(i + 1) - SizeSoFar;
-        AP.OutStreamer->emitZeros(GapToNext);
-        SizeSoFar += GapToNext;
-        emitGlobalAliasInline(AP, Offset + SizeSoFar, AliasList);
+      unsigned numElements = {structType->getNumElements()};
+      if (numElements != 0) {
+        // Handle cases of aliases to direct struct elements
+        const StructLayout *Layout = DL.getStructLayout(structType);
+        uint64_t SizeSoFar = 0;
+        for (unsigned int i = 0; i < numElements - 1; ++i) {
+          uint64_t GapToNext = Layout->getElementOffset(i + 1) - SizeSoFar;
+          AP.OutStreamer->emitZeros(GapToNext);
+          SizeSoFar += GapToNext;
+          emitGlobalAliasInline(AP, Offset + SizeSoFar, AliasList);
+        }
+        AP.OutStreamer->emitZeros(Size - SizeSoFar);
+        return;
       }
-      AP.OutStreamer->emitZeros(Size - SizeSoFar);
-      return;
-    } else {
-      return AP.OutStreamer->emitZeros(Size);
     }
+    return AP.OutStreamer->emitZeros(Size);
   }
 
   if (isa<UndefValue>(CV))
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index e1291e2a14a6..11de4b61797b 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -3789,6 +3789,7 @@ void DwarfDebug::addDwarfTypeUnitType(DwarfCompileUnit &CU,
       // they depend on addresses, throwing them out and rebuilding them.
       setCurrentDWARF5AccelTable(DWARF5AccelTableKind::CU);
       CU.constructTypeDIE(RefDie, cast<DICompositeType>(CTy));
+      CU.updateAcceleratorTables(CTy->getScope(), CTy, RefDie);
       return;
     }
 
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h
index 02256546b6b8..163205378fb4 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h
@@ -315,6 +315,11 @@ public:
   /// Get context owner's DIE.
   DIE *createTypeDIE(const DICompositeType *Ty);
 
+  /// If this is a named finished type then include it in the list of types for
+  /// the accelerator tables.
+  void updateAcceleratorTables(const DIScope *Context, const DIType *Ty,
+                               const DIE &TyDIE);
+
 protected:
   ~DwarfUnit();
 
@@ -357,11 +362,6 @@ private:
 
   virtual void finishNonUnitTypeDIE(DIE& D, const DICompositeType *CTy) = 0;
 
-  /// If this is a named finished type then include it in the list of types for
-  /// the accelerator tables.
-  void updateAcceleratorTables(const DIScope *Context, const DIType *Ty,
-                               const DIE &TyDIE);
-
   virtual bool isDwoUnit() const = 0;
   const MCSymbol *getCrossSectionRelativeBaseAddress() const override;
 
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 5c712e4f007d..ba1b10ec8b9b 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -152,7 +152,7 @@ static cl::opt<bool>
 
 static cl::opt<bool>
     EnableAndCmpSinking("enable-andcmp-sinking", cl::Hidden, cl::init(true),
-                        cl::desc("Enable sinkinig and/cmp into branches."));
+                        cl::desc("Enable sinking and/cmp into branches."));
 
 static cl::opt<bool> DisableStoreExtract(
     "disable-cgp-store-extract", cl::Hidden, cl::init(false),
diff --git a/llvm/lib/CodeGen/MIRSampleProfile.cpp b/llvm/lib/CodeGen/MIRSampleProfile.cpp
index 23db09b89599..9bba50e8e692 100644
--- a/llvm/lib/CodeGen/MIRSampleProfile.cpp
+++ b/llvm/lib/CodeGen/MIRSampleProfile.cpp
@@ -46,8 +46,9 @@ static cl::opt<bool> ShowFSBranchProb(
     cl::desc("Print setting flow sensitive branch probabilities"));
 static cl::opt<unsigned> FSProfileDebugProbDiffThreshold(
     "fs-profile-debug-prob-diff-threshold", cl::init(10),
-    cl::desc("Only show debug message if the branch probility is greater than "
-             "this value (in percentage)."));
+    cl::desc(
+        "Only show debug message if the branch probability is greater than "
+        "this value (in percentage)."));
 
 static cl::opt<unsigned> FSProfileDebugBWThreshold(
     "fs-profile-debug-bw-threshold", cl::init(10000),
diff --git a/llvm/lib/CodeGen/MachineBlockPlacement.cpp b/llvm/lib/CodeGen/MachineBlockPlacement.cpp
index 0f68313e64f5..05bc4cf646f4 100644
--- a/llvm/lib/CodeGen/MachineBlockPlacement.cpp
+++ b/llvm/lib/CodeGen/MachineBlockPlacement.cpp
@@ -149,7 +149,7 @@ static cl::opt<unsigned> JumpInstCost("jump-inst-cost",
 static cl::opt<bool>
     TailDupPlacement("tail-dup-placement",
                      cl::desc("Perform tail duplication during placement. "
-                              "Creates more fallthrough opportunites in "
+                              "Creates more fallthrough opportunities in "
                               "outline branches."),
                      cl::init(true), cl::Hidden);
 
diff --git a/llvm/lib/CodeGen/MachineBranchProbabilityInfo.cpp b/llvm/lib/CodeGen/MachineBranchProbabilityInfo.cpp
index 56ffffff6224..2e92dd8f257b 100644
--- a/llvm/lib/CodeGen/MachineBranchProbabilityInfo.cpp
+++ b/llvm/lib/CodeGen/MachineBranchProbabilityInfo.cpp
@@ -29,7 +29,7 @@ namespace llvm {
 cl::opt<unsigned>
     StaticLikelyProb("static-likely-prob",
                      cl::desc("branch probability threshold in percentage"
-                              "to be considered very likely"),
+                              " to be considered very likely"),
                      cl::init(80), cl::Hidden);
 
 cl::opt<unsigned> ProfileLikelyProb(
diff --git a/llvm/lib/CodeGen/ReachingDefAnalysis.cpp b/llvm/lib/CodeGen/ReachingDefAnalysis.cpp
index 79b0fa672cc6..3ab6315f9c8e 100644
--- a/llvm/lib/CodeGen/ReachingDefAnalysis.cpp
+++ b/llvm/lib/CodeGen/ReachingDefAnalysis.cpp
@@ -30,22 +30,22 @@ static bool isValidRegUse(const MachineOperand &MO) {
   return isValidReg(MO) && MO.isUse();
 }
 
-static bool isValidRegUseOf(const MachineOperand &MO, MCRegister PhysReg,
+static bool isValidRegUseOf(const MachineOperand &MO, MCRegister Reg,
                             const TargetRegisterInfo *TRI) {
   if (!isValidRegUse(MO))
     return false;
-  return TRI->regsOverlap(MO.getReg(), PhysReg);
+  return TRI->regsOverlap(MO.getReg(), Reg);
 }
 
 static bool isValidRegDef(const MachineOperand &MO) {
   return isValidReg(MO) && MO.isDef();
 }
 
-static bool isValidRegDefOf(const MachineOperand &MO, MCRegister PhysReg,
+static bool isValidRegDefOf(const MachineOperand &MO, MCRegister Reg,
                             const TargetRegisterInfo *TRI) {
   if (!isValidRegDef(MO))
     return false;
-  return TRI->regsOverlap(MO.getReg(), PhysReg);
+  return TRI->regsOverlap(MO.getReg(), Reg);
 }
 
 void ReachingDefAnalysis::enterBasicBlock(MachineBasicBlock *MBB) {
@@ -261,7 +261,7 @@ void ReachingDefAnalysis::traverse() {
 }
 
 int ReachingDefAnalysis::getReachingDef(MachineInstr *MI,
-                                        MCRegister PhysReg) const {
+                                        MCRegister Reg) const {
   assert(InstIds.count(MI) && "Unexpected machine instuction.");
   int InstId = InstIds.lookup(MI);
   int DefRes = ReachingDefDefaultVal;
@@ -269,7 +269,7 @@ int ReachingDefAnalysis::getReachingDef(MachineInstr *MI,
   assert(MBBNumber < MBBReachingDefs.numBlockIDs() &&
          "Unexpected basic block number.");
   int LatestDef = ReachingDefDefaultVal;
-  for (MCRegUnit Unit : TRI->regunits(PhysReg)) {
+  for (MCRegUnit Unit : TRI->regunits(Reg)) {
     for (int Def : MBBReachingDefs.defs(MBBNumber, Unit)) {
       if (Def >= InstId)
         break;
@@ -280,22 +280,21 @@ int ReachingDefAnalysis::getReachingDef(MachineInstr *MI,
   return LatestDef;
 }
 
-MachineInstr *
-ReachingDefAnalysis::getReachingLocalMIDef(MachineInstr *MI,
-                                           MCRegister PhysReg) const {
-  return hasLocalDefBefore(MI, PhysReg)
-    ? getInstFromId(MI->getParent(), getReachingDef(MI, PhysReg))
-    : nullptr;
+MachineInstr *ReachingDefAnalysis::getReachingLocalMIDef(MachineInstr *MI,
+                                                         MCRegister Reg) const {
+  return hasLocalDefBefore(MI, Reg)
+             ? getInstFromId(MI->getParent(), getReachingDef(MI, Reg))
+             : nullptr;
 }
 
 bool ReachingDefAnalysis::hasSameReachingDef(MachineInstr *A, MachineInstr *B,
-                                             MCRegister PhysReg) const {
+                                             MCRegister Reg) const {
   MachineBasicBlock *ParentA = A->getParent();
   MachineBasicBlock *ParentB = B->getParent();
   if (ParentA != ParentB)
     return false;
 
-  return getReachingDef(A, PhysReg) == getReachingDef(B, PhysReg);
+  return getReachingDef(A, Reg) == getReachingDef(B, Reg);
 }
 
 MachineInstr *ReachingDefAnalysis::getInstFromId(MachineBasicBlock *MBB,
@@ -318,19 +317,18 @@ MachineInstr *ReachingDefAnalysis::getInstFromId(MachineBasicBlock *MBB,
   return nullptr;
 }
 
-int ReachingDefAnalysis::getClearance(MachineInstr *MI,
-                                      MCRegister PhysReg) const {
+int ReachingDefAnalysis::getClearance(MachineInstr *MI, MCRegister Reg) const {
   assert(InstIds.count(MI) && "Unexpected machine instuction.");
-  return InstIds.lookup(MI) - getReachingDef(MI, PhysReg);
+  return InstIds.lookup(MI) - getReachingDef(MI, Reg);
 }
 
 bool ReachingDefAnalysis::hasLocalDefBefore(MachineInstr *MI,
-                                            MCRegister PhysReg) const {
-  return getReachingDef(MI, PhysReg) >= 0;
+                                            MCRegister Reg) const {
+  return getReachingDef(MI, Reg) >= 0;
 }
 
 void ReachingDefAnalysis::getReachingLocalUses(MachineInstr *Def,
-                                               MCRegister PhysReg,
+                                               MCRegister Reg,
                                                InstSet &Uses) const {
   MachineBasicBlock *MBB = Def->getParent();
   MachineBasicBlock::iterator MI = MachineBasicBlock::iterator(Def);
@@ -340,11 +338,11 @@ void ReachingDefAnalysis::getReachingLocalUses(MachineInstr *Def,
 
     // If/when we find a new reaching def, we know that there's no more uses
     // of 'Def'.
-    if (getReachingLocalMIDef(&*MI, PhysReg) != Def)
+    if (getReachingLocalMIDef(&*MI, Reg) != Def)
       return;
 
     for (auto &MO : MI->operands()) {
-      if (!isValidRegUseOf(MO, PhysReg, TRI))
+      if (!isValidRegUseOf(MO, Reg, TRI))
         continue;
 
       Uses.insert(&*MI);
@@ -354,15 +352,14 @@ void ReachingDefAnalysis::getReachingLocalUses(MachineInstr *Def,
   }
 }
 
-bool ReachingDefAnalysis::getLiveInUses(MachineBasicBlock *MBB,
-                                        MCRegister PhysReg,
+bool ReachingDefAnalysis::getLiveInUses(MachineBasicBlock *MBB, MCRegister Reg,
                                         InstSet &Uses) const {
   for (MachineInstr &MI :
        instructionsWithoutDebug(MBB->instr_begin(), MBB->instr_end())) {
     for (auto &MO : MI.operands()) {
-      if (!isValidRegUseOf(MO, PhysReg, TRI))
+      if (!isValidRegUseOf(MO, Reg, TRI))
         continue;
-      if (getReachingDef(&MI, PhysReg) >= 0)
+      if (getReachingDef(&MI, Reg) >= 0)
         return false;
       Uses.insert(&MI);
     }
@@ -370,18 +367,18 @@ bool ReachingDefAnalysis::getLiveInUses(MachineBasicBlock *MBB,
   auto Last = MBB->getLastNonDebugInstr();
   if (Last == MBB->end())
     return true;
-  return isReachingDefLiveOut(&*Last, PhysReg);
+  return isReachingDefLiveOut(&*Last, Reg);
 }
 
-void ReachingDefAnalysis::getGlobalUses(MachineInstr *MI, MCRegister PhysReg,
+void ReachingDefAnalysis::getGlobalUses(MachineInstr *MI, MCRegister Reg,
                                         InstSet &Uses) const {
   MachineBasicBlock *MBB = MI->getParent();
 
   // Collect the uses that each def touches within the block.
-  getReachingLocalUses(MI, PhysReg, Uses);
+  getReachingLocalUses(MI, Reg, Uses);
 
   // Handle live-out values.
-  if (auto *LiveOut = getLocalLiveOutMIDef(MI->getParent(), PhysReg)) {
+  if (auto *LiveOut = getLocalLiveOutMIDef(MI->getParent(), Reg)) {
     if (LiveOut != MI)
       return;
 
@@ -389,9 +386,9 @@ void ReachingDefAnalysis::getGlobalUses(MachineInstr *MI, MCRegister PhysReg,
     SmallPtrSet<MachineBasicBlock*, 4>Visited;
     while (!ToVisit.empty()) {
       MachineBasicBlock *MBB = ToVisit.pop_back_val();
-      if (Visited.count(MBB) || !MBB->isLiveIn(PhysReg))
+      if (Visited.count(MBB) || !MBB->isLiveIn(Reg))
         continue;
-      if (getLiveInUses(MBB, PhysReg, Uses))
+      if (getLiveInUses(MBB, Reg, Uses))
         llvm::append_range(ToVisit, MBB->successors());
       Visited.insert(MBB);
     }
@@ -399,25 +396,25 @@ void ReachingDefAnalysis::getGlobalUses(MachineInstr *MI, MCRegister PhysReg,
 }
 
 void ReachingDefAnalysis::getGlobalReachingDefs(MachineInstr *MI,
-                                                MCRegister PhysReg,
+                                                MCRegister Reg,
                                                 InstSet &Defs) const {
-  if (auto *Def = getUniqueReachingMIDef(MI, PhysReg)) {
+  if (auto *Def = getUniqueReachingMIDef(MI, Reg)) {
     Defs.insert(Def);
     return;
   }
 
   for (auto *MBB : MI->getParent()->predecessors())
-    getLiveOuts(MBB, PhysReg, Defs);
+    getLiveOuts(MBB, Reg, Defs);
 }
 
-void ReachingDefAnalysis::getLiveOuts(MachineBasicBlock *MBB,
-                                      MCRegister PhysReg, InstSet &Defs) const {
+void ReachingDefAnalysis::getLiveOuts(MachineBasicBlock *MBB, MCRegister Reg,
+                                      InstSet &Defs) const {
   SmallPtrSet<MachineBasicBlock*, 2> VisitedBBs;
-  getLiveOuts(MBB, PhysReg, Defs, VisitedBBs);
+  getLiveOuts(MBB, Reg, Defs, VisitedBBs);
 }
 
-void ReachingDefAnalysis::getLiveOuts(MachineBasicBlock *MBB,
-                                      MCRegister PhysReg, InstSet &Defs,
+void ReachingDefAnalysis::getLiveOuts(MachineBasicBlock *MBB, MCRegister Reg,
+                                      InstSet &Defs,
                                       BlockSet &VisitedBBs) const {
   if (VisitedBBs.count(MBB))
     return;
@@ -425,28 +422,28 @@ void ReachingDefAnalysis::getLiveOuts(MachineBasicBlock *MBB,
   VisitedBBs.insert(MBB);
   LiveRegUnits LiveRegs(*TRI);
   LiveRegs.addLiveOuts(*MBB);
-  if (LiveRegs.available(PhysReg))
+  if (LiveRegs.available(Reg))
     return;
 
-  if (auto *Def = getLocalLiveOutMIDef(MBB, PhysReg))
+  if (auto *Def = getLocalLiveOutMIDef(MBB, Reg))
     Defs.insert(Def);
   else
     for (auto *Pred : MBB->predecessors())
-      getLiveOuts(Pred, PhysReg, Defs, VisitedBBs);
+      getLiveOuts(Pred, Reg, Defs, VisitedBBs);
 }
 
 MachineInstr *
 ReachingDefAnalysis::getUniqueReachingMIDef(MachineInstr *MI,
-                                            MCRegister PhysReg) const {
+                                            MCRegister Reg) const {
   // If there's a local def before MI, return it.
-  MachineInstr *LocalDef = getReachingLocalMIDef(MI, PhysReg);
+  MachineInstr *LocalDef = getReachingLocalMIDef(MI, Reg);
   if (LocalDef && InstIds.lookup(LocalDef) < InstIds.lookup(MI))
     return LocalDef;
 
   SmallPtrSet<MachineInstr*, 2> Incoming;
   MachineBasicBlock *Parent = MI->getParent();
   for (auto *Pred : Parent->predecessors())
-    getLiveOuts(Pred, PhysReg, Incoming);
+    getLiveOuts(Pred, Reg, Incoming);
 
   // Check that we have a single incoming value and that it does not
   // come from the same block as MI - since it would mean that the def
@@ -469,13 +466,13 @@ MachineInstr *ReachingDefAnalysis::getMIOperand(MachineInstr *MI,
 }
 
 bool ReachingDefAnalysis::isRegUsedAfter(MachineInstr *MI,
-                                         MCRegister PhysReg) const {
+                                         MCRegister Reg) const {
   MachineBasicBlock *MBB = MI->getParent();
   LiveRegUnits LiveRegs(*TRI);
   LiveRegs.addLiveOuts(*MBB);
 
   // Yes if the register is live out of the basic block.
-  if (!LiveRegs.available(PhysReg))
+  if (!LiveRegs.available(Reg))
     return true;
 
   // Walk backwards through the block to see if the register is live at some
@@ -483,62 +480,61 @@ bool ReachingDefAnalysis::isRegUsedAfter(MachineInstr *MI,
   for (MachineInstr &Last :
        instructionsWithoutDebug(MBB->instr_rbegin(), MBB->instr_rend())) {
     LiveRegs.stepBackward(Last);
-    if (!LiveRegs.available(PhysReg))
+    if (!LiveRegs.available(Reg))
       return InstIds.lookup(&Last) > InstIds.lookup(MI);
   }
   return false;
 }
 
 bool ReachingDefAnalysis::isRegDefinedAfter(MachineInstr *MI,
-                                            MCRegister PhysReg) const {
+                                            MCRegister Reg) const {
   MachineBasicBlock *MBB = MI->getParent();
   auto Last = MBB->getLastNonDebugInstr();
   if (Last != MBB->end() &&
-      getReachingDef(MI, PhysReg) != getReachingDef(&*Last, PhysReg))
+      getReachingDef(MI, Reg) != getReachingDef(&*Last, Reg))
     return true;
 
-  if (auto *Def = getLocalLiveOutMIDef(MBB, PhysReg))
-    return Def == getReachingLocalMIDef(MI, PhysReg);
+  if (auto *Def = getLocalLiveOutMIDef(MBB, Reg))
+    return Def == getReachingLocalMIDef(MI, Reg);
 
   return false;
 }
 
 bool ReachingDefAnalysis::isReachingDefLiveOut(MachineInstr *MI,
-                                               MCRegister PhysReg) const {
+                                               MCRegister Reg) const {
   MachineBasicBlock *MBB = MI->getParent();
   LiveRegUnits LiveRegs(*TRI);
   LiveRegs.addLiveOuts(*MBB);
-  if (LiveRegs.available(PhysReg))
+  if (LiveRegs.available(Reg))
     return false;
 
   auto Last = MBB->getLastNonDebugInstr();
-  int Def = getReachingDef(MI, PhysReg);
-  if (Last != MBB->end() && getReachingDef(&*Last, PhysReg) != Def)
+  int Def = getReachingDef(MI, Reg);
+  if (Last != MBB->end() && getReachingDef(&*Last, Reg) != Def)
     return false;
 
   // Finally check that the last instruction doesn't redefine the register.
   for (auto &MO : Last->operands())
-    if (isValidRegDefOf(MO, PhysReg, TRI))
+    if (isValidRegDefOf(MO, Reg, TRI))
       return false;
 
   return true;
 }
 
-MachineInstr *
-ReachingDefAnalysis::getLocalLiveOutMIDef(MachineBasicBlock *MBB,
-                                          MCRegister PhysReg) const {
+MachineInstr *ReachingDefAnalysis::getLocalLiveOutMIDef(MachineBasicBlock *MBB,
+                                                        MCRegister Reg) const {
   LiveRegUnits LiveRegs(*TRI);
   LiveRegs.addLiveOuts(*MBB);
-  if (LiveRegs.available(PhysReg))
+  if (LiveRegs.available(Reg))
     return nullptr;
 
   auto Last = MBB->getLastNonDebugInstr();
   if (Last == MBB->end())
     return nullptr;
 
-  int Def = getReachingDef(&*Last, PhysReg);
+  int Def = getReachingDef(&*Last, Reg);
   for (auto &MO : Last->operands())
-    if (isValidRegDefOf(MO, PhysReg, TRI))
+    if (isValidRegDefOf(MO, Reg, TRI))
       return &*Last;
 
   return Def < 0 ? nullptr : getInstFromId(MBB, Def);
@@ -650,7 +646,7 @@ ReachingDefAnalysis::isSafeToRemove(MachineInstr *MI, InstSet &Visited,
 void ReachingDefAnalysis::collectKilledOperands(MachineInstr *MI,
                                                 InstSet &Dead) const {
   Dead.insert(MI);
-  auto IsDead = [this, &Dead](MachineInstr *Def, MCRegister PhysReg) {
+  auto IsDead = [this, &Dead](MachineInstr *Def, MCRegister Reg) {
     if (mayHaveSideEffects(*Def))
       return false;
 
@@ -666,7 +662,7 @@ void ReachingDefAnalysis::collectKilledOperands(MachineInstr *MI,
       return false;
 
     SmallPtrSet<MachineInstr*, 4> Uses;
-    getGlobalUses(Def, PhysReg, Uses);
+    getGlobalUses(Def, Reg, Uses);
     return llvm::set_is_subset(Uses, Dead);
   };
 
@@ -680,18 +676,18 @@ void ReachingDefAnalysis::collectKilledOperands(MachineInstr *MI,
 }
 
 bool ReachingDefAnalysis::isSafeToDefRegAt(MachineInstr *MI,
-                                           MCRegister PhysReg) const {
+                                           MCRegister Reg) const {
   SmallPtrSet<MachineInstr*, 1> Ignore;
-  return isSafeToDefRegAt(MI, PhysReg, Ignore);
+  return isSafeToDefRegAt(MI, Reg, Ignore);
 }
 
-bool ReachingDefAnalysis::isSafeToDefRegAt(MachineInstr *MI, MCRegister PhysReg,
+bool ReachingDefAnalysis::isSafeToDefRegAt(MachineInstr *MI, MCRegister Reg,
                                            InstSet &Ignore) const {
   // Check for any uses of the register after MI.
-  if (isRegUsedAfter(MI, PhysReg)) {
-    if (auto *Def = getReachingLocalMIDef(MI, PhysReg)) {
+  if (isRegUsedAfter(MI, Reg)) {
+    if (auto *Def = getReachingLocalMIDef(MI, Reg)) {
       SmallPtrSet<MachineInstr*, 2> Uses;
-      getGlobalUses(Def, PhysReg, Uses);
+      getGlobalUses(Def, Reg, Uses);
       if (!llvm::set_is_subset(Uses, Ignore))
         return false;
     } else
@@ -700,13 +696,13 @@ bool ReachingDefAnalysis::isSafeToDefRegAt(MachineInstr *MI, MCRegister PhysReg,
 
   MachineBasicBlock *MBB = MI->getParent();
   // Check for any defs after MI.
-  if (isRegDefinedAfter(MI, PhysReg)) {
+  if (isRegDefinedAfter(MI, Reg)) {
     auto I = MachineBasicBlock::iterator(MI);
     for (auto E = MBB->end(); I != E; ++I) {
       if (Ignore.count(&*I))
         continue;
       for (auto &MO : I->operands())
-        if (isValidRegDefOf(MO, PhysReg, TRI))
+        if (isValidRegDefOf(MO, Reg, TRI))
           return false;
     }
   }
diff --git a/llvm/lib/CodeGen/RegAllocGreedy.cpp b/llvm/lib/CodeGen/RegAllocGreedy.cpp
index e61dad5cf64d..b94992c20b11 100644
--- a/llvm/lib/CodeGen/RegAllocGreedy.cpp
+++ b/llvm/lib/CodeGen/RegAllocGreedy.cpp
@@ -140,7 +140,7 @@ static cl::opt<bool> GreedyReverseLocalAssignment(
 static cl::opt<unsigned> SplitThresholdForRegWithHint(
     "split-threshold-for-reg-with-hint",
     cl::desc("The threshold for splitting a virtual register with a hint, in "
-             "percentate"),
+             "percentage"),
     cl::init(75), cl::Hidden);
 
 static RegisterRegAlloc greedyRegAlloc("greedy", "greedy register allocator",
diff --git a/llvm/lib/CodeGen/RegisterCoalescer.cpp b/llvm/lib/CodeGen/RegisterCoalescer.cpp
index 20ad6445344d..8313927dd2aa 100644
--- a/llvm/lib/CodeGen/RegisterCoalescer.cpp
+++ b/llvm/lib/CodeGen/RegisterCoalescer.cpp
@@ -113,7 +113,7 @@ static cl::opt<unsigned> LargeIntervalSizeThreshold(
 
 static cl::opt<unsigned> LargeIntervalFreqThreshold(
     "large-interval-freq-threshold", cl::Hidden,
-    cl::desc("For a large interval, if it is coalesed with other live "
+    cl::desc("For a large interval, if it is coalesced with other live "
              "intervals many times more than the threshold, stop its "
              "coalescing to control the compile time. "),
     cl::init(256));
@@ -1325,11 +1325,6 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP,
   const MCInstrDesc &MCID = DefMI->getDesc();
   if (MCID.getNumDefs() != 1)
     return false;
-  // Only support subregister destinations when the def is read-undef.
-  MachineOperand &DstOperand = CopyMI->getOperand(0);
-  Register CopyDstReg = DstOperand.getReg();
-  if (DstOperand.getSubReg() && !DstOperand.isUndef())
-    return false;
 
   // If both SrcIdx and DstIdx are set, correct rematerialization would widen
   // the register substantially (beyond both source and dest size). This is bad
@@ -1339,6 +1334,32 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP,
   if (SrcIdx && DstIdx)
     return false;
 
+  // Only support subregister destinations when the def is read-undef.
+  MachineOperand &DstOperand = CopyMI->getOperand(0);
+  Register CopyDstReg = DstOperand.getReg();
+  if (DstOperand.getSubReg() && !DstOperand.isUndef())
+    return false;
+
+  // In the physical register case, checking that the def is read-undef is not
+  // enough. We're widening the def and need to avoid clobbering other live
+  // values in the unused register pieces.
+  //
+  // TODO: Targets may support rewriting the rematerialized instruction to only
+  // touch relevant lanes, in which case we don't need any liveness check.
+  if (CopyDstReg.isPhysical() && CP.isPartial()) {
+    for (MCRegUnit Unit : TRI->regunits(DstReg)) {
+      // Ignore the register units we are writing anyway.
+      if (is_contained(TRI->regunits(CopyDstReg), Unit))
+        continue;
+
+      // Check if the other lanes we are defining are live at the
+      // rematerialization point.
+      LiveRange &LR = LIS->getRegUnit(Unit);
+      if (LR.liveAt(CopyIdx))
+        return false;
+    }
+  }
+
   const unsigned DefSubIdx = DefMI->getOperand(0).getSubReg();
   const TargetRegisterClass *DefRC = TII->getRegClass(MCID, 0, TRI, *MF);
   if (!DefMI->isImplicitDef()) {
@@ -1375,27 +1396,6 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP,
   NewMI.setDebugLoc(DL);
 
   // In a situation like the following:
-  //
-  //    undef %2.subreg:reg = INST %1:reg         ; DefMI (rematerializable),
-  //                                              ; DefSubIdx = subreg
-  //    %3:reg = COPY %2                          ; SrcIdx = DstIdx = 0
-  //    .... = SOMEINSTR %3:reg
-  //
-  // there are no subranges for %3 so after rematerialization we need
-  // to explicitly create them. Undefined subranges are removed later on.
-  if (DstReg.isVirtual() && DefSubIdx && !CP.getSrcIdx() && !CP.getDstIdx() &&
-      MRI->shouldTrackSubRegLiveness(DstReg)) {
-    LiveInterval &DstInt = LIS->getInterval(DstReg);
-    if (!DstInt.hasSubRanges()) {
-      LaneBitmask FullMask = MRI->getMaxLaneMaskForVReg(DstReg);
-      LaneBitmask UsedLanes = TRI->getSubRegIndexLaneMask(DefSubIdx);
-      LaneBitmask UnusedLanes = FullMask & ~UsedLanes;
-      DstInt.createSubRangeFrom(LIS->getVNInfoAllocator(), UsedLanes, DstInt);
-      DstInt.createSubRangeFrom(LIS->getVNInfoAllocator(), UnusedLanes, DstInt);
-    }
-  }
-
-  // In a situation like the following:
   //     %0:subreg = instr              ; DefMI, subreg = DstIdx
   //     %1        = copy %0:subreg ; CopyMI, SrcIdx = 0
   // instead of widening %1 to the register class of %0 simply do:
@@ -1523,6 +1523,27 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP,
     // sure that "undef" is not set.
     if (NewIdx == 0)
       NewMI.getOperand(0).setIsUndef(false);
+
+    // In a situation like the following:
+    //
+    //    undef %2.subreg:reg = INST %1:reg    ; DefMI (rematerializable),
+    //                                         ; Defines only some of lanes,
+    //                                         ; so DefSubIdx = NewIdx = subreg
+    //    %3:reg = COPY %2                     ; Copy full reg
+    //    .... = SOMEINSTR %3:reg              ; Use full reg
+    //
+    // there are no subranges for %3 so after rematerialization we need
+    // to explicitly create them. Undefined subranges are removed later on.
+    if (NewIdx && !DstInt.hasSubRanges() &&
+        MRI->shouldTrackSubRegLiveness(DstReg)) {
+      LaneBitmask FullMask = MRI->getMaxLaneMaskForVReg(DstReg);
+      LaneBitmask UsedLanes = TRI->getSubRegIndexLaneMask(NewIdx);
+      LaneBitmask UnusedLanes = FullMask & ~UsedLanes;
+      VNInfo::Allocator &Alloc = LIS->getVNInfoAllocator();
+      DstInt.createSubRangeFrom(Alloc, UsedLanes, DstInt);
+      DstInt.createSubRangeFrom(Alloc, UnusedLanes, DstInt);
+    }
+
     // Add dead subregister definitions if we are defining the whole register
     // but only part of it is live.
     // This could happen if the rematerialization instruction is rematerializing
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index e89e7efa98c1..da3c834417d6 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -141,7 +141,7 @@ static cl::opt<bool> EnableReduceLoadOpStoreWidth(
 static cl::opt<bool> ReduceLoadOpStoreWidthForceNarrowingProfitable(
     "combiner-reduce-load-op-store-width-force-narrowing-profitable",
     cl::Hidden, cl::init(false),
-    cl::desc("DAG combiner force override the narrowing profitable check when"
+    cl::desc("DAG combiner force override the narrowing profitable check when "
              "reducing the width of load/op/store sequences"));
 
 static cl::opt<bool> EnableShrinkLoadReplaceStoreWithStore(
@@ -20455,10 +20455,8 @@ SDValue DAGCombiner::TransformFPLoadStorePair(SDNode *N) {
       Value.hasOneUse()) {
     LoadSDNode *LD = cast<LoadSDNode>(Value);
     EVT VT = LD->getMemoryVT();
-    if (!VT.isFloatingPoint() ||
-        VT != ST->getMemoryVT() ||
-        LD->isNonTemporal() ||
-        ST->isNonTemporal() ||
+    if (!VT.isSimple() || !VT.isFloatingPoint() || VT != ST->getMemoryVT() ||
+        LD->isNonTemporal() || ST->isNonTemporal() ||
         LD->getPointerInfo().getAddrSpace() != 0 ||
         ST->getPointerInfo().getAddrSpace() != 0)
       return SDValue();
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index e8404a13009a..89a00c5a4f04 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -1777,6 +1777,31 @@ void VectorLegalizer::ExpandUINT_TO_FLOAT(SDNode *Node,
   assert((BW == 64 || BW == 32) &&
          "Elements in vector-UINT_TO_FP must be 32 or 64 bits wide");
 
+  // If STRICT_/FMUL is not supported by the target (in case of f16) replace the
+  // UINT_TO_FP with a larger float and round to the smaller type
+  if ((!IsStrict && !TLI.isOperationLegalOrCustom(ISD::FMUL, DstVT)) ||
+      (IsStrict && !TLI.isOperationLegalOrCustom(ISD::STRICT_FMUL, DstVT))) {
+    EVT FPVT = BW == 32 ? MVT::f32 : MVT::f64;
+    SDValue UIToFP;
+    SDValue Result;
+    SDValue TargetZero = DAG.getIntPtrConstant(0, DL, /*isTarget=*/true);
+    EVT FloatVecVT = SrcVT.changeVectorElementType(FPVT);
+    if (IsStrict) {
+      UIToFP = DAG.getNode(ISD::STRICT_UINT_TO_FP, DL, {FloatVecVT, MVT::Other},
+                           {Node->getOperand(0), Src});
+      Result = DAG.getNode(ISD::STRICT_FP_ROUND, DL, {DstVT, MVT::Other},
+                           {Node->getOperand(0), UIToFP, TargetZero});
+      Results.push_back(Result);
+      Results.push_back(Result.getValue(1));
+    } else {
+      UIToFP = DAG.getNode(ISD::UINT_TO_FP, DL, FloatVecVT, Src);
+      Result = DAG.getNode(ISD::FP_ROUND, DL, DstVT, UIToFP, TargetZero);
+      Results.push_back(Result);
+    }
+
+    return;
+  }
+
   SDValue HalfWord = DAG.getConstant(BW / 2, DL, SrcVT);
 
   // Constants to clear the upper part of the word.
diff --git a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
index 9e5867c70d7b..51ee3cc681f0 100644
--- a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
@@ -125,9 +125,9 @@ static cl::opt<int> MaxReorderWindow(
   cl::desc("Number of instructions to allow ahead of the critical path "
            "in sched=list-ilp"));
 
-static cl::opt<unsigned> AvgIPC(
-  "sched-avg-ipc", cl::Hidden, cl::init(1),
-  cl::desc("Average inst/cycle whan no target itinerary exists."));
+static cl::opt<unsigned>
+    AvgIPC("sched-avg-ipc", cl::Hidden, cl::init(1),
+           cl::desc("Average inst/cycle when no target itinerary exists."));
 
 namespace {
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
index 26fc75c0578e..dff7243b0a99 100644
--- a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
@@ -43,9 +43,9 @@ STATISTIC(LoadsClustered, "Number of loads clustered together");
 // without a target itinerary. The choice of number here has more to do with
 // balancing scheduler heuristics than with the actual machine latency.
 static cl::opt<int> HighLatencyCycles(
-  "sched-high-latency-cycles", cl::Hidden, cl::init(10),
-  cl::desc("Roughly estimate the number of cycles that 'long latency'"
-           "instructions take for targets with no itinerary"));
+    "sched-high-latency-cycles", cl::Hidden, cl::init(10),
+    cl::desc("Roughly estimate the number of cycles that 'long latency' "
+             "instructions take for targets with no itinerary"));
 
 ScheduleDAGSDNodes::ScheduleDAGSDNodes(MachineFunction &mf)
     : ScheduleDAG(mf), InstrItins(mf.getSubtarget().getInstrItineraryData()) {}
diff --git a/llvm/lib/ExecutionEngine/Orc/Core.cpp b/llvm/lib/ExecutionEngine/Orc/Core.cpp
index 6a9ebb41e79f..d47eb4416d3c 100644
--- a/llvm/lib/ExecutionEngine/Orc/Core.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/Core.cpp
@@ -1576,12 +1576,22 @@ void Platform::lookupInitSymbolsAsync(
   }
 }
 
+MaterializationTask::~MaterializationTask() {
+  // If this task wasn't run then fail materialization.
+  if (MR)
+    MR->failMaterialization();
+}
+
 void MaterializationTask::printDescription(raw_ostream &OS) {
   OS << "Materialization task: " << MU->getName() << " in "
      << MR->getTargetJITDylib().getName();
 }
 
-void MaterializationTask::run() { MU->materialize(std::move(MR)); }
+void MaterializationTask::run() {
+  assert(MU && "MU should not be null");
+  assert(MR && "MR should not be null");
+  MU->materialize(std::move(MR));
+}
 
 void LookupTask::printDescription(raw_ostream &OS) { OS << "Lookup task"; }
 
@@ -1821,17 +1831,10 @@ ExecutionSession::lookup(const JITDylibSearchOrder &SearchOrder,
                          RegisterDependenciesFunction RegisterDependencies) {
 #if LLVM_ENABLE_THREADS
   // In the threaded case we use promises to return the results.
-  std::promise<SymbolMap> PromisedResult;
-  Error ResolutionError = Error::success();
+  std::promise<MSVCPExpected<SymbolMap>> PromisedResult;
 
   auto NotifyComplete = [&](Expected<SymbolMap> R) {
-    if (R)
-      PromisedResult.set_value(std::move(*R));
-    else {
-      ErrorAsOutParameter _(ResolutionError);
-      ResolutionError = R.takeError();
-      PromisedResult.set_value(SymbolMap());
-    }
+    PromisedResult.set_value(std::move(R));
   };
 
 #else
@@ -1848,18 +1851,11 @@ ExecutionSession::lookup(const JITDylibSearchOrder &SearchOrder,
 #endif
 
   // Perform the asynchronous lookup.
-  lookup(K, SearchOrder, std::move(Symbols), RequiredState, NotifyComplete,
-         RegisterDependencies);
+  lookup(K, SearchOrder, std::move(Symbols), RequiredState,
+         std::move(NotifyComplete), RegisterDependencies);
 
 #if LLVM_ENABLE_THREADS
-  auto ResultFuture = PromisedResult.get_future();
-  auto Result = ResultFuture.get();
-
-  if (ResolutionError)
-    return std::move(ResolutionError);
-
-  return std::move(Result);
-
+  return PromisedResult.get_future().get();
 #else
   if (ResolutionError)
     return std::move(ResolutionError);
diff --git a/llvm/lib/ExecutionEngine/Orc/ObjectLinkingLayer.cpp b/llvm/lib/ExecutionEngine/Orc/ObjectLinkingLayer.cpp
index 6688b0935a2d..9bc0aa89c353 100644
--- a/llvm/lib/ExecutionEngine/Orc/ObjectLinkingLayer.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/ObjectLinkingLayer.cpp
@@ -16,8 +16,6 @@ namespace llvm::orc {
 
 char ObjectLinkingLayer::ID;
 
-using BaseObjectLayer = RTTIExtends<ObjectLinkingLayer, ObjectLayer>;
-
 void ObjectLinkingLayer::emit(std::unique_ptr<MaterializationResponsibility> R,
                               std::unique_ptr<MemoryBuffer> O) {
   assert(O && "Object must not be null");
diff --git a/llvm/lib/ExecutionEngine/Orc/TaskDispatch.cpp b/llvm/lib/ExecutionEngine/Orc/TaskDispatch.cpp
index fbe4b093b0c6..1af17e85220d 100644
--- a/llvm/lib/ExecutionEngine/Orc/TaskDispatch.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/TaskDispatch.cpp
@@ -31,6 +31,10 @@ void DynamicThreadPoolTaskDispatcher::dispatch(std::unique_ptr<Task> T) {
   {
     std::lock_guard<std::mutex> Lock(DispatchMutex);
 
+    // Reject new tasks if they're dispatched after a call to shutdown.
+    if (Shutdown)
+      return;
+
     if (IsMaterializationTask) {
 
       // If this is a materialization task and there are too many running
@@ -54,6 +58,14 @@ void DynamicThreadPoolTaskDispatcher::dispatch(std::unique_ptr<Task> T) {
       // Run the task.
       T->run();
 
+      // Reset the task to free any resources. We need this to happen *before*
+      // we notify anyone (via Outstanding) that this thread is done to ensure
+      // that we don't proceed with JIT shutdown while still holding resources.
+      // (E.g. this was causing "Dangling SymbolStringPtr" assertions).
+      T.reset();
+
+      // Check the work queue state and either proceed with the next task or
+      // end this thread.
       std::lock_guard<std::mutex> Lock(DispatchMutex);
       if (!MaterializationTaskQueue.empty()) {
         // If there are any materialization tasks running then steal that work.
@@ -64,7 +76,6 @@ void DynamicThreadPoolTaskDispatcher::dispatch(std::unique_ptr<Task> T) {
           IsMaterializationTask = true;
         }
       } else {
-        // Otherwise decrement work counters.
         if (IsMaterializationTask)
           --NumMaterializationThreads;
         --Outstanding;
@@ -78,7 +89,7 @@ void DynamicThreadPoolTaskDispatcher::dispatch(std::unique_ptr<Task> T) {
 
 void DynamicThreadPoolTaskDispatcher::shutdown() {
   std::unique_lock<std::mutex> Lock(DispatchMutex);
-  Running = false;
+  Shutdown = true;
   OutstandingCV.wait(Lock, [this]() { return Outstanding == 0; });
 }
 #endif
diff --git a/llvm/lib/ObjCopy/MachO/MachOLayoutBuilder.cpp b/llvm/lib/ObjCopy/MachO/MachOLayoutBuilder.cpp
index 93bc6631e64c..d4eb6a9b9fc0 100644
--- a/llvm/lib/ObjCopy/MachO/MachOLayoutBuilder.cpp
+++ b/llvm/lib/ObjCopy/MachO/MachOLayoutBuilder.cpp
@@ -116,6 +116,11 @@ uint64_t MachOLayoutBuilder::layoutSegments() {
   const bool IsObjectFile =
       O.Header.FileType == MachO::HeaderFileType::MH_OBJECT;
   uint64_t Offset = IsObjectFile ? (HeaderSize + O.Header.SizeOfCmds) : 0;
+  if (O.EncryptionInfoCommandIndex) {
+    // If we are emitting an encryptable binary, our load commands must have a
+    // separate (non-encrypted) page to themselves.
+    Offset = alignToPowerOf2(HeaderSize + O.Header.SizeOfCmds, PageSize);
+  }
   for (LoadCommand &LC : O.LoadCommands) {
     auto &MLC = LC.MachOLoadCommand;
     StringRef Segname;
diff --git a/llvm/lib/ObjCopy/MachO/MachOObject.cpp b/llvm/lib/ObjCopy/MachO/MachOObject.cpp
index 8d2c02dc37c9..e0819d89d24f 100644
--- a/llvm/lib/ObjCopy/MachO/MachOObject.cpp
+++ b/llvm/lib/ObjCopy/MachO/MachOObject.cpp
@@ -98,6 +98,10 @@ void Object::updateLoadCommandIndexes() {
     case MachO::LC_DYLD_EXPORTS_TRIE:
       ExportsTrieCommandIndex = Index;
       break;
+    case MachO::LC_ENCRYPTION_INFO:
+    case MachO::LC_ENCRYPTION_INFO_64:
+      EncryptionInfoCommandIndex = Index;
+      break;
     }
   }
 }
diff --git a/llvm/lib/ObjCopy/MachO/MachOObject.h b/llvm/lib/ObjCopy/MachO/MachOObject.h
index a454c4f502fd..79eb0133c280 100644
--- a/llvm/lib/ObjCopy/MachO/MachOObject.h
+++ b/llvm/lib/ObjCopy/MachO/MachOObject.h
@@ -341,6 +341,9 @@ struct Object {
   /// The index of the LC_SEGMENT or LC_SEGMENT_64 load command
   /// corresponding to the __TEXT segment.
   std::optional<size_t> TextSegmentCommandIndex;
+  /// The index of the LC_ENCRYPTION_INFO or LC_ENCRYPTION_INFO_64 load command
+  /// if present.
+  std::optional<size_t> EncryptionInfoCommandIndex;
 
   BumpPtrAllocator Alloc;
   StringSaver NewSectionsContents;
diff --git a/llvm/lib/ObjCopy/MachO/MachOReader.cpp b/llvm/lib/ObjCopy/MachO/MachOReader.cpp
index 2b344f36d8e7..ef0e0262f939 100644
--- a/llvm/lib/ObjCopy/MachO/MachOReader.cpp
+++ b/llvm/lib/ObjCopy/MachO/MachOReader.cpp
@@ -184,6 +184,10 @@ Error MachOReader::readLoadCommands(Object &O) const {
     case MachO::LC_DYLD_CHAINED_FIXUPS:
       O.ChainedFixupsCommandIndex = O.LoadCommands.size();
       break;
+    case MachO::LC_ENCRYPTION_INFO:
+    case MachO::LC_ENCRYPTION_INFO_64:
+      O.EncryptionInfoCommandIndex = O.LoadCommands.size();
+      break;
     }
 #define HANDLE_LOAD_COMMAND(LCName, LCValue, LCStruct)                         \
   case MachO::LCName:                                                          \
diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index d737ea5ab070..4ec0fb8fc81e 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -189,9 +189,9 @@ static cl::opt<bool> EnableGlobalAnalyses(
     "enable-global-analyses", cl::init(true), cl::Hidden,
     cl::desc("Enable inter-procedural analyses"));
 
-static cl::opt<bool>
-    RunPartialInlining("enable-partial-inlining", cl::init(false), cl::Hidden,
-                       cl::desc("Run Partial inlinining pass"));
+static cl::opt<bool> RunPartialInlining("enable-partial-inlining",
+                                        cl::init(false), cl::Hidden,
+                                        cl::desc("Run Partial inlining pass"));
 
 static cl::opt<bool> ExtraVectorizerPasses(
     "extra-vectorizer-passes", cl::init(false), cl::Hidden,
@@ -264,7 +264,7 @@ static cl::opt<bool>
 static cl::opt<bool> FlattenedProfileUsed(
     "flattened-profile-used", cl::init(false), cl::Hidden,
     cl::desc("Indicate the sample profile being used is flattened, i.e., "
-             "no inline hierachy exists in the profile"));
+             "no inline hierarchy exists in the profile"));
 
 static cl::opt<bool> EnableOrderFileInstrumentation(
     "enable-order-file-instrumentation", cl::init(false), cl::Hidden,
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index 9f0b09278edc..13e192fffbdd 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -156,7 +156,7 @@ MODULE_PASS("strip-nonlinetable-debuginfo", StripNonLineTableDebugInfoPass())
 MODULE_PASS("trigger-crash-module", TriggerCrashModulePass())
 MODULE_PASS("trigger-verifier-error", TriggerVerifierErrorPass())
 MODULE_PASS("tsan-module", ModuleThreadSanitizerPass())
-MODULE_PASS("tysan-module", ModuleTypeSanitizerPass())
+MODULE_PASS("tysan", TypeSanitizerPass())
 MODULE_PASS("verify", VerifierPass())
 MODULE_PASS("view-callgraph", CallGraphViewerPass())
 MODULE_PASS("wholeprogramdevirt", WholeProgramDevirtPass())
@@ -481,7 +481,6 @@ FUNCTION_PASS("transform-warning", WarnMissedTransformationsPass())
 FUNCTION_PASS("trigger-crash-function", TriggerCrashFunctionPass())
 FUNCTION_PASS("trigger-verifier-error", TriggerVerifierErrorPass())
 FUNCTION_PASS("tsan", ThreadSanitizerPass())
-FUNCTION_PASS("tysan", TypeSanitizerPass())
 FUNCTION_PASS("typepromotion", TypePromotionPass(TM))
 FUNCTION_PASS("unify-loop-exits", UnifyLoopExitsPass())
 FUNCTION_PASS("vector-combine", VectorCombinePass())
diff --git a/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp b/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp
index a4539fa94d89..e6a789c16cdf 100644
--- a/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp
+++ b/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp
@@ -135,6 +135,31 @@ Counter CounterExpressionBuilder::subtract(Counter LHS, Counter RHS,
   return Simplify ? simplify(Cnt) : Cnt;
 }
 
+Counter CounterExpressionBuilder::subst(Counter C, const SubstMap &Map) {
+  // Replace C with the value found in Map even if C is Expression.
+  if (auto I = Map.find(C); I != Map.end())
+    return I->second;
+
+  if (!C.isExpression())
+    return C;
+
+  auto CE = Expressions[C.getExpressionID()];
+  auto NewLHS = subst(CE.LHS, Map);
+  auto NewRHS = subst(CE.RHS, Map);
+
+  // Reconstruct Expression with induced subexpressions.
+  switch (CE.Kind) {
+  case CounterExpression::Add:
+    C = add(NewLHS, NewRHS);
+    break;
+  case CounterExpression::Subtract:
+    C = subtract(NewLHS, NewRHS);
+    break;
+  }
+
+  return C;
+}
+
 void CounterMappingContext::dump(const Counter &C, raw_ostream &OS) const {
   switch (C.getKind()) {
   case Counter::Zero:
diff --git a/llvm/lib/Target/AArch64/AArch64FMV.td b/llvm/lib/Target/AArch64/AArch64FMV.td
index fc7a94a5fe47..e0f56fd55561 100644
--- a/llvm/lib/Target/AArch64/AArch64FMV.td
+++ b/llvm/lib/Target/AArch64/AArch64FMV.td
@@ -22,64 +22,65 @@
 
 
 // Something you can add to target_version or target_clones.
-class FMVExtension<string n, string b, int p> {
+class FMVExtension<string name, string enumeration> {
     // Name, as spelled in target_version or target_clones. e.g. "memtag".
-    string Name = n;
+    string Name = name;
 
     // A C++ expression giving the number of the bit in the FMV ABI.
     // Currently this is given as a value from the enum "CPUFeatures".
-    string Bit = b;
+    string FeatureBit = "FEAT_" # enumeration;
 
     // SubtargetFeature enabled for codegen when this FMV feature is present.
-    string BackendFeature = n;
+    string BackendFeature = name;
 
-    // The FMV priority.
-    int Priority = p;
+    // A C++ expression giving the number of the priority bit.
+    // Currently this is given as a value from the enum "FeatPriorities".
+    string PriorityBit = "PRIOR_" # enumeration;
 }
 
-def : FMVExtension<"aes", "FEAT_PMULL", 150>;
-def : FMVExtension<"bf16", "FEAT_BF16", 280>;
-def : FMVExtension<"bti", "FEAT_BTI", 510>;
-def : FMVExtension<"crc", "FEAT_CRC", 110>;
-def : FMVExtension<"dit", "FEAT_DIT", 180>;
-def : FMVExtension<"dotprod", "FEAT_DOTPROD", 104>;
-let BackendFeature = "ccpp" in def : FMVExtension<"dpb", "FEAT_DPB", 190>;
-let BackendFeature = "ccdp" in def : FMVExtension<"dpb2", "FEAT_DPB2", 200>;
-def : FMVExtension<"f32mm", "FEAT_SVE_F32MM", 350>;
-def : FMVExtension<"f64mm", "FEAT_SVE_F64MM", 360>;
-def : FMVExtension<"fcma", "FEAT_FCMA", 220>;
-def : FMVExtension<"flagm", "FEAT_FLAGM", 20>;
-let BackendFeature = "altnzcv" in def : FMVExtension<"flagm2", "FEAT_FLAGM2", 30>;
-def : FMVExtension<"fp", "FEAT_FP", 90>;
-def : FMVExtension<"fp16", "FEAT_FP16", 170>;
-def : FMVExtension<"fp16fml", "FEAT_FP16FML", 175>;
-let BackendFeature = "fptoint" in def : FMVExtension<"frintts", "FEAT_FRINTTS", 250>;
-def : FMVExtension<"i8mm", "FEAT_I8MM", 270>;
-def : FMVExtension<"jscvt", "FEAT_JSCVT", 210>;
-def : FMVExtension<"ls64", "FEAT_LS64_ACCDATA", 520>;
-def : FMVExtension<"lse", "FEAT_LSE", 80>;
-def : FMVExtension<"memtag", "FEAT_MEMTAG2", 440>;
-def : FMVExtension<"mops", "FEAT_MOPS", 650>;
-def : FMVExtension<"predres", "FEAT_PREDRES", 480>;
-def : FMVExtension<"rcpc", "FEAT_RCPC", 230>;
-let BackendFeature = "rcpc-immo" in def : FMVExtension<"rcpc2", "FEAT_RCPC2", 240>;
-def : FMVExtension<"rcpc3", "FEAT_RCPC3", 241>;
-def : FMVExtension<"rdm", "FEAT_RDM", 108>;
-def : FMVExtension<"rng", "FEAT_RNG", 10>;
-def : FMVExtension<"sb", "FEAT_SB", 470>;
-def : FMVExtension<"sha2", "FEAT_SHA2", 130>;
-def : FMVExtension<"sha3", "FEAT_SHA3", 140>;
-def : FMVExtension<"simd", "FEAT_SIMD", 100>;
-def : FMVExtension<"sm4", "FEAT_SM4", 106>;
-def : FMVExtension<"sme", "FEAT_SME", 430>;
-def : FMVExtension<"sme-f64f64", "FEAT_SME_F64", 560>;
-def : FMVExtension<"sme-i16i64", "FEAT_SME_I64", 570>;
-def : FMVExtension<"sme2", "FEAT_SME2", 580>;
-def : FMVExtension<"ssbs", "FEAT_SSBS2", 490>;
-def : FMVExtension<"sve", "FEAT_SVE", 310>;
-def : FMVExtension<"sve2", "FEAT_SVE2", 370>;
-def : FMVExtension<"sve2-aes", "FEAT_SVE_PMULL128", 380>;
-def : FMVExtension<"sve2-bitperm", "FEAT_SVE_BITPERM", 400>;
-def : FMVExtension<"sve2-sha3", "FEAT_SVE_SHA3", 410>;
-def : FMVExtension<"sve2-sm4", "FEAT_SVE_SM4", 420>;
-def : FMVExtension<"wfxt", "FEAT_WFXT", 550>;
+def : FMVExtension<"aes", "PMULL">;
+def : FMVExtension<"bf16", "BF16">;
+def : FMVExtension<"bti", "BTI">;
+def : FMVExtension<"crc", "CRC">;
+def : FMVExtension<"dit", "DIT">;
+def : FMVExtension<"dotprod", "DOTPROD">;
+let BackendFeature = "ccpp" in def : FMVExtension<"dpb", "DPB">;
+let BackendFeature = "ccdp" in def : FMVExtension<"dpb2", "DPB2">;
+def : FMVExtension<"f32mm", "SVE_F32MM">;
+def : FMVExtension<"f64mm", "SVE_F64MM">;
+def : FMVExtension<"fcma", "FCMA">;
+def : FMVExtension<"flagm", "FLAGM">;
+let BackendFeature = "altnzcv" in def : FMVExtension<"flagm2", "FLAGM2">;
+def : FMVExtension<"fp", "FP">;
+def : FMVExtension<"fp16", "FP16">;
+def : FMVExtension<"fp16fml", "FP16FML">;
+let BackendFeature = "fptoint" in def : FMVExtension<"frintts", "FRINTTS">;
+def : FMVExtension<"i8mm", "I8MM">;
+def : FMVExtension<"jscvt", "JSCVT">;
+def : FMVExtension<"ls64", "LS64_ACCDATA">;
+def : FMVExtension<"lse", "LSE">;
+def : FMVExtension<"memtag", "MEMTAG2">;
+def : FMVExtension<"mops", "MOPS">;
+def : FMVExtension<"predres", "PREDRES">;
+def : FMVExtension<"rcpc", "RCPC">;
+let BackendFeature = "rcpc-immo" in def : FMVExtension<"rcpc2", "RCPC2">;
+def : FMVExtension<"rcpc3", "RCPC3">;
+def : FMVExtension<"rdm", "RDM">;
+def : FMVExtension<"rng", "RNG">;
+def : FMVExtension<"sb", "SB">;
+def : FMVExtension<"sha2", "SHA2">;
+def : FMVExtension<"sha3", "SHA3">;
+def : FMVExtension<"simd", "SIMD">;
+def : FMVExtension<"sm4", "SM4">;
+def : FMVExtension<"sme", "SME">;
+def : FMVExtension<"sme-f64f64", "SME_F64">;
+def : FMVExtension<"sme-i16i64", "SME_I64">;
+def : FMVExtension<"sme2", "SME2">;
+def : FMVExtension<"ssbs", "SSBS2">;
+def : FMVExtension<"sve", "SVE">;
+def : FMVExtension<"sve2", "SVE2">;
+def : FMVExtension<"sve2-aes", "SVE_PMULL128">;
+def : FMVExtension<"sve2-bitperm", "SVE_BITPERM">;
+def : FMVExtension<"sve2-sha3", "SVE_SHA3">;
+def : FMVExtension<"sve2-sm4", "SVE_SM4">;
+def : FMVExtension<"wfxt", "WFXT">;
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index ef00b092fe5e..3ad2905ce520 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -753,6 +753,14 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
     setOperationAction(Op, MVT::v8bf16, Expand);
   }
 
+  // For bf16, fpextend is custom lowered to be optionally expanded into shifts.
+  setOperationAction(ISD::FP_EXTEND, MVT::f32, Custom);
+  setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom);
+  setOperationAction(ISD::FP_EXTEND, MVT::v4f32, Custom);
+  setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Custom);
+  setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Custom);
+  setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f32, Custom);
+
   auto LegalizeNarrowFP = [this](MVT ScalarVT) {
     for (auto Op : {
              ISD::SETCC,
@@ -893,10 +901,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
       setOperationAction(Op, MVT::f16, Legal);
   }
 
-  // Strict conversion to a larger type is legal
-  for (auto VT : {MVT::f32, MVT::f64})
-    setOperationAction(ISD::STRICT_FP_EXTEND, VT, Legal);
-
   setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
 
   setOperationAction(ISD::GET_ROUNDING, MVT::i32, Custom);
@@ -1183,6 +1187,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
   setMaxDivRemBitWidthSupported(128);
 
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
+  if (Subtarget->hasSME())
+    setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i1, Custom);
 
   if (Subtarget->isNeonAvailable()) {
     // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to
@@ -4496,6 +4502,54 @@ SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op,
   if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
     return LowerFixedLengthFPExtendToSVE(Op, DAG);
 
+  bool IsStrict = Op->isStrictFPOpcode();
+  SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
+  EVT Op0VT = Op0.getValueType();
+  if (VT == MVT::f64) {
+    // FP16->FP32 extends are legal for v32 and v4f32.
+    if (Op0VT == MVT::f32 || Op0VT == MVT::f16)
+      return Op;
+    // Split bf16->f64 extends into two fpextends.
+    if (Op0VT == MVT::bf16 && IsStrict) {
+      SDValue Ext1 =
+          DAG.getNode(ISD::STRICT_FP_EXTEND, SDLoc(Op), {MVT::f32, MVT::Other},
+                      {Op0, Op.getOperand(0)});
+      return DAG.getNode(ISD::STRICT_FP_EXTEND, SDLoc(Op), {VT, MVT::Other},
+                         {Ext1, Ext1.getValue(1)});
+    }
+    if (Op0VT == MVT::bf16)
+      return DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), VT,
+                         DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), MVT::f32, Op0));
+    return SDValue();
+  }
+
+  if (VT.getScalarType() == MVT::f32) {
+    // FP16->FP32 extends are legal for v32 and v4f32.
+    if (Op0VT.getScalarType() == MVT::f16)
+      return Op;
+    if (Op0VT.getScalarType() == MVT::bf16) {
+      SDLoc DL(Op);
+      EVT IVT = VT.changeTypeToInteger();
+      if (!Op0VT.isVector()) {
+        Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4bf16, Op0);
+        IVT = MVT::v4i32;
+      }
+
+      EVT Op0IVT = Op0.getValueType().changeTypeToInteger();
+      SDValue Ext =
+          DAG.getNode(ISD::ANY_EXTEND, DL, IVT, DAG.getBitcast(Op0IVT, Op0));
+      SDValue Shift =
+          DAG.getNode(ISD::SHL, DL, IVT, Ext, DAG.getConstant(16, DL, IVT));
+      if (!Op0VT.isVector())
+        Shift = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Shift,
+                            DAG.getConstant(0, DL, MVT::i64));
+      Shift = DAG.getBitcast(VT, Shift);
+      return IsStrict ? DAG.getMergeValues({Shift, Op.getOperand(0)}, DL)
+                      : Shift;
+    }
+    return SDValue();
+  }
+
   assert(Op.getValueType() == MVT::f128 && "Unexpected lowering");
   return SDValue();
 }
@@ -7343,6 +7397,7 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
   case ISD::STRICT_FP_ROUND:
     return LowerFP_ROUND(Op, DAG);
   case ISD::FP_EXTEND:
+  case ISD::STRICT_FP_EXTEND:
     return LowerFP_EXTEND(Op, DAG);
   case ISD::FRAMEADDR:
     return LowerFRAMEADDR(Op, DAG);
@@ -27429,6 +27484,15 @@ void AArch64TargetLowering::ReplaceNodeResults(
       Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
       return;
     }
+    case Intrinsic::aarch64_sme_in_streaming_mode: {
+      SDLoc DL(N);
+      SDValue Chain = DAG.getEntryNode();
+      SDValue RuntimePStateSM =
+          getRuntimePStateSM(DAG, Chain, DL, N->getValueType(0));
+      Results.push_back(
+          DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, RuntimePStateSM));
+      return;
+    }
     case Intrinsic::experimental_vector_match:
     case Intrinsic::get_active_lane_mask: {
       if (!VT.isFixedLengthVector() || VT.getVectorElementType() != MVT::i1)
diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index 47c4c6c39565..f527f7e4eafb 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -1804,7 +1804,9 @@ class TMSystemException<bits<3> op1, string asm, list<dag> pattern>
 }
 
 class APASI : SimpleSystemI<0, (ins GPR64:$Xt), "apas", "\t$Xt">, Sched<[]> {
+  bits<5> Xt;
   let Inst{20-5} = 0b0111001110000000;
+  let Inst{4-0} = Xt;
   let DecoderNamespace = "APAS";
 }
 
@@ -2768,6 +2770,8 @@ class MulHi<bits<3> opc, string asm, SDNode OpNode>
   let Inst{23-21} = opc;
   let Inst{20-16} = Rm;
   let Inst{15}    = 0;
+  let Inst{14-10} = 0b11111;
+  let Unpredictable{14-10} = 0b11111;
   let Inst{9-5}   = Rn;
   let Inst{4-0}   = Rd;
 
@@ -4920,6 +4924,8 @@ class LoadExclusivePair<bits<2> sz, bit o2, bit L, bit o1, bit o0,
   bits<5> Rt;
   bits<5> Rt2;
   bits<5> Rn;
+  let Inst{20-16} = 0b11111;
+  let Unpredictable{20-16} = 0b11111;
   let Inst{14-10} = Rt2;
   let Inst{9-5} = Rn;
   let Inst{4-0} = Rt;
@@ -4935,6 +4941,7 @@ class BaseLoadStoreExclusiveLSUI<bits<2> sz, bit L, bit o0,
   let Inst{31-30} = sz;
   let Inst{29-23} = 0b0010010;
   let Inst{22}    = L;
+  let Inst{21}    = 0b0;
   let Inst{15}    = o0;
 }
 
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index ec891ea4bac8..c6f5cdcd1d5f 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -5123,22 +5123,6 @@ let Predicates = [HasFullFP16] in {
 //===----------------------------------------------------------------------===//
 
 defm FCVT : FPConversion<"fcvt">;
-// Helper to get bf16 into fp32.
-def cvt_bf16_to_fp32 :
-  OutPatFrag<(ops node:$Rn),
-             (f32 (COPY_TO_REGCLASS
-	         (i32 (UBFMWri
-		   (i32 (COPY_TO_REGCLASS (INSERT_SUBREG (f32 (IMPLICIT_DEF)),
-		                           node:$Rn, hsub), GPR32)),
-	           (i64 (i32shift_a (i64 16))),
-                   (i64 (i32shift_b (i64 16))))),
-	         FPR32))>;
-// Pattern for bf16 -> fp32.
-def : Pat<(f32 (any_fpextend (bf16 FPR16:$Rn))),
-          (cvt_bf16_to_fp32 FPR16:$Rn)>;
-// Pattern for bf16 -> fp64.
-def : Pat<(f64 (any_fpextend (bf16 FPR16:$Rn))),
-          (FCVTDSr (f32 (cvt_bf16_to_fp32 FPR16:$Rn)))>;
 
 //===----------------------------------------------------------------------===//
 // Floating point single operand instructions.
@@ -8333,8 +8317,6 @@ def : Pat<(v4i32 (anyext (v4i16 V64:$Rn))), (USHLLv4i16_shift V64:$Rn, (i32 0))>
 def : Pat<(v2i64 (sext   (v2i32 V64:$Rn))), (SSHLLv2i32_shift V64:$Rn, (i32 0))>;
 def : Pat<(v2i64 (zext   (v2i32 V64:$Rn))), (USHLLv2i32_shift V64:$Rn, (i32 0))>;
 def : Pat<(v2i64 (anyext (v2i32 V64:$Rn))), (USHLLv2i32_shift V64:$Rn, (i32 0))>;
-// Vector bf16 -> fp32 is implemented morally as a zext + shift.
-def : Pat<(v4f32 (any_fpextend (v4bf16 V64:$Rn))), (SHLLv4i16 V64:$Rn)>;
 // Also match an extend from the upper half of a 128 bit source register.
 def : Pat<(v8i16 (anyext (v8i8 (extract_high_v16i8 (v16i8 V128:$Rn)) ))),
           (USHLLv16i8_shift V128:$Rn, (i32 0))>;
diff --git a/llvm/lib/Target/AArch64/AArch64SystemOperands.td b/llvm/lib/Target/AArch64/AArch64SystemOperands.td
index c76fc8abeeda..355a9d2a0415 100644
--- a/llvm/lib/Target/AArch64/AArch64SystemOperands.td
+++ b/llvm/lib/Target/AArch64/AArch64SystemOperands.td
@@ -630,7 +630,7 @@ def ExactFPImmValues : GenericEnum {
 
 def ExactFPImmsList : GenericTable {
   let FilterClass = "ExactFPImm";
-  let Fields = ["Name", "Enum", "Repr"];
+  let Fields = ["Enum", "Repr"];
 }
 
 def lookupExactFPImmByEnum : SearchIndex {
@@ -638,11 +638,6 @@ def lookupExactFPImmByEnum : SearchIndex {
   let Key = ["Enum"];
 }
 
-def lookupExactFPImmByRepr : SearchIndex {
-  let Table = ExactFPImmsList;
-  let Key = ["Repr"];
-}
-
 def : ExactFPImm<"zero", "0.0", 0x0>;
 def : ExactFPImm<"half", "0.5", 0x1>;
 def : ExactFPImm<"one",  "1.0", 0x2>;
@@ -998,7 +993,6 @@ defm : TLBI<"VMALLWS2E1OS",  0b100, 0b1000, 0b0101, 0b010, 0>;
 class SysReg<string name, bits<2> op0, bits<3> op1, bits<4> crn, bits<4> crm,
              bits<3> op2> {
   string Name = name;
-  string AltName = name;
   bits<16> Encoding;
   let Encoding{15-14} = op0;
   let Encoding{13-11} = op1;
@@ -1018,8 +1012,11 @@ def SysRegValues : GenericEnum {
 
 def SysRegsList : GenericTable {
   let FilterClass = "SysReg";
-  let Fields = ["Name", "AltName", "Encoding", "Readable", "Writeable",
-                "Requires"];
+  let Fields = ["Name", "Encoding", "Readable", "Writeable", "Requires"];
+
+  let PrimaryKey = ["Encoding"];
+  let PrimaryKeyName = "lookupSysRegByEncoding";
+  let PrimaryKeyReturnRange = true;
 }
 
 def lookupSysRegByName : SearchIndex {
@@ -1027,11 +1024,6 @@ def lookupSysRegByName : SearchIndex {
   let Key = ["Name"];
 }
 
-def lookupSysRegByEncoding : SearchIndex {
-  let Table = SysRegsList;
-  let Key = ["Encoding"];
-}
-
 class RWSysReg<string name, bits<2> op0, bits<3> op1, bits<4> crn, bits<4> crm,
                bits<3> op2>
     : SysReg<name, op0, op1, crn, crm, op2> {
@@ -1317,9 +1309,7 @@ def : RWSysReg<"TTBR0_EL1",          0b11, 0b000, 0b0010, 0b0000, 0b000>;
 def : RWSysReg<"TTBR0_EL3",          0b11, 0b110, 0b0010, 0b0000, 0b000>;
 
 let Requires = [{ {AArch64::FeatureEL2VMSA} }] in {
-def : RWSysReg<"TTBR0_EL2",          0b11, 0b100, 0b0010, 0b0000, 0b000> {
-  let AltName = "VSCTLR_EL2";
-}
+def : RWSysReg<"TTBR0_EL2",          0b11, 0b100, 0b0010, 0b0000, 0b000>;
 def : RWSysReg<"VTTBR_EL2",          0b11, 0b100, 0b0010, 0b0001, 0b000>;
 }
 
@@ -1706,9 +1696,7 @@ def : RWSysReg<"ICH_LR15_EL2",       0b11, 0b100, 0b1100, 0b1101, 0b111>;
 let Requires = [{ {AArch64::HasV8_0rOps} }] in {
 //Virtualization System Control Register
 //                                 Op0   Op1    CRn     CRm     Op2
-def : RWSysReg<"VSCTLR_EL2",       0b11, 0b100, 0b0010, 0b0000, 0b000> {
-  let AltName = "TTBR0_EL2";
-}
+def : RWSysReg<"VSCTLR_EL2",       0b11, 0b100, 0b0010, 0b0000, 0b000>;
 
 //MPU Type Register
 //                                 Op0   Op1    CRn     CRm     Op2
@@ -2376,7 +2364,6 @@ def : RWSysReg<"ACTLRALIAS_EL1",  0b11, 0b000, 0b0001, 0b0100, 0b101>;
 class PHint<bits<2> op0, bits<3> op1, bits<4> crn, bits<4> crm,
               bits<3> op2, string name> {
   string Name = name;
-  string AltName = name;
   bits<16> Encoding;
   let Encoding{15-14} = op0;
   let Encoding{13-11} = op1;
@@ -2394,7 +2381,7 @@ def PHintValues : GenericEnum {
 
 def PHintsList : GenericTable {
   let FilterClass = "PHint";
-  let Fields = ["Name", "AltName", "Encoding", "Requires"];
+  let Fields = ["Name", "Encoding", "Requires"];
 }
 
 def lookupPHintByName : SearchIndex {
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 5abe69ef1c9c..25b6731cb313 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -2761,6 +2761,21 @@ InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
     return AdjustCost(
         BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
 
+  static const TypeConversionCostTblEntry BF16Tbl[] = {
+      {ISD::FP_ROUND, MVT::bf16, MVT::f32, 1},     // bfcvt
+      {ISD::FP_ROUND, MVT::bf16, MVT::f64, 1},     // bfcvt
+      {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f32, 1}, // bfcvtn
+      {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f32, 2}, // bfcvtn+bfcvtn2
+      {ISD::FP_ROUND, MVT::v2bf16, MVT::v2f64, 2}, // bfcvtn+fcvtn
+      {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f64, 3}, // fcvtn+fcvtl2+bfcvtn
+      {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f64, 6}, // 2 * fcvtn+fcvtn2+bfcvtn
+  };
+
+  if (ST->hasBF16())
+    if (const auto *Entry = ConvertCostTableLookup(
+            BF16Tbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
+      return AdjustCost(Entry->Cost);
+
   static const TypeConversionCostTblEntry ConversionTbl[] = {
       {ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 1},    // xtn
       {ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, 1},   // xtn
@@ -2848,6 +2863,14 @@ InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
       {ISD::FP_EXTEND, MVT::v2f64, MVT::v2f16, 2}, // fcvtl+fcvtl
       {ISD::FP_EXTEND, MVT::v4f64, MVT::v4f16, 3}, // fcvtl+fcvtl2+fcvtl
       {ISD::FP_EXTEND, MVT::v8f64, MVT::v8f16, 6}, // 2 * fcvtl+fcvtl2+fcvtl
+      //   BF16 (uses shift)
+      {ISD::FP_EXTEND, MVT::f32, MVT::bf16, 1},     // shl
+      {ISD::FP_EXTEND, MVT::f64, MVT::bf16, 2},     // shl+fcvt
+      {ISD::FP_EXTEND, MVT::v4f32, MVT::v4bf16, 1}, // shll
+      {ISD::FP_EXTEND, MVT::v8f32, MVT::v8bf16, 2}, // shll+shll2
+      {ISD::FP_EXTEND, MVT::v2f64, MVT::v2bf16, 2}, // shll+fcvtl
+      {ISD::FP_EXTEND, MVT::v4f64, MVT::v4bf16, 3}, // shll+fcvtl+fcvtl2
+      {ISD::FP_EXTEND, MVT::v8f64, MVT::v8bf16, 6}, // 2 * shll+fcvtl+fcvtl2
       // FP Ext and trunc
       {ISD::FP_ROUND, MVT::f32, MVT::f64, 1},     // fcvt
       {ISD::FP_ROUND, MVT::v2f32, MVT::v2f64, 1}, // fcvtn
@@ -2860,6 +2883,15 @@ InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
       {ISD::FP_ROUND, MVT::v2f16, MVT::v2f64, 2}, // fcvtn+fcvtn
       {ISD::FP_ROUND, MVT::v4f16, MVT::v4f64, 3}, // fcvtn+fcvtn2+fcvtn
       {ISD::FP_ROUND, MVT::v8f16, MVT::v8f64, 6}, // 2 * fcvtn+fcvtn2+fcvtn
+      //   BF16 (more complex, with +bf16 is handled above)
+      {ISD::FP_ROUND, MVT::bf16, MVT::f32, 8}, // Expansion is ~8 insns
+      {ISD::FP_ROUND, MVT::bf16, MVT::f64, 9}, // fcvtn + above
+      {ISD::FP_ROUND, MVT::v2bf16, MVT::v2f32, 8},
+      {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f32, 8},
+      {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f32, 15},
+      {ISD::FP_ROUND, MVT::v2bf16, MVT::v2f64, 9},
+      {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f64, 10},
+      {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f64, 19},
 
       // LowerVectorINT_TO_FP:
       {ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1},
@@ -4706,10 +4738,21 @@ InstructionCost AArch64TTIImpl::getShuffleCost(
   }
 
   Kind = improveShuffleKindFromMask(Kind, Mask, Tp, Index, SubTp);
-  // Treat extractsubvector as single op permutation.
   bool IsExtractSubvector = Kind == TTI::SK_ExtractSubvector;
-  if (IsExtractSubvector && LT.second.isFixedLengthVector())
+  // A sebvector extract can be implemented with a ext (or trivial extract, if
+  // from lane 0). This currently only handles low or high extracts to prevent
+  // SLP vectorizer regressions.
+  if (IsExtractSubvector && LT.second.isFixedLengthVector()) {
+    if (LT.second.is128BitVector() &&
+        cast<FixedVectorType>(SubTp)->getNumElements() ==
+            LT.second.getVectorNumElements() / 2) {
+      if (Index == 0)
+        return 0;
+      if (Index == (int)LT.second.getVectorNumElements() / 2)
+        return 1;
+    }
     Kind = TTI::SK_PermuteSingleSrc;
+  }
 
   // Check for broadcast loads, which are supported by the LD1R instruction.
   // In terms of code-size, the shuffle vector is free when a load + dup get
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp
index ae84bc953f35..875b505549f0 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp
@@ -1874,26 +1874,25 @@ void AArch64InstPrinter::printBarriernXSOption(const MCInst *MI, unsigned OpNo,
     markup(O, Markup::Immediate) << "#" << Val;
 }
 
-static bool isValidSysReg(const AArch64SysReg::SysReg *Reg, bool Read,
+static bool isValidSysReg(const AArch64SysReg::SysReg &Reg, bool Read,
                           const MCSubtargetInfo &STI) {
-  return (Reg && (Read ? Reg->Readable : Reg->Writeable) &&
-          Reg->haveFeatures(STI.getFeatureBits()));
+  return (Read ? Reg.Readable : Reg.Writeable) &&
+         Reg.haveFeatures(STI.getFeatureBits());
 }
 
-// Looks up a system register either by encoding or by name. Some system
+// Looks up a system register either by encoding. Some system
 // registers share the same encoding between different architectures,
-// therefore a tablegen lookup by encoding will return an entry regardless
-// of the register's predication on a specific subtarget feature. To work
-// around this problem we keep an alternative name for such registers and
-// look them up by that name if the first lookup was unsuccessful.
+// to work around this tablegen will return a range of registers with the same
+// encodings. We need to check each register in the range to see if it valid.
 static const AArch64SysReg::SysReg *lookupSysReg(unsigned Val, bool Read,
                                                  const MCSubtargetInfo &STI) {
-  const AArch64SysReg::SysReg *Reg = AArch64SysReg::lookupSysRegByEncoding(Val);
-
-  if (Reg && !isValidSysReg(Reg, Read, STI))
-    Reg = AArch64SysReg::lookupSysRegByName(Reg->AltName);
+  auto Range = AArch64SysReg::lookupSysRegByEncoding(Val);
+  for (auto &Reg : Range) {
+    if (isValidSysReg(Reg, Read, STI))
+      return &Reg;
+  }
 
-  return Reg;
+  return nullptr;
 }
 
 void AArch64InstPrinter::printMRSSystemRegister(const MCInst *MI, unsigned OpNo,
@@ -1917,7 +1916,7 @@ void AArch64InstPrinter::printMRSSystemRegister(const MCInst *MI, unsigned OpNo,
 
   const AArch64SysReg::SysReg *Reg = lookupSysReg(Val, true /*Read*/, STI);
 
-  if (isValidSysReg(Reg, true /*Read*/, STI))
+  if (Reg)
     O << Reg->Name;
   else
     O << AArch64SysReg::genericRegisterString(Val);
@@ -1944,7 +1943,7 @@ void AArch64InstPrinter::printMSRSystemRegister(const MCInst *MI, unsigned OpNo,
 
   const AArch64SysReg::SysReg *Reg = lookupSysReg(Val, false /*Read*/, STI);
 
-  if (isValidSysReg(Reg, false /*Read*/, STI))
+  if (Reg)
     O << Reg->Name;
   else
     O << AArch64SysReg::genericRegisterString(Val);
diff --git a/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h b/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h
index 94bba4e4c351..b8d323649fea 100644
--- a/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h
+++ b/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h
@@ -564,11 +564,10 @@ LLVM_DECLARE_ENUM_AS_BITMASK(TailFoldingOpts,
                              /* LargestValue */ (long)TailFoldingOpts::Reverse);
 
 namespace AArch64ExactFPImm {
-  struct ExactFPImm {
-    const char *Name;
-    int Enum;
-    const char *Repr;
-  };
+struct ExactFPImm {
+  int Enum;
+  const char *Repr;
+};
 #define GET_ExactFPImmValues_DECL
 #define GET_ExactFPImmsList_DECL
 #include "AArch64GenSystemOperands.inc"
@@ -602,7 +601,6 @@ namespace AArch64PSBHint {
 namespace AArch64PHint {
 struct PHint {
   const char *Name;
-  const char *AltName;
   unsigned Encoding;
   FeatureBitset FeaturesRequired;
 
@@ -720,7 +718,6 @@ AArch64StringToVectorLayout(StringRef LayoutStr) {
 namespace AArch64SysReg {
   struct SysReg {
     const char Name[32];
-    const char AltName[32];
     unsigned Encoding;
     bool Readable;
     bool Writeable;
@@ -736,9 +733,6 @@ namespace AArch64SysReg {
 #define GET_SysRegValues_DECL
 #include "AArch64GenSystemOperands.inc"
 
-  const SysReg *lookupSysRegByName(StringRef);
-  const SysReg *lookupSysRegByEncoding(uint16_t);
-
   uint32_t parseGenericRegister(StringRef Name);
   std::string genericRegisterString(uint32_t Bits);
 }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index e844904ebd1e..0f9798838d7f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -1523,7 +1523,8 @@ Value *AMDGPUCodeGenPrepareImpl::shrinkDivRem64(IRBuilder<> &Builder,
   bool IsDiv = Opc == Instruction::SDiv || Opc == Instruction::UDiv;
   bool IsSigned = Opc == Instruction::SDiv || Opc == Instruction::SRem;
 
-  int NumDivBits = getDivNumBits(I, Num, Den, 32, IsSigned);
+  unsigned BitWidth = Num->getType()->getScalarSizeInBits();
+  int NumDivBits = getDivNumBits(I, Num, Den, BitWidth - 32, IsSigned);
   if (NumDivBits == -1)
     return nullptr;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index d9eaf82c5214..27e9018d68a0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -1997,7 +1997,7 @@ bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr,
   if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, ImmOffset))
     return false;
   SAddr = SelectSAddrFI(CurDAG, SAddr);
-  Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i32);
+  Offset = CurDAG->getSignedTargetConstant(ImmOffset, SDLoc(), MVT::i32);
   return true;
 }
 
diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td
index 88205ea361c5..f2686bdf56b4 100644
--- a/llvm/lib/Target/AMDGPU/BUFInstructions.td
+++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -680,7 +680,7 @@ multiclass MUBUF_Pseudo_Stores<string opName, ValueType store_vt = i32> {
 class MUBUF_Pseudo_Store_Lds<string opName>
   : MUBUF_Pseudo<opName,
                  (outs),
-                 (ins SReg_128:$srsrc, SCSrc_b32:$soffset, Offset:$offset, CPol:$cpol, i1imm:$swz),
+                 (ins SReg_128_XNULL:$srsrc, SCSrc_b32:$soffset, Offset:$offset, CPol:$cpol, i1imm:$swz),
                  " $srsrc, $soffset$offset lds$cpol"> {
   let LGKM_CNT = 1;
   let mayLoad = 1;
diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
index 3c7627ff60e9..1b94d6c43392 100644
--- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td
+++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
@@ -1524,7 +1524,7 @@ class MIMG_IntersectRay_Helper<bit Is64, bit IsA16> {
 
 class MIMG_IntersectRay_gfx10<mimgopc op, string opcode, RegisterClass AddrRC>
     : MIMG_gfx10<op.GFX10M, (outs VReg_128:$vdata), "GFX10"> {
-  let InOperandList = (ins AddrRC:$vaddr0, SReg_128:$srsrc, A16:$a16);
+  let InOperandList = (ins AddrRC:$vaddr0, SReg_128_XNULL:$srsrc, A16:$a16);
   let AsmString = opcode#" $vdata, $vaddr0, $srsrc$a16";
 
   let nsa = 0;
@@ -1532,13 +1532,13 @@ class MIMG_IntersectRay_gfx10<mimgopc op, string opcode, RegisterClass AddrRC>
 
 class MIMG_IntersectRay_nsa_gfx10<mimgopc op, string opcode, int num_addrs>
     : MIMG_nsa_gfx10<op.GFX10M, (outs VReg_128:$vdata), num_addrs, "GFX10"> {
-  let InOperandList = !con(nsah.AddrIns, (ins SReg_128:$srsrc, A16:$a16));
+  let InOperandList = !con(nsah.AddrIns, (ins SReg_128_XNULL:$srsrc, A16:$a16));
   let AsmString = opcode#" $vdata, "#nsah.AddrAsm#", $srsrc$a16";
 }
 
 class MIMG_IntersectRay_gfx11<mimgopc op, string opcode, RegisterClass AddrRC>
     : MIMG_gfx11<op.GFX11, (outs VReg_128:$vdata), "GFX11"> {
-  let InOperandList = (ins AddrRC:$vaddr0, SReg_128:$srsrc, A16:$a16);
+  let InOperandList = (ins AddrRC:$vaddr0, SReg_128_XNULL:$srsrc, A16:$a16);
   let AsmString = opcode#" $vdata, $vaddr0, $srsrc$a16";
 
   let nsa = 0;
@@ -1548,7 +1548,7 @@ class MIMG_IntersectRay_nsa_gfx11<mimgopc op, string opcode, int num_addrs,
                                   list<RegisterClass> addr_types>
     : MIMG_nsa_gfx11<op.GFX11, (outs VReg_128:$vdata), num_addrs, "GFX11",
                      addr_types> {
-  let InOperandList = !con(nsah.AddrIns, (ins SReg_128:$srsrc, A16:$a16));
+  let InOperandList = !con(nsah.AddrIns, (ins SReg_128_XNULL:$srsrc, A16:$a16));
   let AsmString = opcode#" $vdata, "#nsah.AddrAsm#", $srsrc$a16";
 }
 
@@ -1556,7 +1556,7 @@ class VIMAGE_IntersectRay_gfx12<mimgopc op, string opcode, int num_addrs,
                                 list<RegisterClass> addr_types>
     : VIMAGE_gfx12<op.GFX12, (outs VReg_128:$vdata),
                    num_addrs, "GFX12", addr_types> {
-  let InOperandList = !con(nsah.AddrIns, (ins SReg_128:$rsrc, A16:$a16));
+  let InOperandList = !con(nsah.AddrIns, (ins SReg_128_XNULL:$rsrc, A16:$a16));
   let AsmString = opcode#" $vdata, "#nsah.AddrAsm#", $rsrc$a16";
 }
 
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index c2199fd587be..2bc19137b1ca 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -1096,21 +1096,8 @@ void SIFoldOperandsImpl::foldOperand(
           B.addImm(Defs[I].second);
         }
         LLVM_DEBUG(dbgs() << "Folded " << *UseMI);
-        return;
       }
 
-      if (Size != 4)
-        return;
-
-      Register Reg0 = UseMI->getOperand(0).getReg();
-      Register Reg1 = UseMI->getOperand(1).getReg();
-      if (TRI->isAGPR(*MRI, Reg0) && TRI->isVGPR(*MRI, Reg1))
-        UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64));
-      else if (TRI->isVGPR(*MRI, Reg0) && TRI->isAGPR(*MRI, Reg1))
-        UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64));
-      else if (ST->hasGFX90AInsts() && TRI->isAGPR(*MRI, Reg0) &&
-               TRI->isAGPR(*MRI, Reg1))
-        UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_MOV_B32));
       return;
     }
 
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index b3cfa398d9b5..0ac84f4e1f02 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -13985,6 +13985,43 @@ SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
   return Accum;
 }
 
+SDValue
+SITargetLowering::foldAddSub64WithZeroLowBitsTo32(SDNode *N,
+                                                  DAGCombinerInfo &DCI) const {
+  SDValue RHS = N->getOperand(1);
+  auto *CRHS = dyn_cast<ConstantSDNode>(RHS);
+  if (!CRHS)
+    return SDValue();
+
+  // TODO: Worth using computeKnownBits? Maybe expensive since it's so
+  // common.
+  uint64_t Val = CRHS->getZExtValue();
+  if (countr_zero(Val) >= 32) {
+    SelectionDAG &DAG = DCI.DAG;
+    SDLoc SL(N);
+    SDValue LHS = N->getOperand(0);
+
+    // Avoid carry machinery if we know the low half of the add does not
+    // contribute to the final result.
+    //
+    // add i64:x, K if computeTrailingZeros(K) >= 32
+    //  => build_pair (add x.hi, K.hi), x.lo
+
+    // Breaking the 64-bit add here with this strange constant is unlikely
+    // to interfere with addressing mode patterns.
+
+    SDValue Hi = getHiHalf64(LHS, DAG);
+    SDValue ConstHi32 = DAG.getConstant(Hi_32(Val), SL, MVT::i32);
+    SDValue AddHi =
+        DAG.getNode(N->getOpcode(), SL, MVT::i32, Hi, ConstHi32, N->getFlags());
+
+    SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
+    return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, Lo, AddHi);
+  }
+
+  return SDValue();
+}
+
 // Collect the ultimate src of each of the mul node's operands, and confirm
 // each operand is 8 bytes.
 static std::optional<ByteProvider<SDValue>>
@@ -14261,6 +14298,11 @@ SDValue SITargetLowering::performAddCombine(SDNode *N,
     return V;
   }
 
+  if (VT == MVT::i64) {
+    if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
+      return Folded;
+  }
+
   if ((isMul(LHS) || isMul(RHS)) && Subtarget->hasDot7Insts() &&
       (Subtarget->hasDot1Insts() || Subtarget->hasDot8Insts())) {
     SDValue TempNode(N, 0);
@@ -14446,6 +14488,11 @@ SDValue SITargetLowering::performSubCombine(SDNode *N,
   SelectionDAG &DAG = DCI.DAG;
   EVT VT = N->getValueType(0);
 
+  if (VT == MVT::i64) {
+    if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
+      return Folded;
+  }
+
   if (VT != MVT::i32)
     return SDValue();
 
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index f4641e7a6599..299c8f5f7392 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -212,6 +212,9 @@ private:
   unsigned getFusedOpcode(const SelectionDAG &DAG,
                           const SDNode *N0, const SDNode *N1) const;
   SDValue tryFoldToMad64_32(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue foldAddSub64WithZeroLowBitsTo32(SDNode *N,
+                                          DAGCombinerInfo &DCI) const;
+
   SDValue performAddCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performAddCarrySubCarryCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performSubCombine(SDNode *N, DAGCombinerInfo &DCI) const;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 692e2867ca88..e6f333fbb878 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -6890,9 +6890,8 @@ SIInstrInfo::legalizeOperands(MachineInstr &MI,
       AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc);
   if (RsrcIdx != -1) {
     MachineOperand *Rsrc = &MI.getOperand(RsrcIdx);
-    if (Rsrc->isReg() && !RI.isSGPRClass(MRI.getRegClass(Rsrc->getReg()))) {
+    if (Rsrc->isReg() && !RI.isSGPRReg(MRI, Rsrc->getReg()))
       isRsrcLegal = false;
-    }
   }
 
   // The operands are legal.
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index e388efe73cdd..ee83dff227a8 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -3055,7 +3055,7 @@ def : GCNPat<
    (V_BFREV_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$a, sub0))), sub1)>;
 
 // If fcanonicalize's operand is implicitly canonicalized, we only need a copy.
-let AddedComplexity = 1000 in {
+let AddedComplexity = 8 in {
 foreach vt = [f16, v2f16, f32, v2f32, f64] in {
   def : GCNPat<
     (fcanonicalize (vt is_canonicalized:$src)),
@@ -3710,12 +3710,15 @@ def : IntMinMaxPat<V_MAXMIN_U32_e64, umin, umax_oneuse>;
 def : IntMinMaxPat<V_MINMAX_U32_e64, umax, umin_oneuse>;
 def : FPMinMaxPat<V_MINMAX_F32_e64, f32, fmaxnum_like, fminnum_like_oneuse>;
 def : FPMinMaxPat<V_MAXMIN_F32_e64, f32, fminnum_like, fmaxnum_like_oneuse>;
-def : FPMinMaxPat<V_MINMAX_F16_e64, f16, fmaxnum_like, fminnum_like_oneuse>;
-def : FPMinMaxPat<V_MAXMIN_F16_e64, f16, fminnum_like, fmaxnum_like_oneuse>;
 def : FPMinCanonMaxPat<V_MINMAX_F32_e64, f32, fmaxnum_like, fminnum_like_oneuse>;
 def : FPMinCanonMaxPat<V_MAXMIN_F32_e64, f32, fminnum_like, fmaxnum_like_oneuse>;
-def : FPMinCanonMaxPat<V_MINMAX_F16_e64, f16, fmaxnum_like, fminnum_like_oneuse>;
-def : FPMinCanonMaxPat<V_MAXMIN_F16_e64, f16, fminnum_like, fmaxnum_like_oneuse>;
+}
+
+let True16Predicate = UseFakeTrue16Insts in {
+def : FPMinMaxPat<V_MINMAX_F16_fake16_e64, f16, fmaxnum_like, fminnum_like_oneuse>;
+def : FPMinMaxPat<V_MAXMIN_F16_fake16_e64, f16, fminnum_like, fmaxnum_like_oneuse>;
+def : FPMinCanonMaxPat<V_MINMAX_F16_fake16_e64, f16, fmaxnum_like, fminnum_like_oneuse>;
+def : FPMinCanonMaxPat<V_MAXMIN_F16_fake16_e64, f16, fminnum_like, fmaxnum_like_oneuse>;
 }
 
 let SubtargetPredicate = isGFX9Plus in {
@@ -3723,6 +3726,10 @@ let True16Predicate = NotHasTrue16BitInsts in {
   defm : Int16Med3Pat<V_MED3_I16_e64, smin, smax, VSrc_b16>;
   defm : Int16Med3Pat<V_MED3_U16_e64, umin, umax, VSrc_b16>;
 }
+let True16Predicate = UseRealTrue16Insts in {
+  defm : Int16Med3Pat<V_MED3_I16_t16_e64, smin, smax, VSrcT_b16>;
+  defm : Int16Med3Pat<V_MED3_U16_t16_e64, umin, umax, VSrcT_b16>;
+}
 let True16Predicate = UseFakeTrue16Insts in {
   defm : Int16Med3Pat<V_MED3_I16_fake16_e64, smin, smax, VSrc_b16>;
   defm : Int16Med3Pat<V_MED3_U16_fake16_e64, umin, umax, VSrc_b16>;
diff --git a/llvm/lib/Target/AMDGPU/SMInstructions.td b/llvm/lib/Target/AMDGPU/SMInstructions.td
index 60e4ce92ac25..37dcc1008625 100644
--- a/llvm/lib/Target/AMDGPU/SMInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SMInstructions.td
@@ -341,10 +341,10 @@ let SubtargetPredicate = HasScalarDwordx3Loads in
 defm S_BUFFER_LOAD_DWORDX4 : SM_Pseudo_Loads <SReg_128_XNULL, SReg_128>;
 defm S_BUFFER_LOAD_DWORDX8 : SM_Pseudo_Loads <SReg_128_XNULL, SReg_256>;
 defm S_BUFFER_LOAD_DWORDX16 : SM_Pseudo_Loads <SReg_128_XNULL, SReg_512>;
-defm S_BUFFER_LOAD_I8 : SM_Pseudo_Loads <SReg_128, SReg_32_XM0_XEXEC>;
-defm S_BUFFER_LOAD_U8 : SM_Pseudo_Loads <SReg_128, SReg_32_XM0_XEXEC>;
-defm S_BUFFER_LOAD_I16 : SM_Pseudo_Loads <SReg_128, SReg_32_XM0_XEXEC>;
-defm S_BUFFER_LOAD_U16 : SM_Pseudo_Loads <SReg_128, SReg_32_XM0_XEXEC>;
+defm S_BUFFER_LOAD_I8 : SM_Pseudo_Loads <SReg_128_XNULL, SReg_32_XM0_XEXEC>;
+defm S_BUFFER_LOAD_U8 : SM_Pseudo_Loads <SReg_128_XNULL, SReg_32_XM0_XEXEC>;
+defm S_BUFFER_LOAD_I16 : SM_Pseudo_Loads <SReg_128_XNULL, SReg_32_XM0_XEXEC>;
+defm S_BUFFER_LOAD_U16 : SM_Pseudo_Loads <SReg_128_XNULL, SReg_32_XM0_XEXEC>;
 }
 
 let SubtargetPredicate = HasScalarStores in {
@@ -375,7 +375,7 @@ def S_DCACHE_WB_VOL : SM_Inval_Pseudo <"s_dcache_wb_vol", int_amdgcn_s_dcache_wb
 
 defm S_ATC_PROBE        : SM_Pseudo_Probe <SReg_64>;
 let is_buffer = 1 in {
-defm S_ATC_PROBE_BUFFER : SM_Pseudo_Probe <SReg_128>;
+defm S_ATC_PROBE_BUFFER : SM_Pseudo_Probe <SReg_128_XNULL>;
 }
 } // SubtargetPredicate = isGFX8Plus
 
@@ -470,7 +470,7 @@ def S_PREFETCH_INST        : SM_Prefetch_Pseudo <"s_prefetch_inst", SReg_64, 1>;
 def S_PREFETCH_INST_PC_REL : SM_Prefetch_Pseudo <"s_prefetch_inst_pc_rel", SReg_64, 0>;
 def S_PREFETCH_DATA        : SM_Prefetch_Pseudo <"s_prefetch_data", SReg_64, 1>;
 def S_PREFETCH_DATA_PC_REL : SM_Prefetch_Pseudo <"s_prefetch_data_pc_rel", SReg_64, 0>;
-def S_BUFFER_PREFETCH_DATA : SM_Prefetch_Pseudo <"s_buffer_prefetch_data", SReg_128, 1> {
+def S_BUFFER_PREFETCH_DATA : SM_Prefetch_Pseudo <"s_buffer_prefetch_data", SReg_128_XNULL, 1> {
   let is_buffer = 1;
 }
 } // end let SubtargetPredicate = isGFX12Plus
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index cef1f20f3420..24a2eede9ca3 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -1374,8 +1374,8 @@ class VOP3_DOT_Profile_fake16<VOPProfile P, VOP3Features Features = VOP3_REGULAR
 let SubtargetPredicate = isGFX11Plus in {
   defm V_MAXMIN_F32     : VOP3Inst<"v_maxmin_f32", VOP3_Profile<VOP_F32_F32_F32_F32>>;
   defm V_MINMAX_F32     : VOP3Inst<"v_minmax_f32", VOP3_Profile<VOP_F32_F32_F32_F32>>;
-  defm V_MAXMIN_F16     : VOP3Inst<"v_maxmin_f16", VOP3_Profile<VOP_F16_F16_F16_F16>>;
-  defm V_MINMAX_F16     : VOP3Inst<"v_minmax_f16", VOP3_Profile<VOP_F16_F16_F16_F16>>;
+  defm V_MAXMIN_F16     : VOP3Inst_t16<"v_maxmin_f16", VOP_F16_F16_F16_F16>;
+  defm V_MINMAX_F16     : VOP3Inst_t16<"v_minmax_f16", VOP_F16_F16_F16_F16>;
   defm V_MAXMIN_U32     : VOP3Inst<"v_maxmin_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
   defm V_MINMAX_U32     : VOP3Inst<"v_minmax_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
   defm V_MAXMIN_I32     : VOP3Inst<"v_maxmin_i32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
@@ -1588,8 +1588,8 @@ defm V_MED3_NUM_F32       : VOP3_Realtriple_with_name_gfx12<0x231, "V_MED3_F32",
 defm V_MED3_NUM_F16       : VOP3_Realtriple_t16_and_fake16_gfx12<0x232, "v_med3_num_f16", "V_MED3_F16", "v_med3_f16">;
 defm V_MINMAX_NUM_F32     : VOP3_Realtriple_with_name_gfx12<0x268, "V_MINMAX_F32", "v_minmax_num_f32">;
 defm V_MAXMIN_NUM_F32     : VOP3_Realtriple_with_name_gfx12<0x269, "V_MAXMIN_F32", "v_maxmin_num_f32">;
-defm V_MINMAX_NUM_F16     : VOP3_Realtriple_with_name_gfx12<0x26a, "V_MINMAX_F16", "v_minmax_num_f16">;
-defm V_MAXMIN_NUM_F16     : VOP3_Realtriple_with_name_gfx12<0x26b, "V_MAXMIN_F16", "v_maxmin_num_f16">;
+defm V_MINMAX_NUM_F16     : VOP3_Realtriple_t16_and_fake16_gfx12<0x26a, "v_minmax_num_f16", "V_MINMAX_F16", "v_minmax_f16">;
+defm V_MAXMIN_NUM_F16     : VOP3_Realtriple_t16_and_fake16_gfx12<0x26b, "v_maxmin_num_f16", "V_MAXMIN_F16", "v_maxmin_f16">;
 defm V_MINIMUMMAXIMUM_F32 : VOP3Only_Realtriple_gfx12<0x26c>;
 defm V_MAXIMUMMINIMUM_F32 : VOP3Only_Realtriple_gfx12<0x26d>;
 defm V_MINIMUMMAXIMUM_F16 : VOP3Only_Realtriple_t16_gfx12<0x26e>;
@@ -1730,8 +1730,8 @@ defm V_PERMLANE16_B32      : VOP3_Real_Base_gfx11_gfx12<0x25b>;
 defm V_PERMLANEX16_B32     : VOP3_Real_Base_gfx11_gfx12<0x25c>;
 defm V_MAXMIN_F32          : VOP3_Realtriple_gfx11<0x25e>;
 defm V_MINMAX_F32          : VOP3_Realtriple_gfx11<0x25f>;
-defm V_MAXMIN_F16          : VOP3_Realtriple_gfx11<0x260>;
-defm V_MINMAX_F16          : VOP3_Realtriple_gfx11<0x261>;
+defm V_MAXMIN_F16          : VOP3_Realtriple_t16_and_fake16_gfx11<0x260, "v_maxmin_f16">;
+defm V_MINMAX_F16          : VOP3_Realtriple_t16_and_fake16_gfx11<0x261, "v_minmax_f16">;
 defm V_MAXMIN_U32          : VOP3_Realtriple_gfx11_gfx12<0x262>;
 defm V_MINMAX_U32          : VOP3_Realtriple_gfx11_gfx12<0x263>;
 defm V_MAXMIN_I32          : VOP3_Realtriple_gfx11_gfx12<0x264>;
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index d236907b0eec..930ed9a5e2d0 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -1909,8 +1909,8 @@ multiclass VOP3_Realtriple_t16_gfx11<bits<10> op, string asmName, string opName
 
 multiclass VOP3_Realtriple_t16_and_fake16_gfx11<bits<10> op, string asmName, string opName = NAME,
                                                 string pseudo_mnemonic = "", bit isSingle = 0> {
-  defm _t16: VOP3_Realtriple_t16_gfx11<op, opName#"_t16", asmName, pseudo_mnemonic, isSingle>;
-  defm _fake16: VOP3_Realtriple_t16_gfx11<op, opName#"_fake16", asmName, pseudo_mnemonic, isSingle>;
+  defm _t16: VOP3_Realtriple_t16_gfx11<op, asmName, opName#"_t16", pseudo_mnemonic, isSingle>;
+  defm _fake16: VOP3_Realtriple_t16_gfx11<op, asmName, opName#"_fake16", pseudo_mnemonic, isSingle>;
 }
 
 multiclass VOP3Only_Realtriple_t16_gfx11<bits<10> op, string asmName,
diff --git a/llvm/lib/Target/ARM/ARMInstrInfo.td b/llvm/lib/Target/ARM/ARMInstrInfo.td
index c67177cd5a6f..009b60c8400f 100644
--- a/llvm/lib/Target/ARM/ARMInstrInfo.td
+++ b/llvm/lib/Target/ARM/ARMInstrInfo.td
@@ -3320,7 +3320,7 @@ def STRH_preidx: ARMPseudoInst<(outs GPR:$Rn_wb),
 }
 
 
-
+let mayStore = 1, hasSideEffects = 0 in {
 def STRH_PRE  : AI3ldstidx<0b1011, 0, 1, (outs GPR:$Rn_wb),
                            (ins GPR:$Rt, addrmode3_pre:$addr), IndexModePre,
                            StMiscFrm, IIC_iStore_bh_ru,
@@ -3352,6 +3352,7 @@ def STRH_POST : AI3ldstidx<0b1011, 0, 0, (outs GPR:$Rn_wb),
   let Inst{3-0}   = offset{3-0};    // imm3_0/Rm
   let DecoderMethod = "DecodeAddrMode3Instruction";
 }
+} // mayStore = 1, hasSideEffects = 0
 
 let mayStore = 1, hasSideEffects = 0, hasExtraSrcRegAllocReq = 1 in {
 def STRD_PRE : AI3ldstidx<0b1111, 0, 1, (outs GPR:$Rn_wb),
diff --git a/llvm/lib/Target/ARM/ARMSystemRegister.td b/llvm/lib/Target/ARM/ARMSystemRegister.td
index c03db15d1041..3afc410e0456 100644
--- a/llvm/lib/Target/ARM/ARMSystemRegister.td
+++ b/llvm/lib/Target/ARM/ARMSystemRegister.td
@@ -19,17 +19,13 @@ class MClassSysReg<bits<1> UniqMask1,
                    bits<1> UniqMask2,
                    bits<1> UniqMask3,
                    bits<12> Enc12,
-                   string name> : SearchableTable {
-  let SearchableFields = ["Name", "M1Encoding12", "M2M3Encoding8", "Encoding"];
+                   string name> {
   string Name;
   bits<13> M1Encoding12;
   bits<10> M2M3Encoding8;
   bits<12> Encoding;
 
   let Name = name;
-  let EnumValueField = "M1Encoding12";
-  let EnumValueField = "M2M3Encoding8";
-  let EnumValueField = "Encoding";
 
   let M1Encoding12{12}    = UniqMask1;
   let M1Encoding12{11-00} = Enc12;
@@ -41,6 +37,27 @@ class MClassSysReg<bits<1> UniqMask1,
   code Requires           = [{ {} }];
 }
 
+def MClassSysRegsList : GenericTable {
+  let FilterClass = "MClassSysReg";
+  let Fields = ["Name", "M1Encoding12", "M2M3Encoding8", "Encoding",
+                "Requires"];
+}
+
+def lookupMClassSysRegByName : SearchIndex {
+  let Table = MClassSysRegsList;
+  let Key = ["Name"];
+}
+
+def lookupMClassSysRegByM1Encoding12 : SearchIndex {
+  let Table = MClassSysRegsList;
+  let Key = ["M1Encoding12"];
+}
+
+def lookupMClassSysRegByM2M3Encoding8 : SearchIndex {
+  let Table = MClassSysRegsList;
+  let Key = ["M2M3Encoding8"];
+}
+
 // [|i|e|x]apsr_nzcvq has alias [|i|e|x]apsr.
 //                 Mask1 Mask2 Mask3 Enc12, Name
 let Requires = [{ {ARM::FeatureDSP} }] in {
@@ -127,15 +144,29 @@ def : MClassSysReg<0,    0,    1,    0x8a7, "pac_key_u_3_ns">;
 
 // Banked Registers
 //
-class BankedReg<string name,  bits<8> enc>
-               : SearchableTable {
+class BankedReg<string name,  bits<8> enc> {
   string Name;
   bits<8> Encoding;
   let Name = name;
   let Encoding = enc;
-  let SearchableFields = ["Name", "Encoding"];
 }
 
+def BankedRegsList : GenericTable {
+  let FilterClass = "BankedReg";
+  let Fields = ["Name", "Encoding"];
+}
+
+def lookupBankedRegByName : SearchIndex {
+  let Table = BankedRegsList;
+  let Key = ["Name"];
+}
+
+def lookupBankedRegByEncoding : SearchIndex {
+  let Table = BankedRegsList;
+  let Key = ["Encoding"];
+}
+
+
 // The values here come from B9.2.3 of the ARM ARM, where bits 4-0 are SysM
 // and bit 5 is R.
 def : BankedReg<"r8_usr",   0x00>;
diff --git a/llvm/lib/Target/ARM/Utils/ARMBaseInfo.cpp b/llvm/lib/Target/ARM/Utils/ARMBaseInfo.cpp
index 494c67d4b776..e76a70b3610a 100644
--- a/llvm/lib/Target/ARM/Utils/ARMBaseInfo.cpp
+++ b/llvm/lib/Target/ARM/Utils/ARMBaseInfo.cpp
@@ -62,13 +62,13 @@ const MClassSysReg *lookupMClassSysRegBy8bitSYSmValue(unsigned SYSm) {
   return ARMSysReg::lookupMClassSysRegByM2M3Encoding8((1<<8)|(SYSm & 0xFF));
 }
 
-#define GET_MCLASSSYSREG_IMPL
+#define GET_MClassSysRegsList_IMPL
 #include "ARMGenSystemRegister.inc"
 
 } // end namespace ARMSysReg
 
 namespace ARMBankedReg {
-#define GET_BANKEDREG_IMPL
+#define GET_BankedRegsList_IMPL
 #include "ARMGenSystemRegister.inc"
 } // end namespce ARMSysReg
 } // end namespace llvm
diff --git a/llvm/lib/Target/ARM/Utils/ARMBaseInfo.h b/llvm/lib/Target/ARM/Utils/ARMBaseInfo.h
index 5562572c5abf..dc4f811e075c 100644
--- a/llvm/lib/Target/ARM/Utils/ARMBaseInfo.h
+++ b/llvm/lib/Target/ARM/Utils/ARMBaseInfo.h
@@ -206,8 +206,8 @@ namespace ARMSysReg {
     }
   };
 
-  #define GET_MCLASSSYSREG_DECL
-  #include "ARMGenSystemRegister.inc"
+#define GET_MClassSysRegsList_DECL
+#include "ARMGenSystemRegister.inc"
 
   // lookup system register using 12-bit SYSm value.
   // Note: the search is uniqued using M1 mask
@@ -228,8 +228,8 @@ namespace ARMBankedReg {
     const char *Name;
     uint16_t Encoding;
   };
-  #define GET_BANKEDREG_DECL
-  #include "ARMGenSystemRegister.inc"
+#define GET_BankedRegsList_DECL
+#include "ARMGenSystemRegister.inc"
 } // end namespace ARMBankedReg
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/DirectX/DXIL.td b/llvm/lib/Target/DirectX/DXIL.td
index 5d865a3c0bbb..62b5b704e99e 100644
--- a/llvm/lib/Target/DirectX/DXIL.td
+++ b/llvm/lib/Target/DirectX/DXIL.td
@@ -42,8 +42,10 @@ def FloatTy : DXILOpParamType;
 def DoubleTy : DXILOpParamType;
 def ResRetHalfTy : DXILOpParamType;
 def ResRetFloatTy : DXILOpParamType;
+def ResRetDoubleTy : DXILOpParamType;
 def ResRetInt16Ty : DXILOpParamType;
 def ResRetInt32Ty : DXILOpParamType;
+def ResRetInt64Ty : DXILOpParamType;
 def HandleTy : DXILOpParamType;
 def ResBindTy : DXILOpParamType;
 def ResPropsTy : DXILOpParamType;
@@ -890,6 +892,23 @@ def SplitDouble :  DXILOp<102, splitDouble> {
   let attributes = [Attributes<DXIL1_0, [ReadNone]>];
 }
 
+def RawBufferLoad : DXILOp<139, rawBufferLoad> {
+  let Doc = "reads from a raw buffer and structured buffer";
+  // Handle, Coord0, Coord1, Mask, Alignment
+  let arguments = [HandleTy, Int32Ty, Int32Ty, Int8Ty, Int32Ty];
+  let result = OverloadTy;
+  let overloads = [
+    Overloads<DXIL1_2,
+              [ResRetHalfTy, ResRetFloatTy, ResRetInt16Ty, ResRetInt32Ty]>,
+    Overloads<DXIL1_3,
+              [
+                ResRetHalfTy, ResRetFloatTy, ResRetDoubleTy, ResRetInt16Ty,
+                ResRetInt32Ty, ResRetInt64Ty
+              ]>
+  ];
+  let stages = [Stages<DXIL1_2, [all_stages]>];
+}
+
 def Dot4AddI8Packed : DXILOp<163, dot4AddPacked> {
   let Doc = "signed dot product of 4 x i8 vectors packed into i32, with "
             "accumulate to i32";
diff --git a/llvm/lib/Target/DirectX/DXILOpBuilder.cpp b/llvm/lib/Target/DirectX/DXILOpBuilder.cpp
index 5d5bb3eacace..9f88ccd7a7b7 100644
--- a/llvm/lib/Target/DirectX/DXILOpBuilder.cpp
+++ b/llvm/lib/Target/DirectX/DXILOpBuilder.cpp
@@ -263,10 +263,14 @@ static Type *getTypeFromOpParamType(OpParamType Kind, LLVMContext &Ctx,
     return getResRetType(Type::getHalfTy(Ctx));
   case OpParamType::ResRetFloatTy:
     return getResRetType(Type::getFloatTy(Ctx));
+  case OpParamType::ResRetDoubleTy:
+    return getResRetType(Type::getDoubleTy(Ctx));
   case OpParamType::ResRetInt16Ty:
     return getResRetType(Type::getInt16Ty(Ctx));
   case OpParamType::ResRetInt32Ty:
     return getResRetType(Type::getInt32Ty(Ctx));
+  case OpParamType::ResRetInt64Ty:
+    return getResRetType(Type::getInt64Ty(Ctx));
   case OpParamType::HandleTy:
     return getHandleType(Ctx);
   case OpParamType::ResBindTy:
diff --git a/llvm/lib/Target/DirectX/DXILOpLowering.cpp b/llvm/lib/Target/DirectX/DXILOpLowering.cpp
index 4e01dd1145a5..f43815bf2116 100644
--- a/llvm/lib/Target/DirectX/DXILOpLowering.cpp
+++ b/llvm/lib/Target/DirectX/DXILOpLowering.cpp
@@ -415,8 +415,16 @@ public:
         }
       }
 
-      OldResult = cast<Instruction>(
-          IRB.CreateExtractValue(Op, 0, OldResult->getName()));
+      if (OldResult->use_empty()) {
+        // Only the check bit was used, so we're done here.
+        OldResult->eraseFromParent();
+        return Error::success();
+      }
+
+      assert(OldResult->hasOneUse() &&
+             isa<ExtractValueInst>(*OldResult->user_begin()) &&
+             "Expected only use to be extract of first element");
+      OldResult = cast<Instruction>(*OldResult->user_begin());
       OldTy = ST->getElementType(0);
     }
 
@@ -534,6 +542,48 @@ public:
     });
   }
 
+  [[nodiscard]] bool lowerRawBufferLoad(Function &F) {
+    Triple TT(Triple(M.getTargetTriple()));
+    VersionTuple DXILVersion = TT.getDXILVersion();
+    const DataLayout &DL = F.getDataLayout();
+    IRBuilder<> &IRB = OpBuilder.getIRB();
+    Type *Int8Ty = IRB.getInt8Ty();
+    Type *Int32Ty = IRB.getInt32Ty();
+
+    return replaceFunction(F, [&](CallInst *CI) -> Error {
+      IRB.SetInsertPoint(CI);
+
+      Type *OldTy = cast<StructType>(CI->getType())->getElementType(0);
+      Type *ScalarTy = OldTy->getScalarType();
+      Type *NewRetTy = OpBuilder.getResRetType(ScalarTy);
+
+      Value *Handle =
+          createTmpHandleCast(CI->getArgOperand(0), OpBuilder.getHandleType());
+      Value *Index0 = CI->getArgOperand(1);
+      Value *Index1 = CI->getArgOperand(2);
+      uint64_t NumElements =
+          DL.getTypeSizeInBits(OldTy) / DL.getTypeSizeInBits(ScalarTy);
+      Value *Mask = ConstantInt::get(Int8Ty, ~(~0U << NumElements));
+      Value *Align =
+          ConstantInt::get(Int32Ty, DL.getPrefTypeAlign(ScalarTy).value());
+
+      Expected<CallInst *> OpCall =
+          DXILVersion >= VersionTuple(1, 2)
+              ? OpBuilder.tryCreateOp(OpCode::RawBufferLoad,
+                                      {Handle, Index0, Index1, Mask, Align},
+                                      CI->getName(), NewRetTy)
+              : OpBuilder.tryCreateOp(OpCode::BufferLoad,
+                                      {Handle, Index0, Index1}, CI->getName(),
+                                      NewRetTy);
+      if (Error E = OpCall.takeError())
+        return E;
+      if (Error E = replaceResRetUses(CI, *OpCall, /*HasCheckBit=*/true))
+        return E;
+
+      return Error::success();
+    });
+  }
+
   [[nodiscard]] bool lowerUpdateCounter(Function &F) {
     IRBuilder<> &IRB = OpBuilder.getIRB();
     Type *Int32Ty = IRB.getInt32Ty();
@@ -723,14 +773,14 @@ public:
         HasErrors |= lowerGetPointer(F);
         break;
       case Intrinsic::dx_resource_load_typedbuffer:
-        HasErrors |= lowerTypedBufferLoad(F, /*HasCheckBit=*/false);
-        break;
-      case Intrinsic::dx_resource_loadchecked_typedbuffer:
         HasErrors |= lowerTypedBufferLoad(F, /*HasCheckBit=*/true);
         break;
       case Intrinsic::dx_resource_store_typedbuffer:
         HasErrors |= lowerTypedBufferStore(F);
         break;
+      case Intrinsic::dx_resource_load_rawbuffer:
+        HasErrors |= lowerRawBufferLoad(F);
+        break;
       case Intrinsic::dx_resource_updatecounter:
         HasErrors |= lowerUpdateCounter(F);
         break;
diff --git a/llvm/lib/Target/DirectX/DXILResourceAccess.cpp b/llvm/lib/Target/DirectX/DXILResourceAccess.cpp
index 1ff8f09f066d..837624935c5f 100644
--- a/llvm/lib/Target/DirectX/DXILResourceAccess.cpp
+++ b/llvm/lib/Target/DirectX/DXILResourceAccess.cpp
@@ -30,6 +30,9 @@ static void replaceTypedBufferAccess(IntrinsicInst *II,
          "Unexpected typed buffer type");
   Type *ContainedType = HandleType->getTypeParameter(0);
 
+  Type *LoadType =
+      StructType::get(ContainedType, Type::getInt1Ty(II->getContext()));
+
   // We need the size of an element in bytes so that we can calculate the offset
   // in elements given a total offset in bytes later.
   Type *ScalarType = ContainedType->getScalarType();
@@ -81,13 +84,15 @@ static void replaceTypedBufferAccess(IntrinsicInst *II,
         // We're storing a scalar, so we need to load the current value and only
         // replace the relevant part.
         auto *Load = Builder.CreateIntrinsic(
-            ContainedType, Intrinsic::dx_resource_load_typedbuffer,
+            LoadType, Intrinsic::dx_resource_load_typedbuffer,
             {II->getOperand(0), II->getOperand(1)});
+        auto *Struct = Builder.CreateExtractValue(Load, {0});
+
         // If we have an offset from seeing a GEP earlier, use it.
         Value *IndexOp = Current.Index
                              ? Current.Index
                              : ConstantInt::get(Builder.getInt32Ty(), 0);
-        V = Builder.CreateInsertElement(Load, V, IndexOp);
+        V = Builder.CreateInsertElement(Struct, V, IndexOp);
       } else {
         llvm_unreachable("Store to typed resource has invalid type");
       }
@@ -101,8 +106,10 @@ static void replaceTypedBufferAccess(IntrinsicInst *II,
     } else if (auto *LI = dyn_cast<LoadInst>(Current.Access)) {
       IRBuilder<> Builder(LI);
       Value *V = Builder.CreateIntrinsic(
-          ContainedType, Intrinsic::dx_resource_load_typedbuffer,
+          LoadType, Intrinsic::dx_resource_load_typedbuffer,
           {II->getOperand(0), II->getOperand(1)});
+      V = Builder.CreateExtractValue(V, {0});
+
       if (Current.Index)
         V = Builder.CreateExtractElement(V, Current.Index);
 
diff --git a/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp b/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp
index ad079f45c882..5afe6b2d2883 100644
--- a/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp
+++ b/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp
@@ -15,14 +15,12 @@
 #include "llvm/ADT/Twine.h"
 #include "llvm/Analysis/DXILMetadataAnalysis.h"
 #include "llvm/Analysis/DXILResource.h"
-#include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/DiagnosticPrinter.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
 #include "llvm/InitializePasses.h"
@@ -302,39 +300,6 @@ static MDTuple *emitTopLevelLibraryNode(Module &M, MDNode *RMD,
   return constructEntryMetadata(nullptr, nullptr, RMD, Properties, Ctx);
 }
 
-// TODO: We might need to refactor this to be more generic,
-// in case we need more metadata to be replaced.
-static void translateBranchMetadata(Module &M) {
-  for (Function &F : M) {
-    for (BasicBlock &BB : F) {
-      Instruction *BBTerminatorInst = BB.getTerminator();
-
-      MDNode *HlslControlFlowMD =
-          BBTerminatorInst->getMetadata("hlsl.controlflow.hint");
-
-      if (!HlslControlFlowMD)
-        continue;
-
-      assert(HlslControlFlowMD->getNumOperands() == 2 &&
-             "invalid operands for hlsl.controlflow.hint");
-
-      MDBuilder MDHelper(M.getContext());
-      ConstantInt *Op1 =
-          mdconst::extract<ConstantInt>(HlslControlFlowMD->getOperand(1));
-
-      SmallVector<llvm::Metadata *, 2> Vals(
-          ArrayRef<Metadata *>{MDHelper.createString("dx.controlflow.hints"),
-                               MDHelper.createConstant(Op1)});
-
-      MDNode *MDNode = llvm::MDNode::get(M.getContext(), Vals);
-
-      BBTerminatorInst->setMetadata("dx.controlflow.hints", MDNode);
-      BBTerminatorInst->setMetadata("hlsl.controlflow.hint", nullptr);
-    }
-    F.clearMetadata();
-  }
-}
-
 static void translateMetadata(Module &M, DXILBindingMap &DBM,
                               DXILResourceTypeMap &DRTM,
                               const Resources &MDResources,
@@ -407,7 +372,6 @@ PreservedAnalyses DXILTranslateMetadata::run(Module &M,
   const dxil::ModuleMetadataInfo MMDI = MAM.getResult<DXILMetadataAnalysis>(M);
 
   translateMetadata(M, DBM, DRTM, MDResources, ShaderFlags, MMDI);
-  translateBranchMetadata(M);
 
   return PreservedAnalyses::all();
 }
@@ -445,7 +409,6 @@ public:
         getAnalysis<DXILMetadataAnalysisWrapperPass>().getModuleMetadata();
 
     translateMetadata(M, DBM, DRTM, MDResources, ShaderFlags, MMDI);
-    translateBranchMetadata(M);
     return true;
   }
 };
diff --git a/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp b/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp
index 45aadac86194..be68d46a876d 100644
--- a/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp
+++ b/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp
@@ -749,8 +749,8 @@ uint64_t DXILBitcodeWriter::getOptimizationFlags(const Value *V) {
     if (PEO->isExact())
       Flags |= 1 << bitc::PEO_EXACT;
   } else if (const auto *FPMO = dyn_cast<FPMathOperator>(V)) {
-    if (FPMO->hasAllowReassoc())
-      Flags |= bitc::AllowReassoc;
+    if (FPMO->hasAllowReassoc() || FPMO->hasAllowContract())
+      Flags |= bitc::UnsafeAlgebra;
     if (FPMO->hasNoNaNs())
       Flags |= bitc::NoNaNs;
     if (FPMO->hasNoInfs())
@@ -759,10 +759,6 @@ uint64_t DXILBitcodeWriter::getOptimizationFlags(const Value *V) {
       Flags |= bitc::NoSignedZeros;
     if (FPMO->hasAllowReciprocal())
       Flags |= bitc::AllowReciprocal;
-    if (FPMO->hasAllowContract())
-      Flags |= bitc::AllowContract;
-    if (FPMO->hasApproxFunc())
-      Flags |= bitc::ApproxFunc;
   }
 
   return Flags;
diff --git a/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp b/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
index 46a8ab395d32..991ee5b1cbaa 100644
--- a/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
@@ -1796,6 +1796,8 @@ bool PolynomialMultiplyRecognize::recognize() {
     IterCount = CV->getValue()->getZExtValue() + 1;
 
   Value *CIV = getCountIV(LoopB);
+  if (CIV == nullptr)
+    return false;
   ParsedValues PV;
   Simplifier PreSimp;
   PV.IterCount = IterCount;
diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp
index 54aeda283640..32bc8bb80129 100644
--- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp
@@ -154,6 +154,9 @@ void LoongArchInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
                                               Register VReg) const {
   MachineFunction *MF = MBB.getParent();
   MachineFrameInfo &MFI = MF->getFrameInfo();
+  DebugLoc DL;
+  if (I != MBB.end())
+    DL = I->getDebugLoc();
 
   unsigned Opcode;
   if (LoongArch::GPRRegClass.hasSubClassEq(RC))
@@ -177,7 +180,7 @@ void LoongArchInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
       MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOLoad,
       MFI.getObjectSize(FI), MFI.getObjectAlign(FI));
 
-  BuildMI(MBB, I, DebugLoc(), get(Opcode), DstReg)
+  BuildMI(MBB, I, DL, get(Opcode), DstReg)
       .addFrameIndex(FI)
       .addImm(0)
       .addMemOperand(MMO);
diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
index 65e1893d3f3b..d34f45fcac00 100644
--- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
@@ -14,7 +14,7 @@
 #include "NVPTX.h"
 #include "NVPTXUtilities.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/IR/NVVMIntrinsicFlags.h"
+#include "llvm/IR/NVVMIntrinsicUtils.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstrInfo.h"
diff --git a/llvm/lib/Target/NVPTX/NVPTXCtorDtorLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXCtorDtorLowering.cpp
index f940dc05948b..c03ef8d33220 100644
--- a/llvm/lib/Target/NVPTX/NVPTXCtorDtorLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXCtorDtorLowering.cpp
@@ -14,6 +14,7 @@
 #include "MCTargetDesc/NVPTXBaseInfo.h"
 #include "NVPTX.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/IR/CallingConv.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalVariable.h"
@@ -49,39 +50,34 @@ static std::string getHash(StringRef Str) {
   return llvm::utohexstr(Hash.low(), /*LowerCase=*/true);
 }
 
-static void addKernelMetadata(Module &M, GlobalValue *GV) {
+static void addKernelMetadata(Module &M, Function *F) {
   llvm::LLVMContext &Ctx = M.getContext();
 
   // Get "nvvm.annotations" metadata node.
   llvm::NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
 
-  llvm::Metadata *KernelMDVals[] = {
-      llvm::ConstantAsMetadata::get(GV), llvm::MDString::get(Ctx, "kernel"),
-      llvm::ConstantAsMetadata::get(
-          llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))};
-
   // This kernel is only to be called single-threaded.
   llvm::Metadata *ThreadXMDVals[] = {
-      llvm::ConstantAsMetadata::get(GV), llvm::MDString::get(Ctx, "maxntidx"),
+      llvm::ConstantAsMetadata::get(F), llvm::MDString::get(Ctx, "maxntidx"),
       llvm::ConstantAsMetadata::get(
           llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))};
   llvm::Metadata *ThreadYMDVals[] = {
-      llvm::ConstantAsMetadata::get(GV), llvm::MDString::get(Ctx, "maxntidy"),
+      llvm::ConstantAsMetadata::get(F), llvm::MDString::get(Ctx, "maxntidy"),
       llvm::ConstantAsMetadata::get(
           llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))};
   llvm::Metadata *ThreadZMDVals[] = {
-      llvm::ConstantAsMetadata::get(GV), llvm::MDString::get(Ctx, "maxntidz"),
+      llvm::ConstantAsMetadata::get(F), llvm::MDString::get(Ctx, "maxntidz"),
       llvm::ConstantAsMetadata::get(
           llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))};
 
   llvm::Metadata *BlockMDVals[] = {
-      llvm::ConstantAsMetadata::get(GV),
+      llvm::ConstantAsMetadata::get(F),
       llvm::MDString::get(Ctx, "maxclusterrank"),
       llvm::ConstantAsMetadata::get(
           llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))};
 
   // Append metadata to nvvm.annotations.
-  MD->addOperand(llvm::MDNode::get(Ctx, KernelMDVals));
+  F->setCallingConv(CallingConv::PTX_Kernel);
   MD->addOperand(llvm::MDNode::get(Ctx, ThreadXMDVals));
   MD->addOperand(llvm::MDNode::get(Ctx, ThreadYMDVals));
   MD->addOperand(llvm::MDNode::get(Ctx, ThreadZMDVals));
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index c51729e224bf..ef97844142d4 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -14,10 +14,11 @@
 #include "NVPTXUtilities.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/ISDOpcodes.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicsNVPTX.h"
-#include "llvm/IR/NVVMIntrinsicFlags.h"
+#include "llvm/IR/NVVMIntrinsicUtils.h"
 #include "llvm/Support/AtomicOrdering.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -2449,6 +2450,11 @@ bool NVPTXDAGToDAGISel::tryBFE(SDNode *N) {
   return true;
 }
 
+static inline bool isAddLike(const SDValue V) {
+  return V.getOpcode() == ISD::ADD ||
+         (V->getOpcode() == ISD::OR && V->getFlags().hasDisjoint());
+}
+
 // SelectDirectAddr - Match a direct address for DAG.
 // A direct address could be a globaladdress or externalsymbol.
 bool NVPTXDAGToDAGISel::SelectDirectAddr(SDValue N, SDValue &Address) {
@@ -2475,7 +2481,7 @@ bool NVPTXDAGToDAGISel::SelectDirectAddr(SDValue N, SDValue &Address) {
 // symbol+offset
 bool NVPTXDAGToDAGISel::SelectADDRsi_imp(
     SDNode *OpNode, SDValue Addr, SDValue &Base, SDValue &Offset, MVT mvt) {
-  if (Addr.getOpcode() == ISD::ADD) {
+  if (isAddLike(Addr)) {
     if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) {
       SDValue base = Addr.getOperand(0);
       if (SelectDirectAddr(base, Base)) {
@@ -2512,7 +2518,7 @@ bool NVPTXDAGToDAGISel::SelectADDRri_imp(
       Addr.getOpcode() == ISD::TargetGlobalAddress)
     return false; // direct calls.
 
-  if (Addr.getOpcode() == ISD::ADD) {
+  if (isAddLike(Addr)) {
     if (SelectDirectAddr(Addr.getOperand(0), Addr)) {
       return false;
     }
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
index 4a98fe21b81d..c9b7e8745569 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -261,6 +261,9 @@ public:
     return true;
   }
 
+  bool isFAbsFree(EVT VT) const override { return true; }
+  bool isFNegFree(EVT VT) const override { return true; }
+
 private:
   const NVPTXSubtarget &STI; // cache the subtarget here
   SDValue getParamSymbol(SelectionDAG &DAG, int idx, EVT) const;
diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp b/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
index 42043adc37b7..74ce6a9fc4ac 100644
--- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
@@ -34,19 +34,18 @@ void NVPTXSubtarget::anchor() {}
 
 NVPTXSubtarget &NVPTXSubtarget::initializeSubtargetDependencies(StringRef CPU,
                                                                 StringRef FS) {
-    // Provide the default CPU if we don't have one.
-    TargetName = std::string(CPU.empty() ? "sm_30" : CPU);
+  TargetName = std::string(CPU);
 
-    ParseSubtargetFeatures(TargetName, /*TuneCPU*/ TargetName, FS);
+  ParseSubtargetFeatures(getTargetName(), /*TuneCPU=*/getTargetName(), FS);
 
-    // Re-map SM version numbers, SmVersion carries the regular SMs which do
-    // have relative order, while FullSmVersion allows distinguishing sm_90 from
-    // sm_90a, which would *not* be a subset of sm_91.
-    SmVersion = getSmVersion();
+  // Re-map SM version numbers, SmVersion carries the regular SMs which do
+  // have relative order, while FullSmVersion allows distinguishing sm_90 from
+  // sm_90a, which would *not* be a subset of sm_91.
+  SmVersion = getSmVersion();
 
-    // Set default to PTX 6.0 (CUDA 9.0)
-    if (PTXVersion == 0) {
-      PTXVersion = 60;
+  // Set default to PTX 6.0 (CUDA 9.0)
+  if (PTXVersion == 0) {
+    PTXVersion = 60;
   }
 
   return *this;
diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
index 7555a2368ec9..bbc1cca7c12d 100644
--- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
+++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
@@ -111,7 +111,12 @@ public:
   // - 0 represents base GPU model,
   // - non-zero value identifies particular architecture-accelerated variant.
   bool hasAAFeatures() const { return getFullSmVersion() % 10; }
-  std::string getTargetName() const { return TargetName; }
+
+  // If the user did not provide a target we default to the `sm_30` target.
+  std::string getTargetName() const {
+    return TargetName.empty() ? "sm_30" : TargetName;
+  }
+  bool hasTargetName() const { return !TargetName.empty(); }
 
   // Get maximum value of required alignments among the supported data types.
   // From the PTX ISA doc, section 8.2.3:
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
index b3b2880588cc..6d4b82aa54a2 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
@@ -255,7 +255,10 @@ void NVPTXTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
   PB.registerPipelineStartEPCallback(
       [this](ModulePassManager &PM, OptimizationLevel Level) {
         FunctionPassManager FPM;
-        FPM.addPass(NVVMReflectPass(Subtarget.getSmVersion()));
+        // We do not want to fold out calls to nvvm.reflect early if the user
+        // has not provided a target architecture just yet.
+        if (Subtarget.hasTargetName())
+          FPM.addPass(NVVMReflectPass(Subtarget.getSmVersion()));
         // Note: NVVMIntrRangePass was causing numerical discrepancies at one
         // point, if issues crop up, consider disabling.
         FPM.addPass(NVVMIntrRangePass());
diff --git a/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp b/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp
index 98bffd92a087..0f2bec711b24 100644
--- a/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp
@@ -311,11 +311,13 @@ std::optional<unsigned> getMaxNReg(const Function &F) {
 }
 
 bool isKernelFunction(const Function &F) {
+  if (F.getCallingConv() == CallingConv::PTX_Kernel)
+    return true;
+
   if (const auto X = findOneNVVMAnnotation(&F, "kernel"))
     return (*X == 1);
 
-  // There is no NVVM metadata, check the calling convention
-  return F.getCallingConv() == CallingConv::PTX_Kernel;
+  return false;
 }
 
 MaybeAlign getAlign(const Function &F, unsigned Index) {
diff --git a/llvm/lib/Target/NVPTX/NVVMReflect.cpp b/llvm/lib/Target/NVPTX/NVVMReflect.cpp
index 56525a1edc76..0cd584c40744 100644
--- a/llvm/lib/Target/NVPTX/NVVMReflect.cpp
+++ b/llvm/lib/Target/NVPTX/NVVMReflect.cpp
@@ -21,6 +21,7 @@
 #include "NVPTX.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/CodeGen/CommandFlags.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
@@ -219,7 +220,13 @@ bool NVVMReflect::runOnFunction(Function &F) {
   return runNVVMReflect(F, SmVersion);
 }
 
-NVVMReflectPass::NVVMReflectPass() : NVVMReflectPass(0) {}
+NVVMReflectPass::NVVMReflectPass() {
+  // Get the CPU string from the command line if not provided.
+  std::string MCPU = codegen::getMCPU();
+  StringRef SM = MCPU;
+  if (!SM.consume_front("sm_") || SM.consumeInteger(10, SmVersion))
+    SmVersion = 0;
+}
 
 PreservedAnalyses NVVMReflectPass::run(Function &F,
                                        FunctionAnalysisManager &AM) {
diff --git a/llvm/lib/Target/RISCV/CMakeLists.txt b/llvm/lib/Target/RISCV/CMakeLists.txt
index 44661647a863..98d3615ebab5 100644
--- a/llvm/lib/Target/RISCV/CMakeLists.txt
+++ b/llvm/lib/Target/RISCV/CMakeLists.txt
@@ -15,6 +15,7 @@ tablegen(LLVM RISCVGenRegisterBank.inc -gen-register-bank)
 tablegen(LLVM RISCVGenRegisterInfo.inc -gen-register-info)
 tablegen(LLVM RISCVGenSearchableTables.inc -gen-searchable-tables)
 tablegen(LLVM RISCVGenSubtargetInfo.inc -gen-subtarget)
+tablegen(LLVM RISCVGenExegesis.inc -gen-exegesis)
 
 set(LLVM_TARGET_DEFINITIONS RISCVGISel.td)
 tablegen(LLVM RISCVGenGlobalISel.inc -gen-global-isel)
diff --git a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
index 30122831767f..a490910154eb 100644
--- a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
+++ b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
@@ -698,6 +698,8 @@ DecodeStatus RISCVDisassembler::getInstruction32(MCInst &MI, uint64_t &Size,
   TRY_TO_DECODE_FEATURE(
       RISCV::FeatureVendorXqcicli, DecoderTableXqcicli32,
       "Qualcomm uC Conditional Load Immediate custom opcode table");
+  TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXqcicm, DecoderTableXqcicm32,
+                        "Qualcomm uC Conditional Move custom opcode table");
   TRY_TO_DECODE(true, DecoderTable32, "RISCV32 table");
 
   return MCDisassembler::Fail;
@@ -727,6 +729,9 @@ DecodeStatus RISCVDisassembler::getInstruction16(MCInst &MI, uint64_t &Size,
   TRY_TO_DECODE_FEATURE(
       RISCV::FeatureVendorXqciac, DecoderTableXqciac16,
       "Qualcomm uC Load-Store Address Calculation custom 16bit opcode table");
+  TRY_TO_DECODE_FEATURE(
+      RISCV::FeatureVendorXqcicm, DecoderTableXqcicm16,
+      "Qualcomm uC Conditional Move custom 16bit opcode table");
   TRY_TO_DECODE_AND_ADD_SP(STI.hasFeature(RISCV::FeatureVendorXwchc),
                            DecoderTableXwchc16,
                            "WCH QingKe XW custom opcode table");
diff --git a/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp b/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp
index ef85057ba126..3f1539da4a9c 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp
+++ b/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp
@@ -80,7 +80,6 @@ private:
   bool selectFPCompare(MachineInstr &MI, MachineIRBuilder &MIB) const;
   void emitFence(AtomicOrdering FenceOrdering, SyncScope::ID FenceSSID,
                  MachineIRBuilder &MIB) const;
-  bool selectMergeValues(MachineInstr &MI, MachineIRBuilder &MIB) const;
   bool selectUnmergeValues(MachineInstr &MI, MachineIRBuilder &MIB) const;
 
   ComplexRendererFns selectShiftMask(MachineOperand &Root,
@@ -732,8 +731,6 @@ bool RISCVInstructionSelector::select(MachineInstr &MI) {
   }
   case TargetOpcode::G_IMPLICIT_DEF:
     return selectImplicitDef(MI, MIB);
-  case TargetOpcode::G_MERGE_VALUES:
-    return selectMergeValues(MI, MIB);
   case TargetOpcode::G_UNMERGE_VALUES:
     return selectUnmergeValues(MI, MIB);
   default:
@@ -741,26 +738,13 @@ bool RISCVInstructionSelector::select(MachineInstr &MI) {
   }
 }
 
-bool RISCVInstructionSelector::selectMergeValues(MachineInstr &MI,
-                                                 MachineIRBuilder &MIB) const {
-  assert(MI.getOpcode() == TargetOpcode::G_MERGE_VALUES);
-
-  // Build a F64 Pair from operands
-  if (MI.getNumOperands() != 3)
-    return false;
-  Register Dst = MI.getOperand(0).getReg();
-  Register Lo = MI.getOperand(1).getReg();
-  Register Hi = MI.getOperand(2).getReg();
-  if (!isRegInFprb(Dst) || !isRegInGprb(Lo) || !isRegInGprb(Hi))
-    return false;
-  MI.setDesc(TII.get(RISCV::BuildPairF64Pseudo));
-  return constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
-}
-
 bool RISCVInstructionSelector::selectUnmergeValues(
     MachineInstr &MI, MachineIRBuilder &MIB) const {
   assert(MI.getOpcode() == TargetOpcode::G_UNMERGE_VALUES);
 
+  if (!Subtarget->hasStdExtZfa())
+    return false;
+
   // Split F64 Src into two s32 parts
   if (MI.getNumOperands() != 3)
     return false;
@@ -769,8 +753,17 @@ bool RISCVInstructionSelector::selectUnmergeValues(
   Register Hi = MI.getOperand(1).getReg();
   if (!isRegInFprb(Src) || !isRegInGprb(Lo) || !isRegInGprb(Hi))
     return false;
-  MI.setDesc(TII.get(RISCV::SplitF64Pseudo));
-  return constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
+
+  MachineInstr *ExtractLo = MIB.buildInstr(RISCV::FMV_X_W_FPR64, {Lo}, {Src});
+  if (!constrainSelectedInstRegOperands(*ExtractLo, TII, TRI, RBI))
+    return false;
+
+  MachineInstr *ExtractHi = MIB.buildInstr(RISCV::FMVH_X_D, {Hi}, {Src});
+  if (!constrainSelectedInstRegOperands(*ExtractHi, TII, TRI, RBI))
+    return false;
+
+  MI.eraseFromParent();
+  return true;
 }
 
 bool RISCVInstructionSelector::replacePtrWithInt(MachineOperand &Op,
diff --git a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
index 82847370b708..6f0645965d73 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
+++ b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
@@ -21,6 +21,7 @@
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineJumpTableInfo.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/TargetOpcodes.h"
 #include "llvm/CodeGen/ValueTypes.h"
@@ -132,7 +133,14 @@ RISCVLegalizerInfo::RISCVLegalizerInfo(const RISCVSubtarget &ST)
 
   auto PtrVecTys = {nxv1p0, nxv2p0, nxv4p0, nxv8p0, nxv16p0};
 
-  getActionDefinitionsBuilder({G_ADD, G_SUB, G_AND, G_OR, G_XOR})
+  getActionDefinitionsBuilder({G_ADD, G_SUB})
+      .legalFor({sXLen})
+      .legalIf(typeIsLegalIntOrFPVec(0, IntOrFPVecTys, ST))
+      .customFor(ST.is64Bit(), {s32})
+      .widenScalarToNextPow2(0)
+      .clampScalar(0, sXLen, sXLen);
+
+  getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
       .legalFor({sXLen})
       .legalIf(typeIsLegalIntOrFPVec(0, IntOrFPVecTys, ST))
       .widenScalarToNextPow2(0)
@@ -1330,6 +1338,24 @@ bool RISCVLegalizerInfo::legalizeCustom(
       return true;
     return Helper.lowerConstant(MI);
   }
+  case TargetOpcode::G_SUB:
+  case TargetOpcode::G_ADD: {
+    Helper.Observer.changingInstr(MI);
+    Helper.widenScalarSrc(MI, sXLen, 1, TargetOpcode::G_ANYEXT);
+    Helper.widenScalarSrc(MI, sXLen, 2, TargetOpcode::G_ANYEXT);
+
+    Register DstALU = MRI.createGenericVirtualRegister(sXLen);
+
+    MachineOperand &MO = MI.getOperand(0);
+    MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
+    auto DstSext = MIRBuilder.buildSExtInReg(sXLen, DstALU, 32);
+
+    MIRBuilder.buildInstr(TargetOpcode::G_TRUNC, {MO}, {DstSext});
+    MO.setReg(DstALU);
+
+    Helper.Observer.changedInstr(MI);
+    return true;
+  }
   case TargetOpcode::G_SEXT_INREG: {
     LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
     int64_t SizeInBits = MI.getOperand(2).getImm();
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
index eab4a5e77d96..0cb1ef0a66b6 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
@@ -38,9 +38,12 @@ std::optional<MCFixupKind> RISCVAsmBackend::getFixupKind(StringRef Name) const {
   if (STI.getTargetTriple().isOSBinFormatELF()) {
     unsigned Type;
     Type = llvm::StringSwitch<unsigned>(Name)
-#define ELF_RELOC(X, Y) .Case(#X, Y)
+#define ELF_RELOC(NAME, ID) .Case(#NAME, ID)
 #include "llvm/BinaryFormat/ELFRelocs/RISCV.def"
 #undef ELF_RELOC
+#define ELF_RISCV_NONSTANDARD_RELOC(_VENDOR, NAME, ID) .Case(#NAME, ID)
+#include "llvm/BinaryFormat/ELFRelocs/RISCV_nonstandard.def"
+#undef ELF_RISCV_NONSTANDARD_RELOC
                .Case("BFD_RELOC_NONE", ELF::R_RISCV_NONE)
                .Case("BFD_RELOC_32", ELF::R_RISCV_32)
                .Case("BFD_RELOC_64", ELF::R_RISCV_64)
diff --git a/llvm/lib/Target/RISCV/RISCV.td b/llvm/lib/Target/RISCV/RISCV.td
index 963124140cd0..4e0c64a5ca2c 100644
--- a/llvm/lib/Target/RISCV/RISCV.td
+++ b/llvm/lib/Target/RISCV/RISCV.td
@@ -64,6 +64,12 @@ include "RISCVSchedXiangShanNanHu.td"
 include "RISCVProcessors.td"
 
 //===----------------------------------------------------------------------===//
+// Pfm Counters
+//===----------------------------------------------------------------------===//
+
+include "RISCVPfmCounters.td"
+
+//===----------------------------------------------------------------------===//
 // Define the RISC-V target.
 //===----------------------------------------------------------------------===//
 
diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td
index 0074be35798a..01bc5387e672 100644
--- a/llvm/lib/Target/RISCV/RISCVFeatures.td
+++ b/llvm/lib/Target/RISCV/RISCVFeatures.td
@@ -1294,6 +1294,14 @@ def HasVendorXqcicli
       AssemblerPredicate<(all_of FeatureVendorXqcicli),
                          "'Xqcicli' (Qualcomm uC Conditional Load Immediate Extension)">;
 
+def FeatureVendorXqcicm
+    : RISCVExperimentalExtension<0, 2, "Qualcomm uC Conditional Move Extension",
+                                 [FeatureStdExtZca]>;
+def HasVendorXqcicm
+    : Predicate<"Subtarget->hasVendorXqcicm()">,
+      AssemblerPredicate<(all_of FeatureVendorXqcicm),
+                         "'Xqcicm' (Qualcomm uC Conditional Move Extension)">;
+
 //===----------------------------------------------------------------------===//
 // LLVM specific features and extensions
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 4a0304f3322e..6c58989b1afb 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -18385,7 +18385,7 @@ bool RISCVTargetLowering::isDesirableToCommuteWithShift(
     auto *C2 = dyn_cast<ConstantSDNode>(N->getOperand(1));
 
     // Bail if we might break a sh{1,2,3}add pattern.
-    if (Subtarget.hasStdExtZba() && C2->getZExtValue() >= 1 &&
+    if (Subtarget.hasStdExtZba() && C2 && C2->getZExtValue() >= 1 &&
         C2->getZExtValue() <= 3 && N->hasOneUse() &&
         N->user_begin()->getOpcode() == ISD::ADD &&
         !isUsedByLdSt(*N->user_begin(), nullptr) &&
@@ -20273,13 +20273,11 @@ SDValue RISCVTargetLowering::LowerCall(CallLoweringInfo &CLI,
   for (auto &Reg : RegsToPass)
     Ops.push_back(DAG.getRegister(Reg.first, Reg.second.getValueType()));
 
-  if (!IsTailCall) {
-    // Add a register mask operand representing the call-preserved registers.
-    const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
-    const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
-    assert(Mask && "Missing call preserved mask for calling convention");
-    Ops.push_back(DAG.getRegisterMask(Mask));
-  }
+  // Add a register mask operand representing the call-preserved registers.
+  const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
+  const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
+  assert(Mask && "Missing call preserved mask for calling convention");
+  Ops.push_back(DAG.getRegisterMask(Mask));
 
   // Glue the call to the argument copies, if any.
   if (Glue.getNode())
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoD.td b/llvm/lib/Target/RISCV/RISCVInstrInfoD.td
index ae969bff82fd..349bc361c90f 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoD.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoD.td
@@ -23,7 +23,9 @@ def SDT_RISCVSplitF64     : SDTypeProfile<2, 1, [SDTCisVT<0, i32>,
                                                  SDTCisVT<2, f64>]>;
 
 def RISCVBuildPairF64 : SDNode<"RISCVISD::BuildPairF64", SDT_RISCVBuildPairF64>;
+def : GINodeEquiv<G_MERGE_VALUES, RISCVBuildPairF64>;
 def RISCVSplitF64     : SDNode<"RISCVISD::SplitF64", SDT_RISCVSplitF64>;
+def : GINodeEquiv<G_UNMERGE_VALUES, RISCVSplitF64>;
 
 def AddrRegImmINX : ComplexPattern<iPTR, 2, "SelectAddrRegImmRV32Zdinx">;
 
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
index 5e6722cb4995..6f15646852f9 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
@@ -150,6 +150,22 @@ class QCILICC<bits<3> funct3, bits<2> funct2, DAGOperand InTyRs2, string opcodes
   let Inst{31-25} = {simm, funct2};
 }
 
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
+class QCIMVCC<bits<3> funct3, string opcodestr>
+    : RVInstR4<0b00, funct3, OPC_CUSTOM_2, (outs GPRNoX0:$rd),
+               (ins GPRNoX0:$rs1, GPRNoX0:$rs2, GPRNoX0:$rs3),
+               opcodestr, "$rd, $rs1, $rs2, $rs3">;
+
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
+class QCIMVCCI<bits<3> funct3, string opcodestr, DAGOperand immType>
+    : RVInstR4<0b10, funct3, OPC_CUSTOM_2, (outs GPRNoX0:$rd),
+               (ins GPRNoX0:$rs1, immType:$imm, GPRNoX0:$rs3),
+               opcodestr, "$rd, $rs1, $imm, $rs3"> {
+  bits<5> imm;
+
+  let rs2 = imm;
+}
+
 //===----------------------------------------------------------------------===//
 // Instructions
 //===----------------------------------------------------------------------===//
@@ -270,6 +286,32 @@ let Predicates = [HasVendorXqcicli, IsRV32], DecoderNamespace = "Xqcicli" in {
   def QC_LIGEUI  : QCILICC<0b111, 0b11, uimm5, "qc.ligeui">;
 } // Predicates = [HasVendorXqcicli, IsRV32], DecoderNamespace = "Xqcicli"
 
+let Predicates = [HasVendorXqcicm, IsRV32], DecoderNamespace = "Xqcicm" in {
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
+  def QC_C_MVEQZ   : RVInst16CL<0b101, 0b10, (outs GPRC:$rd_wb),
+                              (ins GPRC:$rd, GPRC:$rs1),
+                              "qc.c.mveqz", "$rd, $rs1"> {
+    let Constraints = "$rd = $rd_wb";
+
+    let Inst{12-10} = 0b011;
+    let Inst{6-5} = 0b00;
+  }
+
+  def QC_MVEQ    : QCIMVCC<0b000, "qc.mveq">;
+  def QC_MVNE    : QCIMVCC<0b001, "qc.mvne">;
+  def QC_MVLT    : QCIMVCC<0b100, "qc.mvlt">;
+  def QC_MVGE    : QCIMVCC<0b101, "qc.mvge">;
+  def QC_MVLTU   : QCIMVCC<0b110, "qc.mvltu">;
+  def QC_MVGEU   : QCIMVCC<0b111, "qc.mvgeu">;
+
+  def QC_MVEQI   : QCIMVCCI<0b000, "qc.mveqi", simm5>;
+  def QC_MVNEI   : QCIMVCCI<0b001, "qc.mvnei", simm5>;
+  def QC_MVLTI   : QCIMVCCI<0b100, "qc.mvlti", simm5>;
+  def QC_MVGEI   : QCIMVCCI<0b101, "qc.mvgei", simm5>;
+  def QC_MVLTUI  : QCIMVCCI<0b110, "qc.mvltui", uimm5>;
+  def QC_MVGEUI  : QCIMVCCI<0b111, "qc.mvgeui", uimm5>;
+} // Predicates = [HasVendorXqcicm, IsRV32], DecoderNamespace = "Xqcicm"
+
 //===----------------------------------------------------------------------===//
 // Aliases
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/RISCV/RISCVPfmCounters.td b/llvm/lib/Target/RISCV/RISCVPfmCounters.td
new file mode 100644
index 000000000000..013e789a9e92
--- /dev/null
+++ b/llvm/lib/Target/RISCV/RISCVPfmCounters.td
@@ -0,0 +1,18 @@
+//===---- RISCVPfmCounters.td - RISC-V Hardware Counters ---*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This describes the available hardware counters for RISC-V.
+//
+//===----------------------------------------------------------------------===//
+
+def CpuCyclesPfmCounter : PfmCounter<"CYCLES">;
+
+def DefaultPfmCounters : ProcPfmCounters {
+  let CycleCounter = CpuCyclesPfmCounter;
+}
+def : PfmCountersDefaultBinding<DefaultPfmCounters>;
diff --git a/llvm/lib/Target/RISCV/RISCVProcessors.td b/llvm/lib/Target/RISCV/RISCVProcessors.td
index 61c7c2136703..6dfed7ddeb9f 100644
--- a/llvm/lib/Target/RISCV/RISCVProcessors.td
+++ b/llvm/lib/Target/RISCV/RISCVProcessors.td
@@ -321,6 +321,25 @@ def SIFIVE_P470 : RISCVProcessorModel<"sifive-p470", SiFiveP400Model,
                                                   [TuneNoSinkSplatOperands,
                                                    TuneVXRMPipelineFlush])>;
 
+defvar SiFiveP500TuneFeatures = [TuneNoDefaultUnroll,
+                                 TuneConditionalCompressedMoveFusion,
+                                 TuneLUIADDIFusion,
+                                 TuneAUIPCADDIFusion,
+                                 TunePostRAScheduler];
+
+def SIFIVE_P550 : RISCVProcessorModel<"sifive-p550", NoSchedModel,
+                                      [Feature64Bit,
+                                       FeatureStdExtI,
+                                       FeatureStdExtZifencei,
+                                       FeatureStdExtM,
+                                       FeatureStdExtA,
+                                       FeatureStdExtF,
+                                       FeatureStdExtD,
+                                       FeatureStdExtC,
+                                       FeatureStdExtZba,
+                                       FeatureStdExtZbb],
+                                      SiFiveP500TuneFeatures>;
+
 def SIFIVE_P670 : RISCVProcessorModel<"sifive-p670", SiFiveP600Model,
                                       !listconcat(RVA22U64Features,
                                       [FeatureStdExtV,
diff --git a/llvm/lib/Target/RISCV/RISCVSchedSiFiveP400.td b/llvm/lib/Target/RISCV/RISCVSchedSiFiveP400.td
index a86c255f0820..396cbe2c476c 100644
--- a/llvm/lib/Target/RISCV/RISCVSchedSiFiveP400.td
+++ b/llvm/lib/Target/RISCV/RISCVSchedSiFiveP400.td
@@ -182,7 +182,7 @@ def P400WriteCMOV : SchedWriteRes<[SiFiveP400Branch, SiFiveP400IEXQ1]> {
 }
 def : InstRW<[P400WriteCMOV], (instrs PseudoCCMOVGPRNoX0)>;
 
-let Latency = 3 in {
+let Latency = 2 in {
 // Integer multiplication
 def : WriteRes<WriteIMul, [SiFiveP400MulDiv]>;
 def : WriteRes<WriteIMul32, [SiFiveP400MulDiv]>;
diff --git a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
index 32d552625a8e..ad61a77df905 100644
--- a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
+++ b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
@@ -50,7 +50,10 @@ public:
   StringRef getPassName() const override { return PASS_NAME; }
 
 private:
-  bool checkUsers(const MachineOperand *&CommonVL, MachineInstr &MI);
+  std::optional<MachineOperand> getMinimumVLForUser(MachineOperand &UserOp);
+  /// Returns the largest common VL MachineOperand that may be used to optimize
+  /// MI. Returns std::nullopt if it failed to find a suitable VL.
+  std::optional<MachineOperand> checkUsers(MachineInstr &MI);
   bool tryReduceVL(MachineInstr &MI);
   bool isCandidate(const MachineInstr &MI) const;
 };
@@ -76,11 +79,6 @@ static bool isVectorRegClass(Register R, const MachineRegisterInfo *MRI) {
 
 /// Represents the EMUL and EEW of a MachineOperand.
 struct OperandInfo {
-  enum class State {
-    Unknown,
-    Known,
-  } S;
-
   // Represent as 1,2,4,8, ... and fractional indicator. This is because
   // EMUL can take on values that don't map to RISCVII::VLMUL values exactly.
   // For example, a mask operand can have an EMUL less than MF8.
@@ -89,34 +87,32 @@ struct OperandInfo {
   unsigned Log2EEW;
 
   OperandInfo(RISCVII::VLMUL EMUL, unsigned Log2EEW)
-      : S(State::Known), EMUL(RISCVVType::decodeVLMUL(EMUL)), Log2EEW(Log2EEW) {
-  }
+      : EMUL(RISCVVType::decodeVLMUL(EMUL)), Log2EEW(Log2EEW) {}
 
   OperandInfo(std::pair<unsigned, bool> EMUL, unsigned Log2EEW)
-      : S(State::Known), EMUL(EMUL), Log2EEW(Log2EEW) {}
+      : EMUL(EMUL), Log2EEW(Log2EEW) {}
 
-  OperandInfo() : S(State::Unknown) {}
+  OperandInfo(unsigned Log2EEW) : Log2EEW(Log2EEW) {}
 
-  bool isUnknown() const { return S == State::Unknown; }
-  bool isKnown() const { return S == State::Known; }
+  OperandInfo() = delete;
 
   static bool EMULAndEEWAreEqual(const OperandInfo &A, const OperandInfo &B) {
-    assert(A.isKnown() && B.isKnown() && "Both operands must be known");
-
     return A.Log2EEW == B.Log2EEW && A.EMUL->first == B.EMUL->first &&
            A.EMUL->second == B.EMUL->second;
   }
 
+  static bool EEWAreEqual(const OperandInfo &A, const OperandInfo &B) {
+    return A.Log2EEW == B.Log2EEW;
+  }
+
   void print(raw_ostream &OS) const {
-    if (isUnknown()) {
-      OS << "Unknown";
-      return;
-    }
-    assert(EMUL && "Expected EMUL to have value");
-    OS << "EMUL: m";
-    if (EMUL->second)
-      OS << "f";
-    OS << EMUL->first;
+    if (EMUL) {
+      OS << "EMUL: m";
+      if (EMUL->second)
+        OS << "f";
+      OS << EMUL->first;
+    } else
+      OS << "EMUL: unknown\n";
     OS << ", EEW: " << (1 << Log2EEW);
   }
 };
@@ -127,30 +123,18 @@ static raw_ostream &operator<<(raw_ostream &OS, const OperandInfo &OI) {
   return OS;
 }
 
-namespace llvm {
-namespace RISCVVType {
-/// Return the RISCVII::VLMUL that is two times VLMul.
-/// Precondition: VLMul is not LMUL_RESERVED or LMUL_8.
-static RISCVII::VLMUL twoTimesVLMUL(RISCVII::VLMUL VLMul) {
-  switch (VLMul) {
-  case RISCVII::VLMUL::LMUL_F8:
-    return RISCVII::VLMUL::LMUL_F4;
-  case RISCVII::VLMUL::LMUL_F4:
-    return RISCVII::VLMUL::LMUL_F2;
-  case RISCVII::VLMUL::LMUL_F2:
-    return RISCVII::VLMUL::LMUL_1;
-  case RISCVII::VLMUL::LMUL_1:
-    return RISCVII::VLMUL::LMUL_2;
-  case RISCVII::VLMUL::LMUL_2:
-    return RISCVII::VLMUL::LMUL_4;
-  case RISCVII::VLMUL::LMUL_4:
-    return RISCVII::VLMUL::LMUL_8;
-  case RISCVII::VLMUL::LMUL_8:
-  default:
-    llvm_unreachable("Could not multiply VLMul by 2");
-  }
+LLVM_ATTRIBUTE_UNUSED
+static raw_ostream &operator<<(raw_ostream &OS,
+                               const std::optional<OperandInfo> &OI) {
+  if (OI)
+    OI->print(OS);
+  else
+    OS << "nullopt";
+  return OS;
 }
 
+namespace llvm {
+namespace RISCVVType {
 /// Return EMUL = (EEW / SEW) * LMUL where EEW comes from Log2EEW and LMUL and
 /// SEW are from the TSFlags of MI.
 static std::pair<unsigned, bool>
@@ -180,24 +164,22 @@ getEMULEqualsEEWDivSEWTimesLMUL(unsigned Log2EEW, const MachineInstr &MI) {
 } // end namespace RISCVVType
 } // end namespace llvm
 
-/// Dest has EEW=SEW and EMUL=LMUL. Source EEW=SEW/Factor (i.e. F2 => EEW/2).
-/// Source has EMUL=(EEW/SEW)*LMUL. LMUL and SEW comes from TSFlags of MI.
-static OperandInfo getIntegerExtensionOperandInfo(unsigned Factor,
-                                                  const MachineInstr &MI,
-                                                  const MachineOperand &MO) {
-  RISCVII::VLMUL MIVLMul = RISCVII::getLMul(MI.getDesc().TSFlags);
+/// Dest has EEW=SEW. Source EEW=SEW/Factor (i.e. F2 => EEW/2).
+/// SEW comes from TSFlags of MI.
+static unsigned getIntegerExtensionOperandEEW(unsigned Factor,
+                                              const MachineInstr &MI,
+                                              const MachineOperand &MO) {
   unsigned MILog2SEW =
       MI.getOperand(RISCVII::getSEWOpNum(MI.getDesc())).getImm();
 
   if (MO.getOperandNo() == 0)
-    return OperandInfo(MIVLMul, MILog2SEW);
+    return MILog2SEW;
 
   unsigned MISEW = 1 << MILog2SEW;
   unsigned EEW = MISEW / Factor;
   unsigned Log2EEW = Log2_32(EEW);
 
-  return OperandInfo(RISCVVType::getEMULEqualsEEWDivSEWTimesLMUL(Log2EEW, MI),
-                     Log2EEW);
+  return Log2EEW;
 }
 
 /// Check whether MO is a mask operand of MI.
@@ -211,18 +193,15 @@ static bool isMaskOperand(const MachineInstr &MI, const MachineOperand &MO,
   return Desc.operands()[MO.getOperandNo()].RegClass == RISCV::VMV0RegClassID;
 }
 
-/// Return the OperandInfo for MO.
-static OperandInfo getOperandInfo(const MachineOperand &MO,
-                                  const MachineRegisterInfo *MRI) {
+static std::optional<unsigned>
+getOperandLog2EEW(const MachineOperand &MO, const MachineRegisterInfo *MRI) {
   const MachineInstr &MI = *MO.getParent();
   const RISCVVPseudosTable::PseudoInfo *RVV =
       RISCVVPseudosTable::getPseudoInfo(MI.getOpcode());
   assert(RVV && "Could not find MI in PseudoTable");
 
-  // MI has a VLMUL and SEW associated with it. The RVV specification defines
-  // the LMUL and SEW of each operand and definition in relation to MI.VLMUL and
-  // MI.SEW.
-  RISCVII::VLMUL MIVLMul = RISCVII::getLMul(MI.getDesc().TSFlags);
+  // MI has a SEW associated with it. The RVV specification defines
+  // the EEW of each operand and definition in relation to MI.SEW.
   unsigned MILog2SEW =
       MI.getOperand(RISCVII::getSEWOpNum(MI.getDesc())).getImm();
 
@@ -233,13 +212,13 @@ static OperandInfo getOperandInfo(const MachineOperand &MO,
   // since they must preserve the entire register content.
   if (HasPassthru && MO.getOperandNo() == MI.getNumExplicitDefs() &&
       (MO.getReg() != RISCV::NoRegister))
-    return {};
+    return std::nullopt;
 
   bool IsMODef = MO.getOperandNo() == 0;
 
-  // All mask operands have EEW=1, EMUL=(EEW/SEW)*LMUL
+  // All mask operands have EEW=1
   if (isMaskOperand(MI, MO, MRI))
-    return OperandInfo(RISCVVType::getEMULEqualsEEWDivSEWTimesLMUL(0, MI), 0);
+    return 0;
 
   // switch against BaseInstr to reduce number of cases that need to be
   // considered.
@@ -256,55 +235,65 @@ static OperandInfo getOperandInfo(const MachineOperand &MO,
   // Vector Loads and Stores
   // Vector Unit-Stride Instructions
   // Vector Strided Instructions
-  /// Dest EEW encoded in the instruction and EMUL=(EEW/SEW)*LMUL
+  /// Dest EEW encoded in the instruction
+  case RISCV::VLM_V:
+  case RISCV::VSM_V:
+    return 0;
+  case RISCV::VLE8_V:
   case RISCV::VSE8_V:
+  case RISCV::VLSE8_V:
   case RISCV::VSSE8_V:
-    return OperandInfo(RISCVVType::getEMULEqualsEEWDivSEWTimesLMUL(3, MI), 3);
+    return 3;
+  case RISCV::VLE16_V:
   case RISCV::VSE16_V:
+  case RISCV::VLSE16_V:
   case RISCV::VSSE16_V:
-    return OperandInfo(RISCVVType::getEMULEqualsEEWDivSEWTimesLMUL(4, MI), 4);
+    return 4;
+  case RISCV::VLE32_V:
   case RISCV::VSE32_V:
+  case RISCV::VLSE32_V:
   case RISCV::VSSE32_V:
-    return OperandInfo(RISCVVType::getEMULEqualsEEWDivSEWTimesLMUL(5, MI), 5);
+    return 5;
+  case RISCV::VLE64_V:
   case RISCV::VSE64_V:
+  case RISCV::VLSE64_V:
   case RISCV::VSSE64_V:
-    return OperandInfo(RISCVVType::getEMULEqualsEEWDivSEWTimesLMUL(6, MI), 6);
+    return 6;
 
   // Vector Indexed Instructions
   // vs(o|u)xei<eew>.v
-  // Dest/Data (operand 0) EEW=SEW, EMUL=LMUL. Source EEW=<eew> and
-  // EMUL=(EEW/SEW)*LMUL.
+  // Dest/Data (operand 0) EEW=SEW.  Source EEW=<eew>.
   case RISCV::VLUXEI8_V:
   case RISCV::VLOXEI8_V:
   case RISCV::VSUXEI8_V:
   case RISCV::VSOXEI8_V: {
     if (MO.getOperandNo() == 0)
-      return OperandInfo(MIVLMul, MILog2SEW);
-    return OperandInfo(RISCVVType::getEMULEqualsEEWDivSEWTimesLMUL(3, MI), 3);
+      return MILog2SEW;
+    return 3;
   }
   case RISCV::VLUXEI16_V:
   case RISCV::VLOXEI16_V:
   case RISCV::VSUXEI16_V:
   case RISCV::VSOXEI16_V: {
     if (MO.getOperandNo() == 0)
-      return OperandInfo(MIVLMul, MILog2SEW);
-    return OperandInfo(RISCVVType::getEMULEqualsEEWDivSEWTimesLMUL(4, MI), 4);
+      return MILog2SEW;
+    return 4;
   }
   case RISCV::VLUXEI32_V:
   case RISCV::VLOXEI32_V:
   case RISCV::VSUXEI32_V:
   case RISCV::VSOXEI32_V: {
     if (MO.getOperandNo() == 0)
-      return OperandInfo(MIVLMul, MILog2SEW);
-    return OperandInfo(RISCVVType::getEMULEqualsEEWDivSEWTimesLMUL(5, MI), 5);
+      return MILog2SEW;
+    return 5;
   }
   case RISCV::VLUXEI64_V:
   case RISCV::VLOXEI64_V:
   case RISCV::VSUXEI64_V:
   case RISCV::VSOXEI64_V: {
     if (MO.getOperandNo() == 0)
-      return OperandInfo(MIVLMul, MILog2SEW);
-    return OperandInfo(RISCVVType::getEMULEqualsEEWDivSEWTimesLMUL(6, MI), 6);
+      return MILog2SEW;
+    return 6;
   }
 
   // Vector Integer Arithmetic Instructions
@@ -318,7 +307,7 @@ static OperandInfo getOperandInfo(const MachineOperand &MO,
   case RISCV::VRSUB_VX:
   // Vector Bitwise Logical Instructions
   // Vector Single-Width Shift Instructions
-  // EEW=SEW. EMUL=LMUL.
+  // EEW=SEW.
   case RISCV::VAND_VI:
   case RISCV::VAND_VV:
   case RISCV::VAND_VX:
@@ -338,7 +327,7 @@ static OperandInfo getOperandInfo(const MachineOperand &MO,
   case RISCV::VSRA_VV:
   case RISCV::VSRA_VX:
   // Vector Integer Min/Max Instructions
-  // EEW=SEW. EMUL=LMUL.
+  // EEW=SEW.
   case RISCV::VMINU_VV:
   case RISCV::VMINU_VX:
   case RISCV::VMIN_VV:
@@ -348,7 +337,7 @@ static OperandInfo getOperandInfo(const MachineOperand &MO,
   case RISCV::VMAX_VV:
   case RISCV::VMAX_VX:
   // Vector Single-Width Integer Multiply Instructions
-  // Source and Dest EEW=SEW and EMUL=LMUL.
+  // Source and Dest EEW=SEW.
   case RISCV::VMUL_VV:
   case RISCV::VMUL_VX:
   case RISCV::VMULH_VV:
@@ -358,7 +347,7 @@ static OperandInfo getOperandInfo(const MachineOperand &MO,
   case RISCV::VMULHSU_VV:
   case RISCV::VMULHSU_VX:
   // Vector Integer Divide Instructions
-  // EEW=SEW. EMUL=LMUL.
+  // EEW=SEW.
   case RISCV::VDIVU_VV:
   case RISCV::VDIVU_VX:
   case RISCV::VDIV_VV:
@@ -368,7 +357,7 @@ static OperandInfo getOperandInfo(const MachineOperand &MO,
   case RISCV::VREM_VV:
   case RISCV::VREM_VX:
   // Vector Single-Width Integer Multiply-Add Instructions
-  // EEW=SEW. EMUL=LMUL.
+  // EEW=SEW.
   case RISCV::VMACC_VV:
   case RISCV::VMACC_VX:
   case RISCV::VNMSAC_VV:
@@ -379,8 +368,8 @@ static OperandInfo getOperandInfo(const MachineOperand &MO,
   case RISCV::VNMSUB_VX:
   // Vector Integer Merge Instructions
   // Vector Integer Add-with-Carry / Subtract-with-Borrow Instructions
-  // EEW=SEW and EMUL=LMUL, except the mask operand has EEW=1 and EMUL=
-  // (EEW/SEW)*LMUL. Mask operand is handled before this switch.
+  // EEW=SEW, except the mask operand has EEW=1. Mask operand is handled
+  // before this switch.
   case RISCV::VMERGE_VIM:
   case RISCV::VMERGE_VVM:
   case RISCV::VMERGE_VXM:
@@ -393,7 +382,7 @@ static OperandInfo getOperandInfo(const MachineOperand &MO,
   // Vector Fixed-Point Arithmetic Instructions
   // Vector Single-Width Saturating Add and Subtract
   // Vector Single-Width Averaging Add and Subtract
-  // EEW=SEW. EMUL=LMUL.
+  // EEW=SEW.
   case RISCV::VMV_V_I:
   case RISCV::VMV_V_V:
   case RISCV::VMV_V_X:
@@ -416,12 +405,12 @@ static OperandInfo getOperandInfo(const MachineOperand &MO,
   case RISCV::VASUB_VV:
   case RISCV::VASUB_VX:
   // Vector Single-Width Fractional Multiply with Rounding and Saturation
-  // EEW=SEW. EMUL=LMUL. The instruction produces 2*SEW product internally but
+  // EEW=SEW. The instruction produces 2*SEW product internally but
   // saturates to fit into SEW bits.
   case RISCV::VSMUL_VV:
   case RISCV::VSMUL_VX:
   // Vector Single-Width Scaling Shift Instructions
-  // EEW=SEW. EMUL=LMUL.
+  // EEW=SEW.
   case RISCV::VSSRL_VI:
   case RISCV::VSSRL_VV:
   case RISCV::VSSRL_VX:
@@ -431,13 +420,13 @@ static OperandInfo getOperandInfo(const MachineOperand &MO,
   // Vector Permutation Instructions
   // Integer Scalar Move Instructions
   // Floating-Point Scalar Move Instructions
-  // EMUL=LMUL. EEW=SEW.
+  // EEW=SEW.
   case RISCV::VMV_X_S:
   case RISCV::VMV_S_X:
   case RISCV::VFMV_F_S:
   case RISCV::VFMV_S_F:
   // Vector Slide Instructions
-  // EMUL=LMUL. EEW=SEW.
+  // EEW=SEW.
   case RISCV::VSLIDEUP_VI:
   case RISCV::VSLIDEUP_VX:
   case RISCV::VSLIDEDOWN_VI:
@@ -447,12 +436,12 @@ static OperandInfo getOperandInfo(const MachineOperand &MO,
   case RISCV::VSLIDE1DOWN_VX:
   case RISCV::VFSLIDE1DOWN_VF:
   // Vector Register Gather Instructions
-  // EMUL=LMUL. EEW=SEW. For mask operand, EMUL=1 and EEW=1.
+  // EEW=SEW. For mask operand, EEW=1.
   case RISCV::VRGATHER_VI:
   case RISCV::VRGATHER_VV:
   case RISCV::VRGATHER_VX:
   // Vector Compress Instruction
-  // EMUL=LMUL. EEW=SEW.
+  // EEW=SEW.
   case RISCV::VCOMPRESS_VM:
   // Vector Element Index Instruction
   case RISCV::VID_V:
@@ -499,10 +488,10 @@ static OperandInfo getOperandInfo(const MachineOperand &MO,
   case RISCV::VFCVT_F_X_V:
   // Vector Floating-Point Merge Instruction
   case RISCV::VFMERGE_VFM:
-    return OperandInfo(MIVLMul, MILog2SEW);
+    return MILog2SEW;
 
   // Vector Widening Integer Add/Subtract
-  // Def uses EEW=2*SEW and EMUL=2*LMUL. Operands use EEW=SEW and EMUL=LMUL.
+  // Def uses EEW=2*SEW . Operands use EEW=SEW.
   case RISCV::VWADDU_VV:
   case RISCV::VWADDU_VX:
   case RISCV::VWSUBU_VV:
@@ -513,7 +502,7 @@ static OperandInfo getOperandInfo(const MachineOperand &MO,
   case RISCV::VWSUB_VX:
   case RISCV::VWSLL_VI:
   // Vector Widening Integer Multiply Instructions
-  // Source and Destination EMUL=LMUL. Destination EEW=2*SEW. Source EEW=SEW.
+  // Destination EEW=2*SEW. Source EEW=SEW.
   case RISCV::VWMUL_VV:
   case RISCV::VWMUL_VX:
   case RISCV::VWMULSU_VV:
@@ -521,7 +510,7 @@ static OperandInfo getOperandInfo(const MachineOperand &MO,
   case RISCV::VWMULU_VV:
   case RISCV::VWMULU_VX:
   // Vector Widening Integer Multiply-Add Instructions
-  // Destination EEW=2*SEW and EMUL=2*LMUL. Source EEW=SEW and EMUL=LMUL.
+  // Destination EEW=2*SEW. Source EEW=SEW.
   // A SEW-bit*SEW-bit multiply of the sources forms a 2*SEW-bit value, which
   // is then added to the 2*SEW-bit Dest. These instructions never have a
   // passthru operand.
@@ -542,7 +531,7 @@ static OperandInfo getOperandInfo(const MachineOperand &MO,
   case RISCV::VFWNMSAC_VF:
   case RISCV::VFWNMSAC_VV:
   // Vector Widening Floating-Point Add/Subtract Instructions
-  // Dest EEW=2*SEW and EMUL=2*LMUL. Source EEW=SEW and EMUL=LMUL.
+  // Dest EEW=2*SEW. Source EEW=SEW.
   case RISCV::VFWADD_VV:
   case RISCV::VFWADD_VF:
   case RISCV::VFWSUB_VV:
@@ -559,12 +548,10 @@ static OperandInfo getOperandInfo(const MachineOperand &MO,
   case RISCV::VFWCVT_F_X_V:
   case RISCV::VFWCVT_F_F_V: {
     unsigned Log2EEW = IsMODef ? MILog2SEW + 1 : MILog2SEW;
-    RISCVII::VLMUL EMUL =
-        IsMODef ? RISCVVType::twoTimesVLMUL(MIVLMul) : MIVLMul;
-    return OperandInfo(EMUL, Log2EEW);
+    return Log2EEW;
   }
 
-  // Def and Op1 uses EEW=2*SEW and EMUL=2*LMUL. Op2 uses EEW=SEW and EMUL=LMUL
+  // Def and Op1 uses EEW=2*SEW. Op2 uses EEW=SEW.
   case RISCV::VWADDU_WV:
   case RISCV::VWADDU_WX:
   case RISCV::VWSUBU_WV:
@@ -581,25 +568,22 @@ static OperandInfo getOperandInfo(const MachineOperand &MO,
     bool IsOp1 = HasPassthru ? MO.getOperandNo() == 2 : MO.getOperandNo() == 1;
     bool TwoTimes = IsMODef || IsOp1;
     unsigned Log2EEW = TwoTimes ? MILog2SEW + 1 : MILog2SEW;
-    RISCVII::VLMUL EMUL =
-        TwoTimes ? RISCVVType::twoTimesVLMUL(MIVLMul) : MIVLMul;
-    return OperandInfo(EMUL, Log2EEW);
+    return Log2EEW;
   }
 
   // Vector Integer Extension
   case RISCV::VZEXT_VF2:
   case RISCV::VSEXT_VF2:
-    return getIntegerExtensionOperandInfo(2, MI, MO);
+    return getIntegerExtensionOperandEEW(2, MI, MO);
   case RISCV::VZEXT_VF4:
   case RISCV::VSEXT_VF4:
-    return getIntegerExtensionOperandInfo(4, MI, MO);
+    return getIntegerExtensionOperandEEW(4, MI, MO);
   case RISCV::VZEXT_VF8:
   case RISCV::VSEXT_VF8:
-    return getIntegerExtensionOperandInfo(8, MI, MO);
+    return getIntegerExtensionOperandEEW(8, MI, MO);
 
   // Vector Narrowing Integer Right Shift Instructions
-  // Destination EEW=SEW and EMUL=LMUL, Op 1 has EEW=2*SEW EMUL=2*LMUL. Op2 has
-  // EEW=SEW EMUL=LMUL.
+  // Destination EEW=SEW, Op 1 has EEW=2*SEW. Op2 has EEW=SEW
   case RISCV::VNSRL_WX:
   case RISCV::VNSRL_WI:
   case RISCV::VNSRL_WV:
@@ -607,7 +591,7 @@ static OperandInfo getOperandInfo(const MachineOperand &MO,
   case RISCV::VNSRA_WV:
   case RISCV::VNSRA_WX:
   // Vector Narrowing Fixed-Point Clip Instructions
-  // Destination and Op1 EEW=SEW and EMUL=LMUL. Op2 EEW=2*SEW and EMUL=2*LMUL
+  // Destination and Op1 EEW=SEW. Op2 EEW=2*SEW.
   case RISCV::VNCLIPU_WI:
   case RISCV::VNCLIPU_WV:
   case RISCV::VNCLIPU_WX:
@@ -626,9 +610,7 @@ static OperandInfo getOperandInfo(const MachineOperand &MO,
     bool IsOp1 = HasPassthru ? MO.getOperandNo() == 2 : MO.getOperandNo() == 1;
     bool TwoTimes = IsOp1;
     unsigned Log2EEW = TwoTimes ? MILog2SEW + 1 : MILog2SEW;
-    RISCVII::VLMUL EMUL =
-        TwoTimes ? RISCVVType::twoTimesVLMUL(MIVLMul) : MIVLMul;
-    return OperandInfo(EMUL, Log2EEW);
+    return Log2EEW;
   }
 
   // Vector Mask Instructions
@@ -636,7 +618,7 @@ static OperandInfo getOperandInfo(const MachineOperand &MO,
   // vmsbf.m set-before-first mask bit
   // vmsif.m set-including-first mask bit
   // vmsof.m set-only-first mask bit
-  // EEW=1 and EMUL=(EEW/SEW)*LMUL
+  // EEW=1
   // We handle the cases when operand is a v0 mask operand above the switch,
   // but these instructions may use non-v0 mask operands and need to be handled
   // specifically.
@@ -651,20 +633,20 @@ static OperandInfo getOperandInfo(const MachineOperand &MO,
   case RISCV::VMSBF_M:
   case RISCV::VMSIF_M:
   case RISCV::VMSOF_M: {
-    return OperandInfo(RISCVVType::getEMULEqualsEEWDivSEWTimesLMUL(0, MI), 0);
+    return 0;
   }
 
   // Vector Iota Instruction
-  // EEW=SEW and EMUL=LMUL, except the mask operand has EEW=1 and EMUL=
-  // (EEW/SEW)*LMUL. Mask operand is not handled before this switch.
+  // EEW=SEW, except the mask operand has EEW=1. Mask operand is not handled
+  // before this switch.
   case RISCV::VIOTA_M: {
     if (IsMODef || MO.getOperandNo() == 1)
-      return OperandInfo(MIVLMul, MILog2SEW);
-    return OperandInfo(RISCVVType::getEMULEqualsEEWDivSEWTimesLMUL(0, MI), 0);
+      return MILog2SEW;
+    return 0;
   }
 
   // Vector Integer Compare Instructions
-  // Dest EEW=1 and EMUL=(EEW/SEW)*LMUL. Source EEW=SEW and EMUL=LMUL.
+  // Dest EEW=1. Source EEW=SEW.
   case RISCV::VMSEQ_VI:
   case RISCV::VMSEQ_VV:
   case RISCV::VMSEQ_VX:
@@ -686,21 +668,20 @@ static OperandInfo getOperandInfo(const MachineOperand &MO,
   case RISCV::VMSGT_VI:
   case RISCV::VMSGT_VX:
   // Vector Integer Add-with-Carry / Subtract-with-Borrow Instructions
-  // Dest EEW=1 and EMUL=(EEW/SEW)*LMUL. Source EEW=SEW and EMUL=LMUL. Mask
-  // source operand handled above this switch.
+  // Dest EEW=1. Source EEW=SEW. Mask source operand handled above this switch.
   case RISCV::VMADC_VIM:
   case RISCV::VMADC_VVM:
   case RISCV::VMADC_VXM:
   case RISCV::VMSBC_VVM:
   case RISCV::VMSBC_VXM:
-  // Dest EEW=1 and EMUL=(EEW/SEW)*LMUL. Source EEW=SEW and EMUL=LMUL.
+  // Dest EEW=1. Source EEW=SEW.
   case RISCV::VMADC_VV:
   case RISCV::VMADC_VI:
   case RISCV::VMADC_VX:
   case RISCV::VMSBC_VV:
   case RISCV::VMSBC_VX:
   // 13.13. Vector Floating-Point Compare Instructions
-  // Dest EEW=1 and EMUL=(EEW/SEW)*LMUL. Source EEW=SEW EMUL=LMUL.
+  // Dest EEW=1. Source EEW=SEW
   case RISCV::VMFEQ_VF:
   case RISCV::VMFEQ_VV:
   case RISCV::VMFNE_VF:
@@ -712,15 +693,62 @@ static OperandInfo getOperandInfo(const MachineOperand &MO,
   case RISCV::VMFGT_VF:
   case RISCV::VMFGE_VF: {
     if (IsMODef)
-      return OperandInfo(RISCVVType::getEMULEqualsEEWDivSEWTimesLMUL(0, MI), 0);
-    return OperandInfo(MIVLMul, MILog2SEW);
+      return 0;
+    return MILog2SEW;
+  }
+
+  // Vector Reduction Operations
+  // Vector Single-Width Integer Reduction Instructions
+  case RISCV::VREDAND_VS:
+  case RISCV::VREDMAX_VS:
+  case RISCV::VREDMAXU_VS:
+  case RISCV::VREDMIN_VS:
+  case RISCV::VREDMINU_VS:
+  case RISCV::VREDOR_VS:
+  case RISCV::VREDSUM_VS:
+  case RISCV::VREDXOR_VS: {
+    return MILog2SEW;
   }
 
   default:
-    return {};
+    return std::nullopt;
   }
 }
 
+static std::optional<OperandInfo>
+getOperandInfo(const MachineOperand &MO, const MachineRegisterInfo *MRI) {
+  const MachineInstr &MI = *MO.getParent();
+  const RISCVVPseudosTable::PseudoInfo *RVV =
+      RISCVVPseudosTable::getPseudoInfo(MI.getOpcode());
+  assert(RVV && "Could not find MI in PseudoTable");
+
+  std::optional<unsigned> Log2EEW = getOperandLog2EEW(MO, MRI);
+  if (!Log2EEW)
+    return std::nullopt;
+
+  switch (RVV->BaseInstr) {
+  // Vector Reduction Operations
+  // Vector Single-Width Integer Reduction Instructions
+  // The Dest and VS1 only read element 0 of the vector register. Return just
+  // the EEW for these.
+  case RISCV::VREDAND_VS:
+  case RISCV::VREDMAX_VS:
+  case RISCV::VREDMAXU_VS:
+  case RISCV::VREDMIN_VS:
+  case RISCV::VREDMINU_VS:
+  case RISCV::VREDOR_VS:
+  case RISCV::VREDSUM_VS:
+  case RISCV::VREDXOR_VS:
+    if (MO.getOperandNo() != 2)
+      return OperandInfo(*Log2EEW);
+    break;
+  };
+
+  // All others have EMUL=EEW/SEW*LMUL
+  return OperandInfo(RISCVVType::getEMULEqualsEEWDivSEWTimesLMUL(*Log2EEW, MI),
+                     *Log2EEW);
+}
+
 /// Return true if this optimization should consider MI for VL reduction. This
 /// white-list approach simplifies this optimization for instructions that may
 /// have more complex semantics with relation to how it uses VL.
@@ -732,6 +760,32 @@ static bool isSupportedInstr(const MachineInstr &MI) {
     return false;
 
   switch (RVV->BaseInstr) {
+  // Vector Unit-Stride Instructions
+  // Vector Strided Instructions
+  case RISCV::VLM_V:
+  case RISCV::VLE8_V:
+  case RISCV::VLSE8_V:
+  case RISCV::VLE16_V:
+  case RISCV::VLSE16_V:
+  case RISCV::VLE32_V:
+  case RISCV::VLSE32_V:
+  case RISCV::VLE64_V:
+  case RISCV::VLSE64_V:
+  // Vector Indexed Instructions
+  case RISCV::VLUXEI8_V:
+  case RISCV::VLOXEI8_V:
+  case RISCV::VLUXEI16_V:
+  case RISCV::VLOXEI16_V:
+  case RISCV::VLUXEI32_V:
+  case RISCV::VLOXEI32_V:
+  case RISCV::VLUXEI64_V:
+  case RISCV::VLOXEI64_V: {
+    for (const MachineMemOperand *MMO : MI.memoperands())
+      if (MMO->isVolatile())
+        return false;
+    return true;
+  }
+
   // Vector Single-Width Integer Add and Subtract
   case RISCV::VADD_VI:
   case RISCV::VADD_VV:
@@ -901,6 +955,30 @@ static bool isSupportedInstr(const MachineInstr &MI) {
   case RISCV::VMSOF_M:
   case RISCV::VIOTA_M:
   case RISCV::VID_V:
+  // Single-Width Floating-Point/Integer Type-Convert Instructions
+  case RISCV::VFCVT_XU_F_V:
+  case RISCV::VFCVT_X_F_V:
+  case RISCV::VFCVT_RTZ_XU_F_V:
+  case RISCV::VFCVT_RTZ_X_F_V:
+  case RISCV::VFCVT_F_XU_V:
+  case RISCV::VFCVT_F_X_V:
+  // Widening Floating-Point/Integer Type-Convert Instructions
+  case RISCV::VFWCVT_XU_F_V:
+  case RISCV::VFWCVT_X_F_V:
+  case RISCV::VFWCVT_RTZ_XU_F_V:
+  case RISCV::VFWCVT_RTZ_X_F_V:
+  case RISCV::VFWCVT_F_XU_V:
+  case RISCV::VFWCVT_F_X_V:
+  case RISCV::VFWCVT_F_F_V:
+  // Narrowing Floating-Point/Integer Type-Convert Instructions
+  case RISCV::VFNCVT_XU_F_W:
+  case RISCV::VFNCVT_X_F_W:
+  case RISCV::VFNCVT_RTZ_XU_F_W:
+  case RISCV::VFNCVT_RTZ_X_F_W:
+  case RISCV::VFNCVT_F_XU_W:
+  case RISCV::VFNCVT_F_X_W:
+  case RISCV::VFNCVT_F_F_W:
+  case RISCV::VFNCVT_ROD_F_F_W:
     return true;
   }
 
@@ -1007,6 +1085,11 @@ bool RISCVVLOptimizer::isCandidate(const MachineInstr &MI) const {
     return false;
   }
 
+  if (MI.mayRaiseFPException()) {
+    LLVM_DEBUG(dbgs() << "Not a candidate because may raise FP exception\n");
+    return false;
+  }
+
   // Some instructions that produce vectors have semantics that make it more
   // difficult to determine whether the VL can be reduced. For example, some
   // instructions, such as reductions, may write lanes past VL to a scalar
@@ -1028,79 +1111,103 @@ bool RISCVVLOptimizer::isCandidate(const MachineInstr &MI) const {
   return true;
 }
 
-bool RISCVVLOptimizer::checkUsers(const MachineOperand *&CommonVL,
-                                  MachineInstr &MI) {
+std::optional<MachineOperand>
+RISCVVLOptimizer::getMinimumVLForUser(MachineOperand &UserOp) {
+  const MachineInstr &UserMI = *UserOp.getParent();
+  const MCInstrDesc &Desc = UserMI.getDesc();
+
+  if (!RISCVII::hasVLOp(Desc.TSFlags) || !RISCVII::hasSEWOp(Desc.TSFlags)) {
+    LLVM_DEBUG(dbgs() << "    Abort due to lack of VL, assume that"
+                         " use VLMAX\n");
+    return std::nullopt;
+  }
+
+  // Instructions like reductions may use a vector register as a scalar
+  // register. In this case, we should treat it as only reading the first lane.
+  if (isVectorOpUsedAsScalarOp(UserOp)) {
+    [[maybe_unused]] Register R = UserOp.getReg();
+    [[maybe_unused]] const TargetRegisterClass *RC = MRI->getRegClass(R);
+    assert(RISCV::VRRegClass.hasSubClassEq(RC) &&
+           "Expect LMUL 1 register class for vector as scalar operands!");
+    LLVM_DEBUG(dbgs() << "    Used this operand as a scalar operand\n");
+
+    return MachineOperand::CreateImm(1);
+  }
+
+  unsigned VLOpNum = RISCVII::getVLOpNum(Desc);
+  const MachineOperand &VLOp = UserMI.getOperand(VLOpNum);
+  // Looking for an immediate or a register VL that isn't X0.
+  assert((!VLOp.isReg() || VLOp.getReg() != RISCV::X0) &&
+         "Did not expect X0 VL");
+  return VLOp;
+}
+
+std::optional<MachineOperand> RISCVVLOptimizer::checkUsers(MachineInstr &MI) {
   // FIXME: Avoid visiting each user for each time we visit something on the
   // worklist, combined with an extra visit from the outer loop. Restructure
   // along lines of an instcombine style worklist which integrates the outer
   // pass.
-  bool CanReduceVL = true;
+  std::optional<MachineOperand> CommonVL;
   for (auto &UserOp : MRI->use_operands(MI.getOperand(0).getReg())) {
     const MachineInstr &UserMI = *UserOp.getParent();
     LLVM_DEBUG(dbgs() << "  Checking user: " << UserMI << "\n");
-
-    // Instructions like reductions may use a vector register as a scalar
-    // register. In this case, we should treat it like a scalar register which
-    // does not impact the decision on whether to optimize VL.
-    // TODO: Treat it like a scalar register instead of bailing out.
-    if (isVectorOpUsedAsScalarOp(UserOp)) {
-      CanReduceVL = false;
-      break;
-    }
-
     if (mayReadPastVL(UserMI)) {
       LLVM_DEBUG(dbgs() << "    Abort because used by unsafe instruction\n");
-      CanReduceVL = false;
-      break;
+      return std::nullopt;
     }
 
     // Tied operands might pass through.
     if (UserOp.isTied()) {
       LLVM_DEBUG(dbgs() << "    Abort because user used as tied operand\n");
-      CanReduceVL = false;
-      break;
+      return std::nullopt;
     }
 
-    const MCInstrDesc &Desc = UserMI.getDesc();
-    if (!RISCVII::hasVLOp(Desc.TSFlags) || !RISCVII::hasSEWOp(Desc.TSFlags)) {
-      LLVM_DEBUG(dbgs() << "    Abort due to lack of VL or SEW, assume that"
-                           " use VLMAX\n");
-      CanReduceVL = false;
-      break;
-    }
-
-    unsigned VLOpNum = RISCVII::getVLOpNum(Desc);
-    const MachineOperand &VLOp = UserMI.getOperand(VLOpNum);
-
-    // Looking for an immediate or a register VL that isn't X0.
-    assert((!VLOp.isReg() || VLOp.getReg() != RISCV::X0) &&
-           "Did not expect X0 VL");
+    auto VLOp = getMinimumVLForUser(UserOp);
+    if (!VLOp)
+      return std::nullopt;
 
     // Use the largest VL among all the users. If we cannot determine this
     // statically, then we cannot optimize the VL.
-    if (!CommonVL || RISCV::isVLKnownLE(*CommonVL, VLOp)) {
-      CommonVL = &VLOp;
+    if (!CommonVL || RISCV::isVLKnownLE(*CommonVL, *VLOp)) {
+      CommonVL = *VLOp;
       LLVM_DEBUG(dbgs() << "    User VL is: " << VLOp << "\n");
-    } else if (!RISCV::isVLKnownLE(VLOp, *CommonVL)) {
+    } else if (!RISCV::isVLKnownLE(*VLOp, *CommonVL)) {
       LLVM_DEBUG(dbgs() << "    Abort because cannot determine a common VL\n");
-      CanReduceVL = false;
-      break;
+      return std::nullopt;
+    }
+
+    if (!RISCVII::hasSEWOp(UserMI.getDesc().TSFlags)) {
+      LLVM_DEBUG(dbgs() << "    Abort due to lack of SEW operand\n");
+      return std::nullopt;
+    }
+
+    std::optional<OperandInfo> ConsumerInfo = getOperandInfo(UserOp, MRI);
+    std::optional<OperandInfo> ProducerInfo =
+        getOperandInfo(MI.getOperand(0), MRI);
+    if (!ConsumerInfo || !ProducerInfo) {
+      LLVM_DEBUG(dbgs() << "    Abort due to unknown operand information.\n");
+      LLVM_DEBUG(dbgs() << "      ConsumerInfo is: " << ConsumerInfo << "\n");
+      LLVM_DEBUG(dbgs() << "      ProducerInfo is: " << ProducerInfo << "\n");
+      return std::nullopt;
     }
 
-    // The SEW and LMUL of destination and source registers need to match.
-    OperandInfo ConsumerInfo = getOperandInfo(UserOp, MRI);
-    OperandInfo ProducerInfo = getOperandInfo(MI.getOperand(0), MRI);
-    if (ConsumerInfo.isUnknown() || ProducerInfo.isUnknown() ||
-        !OperandInfo::EMULAndEEWAreEqual(ConsumerInfo, ProducerInfo)) {
-      LLVM_DEBUG(dbgs() << "    Abort due to incompatible or unknown "
-                           "information for EMUL or EEW.\n");
+    // If the operand is used as a scalar operand, then the EEW must be
+    // compatible. Otherwise, the EMUL *and* EEW must be compatible.
+    bool IsVectorOpUsedAsScalarOp = isVectorOpUsedAsScalarOp(UserOp);
+    if ((IsVectorOpUsedAsScalarOp &&
+         !OperandInfo::EEWAreEqual(*ConsumerInfo, *ProducerInfo)) ||
+        (!IsVectorOpUsedAsScalarOp &&
+         !OperandInfo::EMULAndEEWAreEqual(*ConsumerInfo, *ProducerInfo))) {
+      LLVM_DEBUG(
+          dbgs()
+          << "    Abort due to incompatible information for EMUL or EEW.\n");
       LLVM_DEBUG(dbgs() << "      ConsumerInfo is: " << ConsumerInfo << "\n");
       LLVM_DEBUG(dbgs() << "      ProducerInfo is: " << ProducerInfo << "\n");
-      CanReduceVL = false;
-      break;
+      return std::nullopt;
     }
   }
-  return CanReduceVL;
+
+  return CommonVL;
 }
 
 bool RISCVVLOptimizer::tryReduceVL(MachineInstr &OrigMI) {
@@ -1112,12 +1219,11 @@ bool RISCVVLOptimizer::tryReduceVL(MachineInstr &OrigMI) {
     MachineInstr &MI = *Worklist.pop_back_val();
     LLVM_DEBUG(dbgs() << "Trying to reduce VL for " << MI << "\n");
 
-    const MachineOperand *CommonVL = nullptr;
-    bool CanReduceVL = true;
-    if (isVectorRegClass(MI.getOperand(0).getReg(), MRI))
-      CanReduceVL = checkUsers(CommonVL, MI);
+    if (!isVectorRegClass(MI.getOperand(0).getReg(), MRI))
+      continue;
 
-    if (!CanReduceVL || !CommonVL)
+    auto CommonVL = checkUsers(MI);
+    if (!CommonVL)
       continue;
 
     assert((CommonVL->isImm() || CommonVL->getReg().isVirtual()) &&
diff --git a/llvm/lib/Target/SPIRV/CMakeLists.txt b/llvm/lib/Target/SPIRV/CMakeLists.txt
index aa83d997578f..a79e19fcd753 100644
--- a/llvm/lib/Target/SPIRV/CMakeLists.txt
+++ b/llvm/lib/Target/SPIRV/CMakeLists.txt
@@ -20,7 +20,6 @@ add_llvm_target(SPIRVCodeGen
   SPIRVCallLowering.cpp
   SPIRVInlineAsmLowering.cpp
   SPIRVCommandLine.cpp
-  SPIRVDuplicatesTracker.cpp
   SPIRVEmitIntrinsics.cpp
   SPIRVGlobalRegistry.cpp
   SPIRVInstrInfo.cpp
diff --git a/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp b/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp
index 4012bd7696c4..78add9214682 100644
--- a/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp
@@ -274,7 +274,7 @@ void SPIRVAsmPrinter::emitInstruction(const MachineInstr *MI) {
 }
 
 void SPIRVAsmPrinter::outputModuleSection(SPIRV::ModuleSectionType MSType) {
-  for (MachineInstr *MI : MAI->getMSInstrs(MSType))
+  for (const MachineInstr *MI : MAI->getMSInstrs(MSType))
     outputInstruction(MI);
 }
 
@@ -326,7 +326,7 @@ void SPIRVAsmPrinter::outputOpMemoryModel() {
 void SPIRVAsmPrinter::outputEntryPoints() {
   // Find all OpVariable IDs with required StorageClass.
   DenseSet<Register> InterfaceIDs;
-  for (MachineInstr *MI : MAI->GlobalVarList) {
+  for (const MachineInstr *MI : MAI->GlobalVarList) {
     assert(MI->getOpcode() == SPIRV::OpVariable);
     auto SC = static_cast<SPIRV::StorageClass::StorageClass>(
         MI->getOperand(2).getImm());
@@ -336,14 +336,14 @@ void SPIRVAsmPrinter::outputEntryPoints() {
     // declaring all global variables referenced by the entry point call tree.
     if (ST->isAtLeastSPIRVVer(VersionTuple(1, 4)) ||
         SC == SPIRV::StorageClass::Input || SC == SPIRV::StorageClass::Output) {
-      MachineFunction *MF = MI->getMF();
+      const MachineFunction *MF = MI->getMF();
       Register Reg = MAI->getRegisterAlias(MF, MI->getOperand(0).getReg());
       InterfaceIDs.insert(Reg);
     }
   }
 
   // Output OpEntryPoints adding interface args to all of them.
-  for (MachineInstr *MI : MAI->getMSInstrs(SPIRV::MB_EntryPoints)) {
+  for (const MachineInstr *MI : MAI->getMSInstrs(SPIRV::MB_EntryPoints)) {
     SPIRVMCInstLower MCInstLowering;
     MCInst TmpInst;
     MCInstLowering.lower(MI, TmpInst, MAI);
@@ -381,9 +381,8 @@ void SPIRVAsmPrinter::outputGlobalRequirements() {
 
 void SPIRVAsmPrinter::outputExtFuncDecls() {
   // Insert OpFunctionEnd after each declaration.
-  SmallVectorImpl<MachineInstr *>::iterator
-      I = MAI->getMSInstrs(SPIRV::MB_ExtFuncDecls).begin(),
-      E = MAI->getMSInstrs(SPIRV::MB_ExtFuncDecls).end();
+  auto I = MAI->getMSInstrs(SPIRV::MB_ExtFuncDecls).begin(),
+       E = MAI->getMSInstrs(SPIRV::MB_ExtFuncDecls).end();
   for (; I != E; ++I) {
     outputInstruction(*I);
     if ((I + 1) == E || (*(I + 1))->getOpcode() == SPIRV::OpFunction)
diff --git a/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp b/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp
index fa37313f8247..44b6f5f8d507 100644
--- a/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp
@@ -418,6 +418,7 @@ bool SPIRVCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
                                .addImm(FuncControl)
                                .addUse(GR->getSPIRVTypeID(FuncTy));
   GR->recordFunctionDefinition(&F, &MB.getInstr()->getOperand(0));
+  GR->addGlobalObject(&F, &MIRBuilder.getMF(), FuncVReg);
 
   // Add OpFunctionParameter instructions
   int i = 0;
@@ -431,6 +432,7 @@ bool SPIRVCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
         .addUse(GR->getSPIRVTypeID(ArgTypeVRegs[i]));
     if (F.isDeclaration())
       GR->add(&Arg, &MIRBuilder.getMF(), ArgReg);
+    GR->addGlobalObject(&Arg, &MIRBuilder.getMF(), ArgReg);
     i++;
   }
   // Name the function.
diff --git a/llvm/lib/Target/SPIRV/SPIRVDuplicatesTracker.cpp b/llvm/lib/Target/SPIRV/SPIRVDuplicatesTracker.cpp
deleted file mode 100644
index 48df845efd76..000000000000
--- a/llvm/lib/Target/SPIRV/SPIRVDuplicatesTracker.cpp
+++ /dev/null
@@ -1,136 +0,0 @@
-//===-- SPIRVDuplicatesTracker.cpp - SPIR-V Duplicates Tracker --*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// General infrastructure for keeping track of the values that according to
-// the SPIR-V binary layout should be global to the whole module.
-//
-//===----------------------------------------------------------------------===//
-
-#include "SPIRVDuplicatesTracker.h"
-#include "SPIRVInstrInfo.h"
-
-#define DEBUG_TYPE "build-dep-graph"
-
-using namespace llvm;
-
-template <typename T>
-void SPIRVGeneralDuplicatesTracker::prebuildReg2Entry(
-    SPIRVDuplicatesTracker<T> &DT, SPIRVReg2EntryTy &Reg2Entry,
-    const SPIRVInstrInfo *TII) {
-  for (auto &TPair : DT.getAllUses()) {
-    for (auto &RegPair : TPair.second) {
-      const MachineFunction *MF = RegPair.first;
-      Register R = RegPair.second;
-      MachineInstr *MI = MF->getRegInfo().getUniqueVRegDef(R);
-      if (!MI || (TPair.second.getIsConst() && !TII->isConstantInstr(*MI)))
-        continue;
-      Reg2Entry[&MI->getOperand(0)] = &TPair.second;
-    }
-  }
-}
-
-void SPIRVGeneralDuplicatesTracker::buildDepsGraph(
-    std::vector<SPIRV::DTSortableEntry *> &Graph, const SPIRVInstrInfo *TII,
-    MachineModuleInfo *MMI = nullptr) {
-  SPIRVReg2EntryTy Reg2Entry;
-  prebuildReg2Entry(TT, Reg2Entry, TII);
-  prebuildReg2Entry(CT, Reg2Entry, TII);
-  prebuildReg2Entry(GT, Reg2Entry, TII);
-  prebuildReg2Entry(FT, Reg2Entry, TII);
-  prebuildReg2Entry(AT, Reg2Entry, TII);
-  prebuildReg2Entry(MT, Reg2Entry, TII);
-  prebuildReg2Entry(ST, Reg2Entry, TII);
-
-  for (auto &Op2E : Reg2Entry) {
-    SPIRV::DTSortableEntry *E = Op2E.second;
-    Graph.push_back(E);
-    for (auto &U : *E) {
-      const MachineRegisterInfo &MRI = U.first->getRegInfo();
-      MachineInstr *MI = MRI.getUniqueVRegDef(U.second);
-      if (!MI)
-        continue;
-      assert(MI && MI->getParent() && "No MachineInstr created yet");
-      for (auto i = MI->getNumDefs(); i < MI->getNumOperands(); i++) {
-        MachineOperand &Op = MI->getOperand(i);
-        if (!Op.isReg())
-          continue;
-        MachineInstr *VRegDef = MRI.getVRegDef(Op.getReg());
-        // References to a function via function pointers generate virtual
-        // registers without a definition. We are able to resolve this
-        // reference using Globar Register info into an OpFunction instruction
-        // but do not expect to find it in Reg2Entry.
-        if (MI->getOpcode() == SPIRV::OpConstantFunctionPointerINTEL && i == 2)
-          continue;
-        MachineOperand *RegOp = &VRegDef->getOperand(0);
-        if (Reg2Entry.count(RegOp) == 0 &&
-            (MI->getOpcode() != SPIRV::OpVariable || i != 3)) {
-          // try to repair the unexpected code pattern
-          bool IsFixed = false;
-          if (VRegDef->getOpcode() == TargetOpcode::G_CONSTANT &&
-              RegOp->isReg() && MRI.getType(RegOp->getReg()).isScalar()) {
-            const Constant *C = VRegDef->getOperand(1).getCImm();
-            add(C, MI->getParent()->getParent(), RegOp->getReg());
-            auto Iter = CT.Storage.find(C);
-            if (Iter != CT.Storage.end()) {
-              SPIRV::DTSortableEntry &MissedEntry = Iter->second;
-              Reg2Entry[RegOp] = &MissedEntry;
-              IsFixed = true;
-            }
-          }
-          if (!IsFixed) {
-            std::string DiagMsg;
-            raw_string_ostream OS(DiagMsg);
-            OS << "Unexpected pattern while building a dependency "
-                  "graph.\nInstruction: ";
-            MI->print(OS);
-            OS << "Operand: ";
-            Op.print(OS);
-            OS << "\nOperand definition: ";
-            VRegDef->print(OS);
-            report_fatal_error(DiagMsg.c_str());
-          }
-        }
-        if (Reg2Entry.count(RegOp))
-          E->addDep(Reg2Entry[RegOp]);
-      }
-
-      if (E->getIsFunc()) {
-        MachineInstr *Next = MI->getNextNode();
-        if (Next && (Next->getOpcode() == SPIRV::OpFunction ||
-                     Next->getOpcode() == SPIRV::OpFunctionParameter)) {
-          E->addDep(Reg2Entry[&Next->getOperand(0)]);
-        }
-      }
-    }
-  }
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-  if (MMI) {
-    const Module *M = MMI->getModule();
-    for (auto F = M->begin(), E = M->end(); F != E; ++F) {
-      const MachineFunction *MF = MMI->getMachineFunction(*F);
-      if (!MF)
-        continue;
-      for (const MachineBasicBlock &MBB : *MF) {
-        for (const MachineInstr &CMI : MBB) {
-          MachineInstr &MI = const_cast<MachineInstr &>(CMI);
-          MI.dump();
-          if (MI.getNumExplicitDefs() > 0 &&
-              Reg2Entry.count(&MI.getOperand(0))) {
-            dbgs() << "\t[";
-            for (SPIRV::DTSortableEntry *D :
-                 Reg2Entry.lookup(&MI.getOperand(0))->getDeps())
-              dbgs() << Register::virtReg2Index(D->lookup(MF)) << ", ";
-            dbgs() << "]\n";
-          }
-        }
-      }
-    }
-  }
-#endif
-}
diff --git a/llvm/lib/Target/SPIRV/SPIRVDuplicatesTracker.h b/llvm/lib/Target/SPIRV/SPIRVDuplicatesTracker.h
index 6847da050979..e57489271229 100644
--- a/llvm/lib/Target/SPIRV/SPIRVDuplicatesTracker.h
+++ b/llvm/lib/Target/SPIRV/SPIRVDuplicatesTracker.h
@@ -211,23 +211,7 @@ class SPIRVGeneralDuplicatesTracker {
   SPIRVDuplicatesTracker<MachineInstr> MT;
   SPIRVDuplicatesTracker<SPIRV::SpecialTypeDescriptor> ST;
 
-  // NOTE: using MOs instead of regs to get rid of MF dependency to be able
-  // to use flat data structure.
-  // NOTE: replacing DenseMap with MapVector doesn't affect overall correctness
-  // but makes LITs more stable, should prefer DenseMap still due to
-  // significant perf difference.
-  using SPIRVReg2EntryTy =
-      MapVector<MachineOperand *, SPIRV::DTSortableEntry *>;
-
-  template <typename T>
-  void prebuildReg2Entry(SPIRVDuplicatesTracker<T> &DT,
-                         SPIRVReg2EntryTy &Reg2Entry,
-                         const SPIRVInstrInfo *TII);
-
 public:
-  void buildDepsGraph(std::vector<SPIRV::DTSortableEntry *> &Graph,
-                      const SPIRVInstrInfo *TII, MachineModuleInfo *MMI);
-
   void add(const Type *Ty, const MachineFunction *MF, Register R) {
     TT.add(unifyPtrType(Ty), MF, R);
   }
diff --git a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
index 77b54219a9ac..d2b14d6d058c 100644
--- a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
@@ -1841,20 +1841,20 @@ void SPIRVEmitIntrinsics::processGlobalValue(GlobalVariable &GV,
   // Skip special artifical variable llvm.global.annotations.
   if (GV.getName() == "llvm.global.annotations")
     return;
-  if (GV.hasInitializer() && !isa<UndefValue>(GV.getInitializer())) {
+  Constant *Init = nullptr;
+  if (hasInitializer(&GV)) {
     // Deduce element type and store results in Global Registry.
     // Result is ignored, because TypedPointerType is not supported
     // by llvm IR general logic.
     deduceElementTypeHelper(&GV, false);
-    Constant *Init = GV.getInitializer();
+    Init = GV.getInitializer();
     Type *Ty = isAggrConstForceInt32(Init) ? B.getInt32Ty() : Init->getType();
     Constant *Const = isAggrConstForceInt32(Init) ? B.getInt32(1) : Init;
     auto *InitInst = B.CreateIntrinsic(Intrinsic::spv_init_global,
                                        {GV.getType(), Ty}, {&GV, Const});
     InitInst->setArgOperand(1, Init);
   }
-  if ((!GV.hasInitializer() || isa<UndefValue>(GV.getInitializer())) &&
-      GV.getNumUses() == 0)
+  if (!Init && GV.getNumUses() == 0)
     B.CreateIntrinsic(Intrinsic::spv_unref_global, GV.getType(), &GV);
 }
 
diff --git a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
index 0c4244770010..a06c62e68d10 100644
--- a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
@@ -721,6 +721,7 @@ Register SPIRVGlobalRegistry::buildGlobalVariable(
   }
   Reg = MIB->getOperand(0).getReg();
   DT.add(GVar, &MIRBuilder.getMF(), Reg);
+  addGlobalObject(GVar, &MIRBuilder.getMF(), Reg);
 
   // Set to Reg the same type as ResVReg has.
   auto MRI = MIRBuilder.getMRI();
diff --git a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h
index ec2386fa1e56..528baf5f8d9e 100644
--- a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h
+++ b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h
@@ -89,6 +89,9 @@ class SPIRVGlobalRegistry {
   // Intrinsic::spv_assign_ptr_type instructions.
   DenseMap<Value *, CallInst *> AssignPtrTypeInstr;
 
+  // Maps OpVariable and OpFunction-related v-regs to its LLVM IR definition.
+  DenseMap<std::pair<const MachineFunction *, Register>, const Value *> Reg2GO;
+
   // Add a new OpTypeXXX instruction without checking for duplicates.
   SPIRVType *createSPIRVType(const Type *Type, MachineIRBuilder &MIRBuilder,
                              SPIRV::AccessQualifier::AccessQualifier AQ =
@@ -151,15 +154,17 @@ public:
     return DT.find(F, MF);
   }
 
-  void buildDepsGraph(std::vector<SPIRV::DTSortableEntry *> &Graph,
-                      const SPIRVInstrInfo *TII,
-                      MachineModuleInfo *MMI = nullptr) {
-    DT.buildDepsGraph(Graph, TII, MMI);
-  }
-
   void setBound(unsigned V) { Bound = V; }
   unsigned getBound() { return Bound; }
 
+  void addGlobalObject(const Value *V, const MachineFunction *MF, Register R) {
+    Reg2GO[std::make_pair(MF, R)] = V;
+  }
+  const Value *getGlobalObject(const MachineFunction *MF, Register R) {
+    auto It = Reg2GO.find(std::make_pair(MF, R));
+    return It == Reg2GO.end() ? nullptr : It->second;
+  }
+
   // Add a record to the map of function return pointer types.
   void addReturnType(const Function *ArgF, TypedPointerType *DerivedTy) {
     FunResPointerTypes[ArgF] = DerivedTy;
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.cpp b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.cpp
index bd9e77e9427c..9a140e75f8ea 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.cpp
@@ -47,6 +47,19 @@ bool SPIRVInstrInfo::isConstantInstr(const MachineInstr &MI) const {
   }
 }
 
+bool SPIRVInstrInfo::isSpecConstantInstr(const MachineInstr &MI) const {
+  switch (MI.getOpcode()) {
+  case SPIRV::OpSpecConstantTrue:
+  case SPIRV::OpSpecConstantFalse:
+  case SPIRV::OpSpecConstant:
+  case SPIRV::OpSpecConstantComposite:
+  case SPIRV::OpSpecConstantOp:
+    return true;
+  default:
+    return false;
+  }
+}
+
 bool SPIRVInstrInfo::isInlineAsmDefInstr(const MachineInstr &MI) const {
   switch (MI.getOpcode()) {
   case SPIRV::OpAsmTargetINTEL:
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.h b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.h
index 67d2d979cb5a..4e5059b4b889 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.h
+++ b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.h
@@ -30,6 +30,7 @@ public:
   const SPIRVRegisterInfo &getRegisterInfo() const { return RI; }
   bool isHeaderInstr(const MachineInstr &MI) const;
   bool isConstantInstr(const MachineInstr &MI) const;
+  bool isSpecConstantInstr(const MachineInstr &MI) const;
   bool isInlineAsmDefInstr(const MachineInstr &MI) const;
   bool isTypeDeclInstr(const MachineInstr &MI) const;
   bool isDecorationInstr(const MachineInstr &MI) const;
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
index 48156856f62c..28c9b81db51f 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
@@ -33,7 +33,6 @@
 #include "llvm/CodeGen/TargetOpcodes.h"
 #include "llvm/IR/IntrinsicsSPIRV.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
 
 #define DEBUG_TYPE "spirv-isel"
 
@@ -46,17 +45,6 @@ using ExtInstList =
 
 namespace {
 
-llvm::SPIRV::SelectionControl::SelectionControl
-getSelectionOperandForImm(int Imm) {
-  if (Imm == 2)
-    return SPIRV::SelectionControl::Flatten;
-  if (Imm == 1)
-    return SPIRV::SelectionControl::DontFlatten;
-  if (Imm == 0)
-    return SPIRV::SelectionControl::None;
-  llvm_unreachable("Invalid immediate");
-}
-
 #define GET_GLOBALISEL_PREDICATE_BITSET
 #include "SPIRVGenGlobalISel.inc"
 #undef GET_GLOBALISEL_PREDICATE_BITSET
@@ -1117,6 +1105,7 @@ bool SPIRVInstructionSelector::selectMemOperation(Register ResVReg,
                                             Constant::getNullValue(LLVMArrTy));
     Register VarReg = MRI->createGenericVirtualRegister(LLT::scalar(64));
     GR.add(GV, GR.CurMF, VarReg);
+    GR.addGlobalObject(GV, GR.CurMF, VarReg);
 
     Result &=
         BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(SPIRV::OpVariable))
@@ -2829,8 +2818,12 @@ bool SPIRVInstructionSelector::selectIntrinsic(Register ResVReg,
     }
     return MIB.constrainAllUses(TII, TRI, RBI);
   }
-  case Intrinsic::spv_loop_merge: {
-    auto MIB = BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpLoopMerge));
+  case Intrinsic::spv_loop_merge:
+  case Intrinsic::spv_selection_merge: {
+    const auto Opcode = IID == Intrinsic::spv_selection_merge
+                            ? SPIRV::OpSelectionMerge
+                            : SPIRV::OpLoopMerge;
+    auto MIB = BuildMI(BB, I, I.getDebugLoc(), TII.get(Opcode));
     for (unsigned i = 1; i < I.getNumExplicitOperands(); ++i) {
       assert(I.getOperand(i).isMBB());
       MIB.addMBB(I.getOperand(i).getMBB());
@@ -2838,15 +2831,6 @@ bool SPIRVInstructionSelector::selectIntrinsic(Register ResVReg,
     MIB.addImm(SPIRV::SelectionControl::None);
     return MIB.constrainAllUses(TII, TRI, RBI);
   }
-  case Intrinsic::spv_selection_merge: {
-    auto MIB =
-        BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpSelectionMerge));
-    assert(I.getOperand(1).isMBB() &&
-           "operand 1 to spv_selection_merge must be a basic block");
-    MIB.addMBB(I.getOperand(1).getMBB());
-    MIB.addImm(getSelectionOperandForImm(I.getOperand(2).getImm()));
-    return MIB.constrainAllUses(TII, TRI, RBI);
-  }
   case Intrinsic::spv_cmpxchg:
     return selectAtomicCmpXchg(ResVReg, ResType, I);
   case Intrinsic::spv_unreachable:
@@ -3477,7 +3461,7 @@ bool SPIRVInstructionSelector::selectGlobalValue(
       ID = UnnamedGlobalIDs.size();
     GlobalIdent = "__unnamed_" + Twine(ID).str();
   } else {
-    GlobalIdent = GV->getGlobalIdentifier();
+    GlobalIdent = GV->getName();
   }
 
   // Behaviour of functions as operands depends on availability of the
@@ -3509,18 +3493,25 @@ bool SPIRVInstructionSelector::selectGlobalValue(
         // References to a function via function pointers generate virtual
         // registers without a definition. We will resolve it later, during
         // module analysis stage.
+        Register ResTypeReg = GR.getSPIRVTypeID(ResType);
         MachineRegisterInfo *MRI = MIRBuilder.getMRI();
-        Register FuncVReg = MRI->createGenericVirtualRegister(LLT::scalar(64));
-        MRI->setRegClass(FuncVReg, &SPIRV::iIDRegClass);
-        MachineInstrBuilder MB =
+        Register FuncVReg =
+            MRI->createGenericVirtualRegister(GR.getRegType(ResType));
+        MRI->setRegClass(FuncVReg, &SPIRV::pIDRegClass);
+        MachineInstrBuilder MIB1 =
+            BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpUndef))
+                .addDef(FuncVReg)
+                .addUse(ResTypeReg);
+        MachineInstrBuilder MIB2 =
             BuildMI(BB, I, I.getDebugLoc(),
                     TII.get(SPIRV::OpConstantFunctionPointerINTEL))
                 .addDef(NewReg)
-                .addUse(GR.getSPIRVTypeID(ResType))
+                .addUse(ResTypeReg)
                 .addUse(FuncVReg);
         // mapping the function pointer to the used Function
-        GR.recordFunctionPointer(&MB.getInstr()->getOperand(2), GVFun);
-        return MB.constrainAllUses(TII, TRI, RBI);
+        GR.recordFunctionPointer(&MIB2.getInstr()->getOperand(2), GVFun);
+        return MIB1.constrainAllUses(TII, TRI, RBI) &&
+               MIB2.constrainAllUses(TII, TRI, RBI);
       }
       return BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpConstantNull))
           .addDef(NewReg)
@@ -3533,18 +3524,16 @@ bool SPIRVInstructionSelector::selectGlobalValue(
   auto GlobalVar = cast<GlobalVariable>(GV);
   assert(GlobalVar->getName() != "llvm.global.annotations");
 
-  bool HasInit = GlobalVar->hasInitializer() &&
-                 !isa<UndefValue>(GlobalVar->getInitializer());
-  // Skip empty declaration for GVs with initilaizers till we get the decl with
+  // Skip empty declaration for GVs with initializers till we get the decl with
   // passed initializer.
-  if (HasInit && !Init)
+  if (hasInitializer(GlobalVar) && !Init)
     return true;
 
-  bool HasLnkTy = GV->getLinkage() != GlobalValue::InternalLinkage;
+  bool HasLnkTy = !GV->hasInternalLinkage() && !GV->hasPrivateLinkage();
   SPIRV::LinkageType::LinkageType LnkType =
-      (GV->isDeclaration() || GV->hasAvailableExternallyLinkage())
+      GV->isDeclarationForLinker()
           ? SPIRV::LinkageType::Import
-          : (GV->getLinkage() == GlobalValue::LinkOnceODRLinkage &&
+          : (GV->hasLinkOnceODRLinkage() &&
                      STI.canUseExtension(SPIRV::Extension::SPV_KHR_linkonce_odr)
                  ? SPIRV::LinkageType::LinkOnceODR
                  : SPIRV::LinkageType::Export);
diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
index 6371c67d9245..63adf545775c 100644
--- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
@@ -216,102 +216,262 @@ void SPIRVModuleAnalysis::setBaseInfo(const Module &M) {
   }
 }
 
-// Collect MI which defines the register in the given machine function.
-static void collectDefInstr(Register Reg, const MachineFunction *MF,
-                            SPIRV::ModuleAnalysisInfo *MAI,
-                            SPIRV::ModuleSectionType MSType,
-                            bool DoInsert = true) {
-  assert(MAI->hasRegisterAlias(MF, Reg) && "Cannot find register alias");
-  MachineInstr *MI = MF->getRegInfo().getUniqueVRegDef(Reg);
-  assert(MI && "There should be an instruction that defines the register");
-  MAI->setSkipEmission(MI);
-  if (DoInsert)
-    MAI->MS[MSType].push_back(MI);
+// Returns a representation of an instruction as a vector of MachineOperand
+// hash values, see llvm::hash_value(const MachineOperand &MO) for details.
+// This creates a signature of the instruction with the same content
+// that MachineOperand::isIdenticalTo uses for comparison.
+static InstrSignature instrToSignature(const MachineInstr &MI,
+                                       SPIRV::ModuleAnalysisInfo &MAI,
+                                       bool UseDefReg) {
+  InstrSignature Signature{MI.getOpcode()};
+  for (unsigned i = 0; i < MI.getNumOperands(); ++i) {
+    const MachineOperand &MO = MI.getOperand(i);
+    size_t h;
+    if (MO.isReg()) {
+      if (!UseDefReg && MO.isDef())
+        continue;
+      Register RegAlias = MAI.getRegisterAlias(MI.getMF(), MO.getReg());
+      if (!RegAlias.isValid()) {
+        LLVM_DEBUG({
+          dbgs() << "Unexpectedly, no global id found for the operand ";
+          MO.print(dbgs());
+          dbgs() << "\nInstruction: ";
+          MI.print(dbgs());
+          dbgs() << "\n";
+        });
+        report_fatal_error("All v-regs must have been mapped to global id's");
+      }
+      // mimic llvm::hash_value(const MachineOperand &MO)
+      h = hash_combine(MO.getType(), (unsigned)RegAlias, MO.getSubReg(),
+                       MO.isDef());
+    } else {
+      h = hash_value(MO);
+    }
+    Signature.push_back(h);
+  }
+  return Signature;
 }
 
-void SPIRVModuleAnalysis::collectGlobalEntities(
-    const std::vector<SPIRV::DTSortableEntry *> &DepsGraph,
-    SPIRV::ModuleSectionType MSType,
-    std::function<bool(const SPIRV::DTSortableEntry *)> Pred,
-    bool UsePreOrder = false) {
-  DenseSet<const SPIRV::DTSortableEntry *> Visited;
-  for (const auto *E : DepsGraph) {
-    std::function<void(const SPIRV::DTSortableEntry *)> RecHoistUtil;
-    // NOTE: here we prefer recursive approach over iterative because
-    // we don't expect depchains long enough to cause SO.
-    RecHoistUtil = [MSType, UsePreOrder, &Visited, &Pred,
-                    &RecHoistUtil](const SPIRV::DTSortableEntry *E) {
-      if (Visited.count(E) || !Pred(E))
-        return;
-      Visited.insert(E);
-
-      // Traversing deps graph in post-order allows us to get rid of
-      // register aliases preprocessing.
-      // But pre-order is required for correct processing of function
-      // declaration and arguments processing.
-      if (!UsePreOrder)
-        for (auto *S : E->getDeps())
-          RecHoistUtil(S);
-
-      Register GlobalReg = Register::index2VirtReg(MAI.getNextID());
-      bool IsFirst = true;
-      for (auto &U : *E) {
-        const MachineFunction *MF = U.first;
-        Register Reg = U.second;
-        MAI.setRegisterAlias(MF, Reg, GlobalReg);
-        if (!MF->getRegInfo().getUniqueVRegDef(Reg))
-          continue;
-        collectDefInstr(Reg, MF, &MAI, MSType, IsFirst);
-        IsFirst = false;
-        if (E->getIsGV())
-          MAI.GlobalVarList.push_back(MF->getRegInfo().getUniqueVRegDef(Reg));
-      }
+bool SPIRVModuleAnalysis::isDeclSection(const MachineRegisterInfo &MRI,
+                                        const MachineInstr &MI) {
+  unsigned Opcode = MI.getOpcode();
+  switch (Opcode) {
+  case SPIRV::OpTypeForwardPointer:
+    // omit now, collect later
+    return false;
+  case SPIRV::OpVariable:
+    return static_cast<SPIRV::StorageClass::StorageClass>(
+               MI.getOperand(2).getImm()) != SPIRV::StorageClass::Function;
+  case SPIRV::OpFunction:
+  case SPIRV::OpFunctionParameter:
+    return true;
+  }
+  if (GR->hasConstFunPtr() && Opcode == SPIRV::OpUndef) {
+    Register DefReg = MI.getOperand(0).getReg();
+    for (MachineInstr &UseMI : MRI.use_instructions(DefReg)) {
+      if (UseMI.getOpcode() != SPIRV::OpConstantFunctionPointerINTEL)
+        continue;
+      // it's a dummy definition, FP constant refers to a function,
+      // and this is resolved in another way; let's skip this definition
+      assert(UseMI.getOperand(2).isReg() &&
+             UseMI.getOperand(2).getReg() == DefReg);
+      MAI.setSkipEmission(&MI);
+      return false;
+    }
+  }
+  return TII->isTypeDeclInstr(MI) || TII->isConstantInstr(MI) ||
+         TII->isInlineAsmDefInstr(MI);
+}
 
-      if (UsePreOrder)
-        for (auto *S : E->getDeps())
-          RecHoistUtil(S);
-    };
-    RecHoistUtil(E);
+// This is a special case of a function pointer refering to a possibly
+// forward function declaration. The operand is a dummy OpUndef that
+// requires a special treatment.
+void SPIRVModuleAnalysis::visitFunPtrUse(
+    Register OpReg, InstrGRegsMap &SignatureToGReg,
+    std::map<const Value *, unsigned> &GlobalToGReg, const MachineFunction *MF,
+    const MachineInstr &MI) {
+  const MachineOperand *OpFunDef =
+      GR->getFunctionDefinitionByUse(&MI.getOperand(2));
+  assert(OpFunDef && OpFunDef->isReg());
+  // find the actual function definition and number it globally in advance
+  const MachineInstr *OpDefMI = OpFunDef->getParent();
+  assert(OpDefMI && OpDefMI->getOpcode() == SPIRV::OpFunction);
+  const MachineFunction *FunDefMF = OpDefMI->getParent()->getParent();
+  const MachineRegisterInfo &FunDefMRI = FunDefMF->getRegInfo();
+  do {
+    visitDecl(FunDefMRI, SignatureToGReg, GlobalToGReg, FunDefMF, *OpDefMI);
+    OpDefMI = OpDefMI->getNextNode();
+  } while (OpDefMI && (OpDefMI->getOpcode() == SPIRV::OpFunction ||
+                       OpDefMI->getOpcode() == SPIRV::OpFunctionParameter));
+  // associate the function pointer with the newly assigned global number
+  Register GlobalFunDefReg = MAI.getRegisterAlias(FunDefMF, OpFunDef->getReg());
+  assert(GlobalFunDefReg.isValid() &&
+         "Function definition must refer to a global register");
+  MAI.setRegisterAlias(MF, OpReg, GlobalFunDefReg);
+}
+
+// Depth first recursive traversal of dependencies. Repeated visits are guarded
+// by MAI.hasRegisterAlias().
+void SPIRVModuleAnalysis::visitDecl(
+    const MachineRegisterInfo &MRI, InstrGRegsMap &SignatureToGReg,
+    std::map<const Value *, unsigned> &GlobalToGReg, const MachineFunction *MF,
+    const MachineInstr &MI) {
+  unsigned Opcode = MI.getOpcode();
+  DenseSet<Register> Deps;
+
+  // Process each operand of the instruction to resolve dependencies
+  for (const MachineOperand &MO : MI.operands()) {
+    if (!MO.isReg() || MO.isDef())
+      continue;
+    Register OpReg = MO.getReg();
+    // Handle function pointers special case
+    if (Opcode == SPIRV::OpConstantFunctionPointerINTEL &&
+        MRI.getRegClass(OpReg) == &SPIRV::pIDRegClass) {
+      visitFunPtrUse(OpReg, SignatureToGReg, GlobalToGReg, MF, MI);
+      continue;
+    }
+    // Skip already processed instructions
+    if (MAI.hasRegisterAlias(MF, MO.getReg()))
+      continue;
+    // Recursively visit dependencies
+    if (const MachineInstr *OpDefMI = MRI.getUniqueVRegDef(OpReg)) {
+      if (isDeclSection(MRI, *OpDefMI))
+        visitDecl(MRI, SignatureToGReg, GlobalToGReg, MF, *OpDefMI);
+      continue;
+    }
+    // Handle the unexpected case of no unique definition for the SPIR-V
+    // instruction
+    LLVM_DEBUG({
+      dbgs() << "Unexpectedly, no unique definition for the operand ";
+      MO.print(dbgs());
+      dbgs() << "\nInstruction: ";
+      MI.print(dbgs());
+      dbgs() << "\n";
+    });
+    report_fatal_error(
+        "No unique definition is found for the virtual register");
   }
+
+  Register GReg;
+  bool IsFunDef = false;
+  if (TII->isSpecConstantInstr(MI)) {
+    GReg = Register::index2VirtReg(MAI.getNextID());
+    MAI.MS[SPIRV::MB_TypeConstVars].push_back(&MI);
+  } else if (Opcode == SPIRV::OpFunction ||
+             Opcode == SPIRV::OpFunctionParameter) {
+    GReg = handleFunctionOrParameter(MF, MI, GlobalToGReg, IsFunDef);
+  } else if (TII->isTypeDeclInstr(MI) || TII->isConstantInstr(MI) ||
+             TII->isInlineAsmDefInstr(MI)) {
+    GReg = handleTypeDeclOrConstant(MI, SignatureToGReg);
+  } else if (Opcode == SPIRV::OpVariable) {
+    GReg = handleVariable(MF, MI, GlobalToGReg);
+  } else {
+    LLVM_DEBUG({
+      dbgs() << "\nInstruction: ";
+      MI.print(dbgs());
+      dbgs() << "\n";
+    });
+    llvm_unreachable("Unexpected instruction is visited");
+  }
+  MAI.setRegisterAlias(MF, MI.getOperand(0).getReg(), GReg);
+  if (!IsFunDef)
+    MAI.setSkipEmission(&MI);
 }
 
-// The function initializes global register alias table for types, consts,
-// global vars and func decls and collects these instruction for output
-// at module level. Also it collects explicit OpExtension/OpCapability
-// instructions.
-void SPIRVModuleAnalysis::processDefInstrs(const Module &M) {
-  std::vector<SPIRV::DTSortableEntry *> DepsGraph;
+Register SPIRVModuleAnalysis::handleFunctionOrParameter(
+    const MachineFunction *MF, const MachineInstr &MI,
+    std::map<const Value *, unsigned> &GlobalToGReg, bool &IsFunDef) {
+  const Value *GObj = GR->getGlobalObject(MF, MI.getOperand(0).getReg());
+  assert(GObj && "Unregistered global definition");
+  const Function *F = dyn_cast<Function>(GObj);
+  if (!F)
+    F = dyn_cast<Argument>(GObj)->getParent();
+  assert(F && "Expected a reference to a function or an argument");
+  IsFunDef = !F->isDeclaration();
+  auto It = GlobalToGReg.find(GObj);
+  if (It != GlobalToGReg.end())
+    return It->second;
+  Register GReg = Register::index2VirtReg(MAI.getNextID());
+  GlobalToGReg[GObj] = GReg;
+  if (!IsFunDef)
+    MAI.MS[SPIRV::MB_ExtFuncDecls].push_back(&MI);
+  return GReg;
+}
 
-  GR->buildDepsGraph(DepsGraph, TII, SPVDumpDeps ? MMI : nullptr);
+Register
+SPIRVModuleAnalysis::handleTypeDeclOrConstant(const MachineInstr &MI,
+                                              InstrGRegsMap &SignatureToGReg) {
+  InstrSignature MISign = instrToSignature(MI, MAI, false);
+  auto It = SignatureToGReg.find(MISign);
+  if (It != SignatureToGReg.end())
+    return It->second;
+  Register GReg = Register::index2VirtReg(MAI.getNextID());
+  SignatureToGReg[MISign] = GReg;
+  MAI.MS[SPIRV::MB_TypeConstVars].push_back(&MI);
+  return GReg;
+}
 
-  collectGlobalEntities(
-      DepsGraph, SPIRV::MB_TypeConstVars,
-      [](const SPIRV::DTSortableEntry *E) { return !E->getIsFunc(); });
+Register SPIRVModuleAnalysis::handleVariable(
+    const MachineFunction *MF, const MachineInstr &MI,
+    std::map<const Value *, unsigned> &GlobalToGReg) {
+  MAI.GlobalVarList.push_back(&MI);
+  const Value *GObj = GR->getGlobalObject(MF, MI.getOperand(0).getReg());
+  assert(GObj && "Unregistered global definition");
+  auto It = GlobalToGReg.find(GObj);
+  if (It != GlobalToGReg.end())
+    return It->second;
+  Register GReg = Register::index2VirtReg(MAI.getNextID());
+  GlobalToGReg[GObj] = GReg;
+  MAI.MS[SPIRV::MB_TypeConstVars].push_back(&MI);
+  return GReg;
+}
 
+void SPIRVModuleAnalysis::collectDeclarations(const Module &M) {
+  InstrGRegsMap SignatureToGReg;
+  std::map<const Value *, unsigned> GlobalToGReg;
   for (auto F = M.begin(), E = M.end(); F != E; ++F) {
     MachineFunction *MF = MMI->getMachineFunction(*F);
     if (!MF)
       continue;
-    // Iterate through and collect OpExtension/OpCapability instructions.
+    const MachineRegisterInfo &MRI = MF->getRegInfo();
+    unsigned PastHeader = 0;
     for (MachineBasicBlock &MBB : *MF) {
       for (MachineInstr &MI : MBB) {
-        if (MI.getOpcode() == SPIRV::OpExtension) {
-          // Here, OpExtension just has a single enum operand, not a string.
-          auto Ext = SPIRV::Extension::Extension(MI.getOperand(0).getImm());
-          MAI.Reqs.addExtension(Ext);
+        if (MI.getNumOperands() == 0)
+          continue;
+        unsigned Opcode = MI.getOpcode();
+        if (Opcode == SPIRV::OpFunction) {
+          if (PastHeader == 0) {
+            PastHeader = 1;
+            continue;
+          }
+        } else if (Opcode == SPIRV::OpFunctionParameter) {
+          if (PastHeader < 2)
+            continue;
+        } else if (PastHeader > 0) {
+          PastHeader = 2;
+        }
+
+        const MachineOperand &DefMO = MI.getOperand(0);
+        switch (Opcode) {
+        case SPIRV::OpExtension:
+          MAI.Reqs.addExtension(SPIRV::Extension::Extension(DefMO.getImm()));
           MAI.setSkipEmission(&MI);
-        } else if (MI.getOpcode() == SPIRV::OpCapability) {
-          auto Cap = SPIRV::Capability::Capability(MI.getOperand(0).getImm());
-          MAI.Reqs.addCapability(Cap);
+          break;
+        case SPIRV::OpCapability:
+          MAI.Reqs.addCapability(SPIRV::Capability::Capability(DefMO.getImm()));
           MAI.setSkipEmission(&MI);
+          if (PastHeader > 0)
+            PastHeader = 2;
+          break;
+        default:
+          if (DefMO.isReg() && isDeclSection(MRI, MI) &&
+              !MAI.hasRegisterAlias(MF, DefMO.getReg()))
+            visitDecl(MRI, SignatureToGReg, GlobalToGReg, MF, MI);
         }
       }
     }
   }
-
-  collectGlobalEntities(
-      DepsGraph, SPIRV::MB_ExtFuncDecls,
-      [](const SPIRV::DTSortableEntry *E) { return E->getIsFunc(); }, true);
 }
 
 // Look for IDs declared with Import linkage, and map the corresponding function
@@ -342,58 +502,6 @@ void SPIRVModuleAnalysis::collectFuncNames(MachineInstr &MI,
   }
 }
 
-// References to a function via function pointers generate virtual
-// registers without a definition. We are able to resolve this
-// reference using Globar Register info into an OpFunction instruction
-// and replace dummy operands by the corresponding global register references.
-void SPIRVModuleAnalysis::collectFuncPtrs() {
-  for (auto &MI : MAI.MS[SPIRV::MB_TypeConstVars])
-    if (MI->getOpcode() == SPIRV::OpConstantFunctionPointerINTEL)
-      collectFuncPtrs(MI);
-}
-
-void SPIRVModuleAnalysis::collectFuncPtrs(MachineInstr *MI) {
-  const MachineOperand *FunUse = &MI->getOperand(2);
-  if (const MachineOperand *FunDef = GR->getFunctionDefinitionByUse(FunUse)) {
-    const MachineInstr *FunDefMI = FunDef->getParent();
-    assert(FunDefMI->getOpcode() == SPIRV::OpFunction &&
-           "Constant function pointer must refer to function definition");
-    Register FunDefReg = FunDef->getReg();
-    Register GlobalFunDefReg =
-        MAI.getRegisterAlias(FunDefMI->getMF(), FunDefReg);
-    assert(GlobalFunDefReg.isValid() &&
-           "Function definition must refer to a global register");
-    Register FunPtrReg = FunUse->getReg();
-    MAI.setRegisterAlias(MI->getMF(), FunPtrReg, GlobalFunDefReg);
-  }
-}
-
-using InstrSignature = SmallVector<size_t>;
-using InstrTraces = std::set<InstrSignature>;
-
-// Returns a representation of an instruction as a vector of MachineOperand
-// hash values, see llvm::hash_value(const MachineOperand &MO) for details.
-// This creates a signature of the instruction with the same content
-// that MachineOperand::isIdenticalTo uses for comparison.
-static InstrSignature instrToSignature(MachineInstr &MI,
-                                       SPIRV::ModuleAnalysisInfo &MAI) {
-  InstrSignature Signature;
-  for (unsigned i = 0; i < MI.getNumOperands(); ++i) {
-    const MachineOperand &MO = MI.getOperand(i);
-    size_t h;
-    if (MO.isReg()) {
-      Register RegAlias = MAI.getRegisterAlias(MI.getMF(), MO.getReg());
-      // mimic llvm::hash_value(const MachineOperand &MO)
-      h = hash_combine(MO.getType(), (unsigned)RegAlias, MO.getSubReg(),
-                       MO.isDef());
-    } else {
-      h = hash_value(MO);
-    }
-    Signature.push_back(h);
-  }
-  return Signature;
-}
-
 // Collect the given instruction in the specified MS. We assume global register
 // numbering has already occurred by this point. We can directly compare reg
 // arguments when detecting duplicates.
@@ -401,7 +509,7 @@ static void collectOtherInstr(MachineInstr &MI, SPIRV::ModuleAnalysisInfo &MAI,
                               SPIRV::ModuleSectionType MSType, InstrTraces &IS,
                               bool Append = true) {
   MAI.setSkipEmission(&MI);
-  InstrSignature MISign = instrToSignature(MI, MAI);
+  InstrSignature MISign = instrToSignature(MI, MAI, true);
   auto FoundMI = IS.insert(MISign);
   if (!FoundMI.second)
     return; // insert failed, so we found a duplicate; don't add it to MAI.MS
@@ -465,7 +573,7 @@ void SPIRVModuleAnalysis::processOtherInstrs(const Module &M) {
 
 // Number registers in all functions globally from 0 onwards and store
 // the result in global register alias table. Some registers are already
-// numbered in collectGlobalEntities.
+// numbered.
 void SPIRVModuleAnalysis::numberRegistersGlobally(const Module &M) {
   for (auto F = M.begin(), E = M.end(); F != E; ++F) {
     if ((*F).isDeclaration())
@@ -1835,15 +1943,11 @@ bool SPIRVModuleAnalysis::runOnModule(Module &M) {
 
   // Process type/const/global var/func decl instructions, number their
   // destination registers from 0 to N, collect Extensions and Capabilities.
-  processDefInstrs(M);
+  collectDeclarations(M);
 
   // Number rest of registers from N+1 onwards.
   numberRegistersGlobally(M);
 
-  // Update references to OpFunction instructions to use Global Registers
-  if (GR->hasConstFunPtr())
-    collectFuncPtrs();
-
   // Collect OpName, OpEntryPoint, OpDecorate etc, process other instructions.
   processOtherInstrs(M);
 
diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h
index ee2aaf156aa8..79b5444cca20 100644
--- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h
+++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h
@@ -124,7 +124,7 @@ public:
                           const Capability::Capability IfPresent);
 };
 
-using InstrList = SmallVector<MachineInstr *>;
+using InstrList = SmallVector<const MachineInstr *>;
 // Maps a local register to the corresponding global alias.
 using LocalToGlobalRegTable = std::map<Register, Register>;
 using RegisterAliasMapTy =
@@ -142,12 +142,12 @@ struct ModuleAnalysisInfo {
   // Maps ExtInstSet to corresponding ID register.
   DenseMap<unsigned, Register> ExtInstSetMap;
   // Contains the list of all global OpVariables in the module.
-  SmallVector<MachineInstr *, 4> GlobalVarList;
+  SmallVector<const MachineInstr *, 4> GlobalVarList;
   // Maps functions to corresponding function ID registers.
   DenseMap<const Function *, Register> FuncMap;
   // The set contains machine instructions which are necessary
   // for correct MIR but will not be emitted in function bodies.
-  DenseSet<MachineInstr *> InstrsToDelete;
+  DenseSet<const MachineInstr *> InstrsToDelete;
   // The table contains global aliases of local registers for each machine
   // function. The aliases are used to substitute local registers during
   // code emission.
@@ -167,7 +167,7 @@ struct ModuleAnalysisInfo {
   }
   Register getExtInstSetReg(unsigned SetNum) { return ExtInstSetMap[SetNum]; }
   InstrList &getMSInstrs(unsigned MSType) { return MS[MSType]; }
-  void setSkipEmission(MachineInstr *MI) { InstrsToDelete.insert(MI); }
+  void setSkipEmission(const MachineInstr *MI) { InstrsToDelete.insert(MI); }
   bool getSkipEmission(const MachineInstr *MI) {
     return InstrsToDelete.contains(MI);
   }
@@ -204,6 +204,10 @@ struct ModuleAnalysisInfo {
 };
 } // namespace SPIRV
 
+using InstrSignature = SmallVector<size_t>;
+using InstrTraces = std::set<InstrSignature>;
+using InstrGRegsMap = std::map<SmallVector<size_t>, unsigned>;
+
 struct SPIRVModuleAnalysis : public ModulePass {
   static char ID;
 
@@ -216,17 +220,27 @@ public:
 
 private:
   void setBaseInfo(const Module &M);
-  void collectGlobalEntities(
-      const std::vector<SPIRV::DTSortableEntry *> &DepsGraph,
-      SPIRV::ModuleSectionType MSType,
-      std::function<bool(const SPIRV::DTSortableEntry *)> Pred,
-      bool UsePreOrder);
-  void processDefInstrs(const Module &M);
   void collectFuncNames(MachineInstr &MI, const Function *F);
   void processOtherInstrs(const Module &M);
   void numberRegistersGlobally(const Module &M);
-  void collectFuncPtrs();
-  void collectFuncPtrs(MachineInstr *MI);
+
+  // analyze dependencies to collect module scope definitions
+  void collectDeclarations(const Module &M);
+  void visitDecl(const MachineRegisterInfo &MRI, InstrGRegsMap &SignatureToGReg,
+                 std::map<const Value *, unsigned> &GlobalToGReg,
+                 const MachineFunction *MF, const MachineInstr &MI);
+  Register handleVariable(const MachineFunction *MF, const MachineInstr &MI,
+                          std::map<const Value *, unsigned> &GlobalToGReg);
+  Register handleTypeDeclOrConstant(const MachineInstr &MI,
+                                    InstrGRegsMap &SignatureToGReg);
+  Register
+  handleFunctionOrParameter(const MachineFunction *MF, const MachineInstr &MI,
+                            std::map<const Value *, unsigned> &GlobalToGReg,
+                            bool &IsFunDef);
+  void visitFunPtrUse(Register OpReg, InstrGRegsMap &SignatureToGReg,
+                      std::map<const Value *, unsigned> &GlobalToGReg,
+                      const MachineFunction *MF, const MachineInstr &MI);
+  bool isDeclSection(const MachineRegisterInfo &MRI, const MachineInstr &MI);
 
   const SPIRVSubtarget *ST;
   SPIRVGlobalRegistry *GR;
diff --git a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
index 8357c30d6949..5b4c84918ab4 100644
--- a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
@@ -58,9 +58,10 @@ addConstantsToTrack(MachineFunction &MF, SPIRVGlobalRegistry *GR,
                              ->getValue());
       if (auto *GV = dyn_cast<GlobalValue>(Const)) {
         Register Reg = GR->find(GV, &MF);
-        if (!Reg.isValid())
+        if (!Reg.isValid()) {
           GR->add(GV, &MF, SrcReg);
-        else
+          GR->addGlobalObject(GV, &MF, SrcReg);
+        } else
           RegsAlreadyAddedToDT[&MI] = Reg;
       } else {
         Register Reg = GR->find(Const, &MF);
diff --git a/llvm/lib/Target/SPIRV/SPIRVStructurizer.cpp b/llvm/lib/Target/SPIRV/SPIRVStructurizer.cpp
index 2e4343c7922f..336cde4e7822 100644
--- a/llvm/lib/Target/SPIRV/SPIRVStructurizer.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVStructurizer.cpp
@@ -18,16 +18,14 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/CodeGen/IntrinsicLowering.h"
+#include "llvm/IR/Analysis.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/IntrinsicsSPIRV.h"
-#include "llvm/IR/LegacyPassManager.h"
 #include "llvm/InitializePasses.h"
-#include "llvm/PassRegistry.h"
-#include "llvm/Transforms/Utils.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/LoopSimplify.h"
 #include "llvm/Transforms/Utils/LowerMemIntrinsics.h"
@@ -648,7 +646,8 @@ class SPIRVStructurizer : public FunctionPass {
       Builder.SetInsertPoint(Header->getTerminator());
 
       auto MergeAddress = BlockAddress::get(BB.getParent(), &BB);
-      createOpSelectMerge(&Builder, MergeAddress);
+      SmallVector<Value *, 1> Args = {MergeAddress};
+      Builder.CreateIntrinsic(Intrinsic::spv_selection_merge, {}, {Args});
 
       Modified = true;
     }
@@ -770,9 +769,10 @@ class SPIRVStructurizer : public FunctionPass {
       BasicBlock *Merge = Candidates[0];
 
       auto MergeAddress = BlockAddress::get(Merge->getParent(), Merge);
+      SmallVector<Value *, 1> Args = {MergeAddress};
       IRBuilder<> Builder(&BB);
       Builder.SetInsertPoint(BB.getTerminator());
-      createOpSelectMerge(&Builder, MergeAddress);
+      Builder.CreateIntrinsic(Intrinsic::spv_selection_merge, {}, {Args});
     }
 
     return Modified;
@@ -1105,7 +1105,8 @@ class SPIRVStructurizer : public FunctionPass {
         Builder.SetInsertPoint(Header->getTerminator());
 
         auto MergeAddress = BlockAddress::get(Merge->getParent(), Merge);
-        createOpSelectMerge(&Builder, MergeAddress);
+        SmallVector<Value *, 1> Args = {MergeAddress};
+        Builder.CreateIntrinsic(Intrinsic::spv_selection_merge, {}, {Args});
         continue;
       }
 
@@ -1119,7 +1120,8 @@ class SPIRVStructurizer : public FunctionPass {
       Builder.SetInsertPoint(Header->getTerminator());
 
       auto MergeAddress = BlockAddress::get(NewMerge->getParent(), NewMerge);
-      createOpSelectMerge(&Builder, MergeAddress);
+      SmallVector<Value *, 1> Args = {MergeAddress};
+      Builder.CreateIntrinsic(Intrinsic::spv_selection_merge, {}, {Args});
     }
 
     return Modified;
@@ -1206,27 +1208,6 @@ public:
     AU.addPreserved<SPIRVConvergenceRegionAnalysisWrapperPass>();
     FunctionPass::getAnalysisUsage(AU);
   }
-
-  void createOpSelectMerge(IRBuilder<> *Builder, BlockAddress *MergeAddress) {
-    Instruction *BBTerminatorInst = Builder->GetInsertBlock()->getTerminator();
-
-    MDNode *MDNode = BBTerminatorInst->getMetadata("hlsl.controlflow.hint");
-
-    ConstantInt *BranchHint = llvm::ConstantInt::get(Builder->getInt32Ty(), 0);
-
-    if (MDNode) {
-      assert(MDNode->getNumOperands() == 2 &&
-             "invalid metadata hlsl.controlflow.hint");
-      BranchHint = mdconst::extract<ConstantInt>(MDNode->getOperand(1));
-
-      assert(BranchHint && "invalid metadata value for hlsl.controlflow.hint");
-    }
-
-    llvm::SmallVector<llvm::Value *, 2> Args = {MergeAddress, BranchHint};
-
-    Builder->CreateIntrinsic(Intrinsic::spv_selection_merge,
-                             {MergeAddress->getType()}, {Args});
-  }
 };
 } // namespace llvm
 
@@ -1248,11 +1229,8 @@ FunctionPass *llvm::createSPIRVStructurizerPass() {
 
 PreservedAnalyses SPIRVStructurizerWrapper::run(Function &F,
                                                 FunctionAnalysisManager &AF) {
-
-  auto FPM = legacy::FunctionPassManager(F.getParent());
-  FPM.add(createSPIRVStructurizerPass());
-
-  if (!FPM.run(F))
+  FunctionPass *StructurizerPass = createSPIRVStructurizerPass();
+  if (!StructurizerPass->runOnFunction(F))
     return PreservedAnalyses::all();
   PreservedAnalyses PA;
   PA.preserveSet<CFGAnalyses>();
diff --git a/llvm/lib/Target/SPIRV/SPIRVUtils.h b/llvm/lib/Target/SPIRV/SPIRVUtils.h
index da2e24c0c9ab..60649eac6281 100644
--- a/llvm/lib/Target/SPIRV/SPIRVUtils.h
+++ b/llvm/lib/Target/SPIRV/SPIRVUtils.h
@@ -17,6 +17,7 @@
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/IR/Dominators.h"
+#include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/TypedPointerType.h"
 #include <queue>
@@ -236,6 +237,10 @@ Type *parseBasicTypeName(StringRef &TypeName, LLVMContext &Ctx);
 // Returns true if the function was changed.
 bool sortBlocks(Function &F);
 
+inline bool hasInitializer(const GlobalVariable *GV) {
+  return GV->hasInitializer() && !isa<UndefValue>(GV->getInitializer());
+}
+
 // True if this is an instance of TypedPointerType.
 inline bool isTypedPointerTy(const Type *T) {
   return T && T->getTypeID() == Type::TypedPointerTyID;
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 68bdeb1cebeb..6b0eb38e7e09 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -94,7 +94,7 @@ static cl::opt<int> BrMergingCcmpBias(
 
 static cl::opt<bool>
     WidenShift("x86-widen-shift", cl::init(true),
-               cl::desc("Replacte narrow shifts with wider shifts."),
+               cl::desc("Replace narrow shifts with wider shifts."),
                cl::Hidden);
 
 static cl::opt<int> BrMergingLikelyBias(
@@ -41694,6 +41694,8 @@ static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL,
                                     TargetLowering::DAGCombinerInfo &DCI,
                                     const X86Subtarget &Subtarget) {
   MVT VT = N.getSimpleValueType();
+  unsigned NumElts = VT.getVectorNumElements();
+
   SmallVector<int, 4> Mask;
   unsigned Opcode = N.getOpcode();
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
@@ -41979,7 +41981,7 @@ static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL,
       APInt Mask = APInt::getHighBitsSet(64, 32);
       if (DAG.MaskedValueIsZero(In, Mask)) {
         SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, In);
-        MVT VecVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
+        MVT VecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
         SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Trunc);
         SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, VecVT, SclVec);
         return DAG.getBitcast(VT, Movl);
@@ -41994,7 +41996,6 @@ static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL,
         // Create a vector constant - scalar constant followed by zeros.
         EVT ScalarVT = N0.getOperand(0).getValueType();
         Type *ScalarTy = ScalarVT.getTypeForEVT(*DAG.getContext());
-        unsigned NumElts = VT.getVectorNumElements();
         Constant *Zero = ConstantInt::getNullValue(ScalarTy);
         SmallVector<Constant *, 32> ConstantVec(NumElts, Zero);
         ConstantVec[0] = const_cast<ConstantInt *>(C->getConstantIntValue());
@@ -42045,9 +42046,8 @@ static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL,
         MVT SrcVT = N0.getOperand(0).getSimpleValueType();
         unsigned SrcBits = SrcVT.getScalarSizeInBits();
         if ((EltBits % SrcBits) == 0 && SrcBits >= 32) {
-          unsigned Size = VT.getVectorNumElements();
           unsigned NewSize = SrcVT.getVectorNumElements();
-          APInt BlendMask = N.getConstantOperandAPInt(2).zextOrTrunc(Size);
+          APInt BlendMask = N.getConstantOperandAPInt(2).zextOrTrunc(NumElts);
           APInt NewBlendMask = APIntOps::ScaleBitMask(BlendMask, NewSize);
           return DAG.getBitcast(
               VT, DAG.getNode(X86ISD::BLENDI, DL, SrcVT, N0.getOperand(0),
@@ -42460,7 +42460,7 @@ static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL,
       int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
       DMask[DOffset + 0] = DOffset + 1;
       DMask[DOffset + 1] = DOffset + 0;
-      MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
+      MVT DVT = MVT::getVectorVT(MVT::i32, NumElts / 2);
       V = DAG.getBitcast(DVT, V);
       V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
                       getV4X86ShuffleImm8ForMask(DMask, DL, DAG));
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 7a7554cdb990..c19bcfc5524c 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -1650,6 +1650,13 @@ InstructionCost X86TTIImpl::getShuffleCost(
         return MatchingTypes ? TTI::TCC_Free : SubLT.first;
     }
 
+    // Attempt to match MOVSS (Idx == 0) or INSERTPS pattern. This will have
+    // been matched by improveShuffleKindFromMask as a SK_InsertSubvector of
+    // v1f32 (legalised to f32) into a v4f32.
+    if (LT.first == 1 && LT.second == MVT::v4f32 && SubLT.first == 1 &&
+        SubLT.second == MVT::f32 && (Index == 0 || ST->hasSSE41()))
+      return 1;
+
     // If the insertion isn't aligned, treat it like a 2-op shuffle.
     Kind = TTI::SK_PermuteTwoSrc;
   }
@@ -4797,9 +4804,12 @@ InstructionCost X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
     MVT MScalarTy = LT.second.getScalarType();
     auto IsCheapPInsrPExtrInsertPS = [&]() {
       // Assume pinsr/pextr XMM <-> GPR is relatively cheap on all targets.
+      // Inserting f32 into index0 is just movss.
       // Also, assume insertps is relatively cheap on all >= SSE41 targets.
       return (MScalarTy == MVT::i16 && ST->hasSSE2()) ||
              (MScalarTy.isInteger() && ST->hasSSE41()) ||
+             (MScalarTy == MVT::f32 && ST->hasSSE1() && Index == 0 &&
+              Opcode == Instruction::InsertElement) ||
              (MScalarTy == MVT::f32 && ST->hasSSE41() &&
               Opcode == Instruction::InsertElement);
     };
diff --git a/llvm/lib/TargetParser/AArch64TargetParser.cpp b/llvm/lib/TargetParser/AArch64TargetParser.cpp
index 50c9a565e7ae..7d0b8c333f72 100644
--- a/llvm/lib/TargetParser/AArch64TargetParser.cpp
+++ b/llvm/lib/TargetParser/AArch64TargetParser.cpp
@@ -48,17 +48,12 @@ std::optional<AArch64::ArchInfo> AArch64::ArchInfo::findBySubArch(StringRef SubA
   return {};
 }
 
-unsigned AArch64::getFMVPriority(ArrayRef<StringRef> Features) {
-  constexpr unsigned MaxFMVPriority = 1000;
-  unsigned Priority = 0;
-  unsigned NumFeatures = 0;
-  for (StringRef Feature : Features) {
-    if (auto Ext = parseFMVExtension(Feature)) {
-      Priority = std::max(Priority, Ext->Priority);
-      NumFeatures++;
-    }
-  }
-  return Priority + MaxFMVPriority * NumFeatures;
+uint64_t AArch64::getFMVPriority(ArrayRef<StringRef> Features) {
+  uint64_t Priority = 0;
+  for (StringRef Feature : Features)
+    if (std::optional<FMVInfo> Info = parseFMVExtension(Feature))
+      Priority |= (1ULL << Info->PriorityBit);
+  return Priority;
 }
 
 uint64_t AArch64::getCpuSupportsMask(ArrayRef<StringRef> Features) {
@@ -73,7 +68,7 @@ uint64_t AArch64::getCpuSupportsMask(ArrayRef<StringRef> Features) {
   uint64_t FeaturesMask = 0;
   for (const FMVInfo &Info : getFMVInfo())
     if (Info.ID && FeatureBits.Enabled.test(*Info.ID))
-      FeaturesMask |= (1ULL << Info.Bit);
+      FeaturesMask |= (1ULL << Info.FeatureBit);
 
   return FeaturesMask;
 }
diff --git a/llvm/lib/TargetParser/RISCVISAInfo.cpp b/llvm/lib/TargetParser/RISCVISAInfo.cpp
index 4f403e9fb6f5..d6e1eac0d85a 100644
--- a/llvm/lib/TargetParser/RISCVISAInfo.cpp
+++ b/llvm/lib/TargetParser/RISCVISAInfo.cpp
@@ -742,8 +742,8 @@ Error RISCVISAInfo::checkDependency() {
   bool HasZvl = MinVLen != 0;
   bool HasZcmt = Exts.count("zcmt") != 0;
   static constexpr StringLiteral XqciExts[] = {
-      {"xqcia"},   {"xqciac"},  {"xqcicli"}, {"xqcics"},
-      {"xqcicsr"}, {"xqcilsm"}, {"xqcisls"}};
+      {"xqcia"},  {"xqciac"},  {"xqcicli"}, {"xqcicm"},
+      {"xqcics"}, {"xqcicsr"}, {"xqcilsm"}, {"xqcisls"}};
 
   if (HasI && HasE)
     return getIncompatibleError("i", "e");
diff --git a/llvm/lib/Transforms/Coroutines/Coroutines.cpp b/llvm/lib/Transforms/Coroutines/Coroutines.cpp
index 240d089ebeff..7b59c39283de 100644
--- a/llvm/lib/Transforms/Coroutines/Coroutines.cpp
+++ b/llvm/lib/Transforms/Coroutines/Coroutines.cpp
@@ -69,7 +69,6 @@ static const char *const CoroIntrinsics[] = {
     "llvm.coro.async.context.dealloc",
     "llvm.coro.async.resume",
     "llvm.coro.async.size.replace",
-    "llvm.coro.async.store_resume",
     "llvm.coro.await.suspend.bool",
     "llvm.coro.await.suspend.handle",
     "llvm.coro.await.suspend.void",
diff --git a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
index 96956481df2f..449d64d1614f 100644
--- a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
+++ b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
@@ -66,19 +66,19 @@ static cl::opt<unsigned> MaxCodeSizeGrowth(
     "Maximum codesize growth allowed per function"));
 
 static cl::opt<unsigned> MinCodeSizeSavings(
-    "funcspec-min-codesize-savings", cl::init(20), cl::Hidden, cl::desc(
-    "Reject specializations whose codesize savings are less than this"
-    "much percent of the original function size"));
+    "funcspec-min-codesize-savings", cl::init(20), cl::Hidden,
+    cl::desc("Reject specializations whose codesize savings are less than this "
+             "much percent of the original function size"));
 
 static cl::opt<unsigned> MinLatencySavings(
     "funcspec-min-latency-savings", cl::init(40), cl::Hidden,
-    cl::desc("Reject specializations whose latency savings are less than this"
+    cl::desc("Reject specializations whose latency savings are less than this "
              "much percent of the original function size"));
 
 static cl::opt<unsigned> MinInliningBonus(
-    "funcspec-min-inlining-bonus", cl::init(300), cl::Hidden, cl::desc(
-    "Reject specializations whose inlining bonus is less than this"
-    "much percent of the original function size"));
+    "funcspec-min-inlining-bonus", cl::init(300), cl::Hidden,
+    cl::desc("Reject specializations whose inlining bonus is less than this "
+             "much percent of the original function size"));
 
 static cl::opt<bool> SpecializeOnAddress(
     "funcspec-on-address", cl::init(false), cl::Hidden, cl::desc(
diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
index 16a80e9ebbea..78cd249c9c16 100644
--- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
@@ -105,7 +105,7 @@ static cl::opt<int> ColdCCRelFreq(
     "coldcc-rel-freq", cl::Hidden, cl::init(2),
     cl::desc(
         "Maximum block frequency, expressed as a percentage of caller's "
-        "entry frequency, for a call site to be considered cold for enabling"
+        "entry frequency, for a call site to be considered cold for enabling "
         "coldcc"));
 
 /// Is this global variable possibly used by a leak checker as a root?  If so,
diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
index 1bf7ff468d78..016db55c99c3 100644
--- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
+++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
@@ -122,6 +122,20 @@ static cl::opt<unsigned>
                         cl::desc("Max depth to recursively search for missing "
                                  "frames through tail calls."));
 
+// By default enable cloning of callsites involved with recursive cycles
+static cl::opt<bool> AllowRecursiveCallsites(
+    "memprof-allow-recursive-callsites", cl::init(true), cl::Hidden,
+    cl::desc("Allow cloning of callsites involved in recursive cycles"));
+
+// When disabled, try to detect and prevent cloning of recursive contexts.
+// This is only necessary until we support cloning through recursive cycles.
+// Leave on by default for now, as disabling requires a little bit of compile
+// time overhead and doesn't affect correctness, it will just inflate the cold
+// hinted bytes reporting a bit when -memprof-report-hinted-sizes is enabled.
+static cl::opt<bool> AllowRecursiveContexts(
+    "memprof-allow-recursive-contexts", cl::init(true), cl::Hidden,
+    cl::desc("Allow cloning of contexts through recursive cycles"));
+
 namespace llvm {
 cl::opt<bool> EnableMemProfContextDisambiguation(
     "enable-memprof-context-disambiguation", cl::init(false), cl::Hidden,
@@ -1236,9 +1250,13 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addStackNodesForMIB(
       StackEntryIdToContextNodeMap[StackId] = StackNode;
       StackNode->OrigStackOrAllocId = StackId;
     }
-    auto Ins = StackIdSet.insert(StackId);
-    if (!Ins.second)
-      StackNode->Recursive = true;
+    // Marking a node recursive will prevent its cloning completely, even for
+    // non-recursive contexts flowing through it.
+    if (!AllowRecursiveCallsites) {
+      auto Ins = StackIdSet.insert(StackId);
+      if (!Ins.second)
+        StackNode->Recursive = true;
+    }
     StackNode->AllocTypes |= (uint8_t)AllocType;
     PrevNode->addOrUpdateCallerEdge(StackNode, AllocType, LastContextId);
     PrevNode = StackNode;
@@ -1375,8 +1393,11 @@ static void checkNode(const ContextNode<DerivedCCG, FuncTy, CallTy> *Node,
       set_union(CallerEdgeContextIds, Edge->ContextIds);
     }
     // Node can have more context ids than callers if some contexts terminate at
-    // node and some are longer.
-    assert(NodeContextIds == CallerEdgeContextIds ||
+    // node and some are longer. If we are allowing recursive callsites but
+    // haven't disabled recursive contexts, this will be violated for
+    // incompletely cloned recursive cycles, so skip the checking in that case.
+    assert((AllowRecursiveCallsites && AllowRecursiveContexts) ||
+           NodeContextIds == CallerEdgeContextIds ||
            set_is_subset(CallerEdgeContextIds, NodeContextIds));
   }
   if (Node->CalleeEdges.size()) {
@@ -3370,6 +3391,21 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones(
 
   assert(Node->AllocTypes != (uint8_t)AllocationType::None);
 
+  DenseSet<uint32_t> RecursiveContextIds;
+  // If we are allowing recursive callsites, but have also disabled recursive
+  // contexts, look for context ids that show up in multiple caller edges.
+  if (AllowRecursiveCallsites && !AllowRecursiveContexts) {
+    DenseSet<uint32_t> AllCallerContextIds;
+    for (auto &CE : Node->CallerEdges) {
+      // Resize to the largest set of caller context ids, since we know the
+      // final set will be at least that large.
+      AllCallerContextIds.reserve(CE->getContextIds().size());
+      for (auto Id : CE->getContextIds())
+        if (!AllCallerContextIds.insert(Id).second)
+          RecursiveContextIds.insert(Id);
+    }
+  }
+
   // Iterate until we find no more opportunities for disambiguating the alloc
   // types via cloning. In most cases this loop will terminate once the Node
   // has a single allocation type, in which case no more cloning is needed.
@@ -3394,6 +3430,9 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones(
     // allocation.
     auto CallerEdgeContextsForAlloc =
         set_intersection(CallerEdge->getContextIds(), AllocContextIds);
+    if (!RecursiveContextIds.empty())
+      CallerEdgeContextsForAlloc =
+          set_difference(CallerEdgeContextsForAlloc, RecursiveContextIds);
     if (CallerEdgeContextsForAlloc.empty()) {
       ++EI;
       continue;
diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
index b40ab357670b..67585e9c80ef 100644
--- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -129,7 +129,7 @@ static cl::opt<bool> PrintModuleBeforeOptimizations(
 
 static cl::opt<bool> AlwaysInlineDeviceFunctions(
     "openmp-opt-inline-device",
-    cl::desc("Inline all applicible functions on the device."), cl::Hidden,
+    cl::desc("Inline all applicable functions on the device."), cl::Hidden,
     cl::init(false));
 
 static cl::opt<bool>
diff --git a/llvm/lib/Transforms/IPO/SampleProfile.cpp b/llvm/lib/Transforms/IPO/SampleProfile.cpp
index 603beb3b883d..b978c54ef96f 100644
--- a/llvm/lib/Transforms/IPO/SampleProfile.cpp
+++ b/llvm/lib/Transforms/IPO/SampleProfile.cpp
@@ -162,7 +162,7 @@ static cl::opt<bool> ProfileSampleBlockAccurate(
 static cl::opt<bool> ProfileAccurateForSymsInList(
     "profile-accurate-for-symsinlist", cl::Hidden, cl::init(true),
     cl::desc("For symbols in profile symbol list, regard their profiles to "
-             "be accurate. It may be overriden by profile-sample-accurate. "));
+             "be accurate. It may be overridden by profile-sample-accurate. "));
 
 static cl::opt<bool> ProfileMergeInlinee(
     "sample-profile-merge-inlinee", cl::Hidden, cl::init(true),
@@ -193,9 +193,10 @@ static cl::opt<bool> ProfileSizeInline(
 // and inline the hot functions (that are skipped in this pass).
 static cl::opt<bool> DisableSampleLoaderInlining(
     "disable-sample-loader-inlining", cl::Hidden, cl::init(false),
-    cl::desc("If true, artifically skip inline transformation in sample-loader "
-             "pass, and merge (or scale) profiles (as configured by "
-             "--sample-profile-merge-inlinee)."));
+    cl::desc(
+        "If true, artificially skip inline transformation in sample-loader "
+        "pass, and merge (or scale) profiles (as configured by "
+        "--sample-profile-merge-inlinee)."));
 
 namespace llvm {
 cl::opt<bool>
@@ -255,7 +256,7 @@ static cl::opt<unsigned> PrecentMismatchForStalenessError(
 
 static cl::opt<bool> CallsitePrioritizedInline(
     "sample-profile-prioritized-inline", cl::Hidden,
-    cl::desc("Use call site prioritized inlining for sample profile loader."
+    cl::desc("Use call site prioritized inlining for sample profile loader. "
              "Currently only CSSPGO is supported."));
 
 static cl::opt<bool> UsePreInlinerDecision(
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index e9bb2b884756..f82a557e5760 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -39,8 +39,7 @@ static Value *getNewICmpValue(unsigned Code, bool Sign, Value *LHS, Value *RHS,
 /// This is the complement of getFCmpCode, which turns an opcode and two
 /// operands into either a FCmp instruction, or a true/false constant.
 static Value *getFCmpValue(unsigned Code, Value *LHS, Value *RHS,
-                           InstCombiner::BuilderTy &Builder,
-                           FastMathFlags FMF) {
+                           InstCombiner::BuilderTy &Builder, FMFSource FMF) {
   FCmpInst::Predicate NewPred;
   if (Constant *TorF = getPredForFCmpCode(Code, LHS->getType(), NewPred))
     return TorF;
@@ -514,7 +513,8 @@ static Value *foldLogOpOfMaskedICmpsAsymmetric(
 /// into a single (icmp(A & X) ==/!= Y).
 static Value *foldLogOpOfMaskedICmps(ICmpInst *LHS, ICmpInst *RHS, bool IsAnd,
                                      bool IsLogical,
-                                     InstCombiner::BuilderTy &Builder) {
+                                     InstCombiner::BuilderTy &Builder,
+                                     const SimplifyQuery &Q) {
   Value *A = nullptr, *B = nullptr, *C = nullptr, *D = nullptr, *E = nullptr;
   ICmpInst::Predicate PredL = LHS->getPredicate(), PredR = RHS->getPredicate();
   std::optional<std::pair<unsigned, unsigned>> MaskPair =
@@ -587,93 +587,107 @@ static Value *foldLogOpOfMaskedICmps(ICmpInst *LHS, ICmpInst *RHS, bool IsAnd,
     return Builder.CreateICmp(NewCC, NewAnd2, A);
   }
 
-  // Remaining cases assume at least that B and D are constant, and depend on
-  // their actual values. This isn't strictly necessary, just a "handle the
-  // easy cases for now" decision.
   const APInt *ConstB, *ConstD;
-  if (!match(B, m_APInt(ConstB)) || !match(D, m_APInt(ConstD)))
-    return nullptr;
-
-  if (Mask & (Mask_NotAllZeros | BMask_NotAllOnes)) {
-    // (icmp ne (A & B), 0) & (icmp ne (A & D), 0) and
-    // (icmp ne (A & B), B) & (icmp ne (A & D), D)
-    //     -> (icmp ne (A & B), 0) or (icmp ne (A & D), 0)
-    // Only valid if one of the masks is a superset of the other (check "B&D" is
-    // the same as either B or D).
-    APInt NewMask = *ConstB & *ConstD;
-    if (NewMask == *ConstB)
-      return LHS;
-    else if (NewMask == *ConstD)
-      return RHS;
-  }
-
-  if (Mask & AMask_NotAllOnes) {
-    // (icmp ne (A & B), B) & (icmp ne (A & D), D)
-    //     -> (icmp ne (A & B), A) or (icmp ne (A & D), A)
-    // Only valid if one of the masks is a superset of the other (check "B|D" is
-    // the same as either B or D).
-    APInt NewMask = *ConstB | *ConstD;
-    if (NewMask == *ConstB)
-      return LHS;
-    else if (NewMask == *ConstD)
-      return RHS;
-  }
-
-  if (Mask & (BMask_Mixed | BMask_NotMixed)) {
-    // Mixed:
-    // (icmp eq (A & B), C) & (icmp eq (A & D), E)
-    // We already know that B & C == C && D & E == E.
-    // If we can prove that (B & D) & (C ^ E) == 0, that is, the bits of
-    // C and E, which are shared by both the mask B and the mask D, don't
-    // contradict, then we can transform to
-    // -> (icmp eq (A & (B|D)), (C|E))
-    // Currently, we only handle the case of B, C, D, and E being constant.
-    // We can't simply use C and E because we might actually handle
-    //   (icmp ne (A & B), B) & (icmp eq (A & D), D)
-    // with B and D, having a single bit set.
-
-    // NotMixed:
-    // (icmp ne (A & B), C) & (icmp ne (A & D), E)
-    // -> (icmp ne (A & (B & D)), (C & E))
-    // Check the intersection (B & D) for inequality.
-    // Assume that (B & D) == B || (B & D) == D, i.e B/D is a subset of D/B
-    // and (B & D) & (C ^ E) == 0, bits of C and E, which are shared by both the
-    // B and the D, don't contradict.
-    // Note that we can assume (~B & C) == 0 && (~D & E) == 0, previous
-    // operation should delete these icmps if it hadn't been met.
-
-    const APInt *OldConstC, *OldConstE;
-    if (!match(C, m_APInt(OldConstC)) || !match(E, m_APInt(OldConstE)))
-      return nullptr;
-
-    auto FoldBMixed = [&](ICmpInst::Predicate CC, bool IsNot) -> Value * {
-      CC = IsNot ? CmpInst::getInversePredicate(CC) : CC;
-      const APInt ConstC = PredL != CC ? *ConstB ^ *OldConstC : *OldConstC;
-      const APInt ConstE = PredR != CC ? *ConstD ^ *OldConstE : *OldConstE;
+  if (match(B, m_APInt(ConstB)) && match(D, m_APInt(ConstD))) {
+    if (Mask & (Mask_NotAllZeros | BMask_NotAllOnes)) {
+      // (icmp ne (A & B), 0) & (icmp ne (A & D), 0) and
+      // (icmp ne (A & B), B) & (icmp ne (A & D), D)
+      //     -> (icmp ne (A & B), 0) or (icmp ne (A & D), 0)
+      // Only valid if one of the masks is a superset of the other (check "B&D"
+      // is the same as either B or D).
+      APInt NewMask = *ConstB & *ConstD;
+      if (NewMask == *ConstB)
+        return LHS;
+      if (NewMask == *ConstD)
+        return RHS;
+    }
 
-      if (((*ConstB & *ConstD) & (ConstC ^ ConstE)).getBoolValue())
-        return IsNot ? nullptr : ConstantInt::get(LHS->getType(), !IsAnd);
+    if (Mask & AMask_NotAllOnes) {
+      // (icmp ne (A & B), B) & (icmp ne (A & D), D)
+      //     -> (icmp ne (A & B), A) or (icmp ne (A & D), A)
+      // Only valid if one of the masks is a superset of the other (check "B|D"
+      // is the same as either B or D).
+      APInt NewMask = *ConstB | *ConstD;
+      if (NewMask == *ConstB)
+        return LHS;
+      if (NewMask == *ConstD)
+        return RHS;
+    }
 
-      if (IsNot && !ConstB->isSubsetOf(*ConstD) && !ConstD->isSubsetOf(*ConstB))
+    if (Mask & (BMask_Mixed | BMask_NotMixed)) {
+      // Mixed:
+      // (icmp eq (A & B), C) & (icmp eq (A & D), E)
+      // We already know that B & C == C && D & E == E.
+      // If we can prove that (B & D) & (C ^ E) == 0, that is, the bits of
+      // C and E, which are shared by both the mask B and the mask D, don't
+      // contradict, then we can transform to
+      // -> (icmp eq (A & (B|D)), (C|E))
+      // Currently, we only handle the case of B, C, D, and E being constant.
+      // We can't simply use C and E because we might actually handle
+      //   (icmp ne (A & B), B) & (icmp eq (A & D), D)
+      // with B and D, having a single bit set.
+
+      // NotMixed:
+      // (icmp ne (A & B), C) & (icmp ne (A & D), E)
+      // -> (icmp ne (A & (B & D)), (C & E))
+      // Check the intersection (B & D) for inequality.
+      // Assume that (B & D) == B || (B & D) == D, i.e B/D is a subset of D/B
+      // and (B & D) & (C ^ E) == 0, bits of C and E, which are shared by both
+      // the B and the D, don't contradict. Note that we can assume (~B & C) ==
+      // 0 && (~D & E) == 0, previous operation should delete these icmps if it
+      // hadn't been met.
+
+      const APInt *OldConstC, *OldConstE;
+      if (!match(C, m_APInt(OldConstC)) || !match(E, m_APInt(OldConstE)))
         return nullptr;
 
-      APInt BD, CE;
-      if (IsNot) {
-        BD = *ConstB & *ConstD;
-        CE = ConstC & ConstE;
-      } else {
-        BD = *ConstB | *ConstD;
-        CE = ConstC | ConstE;
-      }
-      Value *NewAnd = Builder.CreateAnd(A, BD);
-      Value *CEVal = ConstantInt::get(A->getType(), CE);
-      return Builder.CreateICmp(CC, CEVal, NewAnd);
-    };
+      auto FoldBMixed = [&](ICmpInst::Predicate CC, bool IsNot) -> Value * {
+        CC = IsNot ? CmpInst::getInversePredicate(CC) : CC;
+        const APInt ConstC = PredL != CC ? *ConstB ^ *OldConstC : *OldConstC;
+        const APInt ConstE = PredR != CC ? *ConstD ^ *OldConstE : *OldConstE;
+
+        if (((*ConstB & *ConstD) & (ConstC ^ ConstE)).getBoolValue())
+          return IsNot ? nullptr : ConstantInt::get(LHS->getType(), !IsAnd);
+
+        if (IsNot && !ConstB->isSubsetOf(*ConstD) &&
+            !ConstD->isSubsetOf(*ConstB))
+          return nullptr;
+
+        APInt BD, CE;
+        if (IsNot) {
+          BD = *ConstB & *ConstD;
+          CE = ConstC & ConstE;
+        } else {
+          BD = *ConstB | *ConstD;
+          CE = ConstC | ConstE;
+        }
+        Value *NewAnd = Builder.CreateAnd(A, BD);
+        Value *CEVal = ConstantInt::get(A->getType(), CE);
+        return Builder.CreateICmp(CC, CEVal, NewAnd);
+      };
+
+      if (Mask & BMask_Mixed)
+        return FoldBMixed(NewCC, false);
+      if (Mask & BMask_NotMixed) // can be else also
+        return FoldBMixed(NewCC, true);
+    }
+  }
 
-    if (Mask & BMask_Mixed)
-      return FoldBMixed(NewCC, false);
-    if (Mask & BMask_NotMixed) // can be else also
-      return FoldBMixed(NewCC, true);
+  // (icmp eq (A & B), 0) | (icmp eq (A & D), 0)
+  // -> (icmp ne (A & (B|D)), (B|D))
+  // (icmp ne (A & B), 0) & (icmp ne (A & D), 0)
+  // -> (icmp eq (A & (B|D)), (B|D))
+  // iff B and D is known to be a power of two
+  if (Mask & Mask_NotAllZeros &&
+      isKnownToBeAPowerOfTwo(B, /*OrZero=*/false, /*Depth=*/0, Q) &&
+      isKnownToBeAPowerOfTwo(D, /*OrZero=*/false, /*Depth=*/0, Q)) {
+    // If this is a logical and/or, then we must prevent propagation of a
+    // poison value from the RHS by inserting freeze.
+    if (IsLogical)
+      D = Builder.CreateFreeze(D);
+    Value *Mask = Builder.CreateOr(B, D);
+    Value *Masked = Builder.CreateAnd(A, Mask);
+    return Builder.CreateICmp(NewCC, Masked, Mask);
   }
   return nullptr;
 }
@@ -776,46 +790,6 @@ foldAndOrOfICmpsWithPow2AndWithZero(InstCombiner::BuilderTy &Builder,
   return Builder.CreateICmp(Pred, And, Op);
 }
 
-// Fold (iszero(A & K1) | iszero(A & K2)) -> (A & (K1 | K2)) != (K1 | K2)
-// Fold (!iszero(A & K1) & !iszero(A & K2)) -> (A & (K1 | K2)) == (K1 | K2)
-Value *InstCombinerImpl::foldAndOrOfICmpsOfAndWithPow2(ICmpInst *LHS,
-                                                       ICmpInst *RHS,
-                                                       Instruction *CxtI,
-                                                       bool IsAnd,
-                                                       bool IsLogical) {
-  CmpInst::Predicate Pred = IsAnd ? CmpInst::ICMP_NE : CmpInst::ICMP_EQ;
-  if (LHS->getPredicate() != Pred || RHS->getPredicate() != Pred)
-    return nullptr;
-
-  if (!match(LHS->getOperand(1), m_Zero()) ||
-      !match(RHS->getOperand(1), m_Zero()))
-    return nullptr;
-
-  Value *L1, *L2, *R1, *R2;
-  if (match(LHS->getOperand(0), m_And(m_Value(L1), m_Value(L2))) &&
-      match(RHS->getOperand(0), m_And(m_Value(R1), m_Value(R2)))) {
-    if (L1 == R2 || L2 == R2)
-      std::swap(R1, R2);
-    if (L2 == R1)
-      std::swap(L1, L2);
-
-    if (L1 == R1 &&
-        isKnownToBeAPowerOfTwo(L2, false, 0, CxtI) &&
-        isKnownToBeAPowerOfTwo(R2, false, 0, CxtI)) {
-      // If this is a logical and/or, then we must prevent propagation of a
-      // poison value from the RHS by inserting freeze.
-      if (IsLogical)
-        R2 = Builder.CreateFreeze(R2);
-      Value *Mask = Builder.CreateOr(L2, R2);
-      Value *Masked = Builder.CreateAnd(L1, Mask);
-      auto NewPred = IsAnd ? CmpInst::ICMP_EQ : CmpInst::ICMP_NE;
-      return Builder.CreateICmp(NewPred, Masked, Mask);
-    }
-  }
-
-  return nullptr;
-}
-
 /// General pattern:
 ///   X & Y
 ///
@@ -1431,8 +1405,7 @@ static Value *matchIsFiniteTest(InstCombiner::BuilderTy &Builder, FCmpInst *LHS,
     return nullptr;
 
   return Builder.CreateFCmpFMF(FCmpInst::getOrderedPredicate(PredR), RHS0, RHS1,
-                               LHS->getFastMathFlags() &
-                                   RHS->getFastMathFlags());
+                               FMFSource::intersect(LHS, RHS));
 }
 
 Value *InstCombinerImpl::foldLogicOfFCmps(FCmpInst *LHS, FCmpInst *RHS,
@@ -1469,7 +1442,7 @@ Value *InstCombinerImpl::foldLogicOfFCmps(FCmpInst *LHS, FCmpInst *RHS,
     // Intersect the fast math flags.
     // TODO: We can union the fast math flags unless this is a logical select.
     return getFCmpValue(NewPred, LHS0, LHS1, Builder,
-                        LHS->getFastMathFlags() & RHS->getFastMathFlags());
+                        FMFSource::intersect(LHS, RHS));
   }
 
   // This transform is not valid for a logical select.
@@ -1486,8 +1459,8 @@ Value *InstCombinerImpl::foldLogicOfFCmps(FCmpInst *LHS, FCmpInst *RHS,
       // Ignore the constants because they are obviously not NANs:
       // (fcmp ord x, 0.0) & (fcmp ord y, 0.0)  -> (fcmp ord x, y)
       // (fcmp uno x, 0.0) | (fcmp uno y, 0.0)  -> (fcmp uno x, y)
-      return Builder.CreateFCmpFMF(
-          PredL, LHS0, RHS0, LHS->getFastMathFlags() & RHS->getFastMathFlags());
+      return Builder.CreateFCmpFMF(PredL, LHS0, RHS0,
+                                   FMFSource::intersect(LHS, RHS));
     }
   }
 
@@ -3327,12 +3300,6 @@ Value *InstCombinerImpl::foldAndOrOfICmps(ICmpInst *LHS, ICmpInst *RHS,
                                           bool IsLogical) {
   const SimplifyQuery Q = SQ.getWithInstruction(&I);
 
-  // Fold (iszero(A & K1) | iszero(A & K2)) ->  (A & (K1 | K2)) != (K1 | K2)
-  // Fold (!iszero(A & K1) & !iszero(A & K2)) ->  (A & (K1 | K2)) == (K1 | K2)
-  // if K1 and K2 are a one-bit mask.
-  if (Value *V = foldAndOrOfICmpsOfAndWithPow2(LHS, RHS, &I, IsAnd, IsLogical))
-    return V;
-
   ICmpInst::Predicate PredL = LHS->getPredicate(), PredR = RHS->getPredicate();
   Value *LHS0 = LHS->getOperand(0), *RHS0 = RHS->getOperand(0);
   Value *LHS1 = LHS->getOperand(1), *RHS1 = RHS->getOperand(1);
@@ -3359,7 +3326,7 @@ Value *InstCombinerImpl::foldAndOrOfICmps(ICmpInst *LHS, ICmpInst *RHS,
   // handle (roughly):
   // (icmp ne (A & B), C) | (icmp ne (A & D), E)
   // (icmp eq (A & B), C) & (icmp eq (A & D), E)
-  if (Value *V = foldLogOpOfMaskedICmps(LHS, RHS, IsAnd, IsLogical, Builder))
+  if (Value *V = foldLogOpOfMaskedICmps(LHS, RHS, IsAnd, IsLogical, Builder, Q))
     return V;
 
   if (Value *V =
@@ -4964,8 +4931,8 @@ Instruction *InstCombinerImpl::visitXor(BinaryOperator &I) {
 
   // (A & B) ^ (A | C) --> A ? ~B : C -- There are 4 commuted variants.
   if (I.getType()->isIntOrIntVectorTy(1) &&
-      match(Op0, m_OneUse(m_LogicalAnd(m_Value(A), m_Value(B)))) &&
-      match(Op1, m_OneUse(m_LogicalOr(m_Value(C), m_Value(D))))) {
+      match(&I, m_c_Xor(m_OneUse(m_LogicalAnd(m_Value(A), m_Value(B))),
+                        m_OneUse(m_LogicalOr(m_Value(C), m_Value(D)))))) {
     bool NeedFreeze = isa<SelectInst>(Op0) && isa<SelectInst>(Op1) && B == D;
     if (B == C || B == D)
       std::swap(A, B);
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 5494c70b34b1..c55c40c88bc8 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -2674,9 +2674,8 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
     // copysign Mag, (copysign ?, X) --> copysign Mag, X
     Value *X;
     if (match(Sign, m_Intrinsic<Intrinsic::copysign>(m_Value(), m_Value(X)))) {
-      Value *CopySign = Builder.CreateCopySign(
-          Mag, X,
-          II->getFastMathFlags() & cast<Instruction>(Sign)->getFastMathFlags());
+      Value *CopySign =
+          Builder.CreateCopySign(Mag, X, FMFSource::intersect(II, Sign));
       return replaceInstUsesWith(*II, CopySign);
     }
 
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 8b23583c5106..2e4572575994 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -2485,9 +2485,8 @@ Instruction *InstCombinerImpl::foldICmpShlConstant(ICmpInst &Cmp,
       // icmp ule i64 (shl X, 32), 8589934592 ->
       // icmp ule i32 (trunc X, i32), 2 ->
       // icmp ult i32 (trunc X, i32), 3
-      if (auto FlippedStrictness =
-              InstCombiner::getFlippedStrictnessPredicateAndConstant(
-                  Pred, ConstantInt::get(ShType->getContext(), C))) {
+      if (auto FlippedStrictness = getFlippedStrictnessPredicateAndConstant(
+              Pred, ConstantInt::get(ShType->getContext(), C))) {
         CmpPred = FlippedStrictness->first;
         RHSC = cast<ConstantInt>(FlippedStrictness->second)->getValue();
       }
@@ -3091,12 +3090,12 @@ Instruction *InstCombinerImpl::foldICmpAddConstant(ICmpInst &Cmp,
     unsigned BW = C.getBitWidth();
     std::bitset<4> Table;
     auto ComputeTable = [&](bool Op0Val, bool Op1Val) {
-      int Res = 0;
+      APInt Res(BW, 0);
       if (Op0Val)
-        Res += isa<ZExtInst>(Ext0) ? 1 : -1;
+        Res += APInt(BW, isa<ZExtInst>(Ext0) ? 1 : -1, /*isSigned=*/true);
       if (Op1Val)
-        Res += isa<ZExtInst>(Ext1) ? 1 : -1;
-      return ICmpInst::compare(APInt(BW, Res, true), C, Pred);
+        Res += APInt(BW, isa<ZExtInst>(Ext1) ? 1 : -1, /*isSigned=*/true);
+      return ICmpInst::compare(Res, C, Pred);
     };
 
     Table[0] = ComputeTable(false, false);
@@ -3280,8 +3279,7 @@ bool InstCombinerImpl::matchThreeWayIntCompare(SelectInst *SI, Value *&LHS,
   if (PredB == ICmpInst::ICMP_SGT && isa<Constant>(RHS2)) {
     // x sgt C-1  <-->  x sge C  <-->  not(x slt C)
     auto FlippedStrictness =
-        InstCombiner::getFlippedStrictnessPredicateAndConstant(
-            PredB, cast<Constant>(RHS2));
+        getFlippedStrictnessPredicateAndConstant(PredB, cast<Constant>(RHS2));
     if (!FlippedStrictness)
       return false;
     assert(FlippedStrictness->first == ICmpInst::ICMP_SGE &&
@@ -6908,79 +6906,6 @@ Instruction *InstCombinerImpl::foldICmpUsingBoolRange(ICmpInst &I) {
   return nullptr;
 }
 
-std::optional<std::pair<CmpPredicate, Constant *>>
-InstCombiner::getFlippedStrictnessPredicateAndConstant(CmpPredicate Pred,
-                                                       Constant *C) {
-  assert(ICmpInst::isRelational(Pred) && ICmpInst::isIntPredicate(Pred) &&
-         "Only for relational integer predicates.");
-
-  Type *Type = C->getType();
-  bool IsSigned = ICmpInst::isSigned(Pred);
-
-  CmpInst::Predicate UnsignedPred = ICmpInst::getUnsignedPredicate(Pred);
-  bool WillIncrement =
-      UnsignedPred == ICmpInst::ICMP_ULE || UnsignedPred == ICmpInst::ICMP_UGT;
-
-  // Check if the constant operand can be safely incremented/decremented
-  // without overflowing/underflowing.
-  auto ConstantIsOk = [WillIncrement, IsSigned](ConstantInt *C) {
-    return WillIncrement ? !C->isMaxValue(IsSigned) : !C->isMinValue(IsSigned);
-  };
-
-  Constant *SafeReplacementConstant = nullptr;
-  if (auto *CI = dyn_cast<ConstantInt>(C)) {
-    // Bail out if the constant can't be safely incremented/decremented.
-    if (!ConstantIsOk(CI))
-      return std::nullopt;
-  } else if (auto *FVTy = dyn_cast<FixedVectorType>(Type)) {
-    unsigned NumElts = FVTy->getNumElements();
-    for (unsigned i = 0; i != NumElts; ++i) {
-      Constant *Elt = C->getAggregateElement(i);
-      if (!Elt)
-        return std::nullopt;
-
-      if (isa<UndefValue>(Elt))
-        continue;
-
-      // Bail out if we can't determine if this constant is min/max or if we
-      // know that this constant is min/max.
-      auto *CI = dyn_cast<ConstantInt>(Elt);
-      if (!CI || !ConstantIsOk(CI))
-        return std::nullopt;
-
-      if (!SafeReplacementConstant)
-        SafeReplacementConstant = CI;
-    }
-  } else if (isa<VectorType>(C->getType())) {
-    // Handle scalable splat
-    Value *SplatC = C->getSplatValue();
-    auto *CI = dyn_cast_or_null<ConstantInt>(SplatC);
-    // Bail out if the constant can't be safely incremented/decremented.
-    if (!CI || !ConstantIsOk(CI))
-      return std::nullopt;
-  } else {
-    // ConstantExpr?
-    return std::nullopt;
-  }
-
-  // It may not be safe to change a compare predicate in the presence of
-  // undefined elements, so replace those elements with the first safe constant
-  // that we found.
-  // TODO: in case of poison, it is safe; let's replace undefs only.
-  if (C->containsUndefOrPoisonElement()) {
-    assert(SafeReplacementConstant && "Replacement constant not set");
-    C = Constant::replaceUndefsWith(C, SafeReplacementConstant);
-  }
-
-  CmpInst::Predicate NewPred = CmpInst::getFlippedStrictnessPredicate(Pred);
-
-  // Increment or decrement the constant.
-  Constant *OneOrNegOne = ConstantInt::get(Type, WillIncrement ? 1 : -1, true);
-  Constant *NewC = ConstantExpr::getAdd(C, OneOrNegOne);
-
-  return std::make_pair(NewPred, NewC);
-}
-
 /// If we have an icmp le or icmp ge instruction with a constant operand, turn
 /// it into the appropriate icmp lt or icmp gt instruction. This transform
 /// allows them to be folded in visitICmpInst.
@@ -6996,8 +6921,7 @@ static ICmpInst *canonicalizeCmpWithConstant(ICmpInst &I) {
   if (!Op1C)
     return nullptr;
 
-  auto FlippedStrictness =
-      InstCombiner::getFlippedStrictnessPredicateAndConstant(Pred, Op1C);
+  auto FlippedStrictness = getFlippedStrictnessPredicateAndConstant(Pred, Op1C);
   if (!FlippedStrictness)
     return nullptr;
 
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index b31ae374540b..f6992119280c 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -435,9 +435,6 @@ private:
   Instruction *
   canonicalizeConditionalNegationViaMathToSelect(BinaryOperator &i);
 
-  Value *foldAndOrOfICmpsOfAndWithPow2(ICmpInst *LHS, ICmpInst *RHS,
-                                       Instruction *CxtI, bool IsAnd,
-                                       bool IsLogical = false);
   Value *matchSelectFromAndOr(Value *A, Value *B, Value *C, Value *D,
                               bool InvertFalseVal = false);
   Value *getSelectCondition(Value *A, Value *B, bool ABIsTheSame);
diff --git a/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp b/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp
index 272a1942c335..80308bf92dbb 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp
@@ -765,33 +765,14 @@ Instruction *InstCombinerImpl::foldPHIArgLoadIntoPHI(PHINode &PN) {
   NewPN->addIncoming(InVal, PN.getIncomingBlock(0));
   LoadInst *NewLI =
       new LoadInst(FirstLI->getType(), NewPN, "", IsVolatile, LoadAlignment);
-
-  unsigned KnownIDs[] = {
-    LLVMContext::MD_tbaa,
-    LLVMContext::MD_range,
-    LLVMContext::MD_invariant_load,
-    LLVMContext::MD_alias_scope,
-    LLVMContext::MD_noalias,
-    LLVMContext::MD_nonnull,
-    LLVMContext::MD_align,
-    LLVMContext::MD_dereferenceable,
-    LLVMContext::MD_dereferenceable_or_null,
-    LLVMContext::MD_access_group,
-    LLVMContext::MD_noundef,
-  };
-
-  for (unsigned ID : KnownIDs)
-    NewLI->setMetadata(ID, FirstLI->getMetadata(ID));
+  NewLI->copyMetadata(*FirstLI);
 
   // Add all operands to the new PHI and combine TBAA metadata.
   for (auto Incoming : drop_begin(zip(PN.blocks(), PN.incoming_values()))) {
     BasicBlock *BB = std::get<0>(Incoming);
     Value *V = std::get<1>(Incoming);
     LoadInst *LI = cast<LoadInst>(V);
-    // FIXME: https://github.com/llvm/llvm-project/issues/121495
-    // Call combineMetadataForCSE instead, so that an explicit set of KnownIDs
-    // doesn't need to be maintained here.
-    combineMetadata(NewLI, LI, KnownIDs, true);
+    combineMetadataForCSE(NewLI, LI, true);
     Value *NewInVal = LI->getOperand(0);
     if (NewInVal != InVal)
       InVal = nullptr;
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
index 7fd91c72a2fb..1eca17751bac 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -1689,8 +1689,7 @@ tryToReuseConstantFromSelectInComparison(SelectInst &Sel, ICmpInst &Cmp,
     return nullptr;
 
   // Check the constant we'd have with flipped-strictness predicate.
-  auto FlippedStrictness =
-      InstCombiner::getFlippedStrictnessPredicateAndConstant(Pred, C0);
+  auto FlippedStrictness = getFlippedStrictnessPredicateAndConstant(Pred, C0);
   if (!FlippedStrictness)
     return nullptr;
 
@@ -1970,8 +1969,7 @@ static Value *foldSelectWithConstOpToBinOp(ICmpInst *Cmp, Value *TrueVal,
   Value *RHS;
   SelectPatternFlavor SPF;
   const DataLayout &DL = BOp->getDataLayout();
-  auto Flipped =
-      InstCombiner::getFlippedStrictnessPredicateAndConstant(Predicate, C1);
+  auto Flipped = getFlippedStrictnessPredicateAndConstant(Predicate, C1);
 
   if (C3 == ConstantFoldBinaryOpOperands(Opcode, C1, C2, DL)) {
     SPF = getSelectPattern(Predicate).Flavor;
@@ -2823,9 +2821,9 @@ static Instruction *foldSelectWithSRem(SelectInst &SI, InstCombinerImpl &IC,
   // %cnd = icmp slt i32 %rem, 0
   // %add = add i32 %rem, %n
   // %sel = select i1 %cnd, i32 %add, i32 %rem
-  if (match(TrueVal, m_Add(m_Specific(RemRes), m_Value(Remainder))) &&
+  if (match(TrueVal, m_c_Add(m_Specific(RemRes), m_Value(Remainder))) &&
       match(RemRes, m_SRem(m_Value(Op), m_Specific(Remainder))) &&
-      IC.isKnownToBeAPowerOfTwo(Remainder, /*OrZero*/ true) &&
+      IC.isKnownToBeAPowerOfTwo(Remainder, /*OrZero=*/true) &&
       FalseVal == RemRes)
     return FoldToBitwiseAnd(Remainder);
 
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index f63de1f0d410..2fb60ef11499 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -939,12 +939,11 @@ Instruction *InstCombinerImpl::foldBinOpShiftWithShift(BinaryOperator &I) {
                m_OneUse(m_Shift(m_Value(Y), m_Value(Shift)))))
       return nullptr;
     if (!match(I.getOperand(1 - ShOpnum),
-               m_BinOp(m_Value(ShiftedX), m_Value(Mask))))
+               m_c_BinOp(m_CombineAnd(
+                             m_OneUse(m_Shift(m_Value(X), m_Specific(Shift))),
+                             m_Value(ShiftedX)),
+                         m_Value(Mask))))
       return nullptr;
-
-    if (!match(ShiftedX, m_OneUse(m_Shift(m_Value(X), m_Specific(Shift)))))
-      return nullptr;
-
     // Make sure we are matching instruction shifts and not ConstantExpr
     auto *IY = dyn_cast<Instruction>(I.getOperand(ShOpnum));
     auto *IX = dyn_cast<Instruction>(ShiftedX);
@@ -1822,12 +1821,29 @@ Instruction *InstCombinerImpl::foldOpIntoPhi(Instruction &I, PHINode *PN,
       continue;
     }
 
-    // If the only use of phi is comparing it with a constant then we can
-    // put this comparison in the incoming BB directly after a ucmp/scmp call
-    // because we know that it will simplify to a single icmp.
-    const APInt *Ignored;
-    if (isa<CmpIntrinsic>(InVal) && InVal->hasOneUser() &&
-        match(&I, m_ICmp(m_Specific(PN), m_APInt(Ignored)))) {
+    // Handle some cases that can't be fully simplified, but where we know that
+    // the two instructions will fold into one.
+    auto WillFold = [&]() {
+      if (!InVal->hasOneUser())
+        return false;
+
+      // icmp of ucmp/scmp with constant will fold to icmp.
+      const APInt *Ignored;
+      if (isa<CmpIntrinsic>(InVal) &&
+          match(&I, m_ICmp(m_Specific(PN), m_APInt(Ignored))))
+        return true;
+
+      // icmp eq zext(bool), 0 will fold to !bool.
+      if (isa<ZExtInst>(InVal) &&
+          cast<ZExtInst>(InVal)->getSrcTy()->isIntOrIntVectorTy(1) &&
+          match(&I,
+                m_SpecificICmp(ICmpInst::ICMP_EQ, m_Specific(PN), m_Zero())))
+        return true;
+
+      return false;
+    };
+
+    if (WillFold()) {
       OpsToMoveUseToIncomingBB.push_back(i);
       NewPhiValues.push_back(nullptr);
       continue;
diff --git a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
index 530061e3b6bb..2031728c2f33 100644
--- a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
@@ -192,7 +192,7 @@ static cl::opt<bool>
                    cl::Hidden);
 
 static cl::opt<int> ClHotPercentileCutoff("hwasan-percentile-cutoff-hot",
-                                          cl::desc("Hot percentile cuttoff."));
+                                          cl::desc("Hot percentile cutoff."));
 
 static cl::opt<float>
     ClRandomSkipRate("hwasan-random-rate",
diff --git a/llvm/lib/Transforms/Instrumentation/LowerAllowCheckPass.cpp b/llvm/lib/Transforms/Instrumentation/LowerAllowCheckPass.cpp
index 2418030dd601..f27798cfd228 100644
--- a/llvm/lib/Transforms/Instrumentation/LowerAllowCheckPass.cpp
+++ b/llvm/lib/Transforms/Instrumentation/LowerAllowCheckPass.cpp
@@ -30,7 +30,7 @@ using namespace llvm;
 
 static cl::opt<int>
     HotPercentileCutoff("lower-allow-check-percentile-cutoff-hot",
-                        cl::desc("Hot percentile cuttoff."));
+                        cl::desc("Hot percentile cutoff."));
 
 static cl::opt<float>
     RandomRate("lower-allow-check-random-rate",
diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
index 471086ce3a75..db4d62ec3675 100644
--- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
+++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
@@ -158,11 +158,11 @@ STATISTIC(NumCoveredBlocks, "Number of basic blocks that were executed");
 
 // Command line option to specify the file to read profile from. This is
 // mainly used for testing.
-static cl::opt<std::string>
-    PGOTestProfileFile("pgo-test-profile-file", cl::init(""), cl::Hidden,
-                       cl::value_desc("filename"),
-                       cl::desc("Specify the path of profile data file. This is"
-                                "mainly for test purpose."));
+static cl::opt<std::string> PGOTestProfileFile(
+    "pgo-test-profile-file", cl::init(""), cl::Hidden,
+    cl::value_desc("filename"),
+    cl::desc("Specify the path of profile data file. This is "
+             "mainly for test purpose."));
 static cl::opt<std::string> PGOTestProfileRemappingFile(
     "pgo-test-profile-remapping-file", cl::init(""), cl::Hidden,
     cl::value_desc("filename"),
@@ -186,7 +186,7 @@ static cl::opt<unsigned> MaxNumAnnotations(
 // to write to the metadata for a single memop intrinsic.
 static cl::opt<unsigned> MaxNumMemOPAnnotations(
     "memop-max-annotations", cl::init(4), cl::Hidden,
-    cl::desc("Max number of preicise value annotations for a single memop"
+    cl::desc("Max number of precise value annotations for a single memop"
              "intrinsic"));
 
 // Command line option to control appending FunctionHash to the name of a COMDAT
@@ -291,13 +291,13 @@ static cl::opt<bool> PGOVerifyHotBFI(
     cl::desc("Print out the non-match BFI count if a hot raw profile count "
              "becomes non-hot, or a cold raw profile count becomes hot. "
              "The print is enabled under -Rpass-analysis=pgo, or "
-             "internal option -pass-remakrs-analysis=pgo."));
+             "internal option -pass-remarks-analysis=pgo."));
 
 static cl::opt<bool> PGOVerifyBFI(
     "pgo-verify-bfi", cl::init(false), cl::Hidden,
     cl::desc("Print out mismatched BFI counts after setting profile metadata "
              "The print is enabled under -Rpass-analysis=pgo, or "
-             "internal option -pass-remakrs-analysis=pgo."));
+             "internal option -pass-remarks-analysis=pgo."));
 
 static cl::opt<unsigned> PGOVerifyBFIRatio(
     "pgo-verify-bfi-ratio", cl::init(2), cl::Hidden,
diff --git a/llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp
index 19610958e47b..9cd81f3e6edb 100644
--- a/llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp
@@ -70,7 +70,7 @@ namespace {
 /// violations.
 struct TypeSanitizer {
   TypeSanitizer(Module &M);
-  bool run(Function &F, const TargetLibraryInfo &TLI);
+  bool sanitizeFunction(Function &F, const TargetLibraryInfo &TLI);
   void instrumentGlobals(Module &M);
 
 private:
@@ -510,7 +510,8 @@ void collectMemAccessInfo(
   }
 }
 
-bool TypeSanitizer::run(Function &F, const TargetLibraryInfo &TLI) {
+bool TypeSanitizer::sanitizeFunction(Function &F,
+                                     const TargetLibraryInfo &TLI) {
   // This is required to prevent instrumenting call to __tysan_init from within
   // the module constructor.
   if (&F == TysanCtorFunction.getCallee() || &F == TysanGlobalsSetTypeFunction)
@@ -876,15 +877,8 @@ bool TypeSanitizer::instrumentMemInst(Value *V, Instruction *ShadowBase,
   return true;
 }
 
-PreservedAnalyses TypeSanitizerPass::run(Function &F,
-                                         FunctionAnalysisManager &FAM) {
-  TypeSanitizer TySan(*F.getParent());
-  TySan.run(F, FAM.getResult<TargetLibraryAnalysis>(F));
-  return PreservedAnalyses::none();
-}
-
-PreservedAnalyses ModuleTypeSanitizerPass::run(Module &M,
-                                               ModuleAnalysisManager &AM) {
+PreservedAnalyses TypeSanitizerPass::run(Module &M,
+                                         ModuleAnalysisManager &MAM) {
   Function *TysanCtorFunction;
   std::tie(TysanCtorFunction, std::ignore) =
       createSanitizerCtorAndInitFunctions(M, kTysanModuleCtorName,
@@ -894,5 +888,12 @@ PreservedAnalyses ModuleTypeSanitizerPass::run(Module &M,
   TypeSanitizer TySan(M);
   TySan.instrumentGlobals(M);
   appendToGlobalCtors(M, TysanCtorFunction, 0);
+
+  auto &FAM = MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+  for (Function &F : M) {
+    const TargetLibraryInfo &TLI = FAM.getResult<TargetLibraryAnalysis>(F);
+    TySan.sanitizeFunction(F, TLI);
+  }
+
   return PreservedAnalyses::none();
 }
diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index ba1c2241aea9..3c82eeda5483 100644
--- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -128,7 +128,7 @@ static cl::opt<bool, true>
 
 static cl::opt<bool> UseLIRCodeSizeHeurs(
     "use-lir-code-size-heurs",
-    cl::desc("Use loop idiom recognition code size heuristics when compiling"
+    cl::desc("Use loop idiom recognition code size heuristics when compiling "
              "with -Os/-Oz"),
     cl::init(true), cl::Hidden);
 
diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
index 260cc72c3188..090348809e57 100644
--- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -104,7 +104,7 @@ static cl::opt<unsigned> UnrollMaxPercentThresholdBoost(
 
 static cl::opt<unsigned> UnrollMaxIterationsCountToAnalyze(
     "unroll-max-iteration-count-to-analyze", cl::init(10), cl::Hidden,
-    cl::desc("Don't allow loop unrolling to simulate more than this number of"
+    cl::desc("Don't allow loop unrolling to simulate more than this number of "
              "iterations when checking full unroll profitability"));
 
 static cl::opt<unsigned> UnrollCount(
diff --git a/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp b/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp
index f58dcb51f64f..6e91c4fa6e23 100644
--- a/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp
@@ -95,7 +95,7 @@ static const char *LICMVersioningMetaData = "llvm.loop.licm_versioning.disable";
 /// invariant instructions in a loop.
 static cl::opt<float>
     LVInvarThreshold("licm-versioning-invariant-threshold",
-                     cl::desc("LoopVersioningLICM's minimum allowed percentage"
+                     cl::desc("LoopVersioningLICM's minimum allowed percentage "
                               "of possible invariant instructions per loop"),
                      cl::init(25), cl::Hidden);
 
diff --git a/llvm/lib/Transforms/Utils/AssumeBundleBuilder.cpp b/llvm/lib/Transforms/Utils/AssumeBundleBuilder.cpp
index 1d4f5618b39d..b499ef839729 100644
--- a/llvm/lib/Transforms/Utils/AssumeBundleBuilder.cpp
+++ b/llvm/lib/Transforms/Utils/AssumeBundleBuilder.cpp
@@ -28,8 +28,8 @@ using namespace llvm;
 namespace llvm {
 cl::opt<bool> ShouldPreserveAllAttributes(
     "assume-preserve-all", cl::init(false), cl::Hidden,
-    cl::desc("enable preservation of all attrbitues. even those that are "
-             "unlikely to be usefull"));
+    cl::desc("enable preservation of all attributes. even those that are "
+             "unlikely to be useful"));
 
 cl::opt<bool> EnableKnowledgeRetention(
     "enable-knowledge-retention", cl::init(false), cl::Hidden,
diff --git a/llvm/lib/Transforms/Utils/LoopSimplify.cpp b/llvm/lib/Transforms/Utils/LoopSimplify.cpp
index d8298646e18d..b3f9f76274d3 100644
--- a/llvm/lib/Transforms/Utils/LoopSimplify.cpp
+++ b/llvm/lib/Transforms/Utils/LoopSimplify.cpp
@@ -778,7 +778,7 @@ INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
 INITIALIZE_PASS_END(LoopSimplify, "loop-simplify", "Canonicalize natural loops",
-                    false, true)
+                    false, false)
 
 // Publicly exposed interface to pass...
 char &llvm::LoopSimplifyID = LoopSimplify::ID;
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index 03dc6c1d1744..e367b01a0909 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -96,8 +96,9 @@ using namespace PatternMatch;
 cl::opt<bool> llvm::RequireAndPreserveDomTree(
     "simplifycfg-require-and-preserve-domtree", cl::Hidden,
 
-    cl::desc("Temorary development switch used to gradually uplift SimplifyCFG "
-             "into preserving DomTree,"));
+    cl::desc(
+        "Temporary development switch used to gradually uplift SimplifyCFG "
+        "into preserving DomTree,"));
 
 // Chosen as 2 so as to be cheap, but still to have enough power to fold
 // a select, so the "clamp" idiom (of a min followed by a max) will be caught.
@@ -126,7 +127,7 @@ static cl::opt<bool> HoistLoadsStoresWithCondFaulting(
 
 static cl::opt<unsigned> HoistLoadsStoresWithCondFaultingThreshold(
     "hoist-loads-stores-with-cond-faulting-threshold", cl::Hidden, cl::init(6),
-    cl::desc("Control the maximal conditonal load/store that we are willing "
+    cl::desc("Control the maximal conditional load/store that we are willing "
              "to speculatively execute to eliminate conditional branch "
              "(default = 6)"));
 
diff --git a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
index 02ec1d5c259c..9e815731cf8c 100644
--- a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
@@ -324,6 +324,11 @@ private:
       Instruction *ChainElem, Instruction *ChainBegin,
       const DenseMap<Instruction *, APInt /*OffsetFromLeader*/> &ChainOffsets);
 
+  /// Merges the equivalence classes if they have underlying objects that differ
+  /// by one level of indirection (i.e., one is a getelementptr and the other is
+  /// the base pointer in that getelementptr).
+  void mergeEquivalenceClasses(EquivalenceClassMap &EQClasses) const;
+
   /// Collects loads and stores grouped by "equivalence class", where:
   ///   - all elements in an eq class are a load or all are a store,
   ///   - they all load/store the same element size (it's OK to have e.g. i8 and
@@ -1305,6 +1310,119 @@ std::optional<APInt> Vectorizer::getConstantOffsetSelects(
   return std::nullopt;
 }
 
+void Vectorizer::mergeEquivalenceClasses(EquivalenceClassMap &EQClasses) const {
+  if (EQClasses.size() < 2) // There is nothing to merge.
+    return;
+
+  // The reduced key has all elements of the ECClassKey except the underlying
+  // object. Check that EqClassKey has 4 elements and define the reduced key.
+  static_assert(std::tuple_size_v<EqClassKey> == 4,
+                "EqClassKey has changed - EqClassReducedKey needs changes too");
+  using EqClassReducedKey =
+      std::tuple<std::tuple_element_t<1, EqClassKey> /* AddrSpace */,
+                 std::tuple_element_t<2, EqClassKey> /* Element size */,
+                 std::tuple_element_t<3, EqClassKey> /* IsLoad; */>;
+  using ECReducedKeyToUnderlyingObjectMap =
+      MapVector<EqClassReducedKey,
+                SmallPtrSet<std::tuple_element_t<0, EqClassKey>, 4>>;
+
+  // Form a map from the reduced key (without the underlying object) to the
+  // underlying objects: 1 reduced key to many underlying objects, to form
+  // groups of potentially merge-able equivalence classes.
+  ECReducedKeyToUnderlyingObjectMap RedKeyToUOMap;
+  bool FoundPotentiallyOptimizableEC = false;
+  for (const auto &EC : EQClasses) {
+    const auto &Key = EC.first;
+    EqClassReducedKey RedKey{std::get<1>(Key), std::get<2>(Key),
+                             std::get<3>(Key)};
+    RedKeyToUOMap[RedKey].insert(std::get<0>(Key));
+    if (RedKeyToUOMap[RedKey].size() > 1)
+      FoundPotentiallyOptimizableEC = true;
+  }
+  if (!FoundPotentiallyOptimizableEC)
+    return;
+
+  LLVM_DEBUG({
+    dbgs() << "LSV: mergeEquivalenceClasses: before merging:\n";
+    for (const auto &EC : EQClasses) {
+      dbgs() << "  Key: {" << EC.first << "}\n";
+      for (const auto &Inst : EC.second)
+        dbgs() << "    Inst: " << *Inst << '\n';
+    }
+  });
+  LLVM_DEBUG({
+    dbgs() << "LSV: mergeEquivalenceClasses: RedKeyToUOMap:\n";
+    for (const auto &RedKeyToUO : RedKeyToUOMap) {
+      dbgs() << "  Reduced key: {" << std::get<0>(RedKeyToUO.first) << ", "
+             << std::get<1>(RedKeyToUO.first) << ", "
+             << static_cast<int>(std::get<2>(RedKeyToUO.first)) << "} --> "
+             << RedKeyToUO.second.size() << " underlying objects:\n";
+      for (auto UObject : RedKeyToUO.second)
+        dbgs() << "    " << *UObject << '\n';
+    }
+  });
+
+  using UObjectToUObjectMap = DenseMap<const Value *, const Value *>;
+
+  // Compute the ultimate targets for a set of underlying objects.
+  auto GetUltimateTargets =
+      [](SmallPtrSetImpl<const Value *> &UObjects) -> UObjectToUObjectMap {
+    UObjectToUObjectMap IndirectionMap;
+    for (const auto *UObject : UObjects) {
+      const unsigned MaxLookupDepth = 1; // look for 1-level indirections only
+      const auto *UltimateTarget = getUnderlyingObject(UObject, MaxLookupDepth);
+      if (UltimateTarget != UObject)
+        IndirectionMap[UObject] = UltimateTarget;
+    }
+    UObjectToUObjectMap UltimateTargetsMap;
+    for (const auto *UObject : UObjects) {
+      auto Target = UObject;
+      auto It = IndirectionMap.find(Target);
+      for (; It != IndirectionMap.end(); It = IndirectionMap.find(Target))
+        Target = It->second;
+      UltimateTargetsMap[UObject] = Target;
+    }
+    return UltimateTargetsMap;
+  };
+
+  // For each item in RedKeyToUOMap, if it has more than one underlying object,
+  // try to merge the equivalence classes.
+  for (auto &[RedKey, UObjects] : RedKeyToUOMap) {
+    if (UObjects.size() < 2)
+      continue;
+    auto UTMap = GetUltimateTargets(UObjects);
+    for (const auto &[UObject, UltimateTarget] : UTMap) {
+      if (UObject == UltimateTarget)
+        continue;
+
+      EqClassKey KeyFrom{UObject, std::get<0>(RedKey), std::get<1>(RedKey),
+                         std::get<2>(RedKey)};
+      EqClassKey KeyTo{UltimateTarget, std::get<0>(RedKey), std::get<1>(RedKey),
+                       std::get<2>(RedKey)};
+      // The entry for KeyFrom is guarantted to exist, unlike KeyTo. Thus,
+      // request the reference to the instructions vector for KeyTo first.
+      const auto &VecTo = EQClasses[KeyTo];
+      const auto &VecFrom = EQClasses[KeyFrom];
+      SmallVector<Instruction *, 8> MergedVec;
+      std::merge(VecFrom.begin(), VecFrom.end(), VecTo.begin(), VecTo.end(),
+                 std::back_inserter(MergedVec),
+                 [](Instruction *A, Instruction *B) {
+                   return A && B && A->comesBefore(B);
+                 });
+      EQClasses[KeyTo] = std::move(MergedVec);
+      EQClasses.erase(KeyFrom);
+    }
+  }
+  LLVM_DEBUG({
+    dbgs() << "LSV: mergeEquivalenceClasses: after merging:\n";
+    for (const auto &EC : EQClasses) {
+      dbgs() << "  Key: {" << EC.first << "}\n";
+      for (const auto &Inst : EC.second)
+        dbgs() << "    Inst: " << *Inst << '\n';
+    }
+  });
+}
+
 EquivalenceClassMap
 Vectorizer::collectEquivalenceClasses(BasicBlock::iterator Begin,
                                       BasicBlock::iterator End) {
@@ -1377,6 +1495,7 @@ Vectorizer::collectEquivalenceClasses(BasicBlock::iterator Begin,
         .emplace_back(&I);
   }
 
+  mergeEquivalenceClasses(Ret);
   return Ret;
 }
 
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index e0f629e14f65..47866dac9ad9 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8354,17 +8354,22 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
     auto *GEP = dyn_cast<GetElementPtrInst>(
         Ptr->getUnderlyingValue()->stripPointerCasts());
     VPSingleDefRecipe *VectorPtr;
-    if (Reverse)
+    if (Reverse) {
+      // When folding the tail, we may compute an address that we don't in the
+      // original scalar loop and it may not be inbounds. Drop Inbounds in that
+      // case.
+      GEPNoWrapFlags Flags =
+          (CM.foldTailByMasking() || !GEP || !GEP->isInBounds())
+              ? GEPNoWrapFlags::none()
+              : GEPNoWrapFlags::inBounds();
       VectorPtr = new VPReverseVectorPointerRecipe(
-          Ptr, &Plan.getVF(), getLoadStoreType(I),
-          GEP && GEP->isInBounds() ? GEPNoWrapFlags::inBounds()
-                                   : GEPNoWrapFlags::none(),
-          I->getDebugLoc());
-    else
+          Ptr, &Plan.getVF(), getLoadStoreType(I), Flags, I->getDebugLoc());
+    } else {
       VectorPtr = new VPVectorPointerRecipe(Ptr, getLoadStoreType(I),
                                             GEP ? GEP->getNoWrapFlags()
                                                 : GEPNoWrapFlags::none(),
                                             I->getDebugLoc());
+    }
     Builder.getInsertBlock()->appendRecipe(VectorPtr);
     Ptr = VectorPtr;
   }
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index c4582df89213..36fed8937aec 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -104,6 +104,7 @@
 using namespace llvm;
 using namespace llvm::PatternMatch;
 using namespace slpvectorizer;
+using namespace std::placeholders;
 
 #define SV_NAME "slp-vectorizer"
 #define DEBUG_TYPE "SLP"
@@ -4955,6 +4956,37 @@ getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind,
   return TTI.getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args);
 }
 
+/// Correctly creates insert_subvector, checking that the index is multiple of
+/// the subvectors length. Otherwise, generates shuffle using \p Generator or
+/// using default shuffle.
+static Value *createInsertVector(
+    IRBuilderBase &Builder, Value *Vec, Value *V, unsigned Index,
+    function_ref<Value *(Value *, Value *, ArrayRef<int>)> Generator = {}) {
+  const unsigned SubVecVF = getNumElements(V->getType());
+  if (Index % SubVecVF == 0) {
+    Vec = Builder.CreateInsertVector(Vec->getType(), Vec, V,
+                                     Builder.getInt64(Index));
+  } else {
+    // Create shuffle, insertvector requires that index is multiple of
+    // the subvector length.
+    const unsigned VecVF = getNumElements(Vec->getType());
+    SmallVector<int> Mask(VecVF, PoisonMaskElem);
+    std::iota(Mask.begin(), std::next(Mask.begin(), Index), 0);
+    for (unsigned I : seq<unsigned>(SubVecVF))
+      Mask[I + Index] = I + VecVF;
+    if (Generator) {
+      Vec = Generator(Vec, V, Mask);
+    } else {
+      // 1. Resize V to the size of Vec.
+      SmallVector<int> ResizeMask(VecVF, PoisonMaskElem);
+      std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), SubVecVF), 0);
+      V = Builder.CreateShuffleVector(V, ResizeMask);
+      Vec = Builder.CreateShuffleVector(Vec, V, Mask);
+    }
+  }
+  return Vec;
+}
+
 BoUpSLP::LoadsState
 BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
                            SmallVectorImpl<unsigned> &Order,
@@ -5355,11 +5387,10 @@ static bool clusterSortPtrAccesses(ArrayRef<Value *> VL,
     SmallPtrSet<Value *, 13> SecondPointers;
     Value *P1 = Ptr1;
     Value *P2 = Ptr2;
-    if (P1 == P2)
-      return false;
     unsigned Depth = 0;
-    while (!FirstPointers.contains(P2) && !SecondPointers.contains(P1) &&
-           Depth <= RecursionMaxDepth) {
+    while (!FirstPointers.contains(P2) && !SecondPointers.contains(P1)) {
+      if (P1 == P2 || Depth > RecursionMaxDepth)
+        return false;
       FirstPointers.insert(P1);
       SecondPointers.insert(P2);
       P1 = getUnderlyingObject(P1, /*MaxLookup=*/1);
@@ -13884,9 +13915,8 @@ Value *BoUpSLP::gather(
     Instruction *InsElt;
     if (auto *VecTy = dyn_cast<FixedVectorType>(Scalar->getType())) {
       assert(SLPReVec && "FixedVectorType is not expected.");
-      Vec = InsElt = Builder.CreateInsertVector(
-          Vec->getType(), Vec, Scalar,
-          Builder.getInt64(Pos * VecTy->getNumElements()));
+      Vec = InsElt = cast<Instruction>(createInsertVector(
+          Builder, Vec, Scalar, Pos * getNumElements(VecTy)));
       auto *II = dyn_cast<IntrinsicInst>(InsElt);
       if (!II || II->getIntrinsicID() != Intrinsic::vector_insert)
         return Vec;
@@ -14486,23 +14516,10 @@ public:
                                          V, SimplifyQuery(*R.DL));
                                    }));
           unsigned InsertionIndex = Idx * ScalarTyNumElements;
-          const unsigned SubVecVF =
-              cast<FixedVectorType>(V->getType())->getNumElements();
-          if (InsertionIndex % SubVecVF == 0) {
-            Vec = Builder.CreateInsertVector(Vec->getType(), Vec, V,
-                                             Builder.getInt64(InsertionIndex));
-          } else {
-            // Create shuffle, insertvector requires that index is multiple of
-            // the subvectors length.
-            const unsigned VecVF =
-                cast<FixedVectorType>(Vec->getType())->getNumElements();
-            SmallVector<int> Mask(VecVF, PoisonMaskElem);
-            std::iota(Mask.begin(), Mask.end(), 0);
-            for (unsigned I : seq<unsigned>(
-                     InsertionIndex, (Idx + SubVecVF) * ScalarTyNumElements))
-              Mask[I] = I - Idx + VecVF;
-            Vec = createShuffle(Vec, V, Mask);
-          }
+          Vec = createInsertVector(
+              Builder, Vec, V, InsertionIndex,
+              std::bind(&ShuffleInstructionBuilder::createShuffle, this, _1, _2,
+                        _3));
           if (!CommonMask.empty()) {
             std::iota(
                 std::next(CommonMask.begin(), InsertionIndex),
@@ -17748,7 +17765,6 @@ bool BoUpSLP::collectValuesToDemote(
     BitWidth = std::max(BitWidth, BitWidth1);
     return BitWidth > 0 && OrigBitWidth >= (BitWidth * 2);
   };
-  using namespace std::placeholders;
   auto FinalAnalysis = [&]() {
     if (!IsProfitableToDemote)
       return false;
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 9d7bf97d305e..cfbb4ad32d68 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1351,6 +1351,9 @@ public:
     }
   }
 
+  /// Returns true if the underlying opcode may read from or write to memory.
+  bool opcodeMayReadOrWriteFromMemory() const;
+
   /// Returns true if the recipe only uses the first lane of operand \p Op.
   bool onlyFirstLaneUsed(const VPValue *Op) const override;
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 76336ae447ed..e54df8bdeac5 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -51,24 +51,7 @@ extern cl::opt<unsigned> ForceTargetInstructionCost;
 bool VPRecipeBase::mayWriteToMemory() const {
   switch (getVPDefID()) {
   case VPInstructionSC:
-    if (Instruction::isBinaryOp(cast<VPInstruction>(this)->getOpcode()))
-      return false;
-    switch (cast<VPInstruction>(this)->getOpcode()) {
-    case Instruction::Or:
-    case Instruction::ICmp:
-    case Instruction::Select:
-    case VPInstruction::AnyOf:
-    case VPInstruction::Not:
-    case VPInstruction::CalculateTripCountMinusVF:
-    case VPInstruction::CanonicalIVIncrementForPart:
-    case VPInstruction::ExtractFromEnd:
-    case VPInstruction::FirstOrderRecurrenceSplice:
-    case VPInstruction::LogicalAnd:
-    case VPInstruction::PtrAdd:
-      return false;
-    default:
-      return true;
-    }
+    return cast<VPInstruction>(this)->opcodeMayReadOrWriteFromMemory();
   case VPInterleaveSC:
     return cast<VPInterleaveRecipe>(this)->getNumStoreOperands() > 0;
   case VPWidenStoreEVLSC:
@@ -115,6 +98,8 @@ bool VPRecipeBase::mayWriteToMemory() const {
 
 bool VPRecipeBase::mayReadFromMemory() const {
   switch (getVPDefID()) {
+  case VPInstructionSC:
+    return cast<VPInstruction>(this)->opcodeMayReadOrWriteFromMemory();
   case VPWidenLoadEVLSC:
   case VPWidenLoadSC:
     return true;
@@ -707,6 +692,26 @@ void VPInstruction::execute(VPTransformState &State) {
             /*IsScalar*/ GeneratesPerFirstLaneOnly);
 }
 
+bool VPInstruction::opcodeMayReadOrWriteFromMemory() const {
+  if (Instruction::isBinaryOp(getOpcode()))
+    return false;
+  switch (getOpcode()) {
+  case Instruction::ICmp:
+  case Instruction::Select:
+  case VPInstruction::AnyOf:
+  case VPInstruction::CalculateTripCountMinusVF:
+  case VPInstruction::CanonicalIVIncrementForPart:
+  case VPInstruction::ExtractFromEnd:
+  case VPInstruction::FirstOrderRecurrenceSplice:
+  case VPInstruction::LogicalAnd:
+  case VPInstruction::Not:
+  case VPInstruction::PtrAdd:
+    return false;
+  default:
+    return true;
+  }
+}
+
 bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
   assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
   if (Instruction::isBinaryOp(getOpcode()))
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.h b/llvm/lib/Transforms/Vectorize/VPlanUtils.h
index 965777002052..777944264f45 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.h
@@ -49,7 +49,8 @@ inline bool isUniformAfterVectorization(const VPValue *VPV) {
     return all_of(GEP->operands(), isUniformAfterVectorization);
   if (auto *VPI = dyn_cast<VPInstruction>(Def))
     return VPI->isSingleScalar() || VPI->isVectorToScalar();
-  return false;
+  // VPExpandSCEVRecipes must be placed in the entry and are alway uniform.
+  return isa<VPExpandSCEVRecipe>(Def);
 }
 
 /// Return true if \p V is a header mask in \p Plan.
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index ea53a1acebd1..1a669b5058e7 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -705,7 +705,8 @@ bool VectorCombine::foldInsExtFNeg(Instruction &I) {
 
   InstructionCost NewCost =
       TTI.getArithmeticInstrCost(Instruction::FNeg, VecTy, CostKind) +
-      TTI.getShuffleCost(TargetTransformInfo::SK_Select, VecTy, Mask, CostKind);
+      TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, VecTy, Mask,
+                         CostKind);
 
   bool NeedLenChg = SrcVecTy->getNumElements() != NumElts;
   // If the lengths of the two vectors are not equal,
@@ -2022,9 +2023,7 @@ bool VectorCombine::foldShuffleOfShuffles(Instruction &I) {
   if (Match1)
     InnerCost1 = TTI.getInstructionCost(cast<Instruction>(OuterV1), CostKind);
 
-  InstructionCost OuterCost = TTI.getShuffleCost(
-      TargetTransformInfo::SK_PermuteTwoSrc, ShuffleImmTy, OuterMask, CostKind,
-      0, nullptr, {OuterV0, OuterV1}, &I);
+  InstructionCost OuterCost = TTI.getInstructionCost(&I, CostKind);
 
   InstructionCost OldCost = InnerCost0 + InnerCost1 + OuterCost;
 
diff --git a/llvm/test/Analysis/CostModel/AArch64/cast.ll b/llvm/test/Analysis/CostModel/AArch64/cast.ll
index c363b58d939e..b82c2f110d8c 100644
--- a/llvm/test/Analysis/CostModel/AArch64/cast.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/cast.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
 ; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=aarch64 %s | FileCheck --check-prefixes=CHECK,CHECK-NOFP16 %s
 ; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=aarch64 -mattr=+fullfp16 %s | FileCheck --check-prefixes=CHECK,CHECK-FP16 %s
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=aarch64 -mattr=+fullfp16,+bf16 %s | FileCheck --check-prefixes=CHECK,CHECK-FP16,CHECK-BF16 %s
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 
@@ -1237,28 +1238,51 @@ define void @fp16cast() {
 }
 
 define void @bf16cast() {
-; CHECK-LABEL: 'bf16cast'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %extf16f32 = fpext bfloat undef to float
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %extv2f16f32 = fpext <2 x bfloat> undef to <2 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %extv4f16f32 = fpext <4 x bfloat> undef to <4 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %extv8f16f32 = fpext <8 x bfloat> undef to <8 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %extv16f16f32 = fpext <16 x bfloat> undef to <16 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %extf16f64 = fpext bfloat undef to double
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %extv2f16f64 = fpext <2 x bfloat> undef to <2 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %extv4f16f64 = fpext <4 x bfloat> undef to <4 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %extv8f16f64 = fpext <8 x bfloat> undef to <8 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %extv16f16f64 = fpext <16 x bfloat> undef to <16 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %truncf16f32 = fptrunc float undef to bfloat
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %truncv2f16f32 = fptrunc <2 x float> undef to <2 x bfloat>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %truncv4f16f32 = fptrunc <4 x float> undef to <4 x bfloat>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %truncv8f16f32 = fptrunc <8 x float> undef to <8 x bfloat>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %truncv16f16f32 = fptrunc <16 x float> undef to <16 x bfloat>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %truncf16f64 = fptrunc double undef to bfloat
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %truncv2f16f64 = fptrunc <2 x double> undef to <2 x bfloat>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %truncv4f16f64 = fptrunc <4 x double> undef to <4 x bfloat>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %truncv8f16f64 = fptrunc <8 x double> undef to <8 x bfloat>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %truncv16f16f64 = fptrunc <16 x double> undef to <16 x bfloat>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-NOFP16-LABEL: 'bf16cast'
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %extf16f32 = fpext bfloat undef to float
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %extv2f16f32 = fpext <2 x bfloat> undef to <2 x float>
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %extv4f16f32 = fpext <4 x bfloat> undef to <4 x float>
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %extv8f16f32 = fpext <8 x bfloat> undef to <8 x float>
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %extv16f16f32 = fpext <16 x bfloat> undef to <16 x float>
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %extf16f64 = fpext bfloat undef to double
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %extv2f16f64 = fpext <2 x bfloat> undef to <2 x double>
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %extv4f16f64 = fpext <4 x bfloat> undef to <4 x double>
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %extv8f16f64 = fpext <8 x bfloat> undef to <8 x double>
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %extv16f16f64 = fpext <16 x bfloat> undef to <16 x double>
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %truncf16f32 = fptrunc float undef to bfloat
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %truncv2f16f32 = fptrunc <2 x float> undef to <2 x bfloat>
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %truncv4f16f32 = fptrunc <4 x float> undef to <4 x bfloat>
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %truncv8f16f32 = fptrunc <8 x float> undef to <8 x bfloat>
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %truncv16f16f32 = fptrunc <16 x float> undef to <16 x bfloat>
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %truncf16f64 = fptrunc double undef to bfloat
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %truncv2f16f64 = fptrunc <2 x double> undef to <2 x bfloat>
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %truncv4f16f64 = fptrunc <4 x double> undef to <4 x bfloat>
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %truncv8f16f64 = fptrunc <8 x double> undef to <8 x bfloat>
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %truncv16f16f64 = fptrunc <16 x double> undef to <16 x bfloat>
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; CHECK-BF16-LABEL: 'bf16cast'
+; CHECK-BF16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %extf16f32 = fpext bfloat undef to float
+; CHECK-BF16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %extv2f16f32 = fpext <2 x bfloat> undef to <2 x float>
+; CHECK-BF16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %extv4f16f32 = fpext <4 x bfloat> undef to <4 x float>
+; CHECK-BF16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %extv8f16f32 = fpext <8 x bfloat> undef to <8 x float>
+; CHECK-BF16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %extv16f16f32 = fpext <16 x bfloat> undef to <16 x float>
+; CHECK-BF16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %extf16f64 = fpext bfloat undef to double
+; CHECK-BF16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %extv2f16f64 = fpext <2 x bfloat> undef to <2 x double>
+; CHECK-BF16-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %extv4f16f64 = fpext <4 x bfloat> undef to <4 x double>
+; CHECK-BF16-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %extv8f16f64 = fpext <8 x bfloat> undef to <8 x double>
+; CHECK-BF16-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %extv16f16f64 = fpext <16 x bfloat> undef to <16 x double>
+; CHECK-BF16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %truncf16f32 = fptrunc float undef to bfloat
+; CHECK-BF16-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %truncv2f16f32 = fptrunc <2 x float> undef to <2 x bfloat>
+; CHECK-BF16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %truncv4f16f32 = fptrunc <4 x float> undef to <4 x bfloat>
+; CHECK-BF16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %truncv8f16f32 = fptrunc <8 x float> undef to <8 x bfloat>
+; CHECK-BF16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %truncv16f16f32 = fptrunc <16 x float> undef to <16 x bfloat>
+; CHECK-BF16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %truncf16f64 = fptrunc double undef to bfloat
+; CHECK-BF16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %truncv2f16f64 = fptrunc <2 x double> undef to <2 x bfloat>
+; CHECK-BF16-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %truncv4f16f64 = fptrunc <4 x double> undef to <4 x bfloat>
+; CHECK-BF16-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %truncv8f16f64 = fptrunc <8 x double> undef to <8 x bfloat>
+; CHECK-BF16-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %truncv16f16f64 = fptrunc <16 x double> undef to <16 x bfloat>
+; CHECK-BF16-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %extf16f32 = fpext bfloat undef to float
   %extv2f16f32 = fpext <2 x bfloat> undef to <2 x float>
diff --git a/llvm/test/Analysis/CostModel/AArch64/div.ll b/llvm/test/Analysis/CostModel/AArch64/div.ll
index ada0be66c27b..ef52d0db01ee 100644
--- a/llvm/test/Analysis/CostModel/AArch64/div.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/div.ll
@@ -11,14 +11,20 @@ define i32 @sdiv() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V4i64 = sdiv <4 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V8i64 = sdiv <8 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = sdiv i32 undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V2i32 = sdiv <2 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i32 = sdiv <4 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V8i32 = sdiv <8 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V16i32 = sdiv <16 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = sdiv i16 undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V2i16 = sdiv <2 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i16 = sdiv <4 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V8i16 = sdiv <8 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %V16i16 = sdiv <16 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 352 for instruction: %V32i16 = sdiv <32 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = sdiv i8 undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V2i8 = sdiv <2 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i8 = sdiv <4 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V8i8 = sdiv <8 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 168 for instruction: %V16i8 = sdiv <16 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 336 for instruction: %V32i8 = sdiv <32 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 672 for instruction: %V64i8 = sdiv <64 x i8> undef, undef
@@ -32,16 +38,22 @@ define i32 @sdiv() {
   %V8i64 = sdiv <8 x i64> undef, undef
 
   %I32 = sdiv i32 undef, undef
+  %V2i32 = sdiv <2 x i32> undef, undef
   %V4i32 = sdiv <4 x i32> undef, undef
   %V8i32 = sdiv <8 x i32> undef, undef
   %V16i32 = sdiv <16 x i32> undef, undef
 
   %I16 = sdiv i16 undef, undef
+  %V2i16 = sdiv <2 x i16> undef, undef
+  %V4i16 = sdiv <4 x i16> undef, undef
   %V8i16 = sdiv <8 x i16> undef, undef
   %V16i16 = sdiv <16 x i16> undef, undef
   %V32i16 = sdiv <32 x i16> undef, undef
 
   %I8 = sdiv i8 undef, undef
+  %V2i8 = sdiv <2 x i8> undef, undef
+  %V4i8 = sdiv <4 x i8> undef, undef
+  %V8i8 = sdiv <8 x i8> undef, undef
   %V16i8 = sdiv <16 x i8> undef, undef
   %V32i8 = sdiv <32 x i8> undef, undef
   %V64i8 = sdiv <64 x i8> undef, undef
@@ -57,14 +69,20 @@ define i32 @udiv() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V4i64 = udiv <4 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V8i64 = udiv <8 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = udiv i32 undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V2i32 = udiv <2 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i32 = udiv <4 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V8i32 = udiv <8 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V16i32 = udiv <16 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = udiv i16 undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V2i16 = udiv <2 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i16 = udiv <4 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V8i16 = udiv <8 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %V16i16 = udiv <16 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 352 for instruction: %V32i16 = udiv <32 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = udiv i8 undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V2i8 = udiv <2 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i8 = udiv <4 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V8i8 = udiv <8 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 168 for instruction: %V16i8 = udiv <16 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 336 for instruction: %V32i8 = udiv <32 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 672 for instruction: %V64i8 = udiv <64 x i8> undef, undef
@@ -78,16 +96,22 @@ define i32 @udiv() {
   %V8i64 = udiv <8 x i64> undef, undef
 
   %I32 = udiv i32 undef, undef
+  %V2i32 = udiv <2 x i32> undef, undef
   %V4i32 = udiv <4 x i32> undef, undef
   %V8i32 = udiv <8 x i32> undef, undef
   %V16i32 = udiv <16 x i32> undef, undef
 
   %I16 = udiv i16 undef, undef
+  %V2i16 = udiv <2 x i16> undef, undef
+  %V4i16 = udiv <4 x i16> undef, undef
   %V8i16 = udiv <8 x i16> undef, undef
   %V16i16 = udiv <16 x i16> undef, undef
   %V32i16 = udiv <32 x i16> undef, undef
 
   %I8 = udiv i8 undef, undef
+  %V2i8 = udiv <2 x i8> undef, undef
+  %V4i8 = udiv <4 x i8> undef, undef
+  %V8i8 = udiv <8 x i8> undef, undef
   %V16i8 = udiv <16 x i8> undef, undef
   %V32i8 = udiv <32 x i8> undef, undef
   %V64i8 = udiv <64 x i8> undef, undef
@@ -103,14 +127,20 @@ define i32 @sdiv_const() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V4i64 = sdiv <4 x i64> undef, <i64 4, i64 5, i64 6, i64 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V8i64 = sdiv <8 x i64> undef, <i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = sdiv i32 undef, 7
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V2i32 = sdiv <2 x i32> undef, <i32 4, i32 5>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i32 = sdiv <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V8i32 = sdiv <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V16i32 = sdiv <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = sdiv i16 undef, 7
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V2i16 = sdiv <2 x i16> undef, <i16 4, i16 5>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i16 = sdiv <4 x i16> undef, <i16 4, i16 5, i16 6, i16 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V8i16 = sdiv <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %V16i16 = sdiv <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 352 for instruction: %V32i16 = sdiv <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = sdiv i8 undef, 7
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V2i8 = sdiv <2 x i8> undef, <i8 4, i8 5>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i8 = sdiv <4 x i8> undef, <i8 4, i8 5, i8 6, i8 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V8i8 = sdiv <8 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 168 for instruction: %V16i8 = sdiv <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 336 for instruction: %V32i8 = sdiv <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 672 for instruction: %V64i8 = sdiv <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
@@ -124,16 +154,22 @@ define i32 @sdiv_const() {
   %V8i64 = sdiv <8 x i64> undef, <i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11>
 
   %I32 = sdiv i32 undef, 7
+  %V2i32 = sdiv <2 x i32> undef, <i32 4, i32 5>
   %V4i32 = sdiv <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
   %V8i32 = sdiv <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   %V16i32 = sdiv <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
 
   %I16 = sdiv i16 undef, 7
+  %V2i16 = sdiv <2 x i16> undef, <i16 4, i16 5>
+  %V4i16 = sdiv <4 x i16> undef, <i16 4, i16 5, i16 6, i16 7>
   %V8i16 = sdiv <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
   %V16i16 = sdiv <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
   %V32i16 = sdiv <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 
   %I8 = sdiv i8 undef, 7
+  %V2i8 = sdiv <2 x i8> undef, <i8 4, i8 5>
+  %V4i8 = sdiv <4 x i8> undef, <i8 4, i8 5, i8 6, i8 7>
+  %V8i8 = sdiv <8 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11>
   %V16i8 = sdiv <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
   %V32i8 = sdiv <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
   %V64i8 = sdiv <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
@@ -149,14 +185,20 @@ define i32 @udiv_const() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V4i64 = udiv <4 x i64> undef, <i64 4, i64 5, i64 6, i64 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V8i64 = udiv <8 x i64> undef, <i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = udiv i32 undef, 7
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V2i32 = udiv <2 x i32> undef, <i32 4, i32 5>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i32 = udiv <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V8i32 = udiv <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V16i32 = udiv <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = udiv i16 undef, 7
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V2i16 = udiv <2 x i16> undef, <i16 4, i16 5>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i16 = udiv <4 x i16> undef, <i16 4, i16 5, i16 6, i16 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V8i16 = udiv <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %V16i16 = udiv <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 352 for instruction: %V32i16 = udiv <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = udiv i8 undef, 7
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V2i8 = udiv <2 x i8> undef, <i8 4, i8 5>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i8 = udiv <4 x i8> undef, <i8 4, i8 5, i8 6, i8 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V8i8 = udiv <8 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 168 for instruction: %V16i8 = udiv <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 336 for instruction: %V32i8 = udiv <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 672 for instruction: %V64i8 = udiv <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
@@ -171,16 +213,22 @@ define i32 @udiv_const() {
   %V8i64 = udiv <8 x i64> undef, <i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11>
 
   %I32 = udiv i32 undef, 7
+  %V2i32 = udiv <2 x i32> undef, <i32 4, i32 5>
   %V4i32 = udiv <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
   %V8i32 = udiv <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   %V16i32 = udiv <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
 
   %I16 = udiv i16 undef, 7
+  %V2i16 = udiv <2 x i16> undef, <i16 4, i16 5>
+  %V4i16 = udiv <4 x i16> undef, <i16 4, i16 5, i16 6, i16 7>
   %V8i16 = udiv <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
   %V16i16 = udiv <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
   %V32i16 = udiv <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 
   %I8 = udiv i8 undef, 7
+  %V2i8 = udiv <2 x i8> undef, <i8 4, i8 5>
+  %V4i8 = udiv <4 x i8> undef, <i8 4, i8 5, i8 6, i8 7>
+  %V8i8 = udiv <8 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11>
   %V16i8 = udiv <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
   %V32i8 = udiv <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
   %V64i8 = udiv <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
@@ -196,14 +244,20 @@ define i32 @sdiv_uniformconst() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4i64 = sdiv <4 x i64> undef, splat (i64 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8i64 = sdiv <8 x i64> undef, splat (i64 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = sdiv i32 undef, 7
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2i32 = sdiv <2 x i32> undef, splat (i32 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4i32 = sdiv <4 x i32> undef, splat (i32 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8i32 = sdiv <8 x i32> undef, splat (i32 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16i32 = sdiv <16 x i32> undef, splat (i32 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = sdiv i16 undef, 7
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2i16 = sdiv <2 x i16> undef, splat (i16 7)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4i16 = sdiv <4 x i16> undef, splat (i16 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8i16 = sdiv <8 x i16> undef, splat (i16 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16i16 = sdiv <16 x i16> undef, splat (i16 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32i16 = sdiv <32 x i16> undef, splat (i16 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = sdiv i8 undef, 7
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2i8 = sdiv <2 x i8> undef, splat (i8 7)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4i8 = sdiv <4 x i8> undef, splat (i8 7)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8i8 = sdiv <8 x i8> undef, splat (i8 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16i8 = sdiv <16 x i8> undef, splat (i8 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32i8 = sdiv <32 x i8> undef, splat (i8 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V64i8 = sdiv <64 x i8> undef, splat (i8 7)
@@ -217,16 +271,22 @@ define i32 @sdiv_uniformconst() {
   %V8i64 = sdiv <8 x i64> undef, <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>
 
   %I32 = sdiv i32 undef, 7
+  %V2i32 = sdiv <2 x i32> undef, <i32 7, i32 7>
   %V4i32 = sdiv <4 x i32> undef, <i32 7, i32 7, i32 7, i32 7>
   %V8i32 = sdiv <8 x i32> undef, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
   %V16i32 = sdiv <16 x i32> undef, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
 
   %I16 = sdiv i16 undef, 7
+  %V2i16 = sdiv <2 x i16> undef, <i16 7, i16 7>
+  %V4i16 = sdiv <4 x i16> undef, <i16 7, i16 7, i16 7, i16 7>
   %V8i16 = sdiv <8 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
   %V16i16 = sdiv <16 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
   %V32i16 = sdiv <32 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 
   %I8 = sdiv i8 undef, 7
+  %V2i8 = sdiv <2 x i8> undef, <i8 7, i8 7>
+  %V4i8 = sdiv <4 x i8> undef, <i8 7, i8 7, i8 7, i8 7>
+  %V8i8 = sdiv <8 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
   %V16i8 = sdiv <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
   %V32i8 = sdiv <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
   %V64i8 = sdiv <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
@@ -242,14 +302,20 @@ define i32 @udiv_uniformconst() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4i64 = udiv <4 x i64> undef, splat (i64 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8i64 = udiv <8 x i64> undef, splat (i64 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = udiv i32 undef, 7
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2i32 = udiv <2 x i32> undef, splat (i32 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4i32 = udiv <4 x i32> undef, splat (i32 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8i32 = udiv <8 x i32> undef, splat (i32 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16i32 = udiv <16 x i32> undef, splat (i32 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = udiv i16 undef, 7
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2i16 = udiv <2 x i16> undef, splat (i16 7)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4i16 = udiv <4 x i16> undef, splat (i16 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8i16 = udiv <8 x i16> undef, splat (i16 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16i16 = udiv <16 x i16> undef, splat (i16 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32i16 = udiv <32 x i16> undef, splat (i16 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = udiv i8 undef, 7
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2i8 = udiv <2 x i8> undef, splat (i8 7)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4i8 = udiv <4 x i8> undef, splat (i8 7)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8i8 = udiv <8 x i8> undef, splat (i8 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16i8 = udiv <16 x i8> undef, splat (i8 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32i8 = udiv <32 x i8> undef, splat (i8 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V64i8 = udiv <64 x i8> undef, splat (i8 7)
@@ -263,16 +329,22 @@ define i32 @udiv_uniformconst() {
   %V8i64 = udiv <8 x i64> undef, <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>
 
   %I32 = udiv i32 undef, 7
+  %V2i32 = udiv <2 x i32> undef, <i32 7, i32 7>
   %V4i32 = udiv <4 x i32> undef, <i32 7, i32 7, i32 7, i32 7>
   %V8i32 = udiv <8 x i32> undef, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
   %V16i32 = udiv <16 x i32> undef, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
 
   %I16 = udiv i16 undef, 7
+  %V2i16 = udiv <2 x i16> undef, <i16 7, i16 7>
+  %V4i16 = udiv <4 x i16> undef, <i16 7, i16 7, i16 7, i16 7>
   %V8i16 = udiv <8 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
   %V16i16 = udiv <16 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
   %V32i16 = udiv <32 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 
   %I8 = udiv i8 undef, 7
+  %V2i8 = udiv <2 x i8> undef, <i8 7, i8 7>
+  %V4i8 = udiv <4 x i8> undef, <i8 7, i8 7, i8 7, i8 7>
+  %V8i8 = udiv <8 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
   %V16i8 = udiv <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
   %V32i8 = udiv <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
   %V64i8 = udiv <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
@@ -288,14 +360,20 @@ define i32 @sdiv_constpow2() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V4i64 = sdiv <4 x i64> undef, <i64 2, i64 4, i64 8, i64 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V8i64 = sdiv <8 x i64> undef, <i64 2, i64 4, i64 8, i64 16, i64 32, i64 64, i64 128, i64 256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I32 = sdiv i32 undef, 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V2i32 = sdiv <2 x i32> undef, <i32 2, i32 4>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i32 = sdiv <4 x i32> undef, <i32 2, i32 4, i32 8, i32 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V8i32 = sdiv <8 x i32> undef, <i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V16i32 = sdiv <16 x i32> undef, <i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256, i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = sdiv i16 undef, 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V2i16 = sdiv <2 x i16> undef, <i16 2, i16 4>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i16 = sdiv <4 x i16> undef, <i16 2, i16 4, i16 8, i16 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V8i16 = sdiv <8 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %V16i16 = sdiv <16 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 352 for instruction: %V32i16 = sdiv <32 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = sdiv i8 undef, 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V2i8 = sdiv <2 x i8> undef, <i8 2, i8 4>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i8 = sdiv <4 x i8> undef, <i8 2, i8 4, i8 8, i8 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V8i8 = sdiv <8 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 168 for instruction: %V16i8 = sdiv <16 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 336 for instruction: %V32i8 = sdiv <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 672 for instruction: %V64i8 = sdiv <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
@@ -309,16 +387,22 @@ define i32 @sdiv_constpow2() {
   %V8i64 = sdiv <8 x i64> undef, <i64 2, i64 4, i64 8, i64 16, i64 32, i64 64, i64 128, i64 256>
 
   %I32 = sdiv i32 undef, 16
+  %V2i32 = sdiv <2 x i32> undef, <i32 2, i32 4>
   %V4i32 = sdiv <4 x i32> undef, <i32 2, i32 4, i32 8, i32 16>
   %V8i32 = sdiv <8 x i32> undef, <i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256>
   %V16i32 = sdiv <16 x i32> undef, <i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256, i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256>
 
   %I16 = sdiv i16 undef, 16
+  %V2i16 = sdiv <2 x i16> undef, <i16 2, i16 4>
+  %V4i16 = sdiv <4 x i16> undef, <i16 2, i16 4, i16 8, i16 16>
   %V8i16 = sdiv <8 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
   %V16i16 = sdiv <16 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
   %V32i16 = sdiv <32 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
 
   %I8 = sdiv i8 undef, 16
+  %V2i8 = sdiv <2 x i8> undef, <i8 2, i8 4>
+  %V4i8 = sdiv <4 x i8> undef, <i8 2, i8 4, i8 8, i8 16>
+  %V8i8 = sdiv <8 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
   %V16i8 = sdiv <16 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
   %V32i8 = sdiv <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
   %V64i8 = sdiv <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
@@ -334,14 +418,20 @@ define i32 @udiv_constpow2() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V4i64 = udiv <4 x i64> undef, <i64 2, i64 4, i64 8, i64 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V8i64 = udiv <8 x i64> undef, <i64 2, i64 4, i64 8, i64 16, i64 32, i64 64, i64 128, i64 256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = udiv i32 undef, 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V2i32 = udiv <2 x i32> undef, <i32 2, i32 4>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i32 = udiv <4 x i32> undef, <i32 2, i32 4, i32 8, i32 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V8i32 = udiv <8 x i32> undef, <i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V16i32 = udiv <16 x i32> undef, <i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256, i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = udiv i16 undef, 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V2i16 = udiv <2 x i16> undef, <i16 2, i16 4>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i16 = udiv <4 x i16> undef, <i16 2, i16 4, i16 8, i16 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V8i16 = udiv <8 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %V16i16 = udiv <16 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 352 for instruction: %V32i16 = udiv <32 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = udiv i8 undef, 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V2i8 = udiv <2 x i8> undef, <i8 2, i8 4>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i8 = udiv <4 x i8> undef, <i8 2, i8 4, i8 8, i8 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V8i8 = udiv <8 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 168 for instruction: %V16i8 = udiv <16 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 336 for instruction: %V32i8 = udiv <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 672 for instruction: %V64i8 = udiv <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
@@ -355,16 +445,22 @@ define i32 @udiv_constpow2() {
   %V8i64 = udiv <8 x i64> undef, <i64 2, i64 4, i64 8, i64 16, i64 32, i64 64, i64 128, i64 256>
 
   %I32 = udiv i32 undef, 16
+  %V2i32 = udiv <2 x i32> undef, <i32 2, i32 4>
   %V4i32 = udiv <4 x i32> undef, <i32 2, i32 4, i32 8, i32 16>
   %V8i32 = udiv <8 x i32> undef, <i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256>
   %V16i32 = udiv <16 x i32> undef, <i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256, i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256>
 
   %I16 = udiv i16 undef, 16
+  %V2i16 = udiv <2 x i16> undef, <i16 2, i16 4>
+  %V4i16 = udiv <4 x i16> undef, <i16 2, i16 4, i16 8, i16 16>
   %V8i16 = udiv <8 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
   %V16i16 = udiv <16 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
   %V32i16 = udiv <32 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
 
   %I8 = udiv i8 undef, 16
+  %V2i8 = udiv <2 x i8> undef, <i8 2, i8 4>
+  %V4i8 = udiv <4 x i8> undef, <i8 2, i8 4, i8 8, i8 16>
+  %V8i8 = udiv <8 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
   %V16i8 = udiv <16 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
   %V32i8 = udiv <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
   %V64i8 = udiv <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
@@ -380,14 +476,20 @@ define i32 @sdiv_uniformconstpow2() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V4i64 = sdiv <4 x i64> undef, splat (i64 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V8i64 = sdiv <8 x i64> undef, splat (i64 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I32 = sdiv i32 undef, 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V2i32 = sdiv <2 x i32> undef, splat (i32 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V4i32 = sdiv <4 x i32> undef, splat (i32 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V8i32 = sdiv <8 x i32> undef, splat (i32 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %V16i32 = sdiv <16 x i32> undef, splat (i32 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = sdiv i16 undef, 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V2i16 = sdiv <2 x i16> undef, splat (i16 16)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V4i16 = sdiv <4 x i16> undef, splat (i16 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V8i16 = sdiv <8 x i16> undef, splat (i16 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %V16i16 = sdiv <16 x i16> undef, splat (i16 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 204 for instruction: %V32i16 = sdiv <32 x i16> undef, splat (i16 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = sdiv i8 undef, 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V2i8 = sdiv <2 x i8> undef, splat (i8 16)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V4i8 = sdiv <4 x i8> undef, splat (i8 16)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V8i8 = sdiv <8 x i8> undef, splat (i8 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %V16i8 = sdiv <16 x i8> undef, splat (i8 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 198 for instruction: %V32i8 = sdiv <32 x i8> undef, splat (i8 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 396 for instruction: %V64i8 = sdiv <64 x i8> undef, splat (i8 16)
@@ -401,16 +503,22 @@ define i32 @sdiv_uniformconstpow2() {
   %V8i64 = sdiv <8 x i64> undef, <i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16>
 
   %I32 = sdiv i32 undef, 16
+  %V2i32 = sdiv <2 x i32> undef, <i32 16, i32 16>
   %V4i32 = sdiv <4 x i32> undef, <i32 16, i32 16, i32 16, i32 16>
   %V8i32 = sdiv <8 x i32> undef, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
   %V16i32 = sdiv <16 x i32> undef, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
 
   %I16 = sdiv i16 undef, 16
+  %V2i16 = sdiv <2 x i16> undef, <i16 16, i16 16>
+  %V4i16 = sdiv <4 x i16> undef, <i16 16, i16 16, i16 16, i16 16>
   %V8i16 = sdiv <8 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
   %V16i16 = sdiv <16 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
   %V32i16 = sdiv <32 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
 
   %I8 = sdiv i8 undef, 16
+  %V2i8 = sdiv <2 x i8> undef, <i8 16, i8 16>
+  %V4i8 = sdiv <4 x i8> undef, <i8 16, i8 16, i8 16, i8 16>
+  %V8i8 = sdiv <8 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
   %V16i8 = sdiv <16 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
   %V32i8 = sdiv <32 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
   %V64i8 = sdiv <64 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
@@ -426,14 +534,20 @@ define i32 @udiv_uniformconstpow2() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4i64 = udiv <4 x i64> undef, splat (i64 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8i64 = udiv <8 x i64> undef, splat (i64 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = udiv i32 undef, 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2i32 = udiv <2 x i32> undef, splat (i32 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4i32 = udiv <4 x i32> undef, splat (i32 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8i32 = udiv <8 x i32> undef, splat (i32 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16i32 = udiv <16 x i32> undef, splat (i32 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = udiv i16 undef, 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2i16 = udiv <2 x i16> undef, splat (i16 16)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4i16 = udiv <4 x i16> undef, splat (i16 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8i16 = udiv <8 x i16> undef, splat (i16 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16i16 = udiv <16 x i16> undef, splat (i16 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32i16 = udiv <32 x i16> undef, splat (i16 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = udiv i8 undef, 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2i8 = udiv <2 x i8> undef, splat (i8 16)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4i8 = udiv <4 x i8> undef, splat (i8 16)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8i8 = udiv <8 x i8> undef, splat (i8 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16i8 = udiv <16 x i8> undef, splat (i8 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32i8 = udiv <32 x i8> undef, splat (i8 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V64i8 = udiv <64 x i8> undef, splat (i8 16)
@@ -447,16 +561,22 @@ define i32 @udiv_uniformconstpow2() {
   %V8i64 = udiv <8 x i64> undef, <i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16>
 
   %I32 = udiv i32 undef, 16
+  %V2i32 = udiv <2 x i32> undef, <i32 16, i32 16>
   %V4i32 = udiv <4 x i32> undef, <i32 16, i32 16, i32 16, i32 16>
   %V8i32 = udiv <8 x i32> undef, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
   %V16i32 = udiv <16 x i32> undef, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
 
   %I16 = udiv i16 undef, 16
+  %V2i16 = udiv <2 x i16> undef, <i16 16, i16 16>
+  %V4i16 = udiv <4 x i16> undef, <i16 16, i16 16, i16 16, i16 16>
   %V8i16 = udiv <8 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
   %V16i16 = udiv <16 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
   %V32i16 = udiv <32 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
 
   %I8 = udiv i8 undef, 16
+  %V2i8 = udiv <2 x i8> undef, <i8 16, i8 16>
+  %V4i8 = udiv <4 x i8> undef, <i8 16, i8 16, i8 16, i8 16>
+  %V8i8 = udiv <8 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
   %V16i8 = udiv <16 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
   %V32i8 = udiv <32 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
   %V64i8 = udiv <64 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
@@ -472,14 +592,20 @@ define i32 @sdiv_constnegpow2() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V4i64 = sdiv <4 x i64> undef, <i64 -2, i64 -4, i64 -8, i64 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V8i64 = sdiv <8 x i64> undef, <i64 -2, i64 -4, i64 -8, i64 -16, i64 -32, i64 -64, i64 -128, i64 -256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = sdiv i32 undef, -16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V2i32 = sdiv <2 x i32> undef, <i32 -2, i32 -4>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i32 = sdiv <4 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V8i32 = sdiv <8 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V16i32 = sdiv <16 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256, i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = sdiv i16 undef, -16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V2i16 = sdiv <2 x i16> undef, <i16 -2, i16 -4>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i16 = sdiv <4 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V8i16 = sdiv <8 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %V16i16 = sdiv <16 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 352 for instruction: %V32i16 = sdiv <32 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = sdiv i8 undef, -16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V2i8 = sdiv <2 x i8> undef, <i8 -2, i8 -4>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i8 = sdiv <4 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V8i8 = sdiv <8 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 168 for instruction: %V16i8 = sdiv <16 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 336 for instruction: %V32i8 = sdiv <32 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 672 for instruction: %V64i8 = sdiv <64 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
@@ -493,16 +619,22 @@ define i32 @sdiv_constnegpow2() {
   %V8i64 = sdiv <8 x i64> undef, <i64 -2, i64 -4, i64 -8, i64 -16, i64 -32, i64 -64, i64 -128, i64 -256>
 
   %I32 = sdiv i32 undef, -16
+  %V2i32 = sdiv <2 x i32> undef, <i32 -2, i32 -4>
   %V4i32 = sdiv <4 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16>
   %V8i32 = sdiv <8 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256>
   %V16i32 = sdiv <16 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256, i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256>
 
   %I16 = sdiv i16 undef, -16
+  %V2i16 = sdiv <2 x i16> undef, <i16 -2, i16 -4>
+  %V4i16 = sdiv <4 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16>
   %V8i16 = sdiv <8 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
   %V16i16 = sdiv <16 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
   %V32i16 = sdiv <32 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
 
   %I8 = sdiv i8 undef, -16
+  %V2i8 = sdiv <2 x i8> undef, <i8 -2, i8 -4>
+  %V4i8 = sdiv <4 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16>
+  %V8i8 = sdiv <8 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
   %V16i8 = sdiv <16 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
   %V32i8 = sdiv <32 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
   %V64i8 = sdiv <64 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
@@ -518,14 +650,20 @@ define i32 @udiv_constnegpow2() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V4i64 = udiv <4 x i64> undef, <i64 -2, i64 -4, i64 -8, i64 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V8i64 = udiv <8 x i64> undef, <i64 -2, i64 -4, i64 -8, i64 -16, i64 -32, i64 -64, i64 -128, i64 -256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = udiv i32 undef, -16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V2i32 = udiv <2 x i32> undef, <i32 -2, i32 -4>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i32 = udiv <4 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V8i32 = udiv <8 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V16i32 = udiv <16 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256, i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = udiv i16 undef, -16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V2i16 = udiv <2 x i16> undef, <i16 -2, i16 -4>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i16 = udiv <4 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V8i16 = udiv <8 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %V16i16 = udiv <16 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 352 for instruction: %V32i16 = udiv <32 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = udiv i8 undef, -16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V2i8 = udiv <2 x i8> undef, <i8 -2, i8 -4>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4i8 = udiv <4 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V8i8 = udiv <8 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 168 for instruction: %V16i8 = udiv <16 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 336 for instruction: %V32i8 = udiv <32 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 672 for instruction: %V64i8 = udiv <64 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
@@ -539,16 +677,22 @@ define i32 @udiv_constnegpow2() {
   %V8i64 = udiv <8 x i64> undef, <i64 -2, i64 -4, i64 -8, i64 -16, i64 -32, i64 -64, i64 -128, i64 -256>
 
   %I32 = udiv i32 undef, -16
+  %V2i32 = udiv <2 x i32> undef, <i32 -2, i32 -4>
   %V4i32 = udiv <4 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16>
   %V8i32 = udiv <8 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256>
   %V16i32 = udiv <16 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256, i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256>
 
   %I16 = udiv i16 undef, -16
+  %V2i16 = udiv <2 x i16> undef, <i16 -2, i16 -4>
+  %V4i16 = udiv <4 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16>
   %V8i16 = udiv <8 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
   %V16i16 = udiv <16 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
   %V32i16 = udiv <32 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
 
   %I8 = udiv i8 undef, -16
+  %V2i8 = udiv <2 x i8> undef, <i8 -2, i8 -4>
+  %V4i8 = udiv <4 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16>
+  %V8i8 = udiv <8 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
   %V16i8 = udiv <16 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
   %V32i8 = udiv <32 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
   %V64i8 = udiv <64 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
@@ -564,14 +708,20 @@ define i32 @sdiv_uniformconstnegpow2() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4i64 = sdiv <4 x i64> undef, splat (i64 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8i64 = sdiv <8 x i64> undef, splat (i64 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = sdiv i32 undef, -16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2i32 = sdiv <2 x i32> undef, splat (i32 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4i32 = sdiv <4 x i32> undef, splat (i32 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8i32 = sdiv <8 x i32> undef, splat (i32 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16i32 = sdiv <16 x i32> undef, splat (i32 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = sdiv i16 undef, -16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2i16 = sdiv <2 x i16> undef, splat (i16 -16)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4i16 = sdiv <4 x i16> undef, splat (i16 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8i16 = sdiv <8 x i16> undef, splat (i16 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16i16 = sdiv <16 x i16> undef, splat (i16 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32i16 = sdiv <32 x i16> undef, splat (i16 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = sdiv i8 undef, -16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2i8 = sdiv <2 x i8> undef, splat (i8 -16)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4i8 = sdiv <4 x i8> undef, splat (i8 -16)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8i8 = sdiv <8 x i8> undef, splat (i8 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16i8 = sdiv <16 x i8> undef, splat (i8 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32i8 = sdiv <32 x i8> undef, splat (i8 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V64i8 = sdiv <64 x i8> undef, splat (i8 -16)
@@ -585,16 +735,22 @@ define i32 @sdiv_uniformconstnegpow2() {
   %V8i64 = sdiv <8 x i64> undef, <i64 -16, i64 -16, i64 -16, i64 -16, i64 -16, i64 -16, i64 -16, i64 -16>
 
   %I32 = sdiv i32 undef, -16
+  %V2i32 = sdiv <2 x i32> undef, <i32 -16, i32 -16>
   %V4i32 = sdiv <4 x i32> undef, <i32 -16, i32 -16, i32 -16, i32 -16>
   %V8i32 = sdiv <8 x i32> undef, <i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16>
   %V16i32 = sdiv <16 x i32> undef, <i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16>
 
   %I16 = sdiv i16 undef, -16
+  %V2i16 = sdiv <2 x i16> undef, <i16 -16, i16 -16>
+  %V4i16 = sdiv <4 x i16> undef, <i16 -16, i16 -16, i16 -16, i16 -16>
   %V8i16 = sdiv <8 x i16> undef, <i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16>
   %V16i16 = sdiv <16 x i16> undef, <i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16>
   %V32i16 = sdiv <32 x i16> undef, <i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16>
 
   %I8 = sdiv i8 undef, -16
+  %V2i8 = sdiv <2 x i8> undef, <i8 -16, i8 -16>
+  %V4i8 = sdiv <4 x i8> undef, <i8 -16, i8 -16, i8 -16, i8 -16>
+  %V8i8 = sdiv <8 x i8> undef, <i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16>
   %V16i8 = sdiv <16 x i8> undef, <i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16>
   %V32i8 = sdiv <32 x i8> undef, <i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16>
   %V64i8 = sdiv <64 x i8> undef, <i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16>
@@ -610,14 +766,20 @@ define i32 @udiv_uniformconstnegpow2() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4i64 = udiv <4 x i64> undef, splat (i64 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8i64 = udiv <8 x i64> undef, splat (i64 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = udiv i32 undef, -16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2i32 = udiv <2 x i32> undef, splat (i32 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4i32 = udiv <4 x i32> undef, splat (i32 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8i32 = udiv <8 x i32> undef, splat (i32 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16i32 = udiv <16 x i32> undef, splat (i32 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = udiv i16 undef, -16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2i16 = udiv <2 x i16> undef, splat (i16 -16)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4i16 = udiv <4 x i16> undef, splat (i16 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8i16 = udiv <8 x i16> undef, splat (i16 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16i16 = udiv <16 x i16> undef, splat (i16 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32i16 = udiv <32 x i16> undef, splat (i16 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = udiv i8 undef, -16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2i8 = udiv <2 x i8> undef, splat (i8 -16)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4i8 = udiv <4 x i8> undef, splat (i8 -16)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8i8 = udiv <8 x i8> undef, splat (i8 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16i8 = udiv <16 x i8> undef, splat (i8 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32i8 = udiv <32 x i8> undef, splat (i8 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V64i8 = udiv <64 x i8> undef, splat (i8 -16)
@@ -631,16 +793,22 @@ define i32 @udiv_uniformconstnegpow2() {
   %V8i64 = udiv <8 x i64> undef, <i64 -16, i64 -16, i64 -16, i64 -16, i64 -16, i64 -16, i64 -16, i64 -16>
 
   %I32 = udiv i32 undef, -16
+  %V2i32 = udiv <2 x i32> undef, <i32 -16, i32 -16>
   %V4i32 = udiv <4 x i32> undef, <i32 -16, i32 -16, i32 -16, i32 -16>
   %V8i32 = udiv <8 x i32> undef, <i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16>
   %V16i32 = udiv <16 x i32> undef, <i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16>
 
   %I16 = udiv i16 undef, -16
+  %V2i16 = udiv <2 x i16> undef, <i16 -16, i16 -16>
+  %V4i16 = udiv <4 x i16> undef, <i16 -16, i16 -16, i16 -16, i16 -16>
   %V8i16 = udiv <8 x i16> undef, <i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16>
   %V16i16 = udiv <16 x i16> undef, <i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16>
   %V32i16 = udiv <32 x i16> undef, <i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16>
 
   %I8 = udiv i8 undef, -16
+  %V2i8 = udiv <2 x i8> undef, <i8 -16, i8 -16>
+  %V4i8 = udiv <4 x i8> undef, <i8 -16, i8 -16, i8 -16, i8 -16>
+  %V8i8 = udiv <8 x i8> undef, <i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16>
   %V16i8 = udiv <16 x i8> undef, <i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16>
   %V32i8 = udiv <32 x i8> undef, <i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16>
   %V64i8 = udiv <64 x i8> undef, <i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16>
diff --git a/llvm/test/Analysis/CostModel/AArch64/rem.ll b/llvm/test/Analysis/CostModel/AArch64/rem.ll
index 2f1e8c8bf8df..06c05aefedf2 100644
--- a/llvm/test/Analysis/CostModel/AArch64/rem.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/rem.ll
@@ -5,40 +5,55 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 
 define i32 @srem() {
 ; CHECK-LABEL: 'srem'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %I128 = srem i128 undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = srem i64 undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i64 = srem <2 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i64 = srem <4 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i64 = srem <8 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = srem i32 undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i32 = srem <2 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i32 = srem <4 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i32 = srem <8 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i32 = srem <16 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = srem i16 undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i16 = srem <2 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i16 = srem <4 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i16 = srem <8 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i16 = srem <16 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i16 = srem <32 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = srem i8 undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i8 = srem <2 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i8 = srem <4 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i8 = srem <8 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i8 = srem <16 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i8 = srem <32 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V64i8 = srem <64 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+  %I128 = srem i128 undef, undef
+
   %I64 = srem i64 undef, undef
   %V2i64 = srem <2 x i64> undef, undef
   %V4i64 = srem <4 x i64> undef, undef
   %V8i64 = srem <8 x i64> undef, undef
 
   %I32 = srem i32 undef, undef
+  %V2i32 = srem <2 x i32> undef, undef
   %V4i32 = srem <4 x i32> undef, undef
   %V8i32 = srem <8 x i32> undef, undef
   %V16i32 = srem <16 x i32> undef, undef
 
   %I16 = srem i16 undef, undef
+  %V2i16 = srem <2 x i16> undef, undef
+  %V4i16 = srem <4 x i16> undef, undef
   %V8i16 = srem <8 x i16> undef, undef
   %V16i16 = srem <16 x i16> undef, undef
   %V32i16 = srem <32 x i16> undef, undef
 
   %I8 = srem i8 undef, undef
+  %V2i8 = srem <2 x i8> undef, undef
+  %V4i8 = srem <4 x i8> undef, undef
+  %V8i8 = srem <8 x i8> undef, undef
   %V16i8 = srem <16 x i8> undef, undef
   %V32i8 = srem <32 x i8> undef, undef
   %V64i8 = srem <64 x i8> undef, undef
@@ -48,40 +63,55 @@ define i32 @srem() {
 
 define i32 @urem() {
 ; CHECK-LABEL: 'urem'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %I128 = urem i128 undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = urem i64 undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i64 = urem <2 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i64 = urem <4 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i64 = urem <8 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = urem i32 undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i32 = urem <2 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i32 = urem <4 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i32 = urem <8 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i32 = urem <16 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = urem i16 undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i16 = urem <2 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i16 = urem <4 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i16 = urem <8 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i16 = urem <16 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i16 = urem <32 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = urem i8 undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i8 = urem <2 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i8 = urem <4 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i8 = urem <8 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i8 = urem <16 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i8 = urem <32 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V64i8 = urem <64 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+  %I128 = urem i128 undef, undef
+
   %I64 = urem i64 undef, undef
   %V2i64 = urem <2 x i64> undef, undef
   %V4i64 = urem <4 x i64> undef, undef
   %V8i64 = urem <8 x i64> undef, undef
 
   %I32 = urem i32 undef, undef
+  %V2i32 = urem <2 x i32> undef, undef
   %V4i32 = urem <4 x i32> undef, undef
   %V8i32 = urem <8 x i32> undef, undef
   %V16i32 = urem <16 x i32> undef, undef
 
   %I16 = urem i16 undef, undef
+  %V2i16 = urem <2 x i16> undef, undef
+  %V4i16 = urem <4 x i16> undef, undef
   %V8i16 = urem <8 x i16> undef, undef
   %V16i16 = urem <16 x i16> undef, undef
   %V32i16 = urem <32 x i16> undef, undef
 
   %I8 = urem i8 undef, undef
+  %V2i8 = urem <2 x i8> undef, undef
+  %V4i8 = urem <4 x i8> undef, undef
+  %V8i8 = urem <8 x i8> undef, undef
   %V16i8 = urem <16 x i8> undef, undef
   %V32i8 = urem <32 x i8> undef, undef
   %V64i8 = urem <64 x i8> undef, undef
@@ -91,40 +121,55 @@ define i32 @urem() {
 
 define i32 @srem_const() {
 ; CHECK-LABEL: 'srem_const'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %I128 = srem i128 undef, 7
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %I64 = srem i64 undef, 7
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i64 = srem <2 x i64> undef, <i64 6, i64 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i64 = srem <4 x i64> undef, <i64 4, i64 5, i64 6, i64 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i64 = srem <8 x i64> undef, <i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = srem i32 undef, 7
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i32 = srem <2 x i32> undef, <i32 4, i32 5>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i32 = srem <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i32 = srem <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i32 = srem <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = srem i16 undef, 7
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i16 = srem <2 x i16> undef, <i16 4, i16 5>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i16 = srem <4 x i16> undef, <i16 4, i16 5, i16 6, i16 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i16 = srem <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i16 = srem <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i16 = srem <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = srem i8 undef, 7
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i8 = srem <2 x i8> undef, <i8 4, i8 5>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i8 = srem <4 x i8> undef, <i8 4, i8 5, i8 6, i8 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i8 = srem <8 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i8 = srem <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i8 = srem <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V64i8 = srem <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+  %I128 = srem i128 undef, 7
+
   %I64 = srem i64 undef, 7
   %V2i64 = srem <2 x i64> undef, <i64 6, i64 7>
   %V4i64 = srem <4 x i64> undef, <i64 4, i64 5, i64 6, i64 7>
   %V8i64 = srem <8 x i64> undef, <i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11>
 
   %I32 = srem i32 undef, 7
+  %V2i32 = srem <2 x i32> undef, <i32 4, i32 5>
   %V4i32 = srem <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
   %V8i32 = srem <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   %V16i32 = srem <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
 
   %I16 = srem i16 undef, 7
+  %V2i16 = srem <2 x i16> undef, <i16 4, i16 5>
+  %V4i16 = srem <4 x i16> undef, <i16 4, i16 5, i16 6, i16 7>
   %V8i16 = srem <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
   %V16i16 = srem <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
   %V32i16 = srem <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 
   %I8 = srem i8 undef, 7
+  %V2i8 = srem <2 x i8> undef, <i8 4, i8 5>
+  %V4i8 = srem <4 x i8> undef, <i8 4, i8 5, i8 6, i8 7>
+  %V8i8 = srem <8 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11>
   %V16i8 = srem <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
   %V32i8 = srem <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
   %V64i8 = srem <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
@@ -134,40 +179,56 @@ define i32 @srem_const() {
 
 define i32 @urem_const() {
 ; CHECK-LABEL: 'urem_const'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %I128 = urem i128 undef, 7
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %I64 = urem i64 undef, 7
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i64 = urem <2 x i64> undef, <i64 6, i64 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i64 = urem <4 x i64> undef, <i64 4, i64 5, i64 6, i64 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i64 = urem <8 x i64> undef, <i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = urem i32 undef, 7
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i32 = urem <2 x i32> undef, <i32 4, i32 5>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i32 = urem <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i32 = urem <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i32 = urem <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = urem i16 undef, 7
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i16 = urem <2 x i16> undef, <i16 4, i16 5>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i16 = urem <4 x i16> undef, <i16 4, i16 5, i16 6, i16 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i16 = urem <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i16 = urem <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i16 = urem <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = urem i8 undef, 7
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i8 = urem <2 x i8> undef, <i8 4, i8 5>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i8 = urem <4 x i8> undef, <i8 4, i8 5, i8 6, i8 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i8 = urem <8 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i8 = urem <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i8 = urem <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V64i8 = urem <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+
+  %I128 = urem i128 undef, 7
+
   %I64 = urem i64 undef, 7
   %V2i64 = urem <2 x i64> undef, <i64 6, i64 7>
   %V4i64 = urem <4 x i64> undef, <i64 4, i64 5, i64 6, i64 7>
   %V8i64 = urem <8 x i64> undef, <i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11>
 
   %I32 = urem i32 undef, 7
+  %V2i32 = urem <2 x i32> undef, <i32 4, i32 5>
   %V4i32 = urem <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
   %V8i32 = urem <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   %V16i32 = urem <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
 
   %I16 = urem i16 undef, 7
+  %V2i16 = urem <2 x i16> undef, <i16 4, i16 5>
+  %V4i16 = urem <4 x i16> undef, <i16 4, i16 5, i16 6, i16 7>
   %V8i16 = urem <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
   %V16i16 = urem <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
   %V32i16 = urem <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 
   %I8 = urem i8 undef, 7
+  %V2i8 = urem <2 x i8> undef, <i8 4, i8 5>
+  %V4i8 = urem <4 x i8> undef, <i8 4, i8 5, i8 6, i8 7>
+  %V8i8 = urem <8 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11>
   %V16i8 = urem <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
   %V32i8 = urem <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
   %V64i8 = urem <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
@@ -177,40 +238,55 @@ define i32 @urem_const() {
 
 define i32 @srem_uniformconst() {
 ; CHECK-LABEL: 'srem_uniformconst'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %I128 = srem i128 undef, 7
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %I64 = srem i64 undef, 7
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V2i64 = srem <2 x i64> undef, splat (i64 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V4i64 = srem <4 x i64> undef, splat (i64 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %V8i64 = srem <8 x i64> undef, splat (i64 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = srem i32 undef, 7
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i32 = srem <2 x i32> undef, splat (i32 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i32 = srem <4 x i32> undef, splat (i32 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i32 = srem <8 x i32> undef, splat (i32 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i32 = srem <16 x i32> undef, splat (i32 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = srem i16 undef, 7
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i16 = srem <2 x i16> undef, splat (i16 7)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i16 = srem <4 x i16> undef, splat (i16 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i16 = srem <8 x i16> undef, splat (i16 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i16 = srem <16 x i16> undef, splat (i16 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i16 = srem <32 x i16> undef, splat (i16 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = srem i8 undef, 7
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i8 = srem <2 x i8> undef, splat (i8 7)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i8 = srem <4 x i8> undef, splat (i8 7)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i8 = srem <8 x i8> undef, splat (i8 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i8 = srem <16 x i8> undef, splat (i8 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i8 = srem <32 x i8> undef, splat (i8 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V64i8 = srem <64 x i8> undef, splat (i8 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+  %I128 = srem i128 undef, 7
+
   %I64 = srem i64 undef, 7
   %V2i64 = srem <2 x i64> undef, <i64 7, i64 7>
   %V4i64 = srem <4 x i64> undef, <i64 7, i64 7, i64 7, i64 7>
   %V8i64 = srem <8 x i64> undef, <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>
 
   %I32 = srem i32 undef, 7
+  %V2i32 = srem <2 x i32> undef, <i32 7, i32 7>
   %V4i32 = srem <4 x i32> undef, <i32 7, i32 7, i32 7, i32 7>
   %V8i32 = srem <8 x i32> undef, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
   %V16i32 = srem <16 x i32> undef, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
 
   %I16 = srem i16 undef, 7
+  %V2i16 = srem <2 x i16> undef, <i16 7, i16 7>
+  %V4i16 = srem <4 x i16> undef, <i16 7, i16 7, i16 7, i16 7>
   %V8i16 = srem <8 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
   %V16i16 = srem <16 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
   %V32i16 = srem <32 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 
   %I8 = srem i8 undef, 7
+  %V2i8 = srem <2 x i8> undef, <i8 7, i8 7>
+  %V4i8 = srem <4 x i8> undef, <i8 7, i8 7, i8 7, i8 7>
+  %V8i8 = srem <8 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
   %V16i8 = srem <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
   %V32i8 = srem <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
   %V64i8 = srem <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
@@ -220,40 +296,55 @@ define i32 @srem_uniformconst() {
 
 define i32 @urem_uniformconst() {
 ; CHECK-LABEL: 'urem_uniformconst'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %I128 = urem i128 undef, 7
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %I64 = urem i64 undef, 7
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V2i64 = urem <2 x i64> undef, splat (i64 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V4i64 = urem <4 x i64> undef, splat (i64 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %V8i64 = urem <8 x i64> undef, splat (i64 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = urem i32 undef, 7
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i32 = urem <2 x i32> undef, splat (i32 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i32 = urem <4 x i32> undef, splat (i32 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i32 = urem <8 x i32> undef, splat (i32 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i32 = urem <16 x i32> undef, splat (i32 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = urem i16 undef, 7
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i16 = urem <2 x i16> undef, splat (i16 7)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i16 = urem <4 x i16> undef, splat (i16 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i16 = urem <8 x i16> undef, splat (i16 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i16 = urem <16 x i16> undef, splat (i16 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i16 = urem <32 x i16> undef, splat (i16 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = urem i8 undef, 7
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i8 = urem <2 x i8> undef, splat (i8 7)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i8 = urem <4 x i8> undef, splat (i8 7)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i8 = urem <8 x i8> undef, splat (i8 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i8 = urem <16 x i8> undef, splat (i8 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i8 = urem <32 x i8> undef, splat (i8 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V64i8 = urem <64 x i8> undef, splat (i8 7)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+  %I128 = urem i128 undef, 7
+
   %I64 = urem i64 undef, 7
   %V2i64 = urem <2 x i64> undef, <i64 7, i64 7>
   %V4i64 = urem <4 x i64> undef, <i64 7, i64 7, i64 7, i64 7>
   %V8i64 = urem <8 x i64> undef, <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>
 
   %I32 = urem i32 undef, 7
+  %V2i32 = urem <2 x i32> undef, <i32 7, i32 7>
   %V4i32 = urem <4 x i32> undef, <i32 7, i32 7, i32 7, i32 7>
   %V8i32 = urem <8 x i32> undef, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
   %V16i32 = urem <16 x i32> undef, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
 
   %I16 = urem i16 undef, 7
+  %V2i16 = urem <2 x i16> undef, <i16 7, i16 7>
+  %V4i16 = urem <4 x i16> undef, <i16 7, i16 7, i16 7, i16 7>
   %V8i16 = urem <8 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
   %V16i16 = urem <16 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
   %V32i16 = urem <32 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 
   %I8 = urem i8 undef, 7
+  %V2i8 = urem <2 x i8> undef, <i8 7, i8 7>
+  %V4i8 = urem <4 x i8> undef, <i8 7, i8 7, i8 7, i8 7>
+  %V8i8 = urem <8 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
   %V16i8 = urem <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
   %V32i8 = urem <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
   %V64i8 = urem <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
@@ -263,40 +354,55 @@ define i32 @urem_uniformconst() {
 
 define i32 @srem_constpow2() {
 ; CHECK-LABEL: 'srem_constpow2'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %I128 = srem i128 undef, 16
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = srem i64 undef, 16
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i64 = srem <2 x i64> undef, <i64 8, i64 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i64 = srem <4 x i64> undef, <i64 2, i64 4, i64 8, i64 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i64 = srem <8 x i64> undef, <i64 2, i64 4, i64 8, i64 16, i64 32, i64 64, i64 128, i64 256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = srem i32 undef, 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i32 = srem <2 x i32> undef, <i32 2, i32 4>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i32 = srem <4 x i32> undef, <i32 2, i32 4, i32 8, i32 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i32 = srem <8 x i32> undef, <i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i32 = srem <16 x i32> undef, <i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256, i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = srem i16 undef, 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i16 = srem <2 x i16> undef, <i16 2, i16 4>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i16 = srem <4 x i16> undef, <i16 2, i16 4, i16 8, i16 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i16 = srem <8 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i16 = srem <16 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i16 = srem <32 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = srem i8 undef, 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i8 = srem <2 x i8> undef, <i8 2, i8 4>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i8 = srem <4 x i8> undef, <i8 2, i8 4, i8 8, i8 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i8 = srem <8 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i8 = srem <16 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i8 = srem <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V64i8 = srem <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+  %I128 = srem i128 undef, 16
+
   %I64 = srem i64 undef, 16
   %V2i64 = srem <2 x i64> undef, <i64 8, i64 16>
   %V4i64 = srem <4 x i64> undef, <i64 2, i64 4, i64 8, i64 16>
   %V8i64 = srem <8 x i64> undef, <i64 2, i64 4, i64 8, i64 16, i64 32, i64 64, i64 128, i64 256>
 
   %I32 = srem i32 undef, 16
+  %V2i32 = srem <2 x i32> undef, <i32 2, i32 4>
   %V4i32 = srem <4 x i32> undef, <i32 2, i32 4, i32 8, i32 16>
   %V8i32 = srem <8 x i32> undef, <i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256>
   %V16i32 = srem <16 x i32> undef, <i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256, i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256>
 
   %I16 = srem i16 undef, 16
+  %V2i16 = srem <2 x i16> undef, <i16 2, i16 4>
+  %V4i16 = srem <4 x i16> undef, <i16 2, i16 4, i16 8, i16 16>
   %V8i16 = srem <8 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
   %V16i16 = srem <16 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
   %V32i16 = srem <32 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
 
   %I8 = srem i8 undef, 16
+  %V2i8 = srem <2 x i8> undef, <i8 2, i8 4>
+  %V4i8 = srem <4 x i8> undef, <i8 2, i8 4, i8 8, i8 16>
+  %V8i8 = srem <8 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
   %V16i8 = srem <16 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
   %V32i8 = srem <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
   %V64i8 = srem <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
@@ -306,40 +412,55 @@ define i32 @srem_constpow2() {
 
 define i32 @urem_constpow2() {
 ; CHECK-LABEL: 'urem_constpow2'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %I128 = urem i128 undef, 16
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %I64 = urem i64 undef, 16
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i64 = urem <2 x i64> undef, <i64 8, i64 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i64 = urem <4 x i64> undef, <i64 2, i64 4, i64 8, i64 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i64 = urem <8 x i64> undef, <i64 2, i64 4, i64 8, i64 16, i64 32, i64 64, i64 128, i64 256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = urem i32 undef, 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i32 = urem <2 x i32> undef, <i32 2, i32 4>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i32 = urem <4 x i32> undef, <i32 2, i32 4, i32 8, i32 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i32 = urem <8 x i32> undef, <i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i32 = urem <16 x i32> undef, <i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256, i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = urem i16 undef, 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i16 = urem <2 x i16> undef, <i16 2, i16 4>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i16 = urem <4 x i16> undef, <i16 2, i16 4, i16 8, i16 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i16 = urem <8 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i16 = urem <16 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i16 = urem <32 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = urem i8 undef, 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i8 = urem <2 x i8> undef, <i8 2, i8 4>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i8 = urem <4 x i8> undef, <i8 2, i8 4, i8 8, i8 16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i8 = urem <8 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i8 = urem <16 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i8 = urem <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V64i8 = urem <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+  %I128 = urem i128 undef, 16
+
   %I64 = urem i64 undef, 16
   %V2i64 = urem <2 x i64> undef, <i64 8, i64 16>
   %V4i64 = urem <4 x i64> undef, <i64 2, i64 4, i64 8, i64 16>
   %V8i64 = urem <8 x i64> undef, <i64 2, i64 4, i64 8, i64 16, i64 32, i64 64, i64 128, i64 256>
 
   %I32 = urem i32 undef, 16
+  %V2i32 = urem <2 x i32> undef, <i32 2, i32 4>
   %V4i32 = urem <4 x i32> undef, <i32 2, i32 4, i32 8, i32 16>
   %V8i32 = urem <8 x i32> undef, <i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256>
   %V16i32 = urem <16 x i32> undef, <i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256, i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256>
 
   %I16 = urem i16 undef, 16
+  %V2i16 = urem <2 x i16> undef, <i16 2, i16 4>
+  %V4i16 = urem <4 x i16> undef, <i16 2, i16 4, i16 8, i16 16>
   %V8i16 = urem <8 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
   %V16i16 = urem <16 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
   %V32i16 = urem <32 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
 
   %I8 = urem i8 undef, 16
+  %V2i8 = urem <2 x i8> undef, <i8 2, i8 4>
+  %V4i8 = urem <4 x i8> undef, <i8 2, i8 4, i8 8, i8 16>
+  %V8i8 = urem <8 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
   %V16i8 = urem <16 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
   %V32i8 = urem <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
   %V64i8 = urem <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
@@ -349,40 +470,55 @@ define i32 @urem_constpow2() {
 
 define i32 @srem_uniformconstpow2() {
 ; CHECK-LABEL: 'srem_uniformconstpow2'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %I128 = srem i128 undef, 16
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = srem i64 undef, 16
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V2i64 = srem <2 x i64> undef, splat (i64 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V4i64 = srem <4 x i64> undef, splat (i64 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V8i64 = srem <8 x i64> undef, splat (i64 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = srem i32 undef, 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V2i32 = srem <2 x i32> undef, splat (i32 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V4i32 = srem <4 x i32> undef, splat (i32 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V8i32 = srem <8 x i32> undef, splat (i32 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %V16i32 = srem <16 x i32> undef, splat (i32 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = srem i16 undef, 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V2i16 = srem <2 x i16> undef, splat (i16 16)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V4i16 = srem <4 x i16> undef, splat (i16 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V8i16 = srem <8 x i16> undef, splat (i16 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %V16i16 = srem <16 x i16> undef, splat (i16 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 352 for instruction: %V32i16 = srem <32 x i16> undef, splat (i16 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = srem i8 undef, 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V2i8 = srem <2 x i8> undef, splat (i8 16)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V4i8 = srem <4 x i8> undef, splat (i8 16)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V8i8 = srem <8 x i8> undef, splat (i8 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %V16i8 = srem <16 x i8> undef, splat (i8 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 352 for instruction: %V32i8 = srem <32 x i8> undef, splat (i8 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 704 for instruction: %V64i8 = srem <64 x i8> undef, splat (i8 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+  %I128 = srem i128 undef, 16
+
   %I64 = srem i64 undef, 16
   %V2i64 = srem <2 x i64> undef, <i64 16, i64 16>
   %V4i64 = srem <4 x i64> undef, <i64 16, i64 16, i64 16, i64 16>
   %V8i64 = srem <8 x i64> undef, <i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16>
 
   %I32 = srem i32 undef, 16
+  %V2i32 = srem <2 x i32> undef, <i32 16, i32 16>
   %V4i32 = srem <4 x i32> undef, <i32 16, i32 16, i32 16, i32 16>
   %V8i32 = srem <8 x i32> undef, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
   %V16i32 = srem <16 x i32> undef, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
 
   %I16 = srem i16 undef, 16
+  %V2i16 = srem <2 x i16> undef, <i16 16, i16 16>
+  %V4i16 = srem <4 x i16> undef, <i16 16, i16 16, i16 16, i16 16>
   %V8i16 = srem <8 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
   %V16i16 = srem <16 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
   %V32i16 = srem <32 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
 
   %I8 = srem i8 undef, 16
+  %V2i8 = srem <2 x i8> undef, <i8 16, i8 16>
+  %V4i8 = srem <4 x i8> undef, <i8 16, i8 16, i8 16, i8 16>
+  %V8i8 = srem <8 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
   %V16i8 = srem <16 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
   %V32i8 = srem <32 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
   %V64i8 = srem <64 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
@@ -392,40 +528,55 @@ define i32 @srem_uniformconstpow2() {
 
 define i32 @urem_uniformconstpow2() {
 ; CHECK-LABEL: 'urem_uniformconstpow2'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %I128 = urem i128 undef, 16
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %I64 = urem i64 undef, 16
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V2i64 = urem <2 x i64> undef, splat (i64 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V4i64 = urem <4 x i64> undef, splat (i64 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %V8i64 = urem <8 x i64> undef, splat (i64 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = urem i32 undef, 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i32 = urem <2 x i32> undef, splat (i32 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i32 = urem <4 x i32> undef, splat (i32 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i32 = urem <8 x i32> undef, splat (i32 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i32 = urem <16 x i32> undef, splat (i32 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = urem i16 undef, 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i16 = urem <2 x i16> undef, splat (i16 16)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i16 = urem <4 x i16> undef, splat (i16 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i16 = urem <8 x i16> undef, splat (i16 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i16 = urem <16 x i16> undef, splat (i16 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i16 = urem <32 x i16> undef, splat (i16 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = urem i8 undef, 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i8 = urem <2 x i8> undef, splat (i8 16)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i8 = urem <4 x i8> undef, splat (i8 16)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i8 = urem <8 x i8> undef, splat (i8 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i8 = urem <16 x i8> undef, splat (i8 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i8 = urem <32 x i8> undef, splat (i8 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V64i8 = urem <64 x i8> undef, splat (i8 16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+  %I128 = urem i128 undef, 16
+
   %I64 = urem i64 undef, 16
   %V2i64 = urem <2 x i64> undef, <i64 16, i64 16>
   %V4i64 = urem <4 x i64> undef, <i64 16, i64 16, i64 16, i64 16>
   %V8i64 = urem <8 x i64> undef, <i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16>
 
   %I32 = urem i32 undef, 16
+  %V2i32 = urem <2 x i32> undef, <i32 16, i32 16>
   %V4i32 = urem <4 x i32> undef, <i32 16, i32 16, i32 16, i32 16>
   %V8i32 = urem <8 x i32> undef, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
   %V16i32 = urem <16 x i32> undef, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
 
   %I16 = urem i16 undef, 16
+  %V2i16 = urem <2 x i16> undef, <i16 16, i16 16>
+  %V4i16 = urem <4 x i16> undef, <i16 16, i16 16, i16 16, i16 16>
   %V8i16 = urem <8 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
   %V16i16 = urem <16 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
   %V32i16 = urem <32 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
 
   %I8 = urem i8 undef, 16
+  %V2i8 = urem <2 x i8> undef, <i8 16, i8 16>
+  %V4i8 = urem <4 x i8> undef, <i8 16, i8 16, i8 16, i8 16>
+  %V8i8 = urem <8 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
   %V16i8 = urem <16 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
   %V32i8 = urem <32 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
   %V64i8 = urem <64 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
@@ -435,40 +586,55 @@ define i32 @urem_uniformconstpow2() {
 
 define i32 @srem_constnegpow2() {
 ; CHECK-LABEL: 'srem_constnegpow2'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %I128 = srem i128 undef, -16
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %I64 = srem i64 undef, -16
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i64 = srem <2 x i64> undef, <i64 -8, i64 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i64 = srem <4 x i64> undef, <i64 -2, i64 -4, i64 -8, i64 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i64 = srem <8 x i64> undef, <i64 -2, i64 -4, i64 -8, i64 -16, i64 -32, i64 -64, i64 -128, i64 -256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = srem i32 undef, -16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i32 = srem <2 x i32> undef, <i32 -2, i32 -4>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i32 = srem <4 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i32 = srem <8 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i32 = srem <16 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256, i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = srem i16 undef, -16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i16 = srem <2 x i16> undef, <i16 -2, i16 -4>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i16 = srem <4 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i16 = srem <8 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i16 = srem <16 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i16 = srem <32 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = srem i8 undef, -16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i8 = srem <2 x i8> undef, <i8 -2, i8 -4>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i8 = srem <4 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i8 = srem <8 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i8 = srem <16 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i8 = srem <32 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V64i8 = srem <64 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+  %I128 = srem i128 undef, -16
+
   %I64 = srem i64 undef, -16
   %V2i64 = srem <2 x i64> undef, <i64 -8, i64 -16>
   %V4i64 = srem <4 x i64> undef, <i64 -2, i64 -4, i64 -8, i64 -16>
   %V8i64 = srem <8 x i64> undef, <i64 -2, i64 -4, i64 -8, i64 -16, i64 -32, i64 -64, i64 -128, i64 -256>
 
   %I32 = srem i32 undef, -16
+  %V2i32 = srem <2 x i32> undef, <i32 -2, i32 -4>
   %V4i32 = srem <4 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16>
   %V8i32 = srem <8 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256>
   %V16i32 = srem <16 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256, i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256>
 
   %I16 = srem i16 undef, -16
+  %V2i16 = srem <2 x i16> undef, <i16 -2, i16 -4>
+  %V4i16 = srem <4 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16>
   %V8i16 = srem <8 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
   %V16i16 = srem <16 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
   %V32i16 = srem <32 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
 
   %I8 = srem i8 undef, -16
+  %V2i8 = srem <2 x i8> undef, <i8 -2, i8 -4>
+  %V4i8 = srem <4 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16>
+  %V8i8 = srem <8 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
   %V16i8 = srem <16 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
   %V32i8 = srem <32 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
   %V64i8 = srem <64 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
@@ -478,40 +644,55 @@ define i32 @srem_constnegpow2() {
 
 define i32 @urem_constnegpow2() {
 ; CHECK-LABEL: 'urem_constnegpow2'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %I128 = urem i128 undef, -16
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %I64 = urem i64 undef, -16
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i64 = urem <2 x i64> undef, <i64 -8, i64 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i64 = urem <4 x i64> undef, <i64 -2, i64 -4, i64 -8, i64 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i64 = urem <8 x i64> undef, <i64 -2, i64 -4, i64 -8, i64 -16, i64 -32, i64 -64, i64 -128, i64 -256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = urem i32 undef, -16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i32 = urem <2 x i32> undef, <i32 -2, i32 -4>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i32 = urem <4 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i32 = urem <8 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i32 = urem <16 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256, i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = urem i16 undef, -16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i16 = urem <2 x i16> undef, <i16 -2, i16 -4>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i16 = urem <4 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i16 = urem <8 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i16 = urem <16 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i16 = urem <32 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = urem i8 undef, -16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i8 = urem <2 x i8> undef, <i8 -2, i8 -4>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i8 = urem <4 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i8 = urem <8 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i8 = urem <16 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i8 = urem <32 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V64i8 = urem <64 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+  %I128 = urem i128 undef, -16
+
   %I64 = urem i64 undef, -16
   %V2i64 = urem <2 x i64> undef, <i64 -8, i64 -16>
   %V4i64 = urem <4 x i64> undef, <i64 -2, i64 -4, i64 -8, i64 -16>
   %V8i64 = urem <8 x i64> undef, <i64 -2, i64 -4, i64 -8, i64 -16, i64 -32, i64 -64, i64 -128, i64 -256>
 
   %I32 = urem i32 undef, -16
+  %V2i32 = urem <2 x i32> undef, <i32 -2, i32 -4>
   %V4i32 = urem <4 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16>
   %V8i32 = urem <8 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256>
   %V16i32 = urem <16 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256, i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256>
 
   %I16 = urem i16 undef, -16
+  %V2i16 = urem <2 x i16> undef, <i16 -2, i16 -4>
+  %V4i16 = urem <4 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16>
   %V8i16 = urem <8 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
   %V16i16 = urem <16 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
   %V32i16 = urem <32 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
 
   %I8 = urem i8 undef, -16
+  %V2i8 = urem <2 x i8> undef, <i8 -2, i8 -4>
+  %V4i8 = urem <4 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16>
+  %V8i8 = urem <8 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
   %V16i8 = urem <16 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
   %V32i8 = urem <32 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
   %V64i8 = urem <64 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
@@ -521,40 +702,55 @@ define i32 @urem_constnegpow2() {
 
 define i32 @srem_uniformconstnegpow2() {
 ; CHECK-LABEL: 'srem_uniformconstnegpow2'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %I128 = srem i128 undef, -16
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %I64 = srem i64 undef, -16
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V2i64 = srem <2 x i64> undef, splat (i64 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V4i64 = srem <4 x i64> undef, splat (i64 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %V8i64 = srem <8 x i64> undef, splat (i64 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = srem i32 undef, -16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i32 = srem <2 x i32> undef, splat (i32 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i32 = srem <4 x i32> undef, splat (i32 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i32 = srem <8 x i32> undef, splat (i32 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i32 = srem <16 x i32> undef, splat (i32 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = srem i16 undef, -16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i16 = srem <2 x i16> undef, splat (i16 -16)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i16 = srem <4 x i16> undef, splat (i16 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i16 = srem <8 x i16> undef, splat (i16 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i16 = srem <16 x i16> undef, splat (i16 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i16 = srem <32 x i16> undef, splat (i16 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = srem i8 undef, -16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i8 = srem <2 x i8> undef, splat (i8 -16)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i8 = srem <4 x i8> undef, splat (i8 -16)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i8 = srem <8 x i8> undef, splat (i8 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i8 = srem <16 x i8> undef, splat (i8 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i8 = srem <32 x i8> undef, splat (i8 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V64i8 = srem <64 x i8> undef, splat (i8 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+  %I128 = srem i128 undef, -16
+
   %I64 = srem i64 undef, -16
   %V2i64 = srem <2 x i64> undef, <i64 -16, i64 -16>
   %V4i64 = srem <4 x i64> undef, <i64 -16, i64 -16, i64 -16, i64 -16>
   %V8i64 = srem <8 x i64> undef, <i64 -16, i64 -16, i64 -16, i64 -16, i64 -16, i64 -16, i64 -16, i64 -16>
 
   %I32 = srem i32 undef, -16
+  %V2i32 = srem <2 x i32> undef, <i32 -16, i32 -16>
   %V4i32 = srem <4 x i32> undef, <i32 -16, i32 -16, i32 -16, i32 -16>
   %V8i32 = srem <8 x i32> undef, <i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16>
   %V16i32 = srem <16 x i32> undef, <i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16>
 
   %I16 = srem i16 undef, -16
+  %V2i16 = srem <2 x i16> undef, <i16 -16, i16 -16>
+  %V4i16 = srem <4 x i16> undef, <i16 -16, i16 -16, i16 -16, i16 -16>
   %V8i16 = srem <8 x i16> undef, <i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16>
   %V16i16 = srem <16 x i16> undef, <i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16>
   %V32i16 = srem <32 x i16> undef, <i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16>
 
   %I8 = srem i8 undef, -16
+  %V2i8 = srem <2 x i8> undef, <i8 -16, i8 -16>
+  %V4i8 = srem <4 x i8> undef, <i8 -16, i8 -16, i8 -16, i8 -16>
+  %V8i8 = srem <8 x i8> undef, <i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16>
   %V16i8 = srem <16 x i8> undef, <i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16>
   %V32i8 = srem <32 x i8> undef, <i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16>
   %V64i8 = srem <64 x i8> undef, <i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16>
@@ -564,40 +760,55 @@ define i32 @srem_uniformconstnegpow2() {
 
 define i32 @urem_uniformconstnegpow2() {
 ; CHECK-LABEL: 'urem_uniformconstnegpow2'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %I128 = urem i128 undef, -16
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %I64 = urem i64 undef, -16
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V2i64 = urem <2 x i64> undef, splat (i64 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V4i64 = urem <4 x i64> undef, splat (i64 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %V8i64 = urem <8 x i64> undef, splat (i64 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = urem i32 undef, -16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i32 = urem <2 x i32> undef, splat (i32 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i32 = urem <4 x i32> undef, splat (i32 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i32 = urem <8 x i32> undef, splat (i32 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i32 = urem <16 x i32> undef, splat (i32 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = urem i16 undef, -16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i16 = urem <2 x i16> undef, splat (i16 -16)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i16 = urem <4 x i16> undef, splat (i16 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i16 = urem <8 x i16> undef, splat (i16 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i16 = urem <16 x i16> undef, splat (i16 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i16 = urem <32 x i16> undef, splat (i16 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = urem i8 undef, -16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2i8 = urem <2 x i8> undef, splat (i8 -16)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4i8 = urem <4 x i8> undef, splat (i8 -16)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8i8 = urem <8 x i8> undef, splat (i8 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16i8 = urem <16 x i8> undef, splat (i8 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V32i8 = urem <32 x i8> undef, splat (i8 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V64i8 = urem <64 x i8> undef, splat (i8 -16)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+  %I128 = urem i128 undef, -16
+
   %I64 = urem i64 undef, -16
   %V2i64 = urem <2 x i64> undef, <i64 -16, i64 -16>
   %V4i64 = urem <4 x i64> undef, <i64 -16, i64 -16, i64 -16, i64 -16>
   %V8i64 = urem <8 x i64> undef, <i64 -16, i64 -16, i64 -16, i64 -16, i64 -16, i64 -16, i64 -16, i64 -16>
 
   %I32 = urem i32 undef, -16
+  %V2i32 = urem <2 x i32> undef, <i32 -16, i32 -16>
   %V4i32 = urem <4 x i32> undef, <i32 -16, i32 -16, i32 -16, i32 -16>
   %V8i32 = urem <8 x i32> undef, <i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16>
   %V16i32 = urem <16 x i32> undef, <i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16>
 
   %I16 = urem i16 undef, -16
+  %V2i16 = urem <2 x i16> undef, <i16 -16, i16 -16>
+  %V4i16 = urem <4 x i16> undef, <i16 -16, i16 -16, i16 -16, i16 -16>
   %V8i16 = urem <8 x i16> undef, <i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16>
   %V16i16 = urem <16 x i16> undef, <i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16>
   %V32i16 = urem <32 x i16> undef, <i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16>
 
   %I8 = urem i8 undef, -16
+  %V2i8 = urem <2 x i8> undef, <i8 -16, i8 -16>
+  %V4i8 = urem <4 x i8> undef, <i8 -16, i8 -16, i8 -16, i8 -16>
+  %V8i8 = urem <8 x i8> undef, <i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16>
   %V16i8 = urem <16 x i8> undef, <i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16>
   %V32i8 = urem <32 x i8> undef, <i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16>
   %V64i8 = urem <64 x i8> undef, <i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16>
diff --git a/llvm/test/Analysis/CostModel/AArch64/shuffle-extract.ll b/llvm/test/Analysis/CostModel/AArch64/shuffle-extract.ll
index 50356196b838..b81b6a9df1e8 100644
--- a/llvm/test/Analysis/CostModel/AArch64/shuffle-extract.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/shuffle-extract.ll
@@ -15,7 +15,7 @@ define void @extract_half() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i8_hi = shufflevector <8 x i8> poison, <8 x i8> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16i8_lo = shufflevector <16 x i8> poison, <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i8_mi = shufflevector <16 x i8> poison, <16 x i8> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i8_hi = shufflevector <16 x i8> poison, <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_hi = shufflevector <16 x i8> poison, <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2i16_lo = shufflevector <2 x i16> poison, <2 x i16> poison, <1 x i32> zeroinitializer
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_hi = shufflevector <2 x i16> poison, <2 x i16> poison, <1 x i32> <i32 1>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4i16_lo = shufflevector <4 x i16> poison, <4 x i16> poison, <2 x i32> <i32 0, i32 1>
@@ -23,7 +23,7 @@ define void @extract_half() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i16_hi = shufflevector <4 x i16> poison, <4 x i16> poison, <2 x i32> <i32 2, i32 3>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8i16_lo = shufflevector <8 x i16> poison, <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i16_mi = shufflevector <8 x i16> poison, <8 x i16> poison, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i16_hi = shufflevector <8 x i16> poison, <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_hi = shufflevector <8 x i16> poison, <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16i16_lo = shufflevector <16 x i16> poison, <16 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i16_mi = shufflevector <16 x i16> poison, <16 x i16> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i16_hi = shufflevector <16 x i16> poison, <16 x i16> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -31,7 +31,7 @@ define void @extract_half() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_hi = shufflevector <2 x i32> poison, <2 x i32> poison, <1 x i32> <i32 1>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4i32_lo = shufflevector <4 x i32> poison, <4 x i32> poison, <2 x i32> <i32 0, i32 1>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i32_mi = shufflevector <4 x i32> poison, <4 x i32> poison, <2 x i32> <i32 1, i32 2>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i32_hi = shufflevector <4 x i32> poison, <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_hi = shufflevector <4 x i32> poison, <4 x i32> poison, <2 x i32> <i32 2, i32 3>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8i32_lo = shufflevector <8 x i32> poison, <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8i32_mi = shufflevector <8 x i32> poison, <8 x i32> poison, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8i32_hi = shufflevector <8 x i32> poison, <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -114,19 +114,19 @@ define void @extract_qtr() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i16_mi = shufflevector <8 x i16> poison, <8 x i16> poison, <2 x i32> <i32 2, i32 3>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i16_hi = shufflevector <8 x i16> poison, <8 x i16> poison, <2 x i32> <i32 4, i32 5>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16i16_lo = shufflevector <16 x i16> poison, <16 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i16_mi = shufflevector <16 x i16> poison, <16 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_mi = shufflevector <16 x i16> poison, <16 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i16_hi = shufflevector <16 x i16> poison, <16 x i16> poison, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4i32_lo = shufflevector <4 x i32> poison, <4 x i32> poison, <1 x i32> zeroinitializer
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i32_mi = shufflevector <4 x i32> poison, <4 x i32> poison, <1 x i32> <i32 1>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i32_hi = shufflevector <4 x i32> poison, <4 x i32> poison, <1 x i32> <i32 2>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8i32_lo = shufflevector <8 x i32> poison, <8 x i32> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8i32_mi = shufflevector <8 x i32> poison, <8 x i32> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_mi = shufflevector <8 x i32> poison, <8 x i32> poison, <2 x i32> <i32 2, i32 3>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8i32_hi = shufflevector <8 x i32> poison, <8 x i32> poison, <2 x i32> <i32 4, i32 5>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16i32_lo = shufflevector <16 x i32> poison, <16 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16i32_mi = shufflevector <16 x i32> poison, <16 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16i32_hi = shufflevector <16 x i32> poison, <16 x i32> poison, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4i64_lo = shufflevector <4 x i64> poison, <4 x i64> poison, <1 x i32> zeroinitializer
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_mi = shufflevector <4 x i64> poison, <4 x i64> poison, <1 x i32> <i32 1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_mi = shufflevector <4 x i64> poison, <4 x i64> poison, <1 x i32> <i32 1>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_hi = shufflevector <4 x i64> poison, <4 x i64> poison, <1 x i32> <i32 2>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8i64_lo = shufflevector <8 x i64> poison, <8 x i64> poison, <2 x i32> <i32 0, i32 1>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i64_mi = shufflevector <8 x i64> poison, <8 x i64> poison, <2 x i32> <i32 2, i32 3>
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-two-src-codesize.ll b/llvm/test/Analysis/CostModel/X86/shuffle-two-src-codesize.ll
index 027af628ea32..b672df586852 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-two-src-codesize.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-two-src-codesize.ll
@@ -124,7 +124,7 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 3, i32 0>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-two-src-latency.ll b/llvm/test/Analysis/CostModel/X86/shuffle-two-src-latency.ll
index f9f045f3a172..fc8c0cd0a399 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-two-src-latency.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-two-src-latency.ll
@@ -124,7 +124,7 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 3, i32 0>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-two-src-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/shuffle-two-src-sizelatency.ll
index 76690afecabd..b48b6205a255 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-two-src-sizelatency.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-two-src-sizelatency.ll
@@ -124,7 +124,7 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 3, i32 0>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-two-src.ll b/llvm/test/Analysis/CostModel/X86/shuffle-two-src.ll
index 034ec0acf79d..efa0f2eb8dc9 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-two-src.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-two-src.ll
@@ -124,7 +124,7 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 3, i32 0>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
diff --git a/llvm/test/Analysis/CostModel/X86/vector-insert-value.ll b/llvm/test/Analysis/CostModel/X86/vector-insert-value.ll
index 252497643a4f..ee82e10f9ebb 100644
--- a/llvm/test/Analysis/CostModel/X86/vector-insert-value.ll
+++ b/llvm/test/Analysis/CostModel/X86/vector-insert-value.ll
@@ -76,58 +76,58 @@ define i32 @insert_double(i32 %arg, double %val, <2 x double> %src128, <4 x doub
 define i32 @insert_float(i32 %arg, float %val, <2 x float> %src64, <4 x float> %src128, <8 x float> %src256, <16 x float> %src512) {
 ; SSE2-LABEL: 'insert_float'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f32_a = insertelement <2 x float> %src64, float %val, i32 %arg
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32_0 = insertelement <2 x float> %src64, float %val, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_0 = insertelement <2 x float> %src64, float %val, i32 0
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32_1 = insertelement <2 x float> %src64, float %val, i32 1
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f32_a = insertelement <4 x float> %src128, float %val, i32 %arg
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32_0 = insertelement <4 x float> %src128, float %val, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32_0 = insertelement <4 x float> %src128, float %val, i32 0
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32_3 = insertelement <4 x float> %src128, float %val, i32 3
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v8f32_a = insertelement <8 x float> %src256, float %val, i32 %arg
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_0 = insertelement <8 x float> %src256, float %val, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_0 = insertelement <8 x float> %src256, float %val, i32 0
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_3 = insertelement <8 x float> %src256, float %val, i32 3
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_4 = insertelement <8 x float> %src256, float %val, i32 4
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_4 = insertelement <8 x float> %src256, float %val, i32 4
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_7 = insertelement <8 x float> %src256, float %val, i32 7
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v16f32_a = insertelement <16 x float> %src512, float %val, i32 %arg
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f32_0 = insertelement <16 x float> %src512, float %val, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f32_0 = insertelement <16 x float> %src512, float %val, i32 0
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f32_3 = insertelement <16 x float> %src512, float %val, i32 3
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f32_8 = insertelement <16 x float> %src512, float %val, i32 8
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f32_8 = insertelement <16 x float> %src512, float %val, i32 8
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f32_15 = insertelement <16 x float> %src512, float %val, i32 15
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE3-LABEL: 'insert_float'
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f32_a = insertelement <2 x float> %src64, float %val, i32 %arg
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32_0 = insertelement <2 x float> %src64, float %val, i32 0
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_0 = insertelement <2 x float> %src64, float %val, i32 0
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32_1 = insertelement <2 x float> %src64, float %val, i32 1
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f32_a = insertelement <4 x float> %src128, float %val, i32 %arg
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32_0 = insertelement <4 x float> %src128, float %val, i32 0
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32_0 = insertelement <4 x float> %src128, float %val, i32 0
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32_3 = insertelement <4 x float> %src128, float %val, i32 3
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v8f32_a = insertelement <8 x float> %src256, float %val, i32 %arg
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_0 = insertelement <8 x float> %src256, float %val, i32 0
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_0 = insertelement <8 x float> %src256, float %val, i32 0
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_3 = insertelement <8 x float> %src256, float %val, i32 3
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_4 = insertelement <8 x float> %src256, float %val, i32 4
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_4 = insertelement <8 x float> %src256, float %val, i32 4
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_7 = insertelement <8 x float> %src256, float %val, i32 7
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v16f32_a = insertelement <16 x float> %src512, float %val, i32 %arg
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f32_0 = insertelement <16 x float> %src512, float %val, i32 0
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f32_0 = insertelement <16 x float> %src512, float %val, i32 0
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f32_3 = insertelement <16 x float> %src512, float %val, i32 3
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f32_8 = insertelement <16 x float> %src512, float %val, i32 8
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f32_8 = insertelement <16 x float> %src512, float %val, i32 8
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f32_15 = insertelement <16 x float> %src512, float %val, i32 15
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'insert_float'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f32_a = insertelement <2 x float> %src64, float %val, i32 %arg
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32_0 = insertelement <2 x float> %src64, float %val, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_0 = insertelement <2 x float> %src64, float %val, i32 0
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32_1 = insertelement <2 x float> %src64, float %val, i32 1
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f32_a = insertelement <4 x float> %src128, float %val, i32 %arg
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32_0 = insertelement <4 x float> %src128, float %val, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32_0 = insertelement <4 x float> %src128, float %val, i32 0
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32_3 = insertelement <4 x float> %src128, float %val, i32 3
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v8f32_a = insertelement <8 x float> %src256, float %val, i32 %arg
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_0 = insertelement <8 x float> %src256, float %val, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_0 = insertelement <8 x float> %src256, float %val, i32 0
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_3 = insertelement <8 x float> %src256, float %val, i32 3
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_4 = insertelement <8 x float> %src256, float %val, i32 4
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_4 = insertelement <8 x float> %src256, float %val, i32 4
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_7 = insertelement <8 x float> %src256, float %val, i32 7
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v16f32_a = insertelement <16 x float> %src512, float %val, i32 %arg
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f32_0 = insertelement <16 x float> %src512, float %val, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f32_0 = insertelement <16 x float> %src512, float %val, i32 0
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f32_3 = insertelement <16 x float> %src512, float %val, i32 3
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f32_8 = insertelement <16 x float> %src512, float %val, i32 8
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f32_8 = insertelement <16 x float> %src512, float %val, i32 8
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f32_15 = insertelement <16 x float> %src512, float %val, i32 15
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
diff --git a/llvm/test/Analysis/Lint/abi-attrs.ll b/llvm/test/Analysis/Lint/abi-attrs.ll
new file mode 100644
index 000000000000..5a3ece6602f9
--- /dev/null
+++ b/llvm/test/Analysis/Lint/abi-attrs.ll
@@ -0,0 +1,106 @@
+; RUN: opt < %s -passes=lint -disable-output 2>&1 | FileCheck %s
+
+declare void @fn_nothing_i8(i8 %x)
+declare void @fn_zeroext(i8 zeroext %x)
+declare void @fn_signext(i8 signext %x)
+declare void @fn_inreg(i8 inreg %x)
+
+declare void @fn_nothing_ptr(ptr %x)
+declare void @fn_byval(ptr byval(i8) %x)
+declare void @fn_byref(ptr byref(i8) %x)
+declare void @fn_inalloca(ptr inalloca(i8) %x)
+declare void @fn_preallocated(ptr preallocated(i8) %x)
+declare void @fn_sret(ptr sret(i8) %x)
+
+define void @caller_zeroext(i8 %x) {
+; CHECK: Undefined behavior: ABI attribute zeroext not present on both function and call-site
+; CHECK:  call void @fn_zeroext(i8 %x)
+  call void @fn_zeroext(i8 %x)
+
+; CHECK: Undefined behavior: ABI attribute zeroext not present on both function and call-site
+; CHECK:  call void @fn_nothing_i8(i8 zeroext %x)
+  call void @fn_nothing_i8(i8 zeroext %x)
+  ret void
+}
+
+define void @caller_signext(i8 %x) {
+; CHECK: Undefined behavior: ABI attribute signext not present on both function and call-site
+; CHECK:  call void @fn_signext(i8 %x)
+  call void @fn_signext(i8 %x)
+
+; CHECK: Undefined behavior: ABI attribute signext not present on both function and call-site
+; CHECK:  call void @fn_nothing_i8(i8 signext %x)
+  call void @fn_nothing_i8(i8 signext %x)
+  ret void
+}
+
+define void @caller_inreg(i8 %x) {
+; CHECK: Undefined behavior: ABI attribute inreg not present on both function and call-site
+; CHECK:  call void @fn_inreg(i8 %x)
+  call void @fn_inreg(i8 %x)
+
+; CHECK: Undefined behavior: ABI attribute inreg not present on both function and call-site
+; CHECK:  call void @fn_nothing_i8(i8 inreg %x)
+  call void @fn_nothing_i8(i8 inreg %x)
+  ret void
+}
+
+define void @caller_byval(ptr %x) {
+; CHECK: Undefined behavior: ABI attribute byval not present on both function and call-site
+; CHECK:  call void @fn_byval(ptr %x)
+  call void @fn_byval(ptr %x)
+
+; CHECK: Undefined behavior: ABI attribute byval not present on both function and call-site
+; CHECK:  call void @fn_nothing_ptr(ptr byval(i8) %x)
+  call void @fn_nothing_ptr(ptr byval(i8) %x)
+
+; CHECK: Undefined behavior: ABI attribute byval does not have same argument for function and call-site
+; CHECK:  call void @fn_byval(ptr byval(i16) %x)
+  call void @fn_byval(ptr byval(i16) %x)
+  ret void
+}
+
+define void @caller_byref(ptr %x) {
+; CHECK: Undefined behavior: ABI attribute byref not present on both function and call-site
+; CHECK:  call void @fn_byref(ptr %x)
+  call void @fn_byref(ptr %x)
+
+; CHECK: Undefined behavior: ABI attribute byref not present on both function and call-site
+; CHECK:  call void @fn_nothing_ptr(ptr byref(i8) %x)
+  call void @fn_nothing_ptr(ptr byref(i8) %x)
+
+; CHECK: Undefined behavior: ABI attribute byref does not have same argument for function and call-site
+; CHECK:  call void @fn_byref(ptr byref(i16) %x)
+  call void @fn_byref(ptr byref(i16) %x)
+  ret void
+}
+
+define void @caller_inalloca(ptr %x) {
+; CHECK: Undefined behavior: ABI attribute inalloca not present on both function and call-site
+; CHECK:  call void @fn_inalloca(ptr %x)
+  call void @fn_inalloca(ptr %x)
+
+; CHECK: Undefined behavior: ABI attribute inalloca not present on both function and call-site
+; CHECK:  call void @fn_nothing_ptr(ptr inalloca(i8) %x)
+  call void @fn_nothing_ptr(ptr inalloca(i8) %x)
+
+; CHECK: Undefined behavior: ABI attribute inalloca does not have same argument for function and call-site
+; CHECK:  call void @fn_inalloca(ptr inalloca(i16) %x)
+  call void @fn_inalloca(ptr inalloca(i16) %x)
+  ret void
+}
+
+define void @caller_sret(ptr %x) {
+; CHECK: Undefined behavior: ABI attribute sret not present on both function and call-site
+; CHECK:  call void @fn_sret(ptr %x)
+  call void @fn_sret(ptr %x)
+
+; CHECK: Undefined behavior: ABI attribute sret not present on both function and call-site
+; CHECK:  call void @fn_nothing_ptr(ptr sret(i8) %x)
+  call void @fn_nothing_ptr(ptr sret(i8) %x)
+
+; CHECK: Undefined behavior: ABI attribute sret does not have same argument for function and call-site
+; CHECK:  call void @fn_sret(ptr sret(i16) %x)
+  call void @fn_sret(ptr sret(i16) %x)
+  ret void
+}
diff --git a/llvm/test/Analysis/UniformityAnalysis/NVPTX/daorder.ll b/llvm/test/Analysis/UniformityAnalysis/NVPTX/daorder.ll
index 89d8c5aa90ab..14f33d79b471 100644
--- a/llvm/test/Analysis/UniformityAnalysis/NVPTX/daorder.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/NVPTX/daorder.ll
@@ -3,7 +3,7 @@
 target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
 target triple = "nvptx64-nvidia-cuda"
 
-define i32 @daorder(i32 %n) {
+define ptx_kernel i32 @daorder(i32 %n) {
 ; CHECK-LABEL: for function 'daorder'
 entry:
   %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
@@ -43,6 +43,3 @@ declare i32 @llvm.nvvm.read.ptx.sreg.tid.x()
 declare i32 @llvm.nvvm.read.ptx.sreg.tid.y()
 declare i32 @llvm.nvvm.read.ptx.sreg.tid.z()
 declare i32 @llvm.nvvm.read.ptx.sreg.laneid()
-
-!nvvm.annotations = !{!0}
-!0 = !{ptr @daorder, !"kernel", i32 1}
diff --git a/llvm/test/Analysis/UniformityAnalysis/NVPTX/diverge.ll b/llvm/test/Analysis/UniformityAnalysis/NVPTX/diverge.ll
index 0ac1b5f54147..cf8ffadcd073 100644
--- a/llvm/test/Analysis/UniformityAnalysis/NVPTX/diverge.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/NVPTX/diverge.ll
@@ -4,7 +4,7 @@ target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
 target triple = "nvptx64-nvidia-cuda"
 
 ; return (n < 0 ? a + threadIdx.x : b + threadIdx.x)
-define i32 @no_diverge(i32 %n, i32 %a, i32 %b) {
+define ptx_kernel i32 @no_diverge(i32 %n, i32 %a, i32 %b) {
 ; CHECK-LABEL: for function 'no_diverge'
 entry:
   %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
@@ -27,7 +27,7 @@ merge:
 ; if (threadIdx.x < 5)    // divergent: data dependent
 ;   c = b;
 ; return c;               // c is divergent: sync dependent
-define i32 @sync(i32 %a, i32 %b) {
+define ptx_kernel i32 @sync(i32 %a, i32 %b) {
 ; CHECK-LABEL: for function 'sync'
 bb1:
   %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.y()
@@ -49,7 +49,7 @@ bb3:
 ; }
 ; // c here is divergent because it is sync dependent on threadIdx.x >= 5
 ; return c;
-define i32 @mixed(i32 %n, i32 %a, i32 %b) {
+define ptx_kernel i32 @mixed(i32 %n, i32 %a, i32 %b) {
 ; CHECK-LABEL: for function 'mixed'
 bb1:
   %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.z()
@@ -101,7 +101,7 @@ merge:
 ; return i == 10 ? 0 : 1; // i here is divergent
 ;
 ; The i defined in the loop is used outside.
-define i32 @loop() {
+define ptx_kernel i32 @loop() {
 ; CHECK-LABEL: for function 'loop'
 entry:
   %laneid = call i32 @llvm.nvvm.read.ptx.sreg.laneid()
@@ -149,7 +149,7 @@ else:
 }
 
 ; Verifies sync-dependence is computed correctly in the absense of loops.
-define i32 @sync_no_loop(i32 %arg) {
+define ptx_kernel i32 @sync_no_loop(i32 %arg) {
 ; CHECK-LABEL: for function 'sync_no_loop'
 entry:
   %0 = add i32 %arg, 1
@@ -174,9 +174,3 @@ declare i32 @llvm.nvvm.read.ptx.sreg.tid.y()
 declare i32 @llvm.nvvm.read.ptx.sreg.tid.z()
 declare i32 @llvm.nvvm.read.ptx.sreg.laneid()
 
-!nvvm.annotations = !{!0, !1, !2, !3, !4}
-!0 = !{ptr @no_diverge, !"kernel", i32 1}
-!1 = !{ptr @sync, !"kernel", i32 1}
-!2 = !{ptr @mixed, !"kernel", i32 1}
-!3 = !{ptr @loop, !"kernel", i32 1}
-!4 = !{ptr @sync_no_loop, !"kernel", i32 1}
diff --git a/llvm/test/Analysis/UniformityAnalysis/NVPTX/hidden_diverge.ll b/llvm/test/Analysis/UniformityAnalysis/NVPTX/hidden_diverge.ll
index e319211771c0..65512bf572f8 100644
--- a/llvm/test/Analysis/UniformityAnalysis/NVPTX/hidden_diverge.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/NVPTX/hidden_diverge.ll
@@ -3,7 +3,7 @@
 target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
 target triple = "nvptx64-nvidia-cuda"
 
-define i32 @hidden_diverge(i32 %n, i32 %a, i32 %b) {
+define ptx_kernel i32 @hidden_diverge(i32 %n, i32 %a, i32 %b) {
 ; CHECK-LABEL: for function 'hidden_diverge'
 entry:
   %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
@@ -27,6 +27,3 @@ merge:
 }
 
 declare i32 @llvm.nvvm.read.ptx.sreg.tid.x()
-
-!nvvm.annotations = !{!0}
-!0 = !{ptr @hidden_diverge, !"kernel", i32 1}
diff --git a/llvm/test/Analysis/UniformityAnalysis/NVPTX/irreducible.ll b/llvm/test/Analysis/UniformityAnalysis/NVPTX/irreducible.ll
index cd729a918f81..e1ecc69871b9 100644
--- a/llvm/test/Analysis/UniformityAnalysis/NVPTX/irreducible.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/NVPTX/irreducible.ll
@@ -23,7 +23,7 @@ target triple = "nvptx64-nvidia-cuda"
 ;                             V
 ;                        if (i3 == 5) // divergent
 ; because sync dependent on (tid / i3).
-define i32 @unstructured_loop(i1 %entry_cond) {
+define ptx_kernel i32 @unstructured_loop(i1 %entry_cond) {
 ; CHECK-LABEL: for function 'unstructured_loop'
 entry:
   %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
@@ -59,5 +59,3 @@ declare i32 @llvm.nvvm.read.ptx.sreg.tid.y()
 declare i32 @llvm.nvvm.read.ptx.sreg.tid.z()
 declare i32 @llvm.nvvm.read.ptx.sreg.laneid()
 
-!nvvm.annotations = !{!0}
-!0 = !{ptr @unstructured_loop, !"kernel", i32 1}
diff --git a/llvm/test/CodeGen/AArch64/arm64-fast-isel-conversion-fallback.ll b/llvm/test/CodeGen/AArch64/arm64-fast-isel-conversion-fallback.ll
index 1aa28f5c2733..9a1203f18243 100644
--- a/llvm/test/CodeGen/AArch64/arm64-fast-isel-conversion-fallback.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-fast-isel-conversion-fallback.ll
@@ -156,11 +156,10 @@ define i32 @fptosi_bf(bfloat %a) nounwind ssp {
 ; CHECK-LABEL: fptosi_bf:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    fmov s1, s0
-; CHECK-NEXT:    // implicit-def: $s0
+; CHECK-NEXT:    // implicit-def: $d0
 ; CHECK-NEXT:    fmov s0, s1
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-NEXT:    fcvtzs w0, s0
 ; CHECK-NEXT:    ret
 entry:
@@ -173,11 +172,10 @@ define i32 @fptoui_sbf(bfloat %a) nounwind ssp {
 ; CHECK-LABEL: fptoui_sbf:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    fmov s1, s0
-; CHECK-NEXT:    // implicit-def: $s0
+; CHECK-NEXT:    // implicit-def: $d0
 ; CHECK-NEXT:    fmov s0, s1
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-NEXT:    fcvtzu w0, s0
 ; CHECK-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll
index ed9c1b037d0c..fb40dfcbe101 100644
--- a/llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll
@@ -182,17 +182,14 @@ define half @test_atomicrmw_fadd_f16_seq_cst_align4(ptr %ptr, half %value) #0 {
 define bfloat @test_atomicrmw_fadd_bf16_seq_cst_align2(ptr %ptr, bfloat %value) #0 {
 ; NOLSE-LABEL: test_atomicrmw_fadd_bf16_seq_cst_align2:
 ; NOLSE:       // %bb.0:
-; NOLSE-NEXT:    // kill: def $h0 killed $h0 def $s0
-; NOLSE-NEXT:    fmov w9, s0
+; NOLSE-NEXT:    // kill: def $h0 killed $h0 def $d0
+; NOLSE-NEXT:    shll v1.4s, v0.4h, #16
 ; NOLSE-NEXT:    mov w8, #32767 // =0x7fff
-; NOLSE-NEXT:    lsl w9, w9, #16
-; NOLSE-NEXT:    fmov s1, w9
 ; NOLSE-NEXT:  .LBB2_1: // %atomicrmw.start
 ; NOLSE-NEXT:    // =>This Inner Loop Header: Depth=1
 ; NOLSE-NEXT:    ldaxrh w9, [x0]
 ; NOLSE-NEXT:    fmov s0, w9
-; NOLSE-NEXT:    lsl w9, w9, #16
-; NOLSE-NEXT:    fmov s2, w9
+; NOLSE-NEXT:    shll v2.4s, v0.4h, #16
 ; NOLSE-NEXT:    fadd s2, s2, s1
 ; NOLSE-NEXT:    fmov w9, s2
 ; NOLSE-NEXT:    ubfx w10, w9, #16, #1
@@ -202,36 +199,34 @@ define bfloat @test_atomicrmw_fadd_bf16_seq_cst_align2(ptr %ptr, bfloat %value)
 ; NOLSE-NEXT:    stlxrh w10, w9, [x0]
 ; NOLSE-NEXT:    cbnz w10, .LBB2_1
 ; NOLSE-NEXT:  // %bb.2: // %atomicrmw.end
-; NOLSE-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; NOLSE-NEXT:    // kill: def $h0 killed $h0 killed $d0
 ; NOLSE-NEXT:    ret
 ;
 ; LSE-LABEL: test_atomicrmw_fadd_bf16_seq_cst_align2:
 ; LSE:       // %bb.0:
-; LSE-NEXT:    // kill: def $h0 killed $h0 def $s0
-; LSE-NEXT:    fmov w9, s0
+; LSE-NEXT:    // kill: def $h0 killed $h0 def $d0
+; LSE-NEXT:    shll v1.4s, v0.4h, #16
 ; LSE-NEXT:    mov w8, #32767 // =0x7fff
 ; LSE-NEXT:    ldr h0, [x0]
-; LSE-NEXT:    lsl w9, w9, #16
-; LSE-NEXT:    fmov s1, w9
 ; LSE-NEXT:  .LBB2_1: // %atomicrmw.start
 ; LSE-NEXT:    // =>This Inner Loop Header: Depth=1
-; LSE-NEXT:    fmov w9, s0
-; LSE-NEXT:    lsl w9, w9, #16
-; LSE-NEXT:    fmov s2, w9
+; LSE-NEXT:    shll v2.4s, v0.4h, #16
 ; LSE-NEXT:    fadd s2, s2, s1
 ; LSE-NEXT:    fmov w9, s2
 ; LSE-NEXT:    ubfx w10, w9, #16, #1
 ; LSE-NEXT:    add w9, w9, w8
 ; LSE-NEXT:    add w9, w10, w9
-; LSE-NEXT:    fmov w10, s0
 ; LSE-NEXT:    lsr w9, w9, #16
-; LSE-NEXT:    mov w11, w10
-; LSE-NEXT:    casalh w11, w9, [x0]
+; LSE-NEXT:    fmov s2, w9
+; LSE-NEXT:    fmov w9, s0
+; LSE-NEXT:    fmov w10, s2
+; LSE-NEXT:    mov w11, w9
+; LSE-NEXT:    casalh w11, w10, [x0]
 ; LSE-NEXT:    fmov s0, w11
-; LSE-NEXT:    cmp w11, w10, uxth
+; LSE-NEXT:    cmp w11, w9, uxth
 ; LSE-NEXT:    b.ne .LBB2_1
 ; LSE-NEXT:  // %bb.2: // %atomicrmw.end
-; LSE-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; LSE-NEXT:    // kill: def $h0 killed $h0 killed $d0
 ; LSE-NEXT:    ret
 ;
 ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fadd_bf16_seq_cst_align2:
@@ -281,17 +276,14 @@ define bfloat @test_atomicrmw_fadd_bf16_seq_cst_align2(ptr %ptr, bfloat %value)
 define bfloat @test_atomicrmw_fadd_bf16_seq_cst_align4(ptr %ptr, bfloat %value) #0 {
 ; NOLSE-LABEL: test_atomicrmw_fadd_bf16_seq_cst_align4:
 ; NOLSE:       // %bb.0:
-; NOLSE-NEXT:    // kill: def $h0 killed $h0 def $s0
-; NOLSE-NEXT:    fmov w9, s0
+; NOLSE-NEXT:    // kill: def $h0 killed $h0 def $d0
+; NOLSE-NEXT:    shll v1.4s, v0.4h, #16
 ; NOLSE-NEXT:    mov w8, #32767 // =0x7fff
-; NOLSE-NEXT:    lsl w9, w9, #16
-; NOLSE-NEXT:    fmov s1, w9
 ; NOLSE-NEXT:  .LBB3_1: // %atomicrmw.start
 ; NOLSE-NEXT:    // =>This Inner Loop Header: Depth=1
 ; NOLSE-NEXT:    ldaxrh w9, [x0]
 ; NOLSE-NEXT:    fmov s0, w9
-; NOLSE-NEXT:    lsl w9, w9, #16
-; NOLSE-NEXT:    fmov s2, w9
+; NOLSE-NEXT:    shll v2.4s, v0.4h, #16
 ; NOLSE-NEXT:    fadd s2, s2, s1
 ; NOLSE-NEXT:    fmov w9, s2
 ; NOLSE-NEXT:    ubfx w10, w9, #16, #1
@@ -301,36 +293,34 @@ define bfloat @test_atomicrmw_fadd_bf16_seq_cst_align4(ptr %ptr, bfloat %value)
 ; NOLSE-NEXT:    stlxrh w10, w9, [x0]
 ; NOLSE-NEXT:    cbnz w10, .LBB3_1
 ; NOLSE-NEXT:  // %bb.2: // %atomicrmw.end
-; NOLSE-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; NOLSE-NEXT:    // kill: def $h0 killed $h0 killed $d0
 ; NOLSE-NEXT:    ret
 ;
 ; LSE-LABEL: test_atomicrmw_fadd_bf16_seq_cst_align4:
 ; LSE:       // %bb.0:
-; LSE-NEXT:    // kill: def $h0 killed $h0 def $s0
-; LSE-NEXT:    fmov w9, s0
+; LSE-NEXT:    // kill: def $h0 killed $h0 def $d0
+; LSE-NEXT:    shll v1.4s, v0.4h, #16
 ; LSE-NEXT:    mov w8, #32767 // =0x7fff
 ; LSE-NEXT:    ldr h0, [x0]
-; LSE-NEXT:    lsl w9, w9, #16
-; LSE-NEXT:    fmov s1, w9
 ; LSE-NEXT:  .LBB3_1: // %atomicrmw.start
 ; LSE-NEXT:    // =>This Inner Loop Header: Depth=1
-; LSE-NEXT:    fmov w9, s0
-; LSE-NEXT:    lsl w9, w9, #16
-; LSE-NEXT:    fmov s2, w9
+; LSE-NEXT:    shll v2.4s, v0.4h, #16
 ; LSE-NEXT:    fadd s2, s2, s1
 ; LSE-NEXT:    fmov w9, s2
 ; LSE-NEXT:    ubfx w10, w9, #16, #1
 ; LSE-NEXT:    add w9, w9, w8
 ; LSE-NEXT:    add w9, w10, w9
-; LSE-NEXT:    fmov w10, s0
 ; LSE-NEXT:    lsr w9, w9, #16
-; LSE-NEXT:    mov w11, w10
-; LSE-NEXT:    casalh w11, w9, [x0]
+; LSE-NEXT:    fmov s2, w9
+; LSE-NEXT:    fmov w9, s0
+; LSE-NEXT:    fmov w10, s2
+; LSE-NEXT:    mov w11, w9
+; LSE-NEXT:    casalh w11, w10, [x0]
 ; LSE-NEXT:    fmov s0, w11
-; LSE-NEXT:    cmp w11, w10, uxth
+; LSE-NEXT:    cmp w11, w9, uxth
 ; LSE-NEXT:    b.ne .LBB3_1
 ; LSE-NEXT:  // %bb.2: // %atomicrmw.end
-; LSE-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; LSE-NEXT:    // kill: def $h0 killed $h0 killed $d0
 ; LSE-NEXT:    ret
 ;
 ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fadd_bf16_seq_cst_align4:
diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll
index 888b795876f7..818dcf3a0b48 100644
--- a/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll
@@ -184,17 +184,14 @@ define half @test_atomicrmw_fmax_f16_seq_cst_align4(ptr %ptr, half %value) #0 {
 define bfloat @test_atomicrmw_fmax_bf16_seq_cst_align2(ptr %ptr, bfloat %value) #0 {
 ; NOLSE-LABEL: test_atomicrmw_fmax_bf16_seq_cst_align2:
 ; NOLSE:       // %bb.0:
-; NOLSE-NEXT:    // kill: def $h0 killed $h0 def $s0
-; NOLSE-NEXT:    fmov w9, s0
+; NOLSE-NEXT:    // kill: def $h0 killed $h0 def $d0
+; NOLSE-NEXT:    shll v1.4s, v0.4h, #16
 ; NOLSE-NEXT:    mov w8, #32767 // =0x7fff
-; NOLSE-NEXT:    lsl w9, w9, #16
-; NOLSE-NEXT:    fmov s1, w9
 ; NOLSE-NEXT:  .LBB2_1: // %atomicrmw.start
 ; NOLSE-NEXT:    // =>This Inner Loop Header: Depth=1
 ; NOLSE-NEXT:    ldaxrh w9, [x0]
 ; NOLSE-NEXT:    fmov s0, w9
-; NOLSE-NEXT:    lsl w9, w9, #16
-; NOLSE-NEXT:    fmov s2, w9
+; NOLSE-NEXT:    shll v2.4s, v0.4h, #16
 ; NOLSE-NEXT:    fmaxnm s2, s2, s1
 ; NOLSE-NEXT:    fmov w9, s2
 ; NOLSE-NEXT:    ubfx w10, w9, #16, #1
@@ -204,36 +201,34 @@ define bfloat @test_atomicrmw_fmax_bf16_seq_cst_align2(ptr %ptr, bfloat %value)
 ; NOLSE-NEXT:    stlxrh w10, w9, [x0]
 ; NOLSE-NEXT:    cbnz w10, .LBB2_1
 ; NOLSE-NEXT:  // %bb.2: // %atomicrmw.end
-; NOLSE-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; NOLSE-NEXT:    // kill: def $h0 killed $h0 killed $d0
 ; NOLSE-NEXT:    ret
 ;
 ; LSE-LABEL: test_atomicrmw_fmax_bf16_seq_cst_align2:
 ; LSE:       // %bb.0:
-; LSE-NEXT:    // kill: def $h0 killed $h0 def $s0
-; LSE-NEXT:    fmov w9, s0
+; LSE-NEXT:    // kill: def $h0 killed $h0 def $d0
+; LSE-NEXT:    shll v1.4s, v0.4h, #16
 ; LSE-NEXT:    mov w8, #32767 // =0x7fff
 ; LSE-NEXT:    ldr h0, [x0]
-; LSE-NEXT:    lsl w9, w9, #16
-; LSE-NEXT:    fmov s1, w9
 ; LSE-NEXT:  .LBB2_1: // %atomicrmw.start
 ; LSE-NEXT:    // =>This Inner Loop Header: Depth=1
-; LSE-NEXT:    fmov w9, s0
-; LSE-NEXT:    lsl w9, w9, #16
-; LSE-NEXT:    fmov s2, w9
+; LSE-NEXT:    shll v2.4s, v0.4h, #16
 ; LSE-NEXT:    fmaxnm s2, s2, s1
 ; LSE-NEXT:    fmov w9, s2
 ; LSE-NEXT:    ubfx w10, w9, #16, #1
 ; LSE-NEXT:    add w9, w9, w8
 ; LSE-NEXT:    add w9, w10, w9
-; LSE-NEXT:    fmov w10, s0
 ; LSE-NEXT:    lsr w9, w9, #16
-; LSE-NEXT:    mov w11, w10
-; LSE-NEXT:    casalh w11, w9, [x0]
+; LSE-NEXT:    fmov s2, w9
+; LSE-NEXT:    fmov w9, s0
+; LSE-NEXT:    fmov w10, s2
+; LSE-NEXT:    mov w11, w9
+; LSE-NEXT:    casalh w11, w10, [x0]
 ; LSE-NEXT:    fmov s0, w11
-; LSE-NEXT:    cmp w11, w10, uxth
+; LSE-NEXT:    cmp w11, w9, uxth
 ; LSE-NEXT:    b.ne .LBB2_1
 ; LSE-NEXT:  // %bb.2: // %atomicrmw.end
-; LSE-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; LSE-NEXT:    // kill: def $h0 killed $h0 killed $d0
 ; LSE-NEXT:    ret
 ;
 ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmax_bf16_seq_cst_align2:
@@ -283,17 +278,14 @@ define bfloat @test_atomicrmw_fmax_bf16_seq_cst_align2(ptr %ptr, bfloat %value)
 define bfloat @test_atomicrmw_fmax_bf16_seq_cst_align4(ptr %ptr, bfloat %value) #0 {
 ; NOLSE-LABEL: test_atomicrmw_fmax_bf16_seq_cst_align4:
 ; NOLSE:       // %bb.0:
-; NOLSE-NEXT:    // kill: def $h0 killed $h0 def $s0
-; NOLSE-NEXT:    fmov w9, s0
+; NOLSE-NEXT:    // kill: def $h0 killed $h0 def $d0
+; NOLSE-NEXT:    shll v1.4s, v0.4h, #16
 ; NOLSE-NEXT:    mov w8, #32767 // =0x7fff
-; NOLSE-NEXT:    lsl w9, w9, #16
-; NOLSE-NEXT:    fmov s1, w9
 ; NOLSE-NEXT:  .LBB3_1: // %atomicrmw.start
 ; NOLSE-NEXT:    // =>This Inner Loop Header: Depth=1
 ; NOLSE-NEXT:    ldaxrh w9, [x0]
 ; NOLSE-NEXT:    fmov s0, w9
-; NOLSE-NEXT:    lsl w9, w9, #16
-; NOLSE-NEXT:    fmov s2, w9
+; NOLSE-NEXT:    shll v2.4s, v0.4h, #16
 ; NOLSE-NEXT:    fmaxnm s2, s2, s1
 ; NOLSE-NEXT:    fmov w9, s2
 ; NOLSE-NEXT:    ubfx w10, w9, #16, #1
@@ -303,36 +295,34 @@ define bfloat @test_atomicrmw_fmax_bf16_seq_cst_align4(ptr %ptr, bfloat %value)
 ; NOLSE-NEXT:    stlxrh w10, w9, [x0]
 ; NOLSE-NEXT:    cbnz w10, .LBB3_1
 ; NOLSE-NEXT:  // %bb.2: // %atomicrmw.end
-; NOLSE-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; NOLSE-NEXT:    // kill: def $h0 killed $h0 killed $d0
 ; NOLSE-NEXT:    ret
 ;
 ; LSE-LABEL: test_atomicrmw_fmax_bf16_seq_cst_align4:
 ; LSE:       // %bb.0:
-; LSE-NEXT:    // kill: def $h0 killed $h0 def $s0
-; LSE-NEXT:    fmov w9, s0
+; LSE-NEXT:    // kill: def $h0 killed $h0 def $d0
+; LSE-NEXT:    shll v1.4s, v0.4h, #16
 ; LSE-NEXT:    mov w8, #32767 // =0x7fff
 ; LSE-NEXT:    ldr h0, [x0]
-; LSE-NEXT:    lsl w9, w9, #16
-; LSE-NEXT:    fmov s1, w9
 ; LSE-NEXT:  .LBB3_1: // %atomicrmw.start
 ; LSE-NEXT:    // =>This Inner Loop Header: Depth=1
-; LSE-NEXT:    fmov w9, s0
-; LSE-NEXT:    lsl w9, w9, #16
-; LSE-NEXT:    fmov s2, w9
+; LSE-NEXT:    shll v2.4s, v0.4h, #16
 ; LSE-NEXT:    fmaxnm s2, s2, s1
 ; LSE-NEXT:    fmov w9, s2
 ; LSE-NEXT:    ubfx w10, w9, #16, #1
 ; LSE-NEXT:    add w9, w9, w8
 ; LSE-NEXT:    add w9, w10, w9
-; LSE-NEXT:    fmov w10, s0
 ; LSE-NEXT:    lsr w9, w9, #16
-; LSE-NEXT:    mov w11, w10
-; LSE-NEXT:    casalh w11, w9, [x0]
+; LSE-NEXT:    fmov s2, w9
+; LSE-NEXT:    fmov w9, s0
+; LSE-NEXT:    fmov w10, s2
+; LSE-NEXT:    mov w11, w9
+; LSE-NEXT:    casalh w11, w10, [x0]
 ; LSE-NEXT:    fmov s0, w11
-; LSE-NEXT:    cmp w11, w10, uxth
+; LSE-NEXT:    cmp w11, w9, uxth
 ; LSE-NEXT:    b.ne .LBB3_1
 ; LSE-NEXT:  // %bb.2: // %atomicrmw.end
-; LSE-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; LSE-NEXT:    // kill: def $h0 killed $h0 killed $d0
 ; LSE-NEXT:    ret
 ;
 ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmax_bf16_seq_cst_align4:
@@ -653,31 +643,23 @@ define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf
 ; NOLSE-LABEL: test_atomicrmw_fmax_v2bf16_seq_cst_align4:
 ; NOLSE:       // %bb.0:
 ; NOLSE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NOLSE-NEXT:    mov h1, v0.h[1]
-; NOLSE-NEXT:    fmov w10, s0
+; NOLSE-NEXT:    dup v1.4h, v0.h[1]
 ; NOLSE-NEXT:    mov w8, #32767 // =0x7fff
-; NOLSE-NEXT:    lsl w10, w10, #16
-; NOLSE-NEXT:    fmov w9, s1
-; NOLSE-NEXT:    fmov s1, w10
-; NOLSE-NEXT:    lsl w9, w9, #16
-; NOLSE-NEXT:    fmov s0, w9
+; NOLSE-NEXT:    shll v0.4s, v0.4h, #16
+; NOLSE-NEXT:    shll v1.4s, v1.4h, #16
 ; NOLSE-NEXT:  .LBB7_1: // %atomicrmw.start
 ; NOLSE-NEXT:    // =>This Inner Loop Header: Depth=1
 ; NOLSE-NEXT:    ldaxr w9, [x0]
 ; NOLSE-NEXT:    fmov s2, w9
-; NOLSE-NEXT:    mov h3, v2.h[1]
-; NOLSE-NEXT:    fmov w11, s2
-; NOLSE-NEXT:    lsl w11, w11, #16
-; NOLSE-NEXT:    fmov w10, s3
-; NOLSE-NEXT:    fmov s3, w11
-; NOLSE-NEXT:    lsl w10, w10, #16
-; NOLSE-NEXT:    fmaxnm s3, s3, s1
-; NOLSE-NEXT:    fmov s2, w10
+; NOLSE-NEXT:    dup v3.4h, v2.h[1]
+; NOLSE-NEXT:    shll v2.4s, v2.4h, #16
 ; NOLSE-NEXT:    fmaxnm s2, s2, s0
-; NOLSE-NEXT:    fmov w11, s3
+; NOLSE-NEXT:    shll v3.4s, v3.4h, #16
+; NOLSE-NEXT:    fmaxnm s3, s3, s1
+; NOLSE-NEXT:    fmov w11, s2
 ; NOLSE-NEXT:    ubfx w13, w11, #16, #1
 ; NOLSE-NEXT:    add w11, w11, w8
-; NOLSE-NEXT:    fmov w10, s2
+; NOLSE-NEXT:    fmov w10, s3
 ; NOLSE-NEXT:    add w11, w13, w11
 ; NOLSE-NEXT:    lsr w11, w11, #16
 ; NOLSE-NEXT:    ubfx w12, w10, #16, #1
@@ -697,25 +679,17 @@ define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf
 ; LSE-LABEL: test_atomicrmw_fmax_v2bf16_seq_cst_align4:
 ; LSE:       // %bb.0:
 ; LSE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; LSE-NEXT:    mov h1, v0.h[1]
-; LSE-NEXT:    fmov w10, s0
+; LSE-NEXT:    dup v1.4h, v0.h[1]
+; LSE-NEXT:    shll v2.4s, v0.4h, #16
 ; LSE-NEXT:    mov w8, #32767 // =0x7fff
 ; LSE-NEXT:    ldr s0, [x0]
-; LSE-NEXT:    lsl w10, w10, #16
-; LSE-NEXT:    fmov w9, s1
-; LSE-NEXT:    fmov s2, w10
-; LSE-NEXT:    lsl w9, w9, #16
-; LSE-NEXT:    fmov s1, w9
+; LSE-NEXT:    shll v1.4s, v1.4h, #16
 ; LSE-NEXT:  .LBB7_1: // %atomicrmw.start
 ; LSE-NEXT:    // =>This Inner Loop Header: Depth=1
-; LSE-NEXT:    mov h3, v0.h[1]
-; LSE-NEXT:    fmov w10, s0
-; LSE-NEXT:    lsl w10, w10, #16
-; LSE-NEXT:    fmov w9, s3
-; LSE-NEXT:    fmov s4, w10
-; LSE-NEXT:    lsl w9, w9, #16
+; LSE-NEXT:    dup v3.4h, v0.h[1]
+; LSE-NEXT:    shll v4.4s, v0.4h, #16
 ; LSE-NEXT:    fmaxnm s4, s4, s2
-; LSE-NEXT:    fmov s3, w9
+; LSE-NEXT:    shll v3.4s, v3.4h, #16
 ; LSE-NEXT:    fmaxnm s3, s3, s1
 ; LSE-NEXT:    fmov w10, s4
 ; LSE-NEXT:    ubfx w12, w10, #16, #1
diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll
index a3665c6e4286..b969241e8bf9 100644
--- a/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll
+++ b/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll
@@ -184,17 +184,14 @@ define half @test_atomicrmw_fmin_f16_seq_cst_align4(ptr %ptr, half %value) #0 {
 define bfloat @test_atomicrmw_fmin_bf16_seq_cst_align2(ptr %ptr, bfloat %value) #0 {
 ; NOLSE-LABEL: test_atomicrmw_fmin_bf16_seq_cst_align2:
 ; NOLSE:       // %bb.0:
-; NOLSE-NEXT:    // kill: def $h0 killed $h0 def $s0
-; NOLSE-NEXT:    fmov w9, s0
+; NOLSE-NEXT:    // kill: def $h0 killed $h0 def $d0
+; NOLSE-NEXT:    shll v1.4s, v0.4h, #16
 ; NOLSE-NEXT:    mov w8, #32767 // =0x7fff
-; NOLSE-NEXT:    lsl w9, w9, #16
-; NOLSE-NEXT:    fmov s1, w9
 ; NOLSE-NEXT:  .LBB2_1: // %atomicrmw.start
 ; NOLSE-NEXT:    // =>This Inner Loop Header: Depth=1
 ; NOLSE-NEXT:    ldaxrh w9, [x0]
 ; NOLSE-NEXT:    fmov s0, w9
-; NOLSE-NEXT:    lsl w9, w9, #16
-; NOLSE-NEXT:    fmov s2, w9
+; NOLSE-NEXT:    shll v2.4s, v0.4h, #16
 ; NOLSE-NEXT:    fminnm s2, s2, s1
 ; NOLSE-NEXT:    fmov w9, s2
 ; NOLSE-NEXT:    ubfx w10, w9, #16, #1
@@ -204,36 +201,34 @@ define bfloat @test_atomicrmw_fmin_bf16_seq_cst_align2(ptr %ptr, bfloat %value)
 ; NOLSE-NEXT:    stlxrh w10, w9, [x0]
 ; NOLSE-NEXT:    cbnz w10, .LBB2_1
 ; NOLSE-NEXT:  // %bb.2: // %atomicrmw.end
-; NOLSE-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; NOLSE-NEXT:    // kill: def $h0 killed $h0 killed $d0
 ; NOLSE-NEXT:    ret
 ;
 ; LSE-LABEL: test_atomicrmw_fmin_bf16_seq_cst_align2:
 ; LSE:       // %bb.0:
-; LSE-NEXT:    // kill: def $h0 killed $h0 def $s0
-; LSE-NEXT:    fmov w9, s0
+; LSE-NEXT:    // kill: def $h0 killed $h0 def $d0
+; LSE-NEXT:    shll v1.4s, v0.4h, #16
 ; LSE-NEXT:    mov w8, #32767 // =0x7fff
 ; LSE-NEXT:    ldr h0, [x0]
-; LSE-NEXT:    lsl w9, w9, #16
-; LSE-NEXT:    fmov s1, w9
 ; LSE-NEXT:  .LBB2_1: // %atomicrmw.start
 ; LSE-NEXT:    // =>This Inner Loop Header: Depth=1
-; LSE-NEXT:    fmov w9, s0
-; LSE-NEXT:    lsl w9, w9, #16
-; LSE-NEXT:    fmov s2, w9
+; LSE-NEXT:    shll v2.4s, v0.4h, #16
 ; LSE-NEXT:    fminnm s2, s2, s1
 ; LSE-NEXT:    fmov w9, s2
 ; LSE-NEXT:    ubfx w10, w9, #16, #1
 ; LSE-NEXT:    add w9, w9, w8
 ; LSE-NEXT:    add w9, w10, w9
-; LSE-NEXT:    fmov w10, s0
 ; LSE-NEXT:    lsr w9, w9, #16
-; LSE-NEXT:    mov w11, w10
-; LSE-NEXT:    casalh w11, w9, [x0]
+; LSE-NEXT:    fmov s2, w9
+; LSE-NEXT:    fmov w9, s0
+; LSE-NEXT:    fmov w10, s2
+; LSE-NEXT:    mov w11, w9
+; LSE-NEXT:    casalh w11, w10, [x0]
 ; LSE-NEXT:    fmov s0, w11
-; LSE-NEXT:    cmp w11, w10, uxth
+; LSE-NEXT:    cmp w11, w9, uxth
 ; LSE-NEXT:    b.ne .LBB2_1
 ; LSE-NEXT:  // %bb.2: // %atomicrmw.end
-; LSE-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; LSE-NEXT:    // kill: def $h0 killed $h0 killed $d0
 ; LSE-NEXT:    ret
 ;
 ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmin_bf16_seq_cst_align2:
@@ -283,17 +278,14 @@ define bfloat @test_atomicrmw_fmin_bf16_seq_cst_align2(ptr %ptr, bfloat %value)
 define bfloat @test_atomicrmw_fmin_bf16_seq_cst_align4(ptr %ptr, bfloat %value) #0 {
 ; NOLSE-LABEL: test_atomicrmw_fmin_bf16_seq_cst_align4:
 ; NOLSE:       // %bb.0:
-; NOLSE-NEXT:    // kill: def $h0 killed $h0 def $s0
-; NOLSE-NEXT:    fmov w9, s0
+; NOLSE-NEXT:    // kill: def $h0 killed $h0 def $d0
+; NOLSE-NEXT:    shll v1.4s, v0.4h, #16
 ; NOLSE-NEXT:    mov w8, #32767 // =0x7fff
-; NOLSE-NEXT:    lsl w9, w9, #16
-; NOLSE-NEXT:    fmov s1, w9
 ; NOLSE-NEXT:  .LBB3_1: // %atomicrmw.start
 ; NOLSE-NEXT:    // =>This Inner Loop Header: Depth=1
 ; NOLSE-NEXT:    ldaxrh w9, [x0]
 ; NOLSE-NEXT:    fmov s0, w9
-; NOLSE-NEXT:    lsl w9, w9, #16
-; NOLSE-NEXT:    fmov s2, w9
+; NOLSE-NEXT:    shll v2.4s, v0.4h, #16
 ; NOLSE-NEXT:    fminnm s2, s2, s1
 ; NOLSE-NEXT:    fmov w9, s2
 ; NOLSE-NEXT:    ubfx w10, w9, #16, #1
@@ -303,36 +295,34 @@ define bfloat @test_atomicrmw_fmin_bf16_seq_cst_align4(ptr %ptr, bfloat %value)
 ; NOLSE-NEXT:    stlxrh w10, w9, [x0]
 ; NOLSE-NEXT:    cbnz w10, .LBB3_1
 ; NOLSE-NEXT:  // %bb.2: // %atomicrmw.end
-; NOLSE-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; NOLSE-NEXT:    // kill: def $h0 killed $h0 killed $d0
 ; NOLSE-NEXT:    ret
 ;
 ; LSE-LABEL: test_atomicrmw_fmin_bf16_seq_cst_align4:
 ; LSE:       // %bb.0:
-; LSE-NEXT:    // kill: def $h0 killed $h0 def $s0
-; LSE-NEXT:    fmov w9, s0
+; LSE-NEXT:    // kill: def $h0 killed $h0 def $d0
+; LSE-NEXT:    shll v1.4s, v0.4h, #16
 ; LSE-NEXT:    mov w8, #32767 // =0x7fff
 ; LSE-NEXT:    ldr h0, [x0]
-; LSE-NEXT:    lsl w9, w9, #16
-; LSE-NEXT:    fmov s1, w9
 ; LSE-NEXT:  .LBB3_1: // %atomicrmw.start
 ; LSE-NEXT:    // =>This Inner Loop Header: Depth=1
-; LSE-NEXT:    fmov w9, s0
-; LSE-NEXT:    lsl w9, w9, #16
-; LSE-NEXT:    fmov s2, w9
+; LSE-NEXT:    shll v2.4s, v0.4h, #16
 ; LSE-NEXT:    fminnm s2, s2, s1
 ; LSE-NEXT:    fmov w9, s2
 ; LSE-NEXT:    ubfx w10, w9, #16, #1
 ; LSE-NEXT:    add w9, w9, w8
 ; LSE-NEXT:    add w9, w10, w9
-; LSE-NEXT:    fmov w10, s0
 ; LSE-NEXT:    lsr w9, w9, #16
-; LSE-NEXT:    mov w11, w10
-; LSE-NEXT:    casalh w11, w9, [x0]
+; LSE-NEXT:    fmov s2, w9
+; LSE-NEXT:    fmov w9, s0
+; LSE-NEXT:    fmov w10, s2
+; LSE-NEXT:    mov w11, w9
+; LSE-NEXT:    casalh w11, w10, [x0]
 ; LSE-NEXT:    fmov s0, w11
-; LSE-NEXT:    cmp w11, w10, uxth
+; LSE-NEXT:    cmp w11, w9, uxth
 ; LSE-NEXT:    b.ne .LBB3_1
 ; LSE-NEXT:  // %bb.2: // %atomicrmw.end
-; LSE-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; LSE-NEXT:    // kill: def $h0 killed $h0 killed $d0
 ; LSE-NEXT:    ret
 ;
 ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmin_bf16_seq_cst_align4:
@@ -653,31 +643,23 @@ define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf
 ; NOLSE-LABEL: test_atomicrmw_fmin_v2bf16_seq_cst_align4:
 ; NOLSE:       // %bb.0:
 ; NOLSE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NOLSE-NEXT:    mov h1, v0.h[1]
-; NOLSE-NEXT:    fmov w10, s0
+; NOLSE-NEXT:    dup v1.4h, v0.h[1]
 ; NOLSE-NEXT:    mov w8, #32767 // =0x7fff
-; NOLSE-NEXT:    lsl w10, w10, #16
-; NOLSE-NEXT:    fmov w9, s1
-; NOLSE-NEXT:    fmov s1, w10
-; NOLSE-NEXT:    lsl w9, w9, #16
-; NOLSE-NEXT:    fmov s0, w9
+; NOLSE-NEXT:    shll v0.4s, v0.4h, #16
+; NOLSE-NEXT:    shll v1.4s, v1.4h, #16
 ; NOLSE-NEXT:  .LBB7_1: // %atomicrmw.start
 ; NOLSE-NEXT:    // =>This Inner Loop Header: Depth=1
 ; NOLSE-NEXT:    ldaxr w9, [x0]
 ; NOLSE-NEXT:    fmov s2, w9
-; NOLSE-NEXT:    mov h3, v2.h[1]
-; NOLSE-NEXT:    fmov w11, s2
-; NOLSE-NEXT:    lsl w11, w11, #16
-; NOLSE-NEXT:    fmov w10, s3
-; NOLSE-NEXT:    fmov s3, w11
-; NOLSE-NEXT:    lsl w10, w10, #16
-; NOLSE-NEXT:    fminnm s3, s3, s1
-; NOLSE-NEXT:    fmov s2, w10
+; NOLSE-NEXT:    dup v3.4h, v2.h[1]
+; NOLSE-NEXT:    shll v2.4s, v2.4h, #16
 ; NOLSE-NEXT:    fminnm s2, s2, s0
-; NOLSE-NEXT:    fmov w11, s3
+; NOLSE-NEXT:    shll v3.4s, v3.4h, #16
+; NOLSE-NEXT:    fminnm s3, s3, s1
+; NOLSE-NEXT:    fmov w11, s2
 ; NOLSE-NEXT:    ubfx w13, w11, #16, #1
 ; NOLSE-NEXT:    add w11, w11, w8
-; NOLSE-NEXT:    fmov w10, s2
+; NOLSE-NEXT:    fmov w10, s3
 ; NOLSE-NEXT:    add w11, w13, w11
 ; NOLSE-NEXT:    lsr w11, w11, #16
 ; NOLSE-NEXT:    ubfx w12, w10, #16, #1
@@ -697,25 +679,17 @@ define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf
 ; LSE-LABEL: test_atomicrmw_fmin_v2bf16_seq_cst_align4:
 ; LSE:       // %bb.0:
 ; LSE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; LSE-NEXT:    mov h1, v0.h[1]
-; LSE-NEXT:    fmov w10, s0
+; LSE-NEXT:    dup v1.4h, v0.h[1]
+; LSE-NEXT:    shll v2.4s, v0.4h, #16
 ; LSE-NEXT:    mov w8, #32767 // =0x7fff
 ; LSE-NEXT:    ldr s0, [x0]
-; LSE-NEXT:    lsl w10, w10, #16
-; LSE-NEXT:    fmov w9, s1
-; LSE-NEXT:    fmov s2, w10
-; LSE-NEXT:    lsl w9, w9, #16
-; LSE-NEXT:    fmov s1, w9
+; LSE-NEXT:    shll v1.4s, v1.4h, #16
 ; LSE-NEXT:  .LBB7_1: // %atomicrmw.start
 ; LSE-NEXT:    // =>This Inner Loop Header: Depth=1
-; LSE-NEXT:    mov h3, v0.h[1]
-; LSE-NEXT:    fmov w10, s0
-; LSE-NEXT:    lsl w10, w10, #16
-; LSE-NEXT:    fmov w9, s3
-; LSE-NEXT:    fmov s4, w10
-; LSE-NEXT:    lsl w9, w9, #16
+; LSE-NEXT:    dup v3.4h, v0.h[1]
+; LSE-NEXT:    shll v4.4s, v0.4h, #16
 ; LSE-NEXT:    fminnm s4, s4, s2
-; LSE-NEXT:    fmov s3, w9
+; LSE-NEXT:    shll v3.4s, v3.4h, #16
 ; LSE-NEXT:    fminnm s3, s3, s1
 ; LSE-NEXT:    fmov w10, s4
 ; LSE-NEXT:    ubfx w12, w10, #16, #1
diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fsub.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fsub.ll
index 7725ce0e7318..e603337e7a56 100644
--- a/llvm/test/CodeGen/AArch64/atomicrmw-fsub.ll
+++ b/llvm/test/CodeGen/AArch64/atomicrmw-fsub.ll
@@ -182,17 +182,14 @@ define half @test_atomicrmw_fsub_f16_seq_cst_align4(ptr %ptr, half %value) #0 {
 define bfloat @test_atomicrmw_fsub_bf16_seq_cst_align2(ptr %ptr, bfloat %value) #0 {
 ; NOLSE-LABEL: test_atomicrmw_fsub_bf16_seq_cst_align2:
 ; NOLSE:       // %bb.0:
-; NOLSE-NEXT:    // kill: def $h0 killed $h0 def $s0
-; NOLSE-NEXT:    fmov w9, s0
+; NOLSE-NEXT:    // kill: def $h0 killed $h0 def $d0
+; NOLSE-NEXT:    shll v1.4s, v0.4h, #16
 ; NOLSE-NEXT:    mov w8, #32767 // =0x7fff
-; NOLSE-NEXT:    lsl w9, w9, #16
-; NOLSE-NEXT:    fmov s1, w9
 ; NOLSE-NEXT:  .LBB2_1: // %atomicrmw.start
 ; NOLSE-NEXT:    // =>This Inner Loop Header: Depth=1
 ; NOLSE-NEXT:    ldaxrh w9, [x0]
 ; NOLSE-NEXT:    fmov s0, w9
-; NOLSE-NEXT:    lsl w9, w9, #16
-; NOLSE-NEXT:    fmov s2, w9
+; NOLSE-NEXT:    shll v2.4s, v0.4h, #16
 ; NOLSE-NEXT:    fsub s2, s2, s1
 ; NOLSE-NEXT:    fmov w9, s2
 ; NOLSE-NEXT:    ubfx w10, w9, #16, #1
@@ -202,36 +199,34 @@ define bfloat @test_atomicrmw_fsub_bf16_seq_cst_align2(ptr %ptr, bfloat %value)
 ; NOLSE-NEXT:    stlxrh w10, w9, [x0]
 ; NOLSE-NEXT:    cbnz w10, .LBB2_1
 ; NOLSE-NEXT:  // %bb.2: // %atomicrmw.end
-; NOLSE-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; NOLSE-NEXT:    // kill: def $h0 killed $h0 killed $d0
 ; NOLSE-NEXT:    ret
 ;
 ; LSE-LABEL: test_atomicrmw_fsub_bf16_seq_cst_align2:
 ; LSE:       // %bb.0:
-; LSE-NEXT:    // kill: def $h0 killed $h0 def $s0
-; LSE-NEXT:    fmov w9, s0
+; LSE-NEXT:    // kill: def $h0 killed $h0 def $d0
+; LSE-NEXT:    shll v1.4s, v0.4h, #16
 ; LSE-NEXT:    mov w8, #32767 // =0x7fff
 ; LSE-NEXT:    ldr h0, [x0]
-; LSE-NEXT:    lsl w9, w9, #16
-; LSE-NEXT:    fmov s1, w9
 ; LSE-NEXT:  .LBB2_1: // %atomicrmw.start
 ; LSE-NEXT:    // =>This Inner Loop Header: Depth=1
-; LSE-NEXT:    fmov w9, s0
-; LSE-NEXT:    lsl w9, w9, #16
-; LSE-NEXT:    fmov s2, w9
+; LSE-NEXT:    shll v2.4s, v0.4h, #16
 ; LSE-NEXT:    fsub s2, s2, s1
 ; LSE-NEXT:    fmov w9, s2
 ; LSE-NEXT:    ubfx w10, w9, #16, #1
 ; LSE-NEXT:    add w9, w9, w8
 ; LSE-NEXT:    add w9, w10, w9
-; LSE-NEXT:    fmov w10, s0
 ; LSE-NEXT:    lsr w9, w9, #16
-; LSE-NEXT:    mov w11, w10
-; LSE-NEXT:    casalh w11, w9, [x0]
+; LSE-NEXT:    fmov s2, w9
+; LSE-NEXT:    fmov w9, s0
+; LSE-NEXT:    fmov w10, s2
+; LSE-NEXT:    mov w11, w9
+; LSE-NEXT:    casalh w11, w10, [x0]
 ; LSE-NEXT:    fmov s0, w11
-; LSE-NEXT:    cmp w11, w10, uxth
+; LSE-NEXT:    cmp w11, w9, uxth
 ; LSE-NEXT:    b.ne .LBB2_1
 ; LSE-NEXT:  // %bb.2: // %atomicrmw.end
-; LSE-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; LSE-NEXT:    // kill: def $h0 killed $h0 killed $d0
 ; LSE-NEXT:    ret
 ;
 ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fsub_bf16_seq_cst_align2:
@@ -281,17 +276,14 @@ define bfloat @test_atomicrmw_fsub_bf16_seq_cst_align2(ptr %ptr, bfloat %value)
 define bfloat @test_atomicrmw_fsub_bf16_seq_cst_align4(ptr %ptr, bfloat %value) #0 {
 ; NOLSE-LABEL: test_atomicrmw_fsub_bf16_seq_cst_align4:
 ; NOLSE:       // %bb.0:
-; NOLSE-NEXT:    // kill: def $h0 killed $h0 def $s0
-; NOLSE-NEXT:    fmov w9, s0
+; NOLSE-NEXT:    // kill: def $h0 killed $h0 def $d0
+; NOLSE-NEXT:    shll v1.4s, v0.4h, #16
 ; NOLSE-NEXT:    mov w8, #32767 // =0x7fff
-; NOLSE-NEXT:    lsl w9, w9, #16
-; NOLSE-NEXT:    fmov s1, w9
 ; NOLSE-NEXT:  .LBB3_1: // %atomicrmw.start
 ; NOLSE-NEXT:    // =>This Inner Loop Header: Depth=1
 ; NOLSE-NEXT:    ldaxrh w9, [x0]
 ; NOLSE-NEXT:    fmov s0, w9
-; NOLSE-NEXT:    lsl w9, w9, #16
-; NOLSE-NEXT:    fmov s2, w9
+; NOLSE-NEXT:    shll v2.4s, v0.4h, #16
 ; NOLSE-NEXT:    fsub s2, s2, s1
 ; NOLSE-NEXT:    fmov w9, s2
 ; NOLSE-NEXT:    ubfx w10, w9, #16, #1
@@ -301,36 +293,34 @@ define bfloat @test_atomicrmw_fsub_bf16_seq_cst_align4(ptr %ptr, bfloat %value)
 ; NOLSE-NEXT:    stlxrh w10, w9, [x0]
 ; NOLSE-NEXT:    cbnz w10, .LBB3_1
 ; NOLSE-NEXT:  // %bb.2: // %atomicrmw.end
-; NOLSE-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; NOLSE-NEXT:    // kill: def $h0 killed $h0 killed $d0
 ; NOLSE-NEXT:    ret
 ;
 ; LSE-LABEL: test_atomicrmw_fsub_bf16_seq_cst_align4:
 ; LSE:       // %bb.0:
-; LSE-NEXT:    // kill: def $h0 killed $h0 def $s0
-; LSE-NEXT:    fmov w9, s0
+; LSE-NEXT:    // kill: def $h0 killed $h0 def $d0
+; LSE-NEXT:    shll v1.4s, v0.4h, #16
 ; LSE-NEXT:    mov w8, #32767 // =0x7fff
 ; LSE-NEXT:    ldr h0, [x0]
-; LSE-NEXT:    lsl w9, w9, #16
-; LSE-NEXT:    fmov s1, w9
 ; LSE-NEXT:  .LBB3_1: // %atomicrmw.start
 ; LSE-NEXT:    // =>This Inner Loop Header: Depth=1
-; LSE-NEXT:    fmov w9, s0
-; LSE-NEXT:    lsl w9, w9, #16
-; LSE-NEXT:    fmov s2, w9
+; LSE-NEXT:    shll v2.4s, v0.4h, #16
 ; LSE-NEXT:    fsub s2, s2, s1
 ; LSE-NEXT:    fmov w9, s2
 ; LSE-NEXT:    ubfx w10, w9, #16, #1
 ; LSE-NEXT:    add w9, w9, w8
 ; LSE-NEXT:    add w9, w10, w9
-; LSE-NEXT:    fmov w10, s0
 ; LSE-NEXT:    lsr w9, w9, #16
-; LSE-NEXT:    mov w11, w10
-; LSE-NEXT:    casalh w11, w9, [x0]
+; LSE-NEXT:    fmov s2, w9
+; LSE-NEXT:    fmov w9, s0
+; LSE-NEXT:    fmov w10, s2
+; LSE-NEXT:    mov w11, w9
+; LSE-NEXT:    casalh w11, w10, [x0]
 ; LSE-NEXT:    fmov s0, w11
-; LSE-NEXT:    cmp w11, w10, uxth
+; LSE-NEXT:    cmp w11, w9, uxth
 ; LSE-NEXT:    b.ne .LBB3_1
 ; LSE-NEXT:  // %bb.2: // %atomicrmw.end
-; LSE-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; LSE-NEXT:    // kill: def $h0 killed $h0 killed $d0
 ; LSE-NEXT:    ret
 ;
 ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fsub_bf16_seq_cst_align4:
diff --git a/llvm/test/CodeGen/AArch64/bf16-instructions.ll b/llvm/test/CodeGen/AArch64/bf16-instructions.ll
index 33997614598c..bc06453e9c01 100644
--- a/llvm/test/CodeGen/AArch64/bf16-instructions.ll
+++ b/llvm/test/CodeGen/AArch64/bf16-instructions.ll
@@ -5,16 +5,12 @@
 define bfloat @test_fadd(bfloat %a, bfloat %b) #0 {
 ; CHECK-CVT-LABEL: test_fadd:
 ; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w9, s1
-; CHECK-CVT-NEXT:    fmov w10, s0
+; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
-; CHECK-CVT-NEXT:    lsl w9, w9, #16
-; CHECK-CVT-NEXT:    lsl w10, w10, #16
-; CHECK-CVT-NEXT:    fmov s0, w9
-; CHECK-CVT-NEXT:    fmov s1, w10
-; CHECK-CVT-NEXT:    fadd s0, s1, s0
+; CHECK-CVT-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-CVT-NEXT:    fadd s0, s0, s1
 ; CHECK-CVT-NEXT:    fmov w9, s0
 ; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
 ; CHECK-CVT-NEXT:    add w8, w9, w8
@@ -26,15 +22,11 @@ define bfloat @test_fadd(bfloat %a, bfloat %b) #0 {
 ;
 ; CHECK-BF16-LABEL: test_fadd:
 ; CHECK-BF16:       // %bb.0:
-; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s1
-; CHECK-BF16-NEXT:    fmov w9, s0
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    lsl w9, w9, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
-; CHECK-BF16-NEXT:    fmov s1, w9
-; CHECK-BF16-NEXT:    fadd s0, s1, s0
+; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    fadd s0, s0, s1
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ret
   %r = fadd bfloat %a, %b
@@ -44,16 +36,12 @@ define bfloat @test_fadd(bfloat %a, bfloat %b) #0 {
 define bfloat @test_fsub(bfloat %a, bfloat %b) #0 {
 ; CHECK-CVT-LABEL: test_fsub:
 ; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w9, s1
-; CHECK-CVT-NEXT:    fmov w10, s0
+; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
-; CHECK-CVT-NEXT:    lsl w9, w9, #16
-; CHECK-CVT-NEXT:    lsl w10, w10, #16
-; CHECK-CVT-NEXT:    fmov s0, w9
-; CHECK-CVT-NEXT:    fmov s1, w10
-; CHECK-CVT-NEXT:    fsub s0, s1, s0
+; CHECK-CVT-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-CVT-NEXT:    fsub s0, s0, s1
 ; CHECK-CVT-NEXT:    fmov w9, s0
 ; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
 ; CHECK-CVT-NEXT:    add w8, w9, w8
@@ -65,15 +53,11 @@ define bfloat @test_fsub(bfloat %a, bfloat %b) #0 {
 ;
 ; CHECK-BF16-LABEL: test_fsub:
 ; CHECK-BF16:       // %bb.0:
-; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s1
-; CHECK-BF16-NEXT:    fmov w9, s0
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    lsl w9, w9, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
-; CHECK-BF16-NEXT:    fmov s1, w9
-; CHECK-BF16-NEXT:    fsub s0, s1, s0
+; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    fsub s0, s0, s1
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ret
   %r = fsub bfloat %a, %b
@@ -83,16 +67,12 @@ define bfloat @test_fsub(bfloat %a, bfloat %b) #0 {
 define bfloat @test_fmul(bfloat %a, bfloat %b) #0 {
 ; CHECK-CVT-LABEL: test_fmul:
 ; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w9, s1
-; CHECK-CVT-NEXT:    fmov w10, s0
+; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
-; CHECK-CVT-NEXT:    lsl w9, w9, #16
-; CHECK-CVT-NEXT:    lsl w10, w10, #16
-; CHECK-CVT-NEXT:    fmov s0, w9
-; CHECK-CVT-NEXT:    fmov s1, w10
-; CHECK-CVT-NEXT:    fmul s0, s1, s0
+; CHECK-CVT-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-CVT-NEXT:    fmul s0, s0, s1
 ; CHECK-CVT-NEXT:    fmov w9, s0
 ; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
 ; CHECK-CVT-NEXT:    add w8, w9, w8
@@ -104,15 +84,11 @@ define bfloat @test_fmul(bfloat %a, bfloat %b) #0 {
 ;
 ; CHECK-BF16-LABEL: test_fmul:
 ; CHECK-BF16:       // %bb.0:
-; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s1
-; CHECK-BF16-NEXT:    fmov w9, s0
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    lsl w9, w9, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
-; CHECK-BF16-NEXT:    fmov s1, w9
-; CHECK-BF16-NEXT:    fmul s0, s1, s0
+; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    fmul s0, s0, s1
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ret
   %r = fmul bfloat %a, %b
@@ -122,27 +98,21 @@ define bfloat @test_fmul(bfloat %a, bfloat %b) #0 {
 define bfloat @test_fmadd(bfloat %a, bfloat %b, bfloat %c) #0 {
 ; CHECK-CVT-LABEL: test_fmadd:
 ; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w8, s1
-; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-CVT-NEXT:    mov w10, #32767 // =0x7fff
-; CHECK-CVT-NEXT:    // kill: def $h2 killed $h2 def $s2
-; CHECK-CVT-NEXT:    lsl w8, w8, #16
-; CHECK-CVT-NEXT:    lsl w9, w9, #16
-; CHECK-CVT-NEXT:    fmov s0, w8
-; CHECK-CVT-NEXT:    fmov s1, w9
-; CHECK-CVT-NEXT:    fmul s0, s1, s0
+; CHECK-CVT-NEXT:    // kill: def $h2 killed $h2 def $d2
+; CHECK-CVT-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-CVT-NEXT:    fmul s0, s0, s1
+; CHECK-CVT-NEXT:    shll v1.4s, v2.4h, #16
 ; CHECK-CVT-NEXT:    fmov w8, s0
 ; CHECK-CVT-NEXT:    ubfx w9, w8, #16, #1
 ; CHECK-CVT-NEXT:    add w8, w8, w10
 ; CHECK-CVT-NEXT:    add w8, w9, w8
-; CHECK-CVT-NEXT:    fmov w9, s2
 ; CHECK-CVT-NEXT:    lsr w8, w8, #16
-; CHECK-CVT-NEXT:    lsl w8, w8, #16
-; CHECK-CVT-NEXT:    lsl w9, w9, #16
 ; CHECK-CVT-NEXT:    fmov s0, w8
-; CHECK-CVT-NEXT:    fmov s1, w9
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-CVT-NEXT:    fadd s0, s0, s1
 ; CHECK-CVT-NEXT:    fmov w8, s0
 ; CHECK-CVT-NEXT:    ubfx w9, w8, #16, #1
@@ -155,23 +125,15 @@ define bfloat @test_fmadd(bfloat %a, bfloat %b, bfloat %c) #0 {
 ;
 ; CHECK-BF16-LABEL: test_fmadd:
 ; CHECK-BF16:       // %bb.0:
-; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s1
-; CHECK-BF16-NEXT:    fmov w9, s0
-; CHECK-BF16-NEXT:    // kill: def $h2 killed $h2 def $s2
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    lsl w9, w9, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
-; CHECK-BF16-NEXT:    fmov s1, w9
-; CHECK-BF16-NEXT:    fmov w9, s2
-; CHECK-BF16-NEXT:    fmul s0, s1, s0
-; CHECK-BF16-NEXT:    lsl w9, w9, #16
-; CHECK-BF16-NEXT:    fmov s1, w9
+; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    // kill: def $h2 killed $h2 def $d2
+; CHECK-BF16-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    fmul s0, s0, s1
+; CHECK-BF16-NEXT:    shll v1.4s, v2.4h, #16
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
-; CHECK-BF16-NEXT:    fmov w8, s0
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-BF16-NEXT:    fadd s0, s0, s1
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ret
@@ -183,16 +145,12 @@ define bfloat @test_fmadd(bfloat %a, bfloat %b, bfloat %c) #0 {
 define bfloat @test_fdiv(bfloat %a, bfloat %b) #0 {
 ; CHECK-CVT-LABEL: test_fdiv:
 ; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w9, s1
-; CHECK-CVT-NEXT:    fmov w10, s0
+; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
-; CHECK-CVT-NEXT:    lsl w9, w9, #16
-; CHECK-CVT-NEXT:    lsl w10, w10, #16
-; CHECK-CVT-NEXT:    fmov s0, w9
-; CHECK-CVT-NEXT:    fmov s1, w10
-; CHECK-CVT-NEXT:    fdiv s0, s1, s0
+; CHECK-CVT-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-CVT-NEXT:    fdiv s0, s0, s1
 ; CHECK-CVT-NEXT:    fmov w9, s0
 ; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
 ; CHECK-CVT-NEXT:    add w8, w9, w8
@@ -204,15 +162,11 @@ define bfloat @test_fdiv(bfloat %a, bfloat %b) #0 {
 ;
 ; CHECK-BF16-LABEL: test_fdiv:
 ; CHECK-BF16:       // %bb.0:
-; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s1
-; CHECK-BF16-NEXT:    fmov w9, s0
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    lsl w9, w9, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
-; CHECK-BF16-NEXT:    fmov s1, w9
-; CHECK-BF16-NEXT:    fdiv s0, s1, s0
+; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    fdiv s0, s0, s1
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ret
   %r = fdiv bfloat %a, %b
@@ -223,14 +177,12 @@ define bfloat @test_frem(bfloat %a, bfloat %b) #0 {
 ; CHECK-CVT-LABEL: test_frem:
 ; CHECK-CVT:       // %bb.0:
 ; CHECK-CVT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w8, s0
-; CHECK-CVT-NEXT:    fmov w9, s1
-; CHECK-CVT-NEXT:    lsl w8, w8, #16
-; CHECK-CVT-NEXT:    lsl w9, w9, #16
-; CHECK-CVT-NEXT:    fmov s0, w8
-; CHECK-CVT-NEXT:    fmov s1, w9
+; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-CVT-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-CVT-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-CVT-NEXT:    // kill: def $s1 killed $s1 killed $q1
 ; CHECK-CVT-NEXT:    bl fmodf
 ; CHECK-CVT-NEXT:    fmov w9, s0
 ; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
@@ -246,14 +198,12 @@ define bfloat @test_frem(bfloat %a, bfloat %b) #0 {
 ; CHECK-BF16-LABEL: test_frem:
 ; CHECK-BF16:       // %bb.0:
 ; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s0
-; CHECK-BF16-NEXT:    fmov w9, s1
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    lsl w9, w9, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
-; CHECK-BF16-NEXT:    fmov s1, w9
+; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-BF16-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-BF16-NEXT:    // kill: def $s1 killed $s1 killed $q1
 ; CHECK-BF16-NEXT:    bl fmodf
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
@@ -334,17 +284,13 @@ define bfloat @test_select(bfloat %a, bfloat %b, i1 zeroext %c) #0 {
 define bfloat @test_select_cc(bfloat %a, bfloat %b, bfloat %c, bfloat %d) #0 {
 ; CHECK-LABEL: test_select_cc:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $h3 killed $h3 def $s3
-; CHECK-NEXT:    // kill: def $h2 killed $h2 def $s2
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    fmov w9, s2
+; CHECK-NEXT:    // kill: def $h3 killed $h3 def $d3
+; CHECK-NEXT:    // kill: def $h2 killed $h2 def $d2
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
 ; CHECK-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s2, w8
-; CHECK-NEXT:    fmov s3, w9
-; CHECK-NEXT:    fcmp s3, s2
+; CHECK-NEXT:    shll v3.4s, v3.4h, #16
+; CHECK-NEXT:    shll v2.4s, v2.4h, #16
+; CHECK-NEXT:    fcmp s2, s3
 ; CHECK-NEXT:    fcsel s0, s0, s1, ne
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $s0
 ; CHECK-NEXT:    ret
@@ -356,15 +302,11 @@ define bfloat @test_select_cc(bfloat %a, bfloat %b, bfloat %c, bfloat %d) #0 {
 define float @test_select_cc_f32_f16(float %a, float %b, bfloat %c, bfloat %d) #0 {
 ; CHECK-LABEL: test_select_cc_f32_f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $h3 killed $h3 def $s3
-; CHECK-NEXT:    // kill: def $h2 killed $h2 def $s2
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    fmov w9, s2
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s2, w8
-; CHECK-NEXT:    fmov s3, w9
-; CHECK-NEXT:    fcmp s3, s2
+; CHECK-NEXT:    // kill: def $h3 killed $h3 def $d3
+; CHECK-NEXT:    // kill: def $h2 killed $h2 def $d2
+; CHECK-NEXT:    shll v3.4s, v3.4h, #16
+; CHECK-NEXT:    shll v2.4s, v2.4h, #16
+; CHECK-NEXT:    fcmp s2, s3
 ; CHECK-NEXT:    fcsel s0, s0, s1, ne
 ; CHECK-NEXT:    ret
   %cc = fcmp une bfloat %c, %d
@@ -389,15 +331,11 @@ define bfloat @test_select_cc_f16_f32(bfloat %a, bfloat %b, float %c, float %d)
 define i1 @test_fcmp_une(bfloat %a, bfloat %b) #0 {
 ; CHECK-LABEL: test_fcmp_une:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fmov s1, w9
-; CHECK-NEXT:    fcmp s1, s0
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    fcmp s0, s1
 ; CHECK-NEXT:    cset w0, ne
 ; CHECK-NEXT:    ret
   %r = fcmp une bfloat %a, %b
@@ -407,15 +345,11 @@ define i1 @test_fcmp_une(bfloat %a, bfloat %b) #0 {
 define i1 @test_fcmp_ueq(bfloat %a, bfloat %b) #0 {
 ; CHECK-LABEL: test_fcmp_ueq:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fmov s1, w9
-; CHECK-NEXT:    fcmp s1, s0
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    fcmp s0, s1
 ; CHECK-NEXT:    cset w8, eq
 ; CHECK-NEXT:    csinc w0, w8, wzr, vc
 ; CHECK-NEXT:    ret
@@ -426,15 +360,11 @@ define i1 @test_fcmp_ueq(bfloat %a, bfloat %b) #0 {
 define i1 @test_fcmp_ugt(bfloat %a, bfloat %b) #0 {
 ; CHECK-LABEL: test_fcmp_ugt:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fmov s1, w9
-; CHECK-NEXT:    fcmp s1, s0
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    fcmp s0, s1
 ; CHECK-NEXT:    cset w0, hi
 ; CHECK-NEXT:    ret
   %r = fcmp ugt bfloat %a, %b
@@ -444,15 +374,11 @@ define i1 @test_fcmp_ugt(bfloat %a, bfloat %b) #0 {
 define i1 @test_fcmp_uge(bfloat %a, bfloat %b) #0 {
 ; CHECK-LABEL: test_fcmp_uge:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fmov s1, w9
-; CHECK-NEXT:    fcmp s1, s0
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    fcmp s0, s1
 ; CHECK-NEXT:    cset w0, pl
 ; CHECK-NEXT:    ret
   %r = fcmp uge bfloat %a, %b
@@ -462,15 +388,11 @@ define i1 @test_fcmp_uge(bfloat %a, bfloat %b) #0 {
 define i1 @test_fcmp_ult(bfloat %a, bfloat %b) #0 {
 ; CHECK-LABEL: test_fcmp_ult:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fmov s1, w9
-; CHECK-NEXT:    fcmp s1, s0
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    fcmp s0, s1
 ; CHECK-NEXT:    cset w0, lt
 ; CHECK-NEXT:    ret
   %r = fcmp ult bfloat %a, %b
@@ -480,15 +402,11 @@ define i1 @test_fcmp_ult(bfloat %a, bfloat %b) #0 {
 define i1 @test_fcmp_ule(bfloat %a, bfloat %b) #0 {
 ; CHECK-LABEL: test_fcmp_ule:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fmov s1, w9
-; CHECK-NEXT:    fcmp s1, s0
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    fcmp s0, s1
 ; CHECK-NEXT:    cset w0, le
 ; CHECK-NEXT:    ret
   %r = fcmp ule bfloat %a, %b
@@ -498,15 +416,11 @@ define i1 @test_fcmp_ule(bfloat %a, bfloat %b) #0 {
 define i1 @test_fcmp_uno(bfloat %a, bfloat %b) #0 {
 ; CHECK-LABEL: test_fcmp_uno:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fmov s1, w9
-; CHECK-NEXT:    fcmp s1, s0
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    fcmp s0, s1
 ; CHECK-NEXT:    cset w0, vs
 ; CHECK-NEXT:    ret
   %r = fcmp uno bfloat %a, %b
@@ -516,15 +430,11 @@ define i1 @test_fcmp_uno(bfloat %a, bfloat %b) #0 {
 define i1 @test_fcmp_one(bfloat %a, bfloat %b) #0 {
 ; CHECK-LABEL: test_fcmp_one:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fmov s1, w9
-; CHECK-NEXT:    fcmp s1, s0
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    fcmp s0, s1
 ; CHECK-NEXT:    cset w8, mi
 ; CHECK-NEXT:    csinc w0, w8, wzr, le
 ; CHECK-NEXT:    ret
@@ -535,15 +445,11 @@ define i1 @test_fcmp_one(bfloat %a, bfloat %b) #0 {
 define i1 @test_fcmp_oeq(bfloat %a, bfloat %b) #0 {
 ; CHECK-LABEL: test_fcmp_oeq:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fmov s1, w9
-; CHECK-NEXT:    fcmp s1, s0
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    fcmp s0, s1
 ; CHECK-NEXT:    cset w0, eq
 ; CHECK-NEXT:    ret
   %r = fcmp oeq bfloat %a, %b
@@ -553,15 +459,11 @@ define i1 @test_fcmp_oeq(bfloat %a, bfloat %b) #0 {
 define i1 @test_fcmp_ogt(bfloat %a, bfloat %b) #0 {
 ; CHECK-LABEL: test_fcmp_ogt:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fmov s1, w9
-; CHECK-NEXT:    fcmp s1, s0
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    fcmp s0, s1
 ; CHECK-NEXT:    cset w0, gt
 ; CHECK-NEXT:    ret
   %r = fcmp ogt bfloat %a, %b
@@ -571,15 +473,11 @@ define i1 @test_fcmp_ogt(bfloat %a, bfloat %b) #0 {
 define i1 @test_fcmp_oge(bfloat %a, bfloat %b) #0 {
 ; CHECK-LABEL: test_fcmp_oge:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fmov s1, w9
-; CHECK-NEXT:    fcmp s1, s0
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    fcmp s0, s1
 ; CHECK-NEXT:    cset w0, ge
 ; CHECK-NEXT:    ret
   %r = fcmp oge bfloat %a, %b
@@ -589,15 +487,11 @@ define i1 @test_fcmp_oge(bfloat %a, bfloat %b) #0 {
 define i1 @test_fcmp_olt(bfloat %a, bfloat %b) #0 {
 ; CHECK-LABEL: test_fcmp_olt:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fmov s1, w9
-; CHECK-NEXT:    fcmp s1, s0
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    fcmp s0, s1
 ; CHECK-NEXT:    cset w0, mi
 ; CHECK-NEXT:    ret
   %r = fcmp olt bfloat %a, %b
@@ -607,15 +501,11 @@ define i1 @test_fcmp_olt(bfloat %a, bfloat %b) #0 {
 define i1 @test_fcmp_ole(bfloat %a, bfloat %b) #0 {
 ; CHECK-LABEL: test_fcmp_ole:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fmov s1, w9
-; CHECK-NEXT:    fcmp s1, s0
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    fcmp s0, s1
 ; CHECK-NEXT:    cset w0, ls
 ; CHECK-NEXT:    ret
   %r = fcmp ole bfloat %a, %b
@@ -625,15 +515,11 @@ define i1 @test_fcmp_ole(bfloat %a, bfloat %b) #0 {
 define i1 @test_fcmp_ord(bfloat %a, bfloat %b) #0 {
 ; CHECK-LABEL: test_fcmp_ord:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fmov s1, w9
-; CHECK-NEXT:    fcmp s1, s0
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    fcmp s0, s1
 ; CHECK-NEXT:    cset w0, vc
 ; CHECK-NEXT:    ret
   %r = fcmp ord bfloat %a, %b
@@ -643,13 +529,11 @@ define i1 @test_fcmp_ord(bfloat %a, bfloat %b) #0 {
 define void @test_fccmp(bfloat %in, ptr %out) {
 ; CHECK-LABEL: test_fccmp:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    movi v1.2s, #69, lsl #24
-; CHECK-NEXT:    movi v3.2s, #72, lsl #24
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fmov s2, w8
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-NEXT:    shll v2.4s, v0.4h, #16
 ; CHECK-NEXT:    adrp x8, .LCPI29_0
+; CHECK-NEXT:    movi v3.2s, #72, lsl #24
 ; CHECK-NEXT:    fcmp s2, s1
 ; CHECK-NEXT:    ldr h1, [x8, :lo12:.LCPI29_0]
 ; CHECK-NEXT:    fccmp s2, s3, #4, mi
@@ -667,15 +551,11 @@ define void @test_fccmp(bfloat %in, ptr %out) {
 define void @test_br_cc(bfloat %a, bfloat %b, ptr %p1, ptr %p2) #0 {
 ; CHECK-LABEL: test_br_cc:
 ; CHECK:       // %bb.0: // %common.ret
-; CHECK-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fmov s1, w9
-; CHECK-NEXT:    fcmp s1, s0
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    fcmp s0, s1
 ; CHECK-NEXT:    csel x8, x0, x1, pl
 ; CHECK-NEXT:    str wzr, [x8]
 ; CHECK-NEXT:    ret
@@ -725,10 +605,8 @@ declare i1 @test_dummy(ptr %p1) #0
 define i32 @test_fptosi_i32(bfloat %a) #0 {
 ; CHECK-LABEL: test_fptosi_i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-NEXT:    fcvtzs w0, s0
 ; CHECK-NEXT:    ret
   %r = fptosi bfloat %a to i32
@@ -738,10 +616,8 @@ define i32 @test_fptosi_i32(bfloat %a) #0 {
 define i64 @test_fptosi_i64(bfloat %a) #0 {
 ; CHECK-LABEL: test_fptosi_i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-NEXT:    fcvtzs x0, s0
 ; CHECK-NEXT:    ret
   %r = fptosi bfloat %a to i64
@@ -751,10 +627,8 @@ define i64 @test_fptosi_i64(bfloat %a) #0 {
 define i32 @test_fptoui_i32(bfloat %a) #0 {
 ; CHECK-LABEL: test_fptoui_i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-NEXT:    fcvtzu w0, s0
 ; CHECK-NEXT:    ret
   %r = fptoui bfloat %a to i32
@@ -764,10 +638,8 @@ define i32 @test_fptoui_i32(bfloat %a) #0 {
 define i64 @test_fptoui_i64(bfloat %a) #0 {
 ; CHECK-LABEL: test_fptoui_i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-NEXT:    fcvtzu x0, s0
 ; CHECK-NEXT:    ret
   %r = fptoui bfloat %a to i64
@@ -927,7 +799,8 @@ define bfloat @test_uitofp_i32_fadd(i32 %a, bfloat %b) #0 {
 ; CHECK-CVT:       // %bb.0:
 ; CHECK-CVT-NEXT:    ucvtf d1, w0
 ; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-CVT-NEXT:    fcvtxn s1, d1
 ; CHECK-CVT-NEXT:    fmov w9, s1
 ; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
@@ -935,12 +808,7 @@ define bfloat @test_uitofp_i32_fadd(i32 %a, bfloat %b) #0 {
 ; CHECK-CVT-NEXT:    add w9, w10, w9
 ; CHECK-CVT-NEXT:    lsr w9, w9, #16
 ; CHECK-CVT-NEXT:    fmov s1, w9
-; CHECK-CVT-NEXT:    fmov w9, s0
-; CHECK-CVT-NEXT:    fmov w10, s1
-; CHECK-CVT-NEXT:    lsl w9, w9, #16
-; CHECK-CVT-NEXT:    fmov s0, w9
-; CHECK-CVT-NEXT:    lsl w10, w10, #16
-; CHECK-CVT-NEXT:    fmov s1, w10
+; CHECK-CVT-NEXT:    shll v1.4s, v1.4h, #16
 ; CHECK-CVT-NEXT:    fadd s0, s0, s1
 ; CHECK-CVT-NEXT:    fmov w9, s0
 ; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
@@ -954,15 +822,11 @@ define bfloat @test_uitofp_i32_fadd(i32 %a, bfloat %b) #0 {
 ; CHECK-BF16-LABEL: test_uitofp_i32_fadd:
 ; CHECK-BF16:       // %bb.0:
 ; CHECK-BF16-NEXT:    ucvtf d1, w0
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s0
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-BF16-NEXT:    fcvtxn s1, d1
-; CHECK-BF16-NEXT:    fmov s0, w8
 ; CHECK-BF16-NEXT:    bfcvt h1, s1
-; CHECK-BF16-NEXT:    fmov w9, s1
-; CHECK-BF16-NEXT:    lsl w9, w9, #16
-; CHECK-BF16-NEXT:    fmov s1, w9
+; CHECK-BF16-NEXT:    shll v1.4s, v1.4h, #16
 ; CHECK-BF16-NEXT:    fadd s0, s0, s1
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ret
@@ -976,7 +840,8 @@ define bfloat @test_sitofp_i32_fadd(i32 %a, bfloat %b) #0 {
 ; CHECK-CVT:       // %bb.0:
 ; CHECK-CVT-NEXT:    scvtf d1, w0
 ; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-CVT-NEXT:    fcvtxn s1, d1
 ; CHECK-CVT-NEXT:    fmov w9, s1
 ; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
@@ -984,12 +849,7 @@ define bfloat @test_sitofp_i32_fadd(i32 %a, bfloat %b) #0 {
 ; CHECK-CVT-NEXT:    add w9, w10, w9
 ; CHECK-CVT-NEXT:    lsr w9, w9, #16
 ; CHECK-CVT-NEXT:    fmov s1, w9
-; CHECK-CVT-NEXT:    fmov w9, s0
-; CHECK-CVT-NEXT:    fmov w10, s1
-; CHECK-CVT-NEXT:    lsl w9, w9, #16
-; CHECK-CVT-NEXT:    fmov s0, w9
-; CHECK-CVT-NEXT:    lsl w10, w10, #16
-; CHECK-CVT-NEXT:    fmov s1, w10
+; CHECK-CVT-NEXT:    shll v1.4s, v1.4h, #16
 ; CHECK-CVT-NEXT:    fadd s0, s0, s1
 ; CHECK-CVT-NEXT:    fmov w9, s0
 ; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
@@ -1003,15 +863,11 @@ define bfloat @test_sitofp_i32_fadd(i32 %a, bfloat %b) #0 {
 ; CHECK-BF16-LABEL: test_sitofp_i32_fadd:
 ; CHECK-BF16:       // %bb.0:
 ; CHECK-BF16-NEXT:    scvtf d1, w0
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s0
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-BF16-NEXT:    fcvtxn s1, d1
-; CHECK-BF16-NEXT:    fmov s0, w8
 ; CHECK-BF16-NEXT:    bfcvt h1, s1
-; CHECK-BF16-NEXT:    fmov w9, s1
-; CHECK-BF16-NEXT:    lsl w9, w9, #16
-; CHECK-BF16-NEXT:    fmov s1, w9
+; CHECK-BF16-NEXT:    shll v1.4s, v1.4h, #16
 ; CHECK-BF16-NEXT:    fadd s0, s0, s1
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ret
@@ -1070,10 +926,9 @@ define bfloat @test_fptrunc_double(double %a) #0 {
 define float @test_fpext_float(bfloat %a) #0 {
 ; CHECK-LABEL: test_fpext_float:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-NEXT:    ret
   %r = fpext bfloat %a to float
   ret float %r
@@ -1082,10 +937,8 @@ define float @test_fpext_float(bfloat %a) #0 {
 define double @test_fpext_double(bfloat %a) #0 {
 ; CHECK-LABEL: test_fpext_double:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-NEXT:    fcvt d0, s0
 ; CHECK-NEXT:    ret
   %r = fpext bfloat %a to double
@@ -1148,11 +1001,9 @@ declare bfloat @llvm.fmuladd.f16(bfloat %a, bfloat %b, bfloat %c) #0
 define bfloat @test_sqrt(bfloat %a) #0 {
 ; CHECK-CVT-LABEL: test_sqrt:
 ; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
-; CHECK-CVT-NEXT:    lsl w9, w9, #16
-; CHECK-CVT-NEXT:    fmov s0, w9
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-CVT-NEXT:    fsqrt s0, s0
 ; CHECK-CVT-NEXT:    fmov w9, s0
 ; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
@@ -1165,10 +1016,8 @@ define bfloat @test_sqrt(bfloat %a) #0 {
 ;
 ; CHECK-BF16-LABEL: test_sqrt:
 ; CHECK-BF16:       // %bb.0:
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s0
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-BF16-NEXT:    fsqrt s0, s0
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ret
@@ -1180,10 +1029,9 @@ define bfloat @test_powi(bfloat %a, i32 %b) #0 {
 ; CHECK-CVT-LABEL: test_powi:
 ; CHECK-CVT:       // %bb.0:
 ; CHECK-CVT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w8, s0
-; CHECK-CVT-NEXT:    lsl w8, w8, #16
-; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-CVT-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-CVT-NEXT:    bl __powisf2
 ; CHECK-CVT-NEXT:    fmov w9, s0
 ; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
@@ -1199,10 +1047,9 @@ define bfloat @test_powi(bfloat %a, i32 %b) #0 {
 ; CHECK-BF16-LABEL: test_powi:
 ; CHECK-BF16:       // %bb.0:
 ; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s0
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-BF16-NEXT:    bl __powisf2
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
@@ -1216,10 +1063,9 @@ define bfloat @test_sin(bfloat %a) #0 {
 ; CHECK-CVT-LABEL: test_sin:
 ; CHECK-CVT:       // %bb.0:
 ; CHECK-CVT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w8, s0
-; CHECK-CVT-NEXT:    lsl w8, w8, #16
-; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-CVT-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-CVT-NEXT:    bl sinf
 ; CHECK-CVT-NEXT:    fmov w9, s0
 ; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
@@ -1235,10 +1081,9 @@ define bfloat @test_sin(bfloat %a) #0 {
 ; CHECK-BF16-LABEL: test_sin:
 ; CHECK-BF16:       // %bb.0:
 ; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s0
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-BF16-NEXT:    bl sinf
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
@@ -1251,10 +1096,9 @@ define bfloat @test_cos(bfloat %a) #0 {
 ; CHECK-CVT-LABEL: test_cos:
 ; CHECK-CVT:       // %bb.0:
 ; CHECK-CVT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w8, s0
-; CHECK-CVT-NEXT:    lsl w8, w8, #16
-; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-CVT-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-CVT-NEXT:    bl cosf
 ; CHECK-CVT-NEXT:    fmov w9, s0
 ; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
@@ -1270,10 +1114,9 @@ define bfloat @test_cos(bfloat %a) #0 {
 ; CHECK-BF16-LABEL: test_cos:
 ; CHECK-BF16:       // %bb.0:
 ; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s0
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-BF16-NEXT:    bl cosf
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
@@ -1286,10 +1129,9 @@ define bfloat @test_tan(bfloat %a) #0 {
 ; CHECK-CVT-LABEL: test_tan:
 ; CHECK-CVT:       // %bb.0:
 ; CHECK-CVT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w8, s0
-; CHECK-CVT-NEXT:    lsl w8, w8, #16
-; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-CVT-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-CVT-NEXT:    bl tanf
 ; CHECK-CVT-NEXT:    fmov w9, s0
 ; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
@@ -1305,10 +1147,9 @@ define bfloat @test_tan(bfloat %a) #0 {
 ; CHECK-BF16-LABEL: test_tan:
 ; CHECK-BF16:       // %bb.0:
 ; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s0
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-BF16-NEXT:    bl tanf
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
@@ -1321,10 +1162,9 @@ define bfloat @test_acos(bfloat %a) #0 {
 ; CHECK-CVT-LABEL: test_acos:
 ; CHECK-CVT:       // %bb.0:
 ; CHECK-CVT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w8, s0
-; CHECK-CVT-NEXT:    lsl w8, w8, #16
-; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-CVT-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-CVT-NEXT:    bl acosf
 ; CHECK-CVT-NEXT:    fmov w9, s0
 ; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
@@ -1340,10 +1180,9 @@ define bfloat @test_acos(bfloat %a) #0 {
 ; CHECK-BF16-LABEL: test_acos:
 ; CHECK-BF16:       // %bb.0:
 ; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s0
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-BF16-NEXT:    bl acosf
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
@@ -1356,10 +1195,9 @@ define bfloat @test_asin(bfloat %a) #0 {
 ; CHECK-CVT-LABEL: test_asin:
 ; CHECK-CVT:       // %bb.0:
 ; CHECK-CVT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w8, s0
-; CHECK-CVT-NEXT:    lsl w8, w8, #16
-; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-CVT-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-CVT-NEXT:    bl asinf
 ; CHECK-CVT-NEXT:    fmov w9, s0
 ; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
@@ -1375,10 +1213,9 @@ define bfloat @test_asin(bfloat %a) #0 {
 ; CHECK-BF16-LABEL: test_asin:
 ; CHECK-BF16:       // %bb.0:
 ; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s0
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-BF16-NEXT:    bl asinf
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
@@ -1391,10 +1228,9 @@ define bfloat @test_atan(bfloat %a) #0 {
 ; CHECK-CVT-LABEL: test_atan:
 ; CHECK-CVT:       // %bb.0:
 ; CHECK-CVT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w8, s0
-; CHECK-CVT-NEXT:    lsl w8, w8, #16
-; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-CVT-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-CVT-NEXT:    bl atanf
 ; CHECK-CVT-NEXT:    fmov w9, s0
 ; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
@@ -1410,10 +1246,9 @@ define bfloat @test_atan(bfloat %a) #0 {
 ; CHECK-BF16-LABEL: test_atan:
 ; CHECK-BF16:       // %bb.0:
 ; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s0
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-BF16-NEXT:    bl atanf
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
@@ -1426,14 +1261,12 @@ define bfloat @test_atan2(bfloat %a, bfloat %b) #0 {
 ; CHECK-CVT-LABEL: test_atan2:
 ; CHECK-CVT:       // %bb.0:
 ; CHECK-CVT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w8, s0
-; CHECK-CVT-NEXT:    fmov w9, s1
-; CHECK-CVT-NEXT:    lsl w8, w8, #16
-; CHECK-CVT-NEXT:    lsl w9, w9, #16
-; CHECK-CVT-NEXT:    fmov s0, w8
-; CHECK-CVT-NEXT:    fmov s1, w9
+; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-CVT-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-CVT-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-CVT-NEXT:    // kill: def $s1 killed $s1 killed $q1
 ; CHECK-CVT-NEXT:    bl atan2f
 ; CHECK-CVT-NEXT:    fmov w9, s0
 ; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
@@ -1449,14 +1282,12 @@ define bfloat @test_atan2(bfloat %a, bfloat %b) #0 {
 ; CHECK-BF16-LABEL: test_atan2:
 ; CHECK-BF16:       // %bb.0:
 ; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s0
-; CHECK-BF16-NEXT:    fmov w9, s1
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    lsl w9, w9, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
-; CHECK-BF16-NEXT:    fmov s1, w9
+; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-BF16-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-BF16-NEXT:    // kill: def $s1 killed $s1 killed $q1
 ; CHECK-BF16-NEXT:    bl atan2f
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
@@ -1469,10 +1300,9 @@ define bfloat @test_cosh(bfloat %a) #0 {
 ; CHECK-CVT-LABEL: test_cosh:
 ; CHECK-CVT:       // %bb.0:
 ; CHECK-CVT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w8, s0
-; CHECK-CVT-NEXT:    lsl w8, w8, #16
-; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-CVT-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-CVT-NEXT:    bl coshf
 ; CHECK-CVT-NEXT:    fmov w9, s0
 ; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
@@ -1488,10 +1318,9 @@ define bfloat @test_cosh(bfloat %a) #0 {
 ; CHECK-BF16-LABEL: test_cosh:
 ; CHECK-BF16:       // %bb.0:
 ; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s0
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-BF16-NEXT:    bl coshf
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
@@ -1504,10 +1333,9 @@ define bfloat @test_sinh(bfloat %a) #0 {
 ; CHECK-CVT-LABEL: test_sinh:
 ; CHECK-CVT:       // %bb.0:
 ; CHECK-CVT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w8, s0
-; CHECK-CVT-NEXT:    lsl w8, w8, #16
-; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-CVT-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-CVT-NEXT:    bl sinhf
 ; CHECK-CVT-NEXT:    fmov w9, s0
 ; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
@@ -1523,10 +1351,9 @@ define bfloat @test_sinh(bfloat %a) #0 {
 ; CHECK-BF16-LABEL: test_sinh:
 ; CHECK-BF16:       // %bb.0:
 ; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s0
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-BF16-NEXT:    bl sinhf
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
@@ -1539,10 +1366,9 @@ define bfloat @test_tanh(bfloat %a) #0 {
 ; CHECK-CVT-LABEL: test_tanh:
 ; CHECK-CVT:       // %bb.0:
 ; CHECK-CVT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w8, s0
-; CHECK-CVT-NEXT:    lsl w8, w8, #16
-; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-CVT-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-CVT-NEXT:    bl tanhf
 ; CHECK-CVT-NEXT:    fmov w9, s0
 ; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
@@ -1558,10 +1384,9 @@ define bfloat @test_tanh(bfloat %a) #0 {
 ; CHECK-BF16-LABEL: test_tanh:
 ; CHECK-BF16:       // %bb.0:
 ; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s0
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-BF16-NEXT:    bl tanhf
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
@@ -1574,14 +1399,12 @@ define bfloat @test_pow(bfloat %a, bfloat %b) #0 {
 ; CHECK-CVT-LABEL: test_pow:
 ; CHECK-CVT:       // %bb.0:
 ; CHECK-CVT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w8, s0
-; CHECK-CVT-NEXT:    fmov w9, s1
-; CHECK-CVT-NEXT:    lsl w8, w8, #16
-; CHECK-CVT-NEXT:    lsl w9, w9, #16
-; CHECK-CVT-NEXT:    fmov s0, w8
-; CHECK-CVT-NEXT:    fmov s1, w9
+; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-CVT-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-CVT-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-CVT-NEXT:    // kill: def $s1 killed $s1 killed $q1
 ; CHECK-CVT-NEXT:    bl powf
 ; CHECK-CVT-NEXT:    fmov w9, s0
 ; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
@@ -1597,14 +1420,12 @@ define bfloat @test_pow(bfloat %a, bfloat %b) #0 {
 ; CHECK-BF16-LABEL: test_pow:
 ; CHECK-BF16:       // %bb.0:
 ; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s0
-; CHECK-BF16-NEXT:    fmov w9, s1
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    lsl w9, w9, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
-; CHECK-BF16-NEXT:    fmov s1, w9
+; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-BF16-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-BF16-NEXT:    // kill: def $s1 killed $s1 killed $q1
 ; CHECK-BF16-NEXT:    bl powf
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
@@ -1617,10 +1438,9 @@ define bfloat @test_exp(bfloat %a) #0 {
 ; CHECK-CVT-LABEL: test_exp:
 ; CHECK-CVT:       // %bb.0:
 ; CHECK-CVT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w8, s0
-; CHECK-CVT-NEXT:    lsl w8, w8, #16
-; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-CVT-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-CVT-NEXT:    bl expf
 ; CHECK-CVT-NEXT:    fmov w9, s0
 ; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
@@ -1636,10 +1456,9 @@ define bfloat @test_exp(bfloat %a) #0 {
 ; CHECK-BF16-LABEL: test_exp:
 ; CHECK-BF16:       // %bb.0:
 ; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s0
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-BF16-NEXT:    bl expf
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
@@ -1652,10 +1471,9 @@ define bfloat @test_exp2(bfloat %a) #0 {
 ; CHECK-CVT-LABEL: test_exp2:
 ; CHECK-CVT:       // %bb.0:
 ; CHECK-CVT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w8, s0
-; CHECK-CVT-NEXT:    lsl w8, w8, #16
-; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-CVT-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-CVT-NEXT:    bl exp2f
 ; CHECK-CVT-NEXT:    fmov w9, s0
 ; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
@@ -1671,10 +1489,9 @@ define bfloat @test_exp2(bfloat %a) #0 {
 ; CHECK-BF16-LABEL: test_exp2:
 ; CHECK-BF16:       // %bb.0:
 ; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s0
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-BF16-NEXT:    bl exp2f
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
@@ -1687,10 +1504,9 @@ define bfloat @test_log(bfloat %a) #0 {
 ; CHECK-CVT-LABEL: test_log:
 ; CHECK-CVT:       // %bb.0:
 ; CHECK-CVT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w8, s0
-; CHECK-CVT-NEXT:    lsl w8, w8, #16
-; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-CVT-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-CVT-NEXT:    bl logf
 ; CHECK-CVT-NEXT:    fmov w9, s0
 ; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
@@ -1706,10 +1522,9 @@ define bfloat @test_log(bfloat %a) #0 {
 ; CHECK-BF16-LABEL: test_log:
 ; CHECK-BF16:       // %bb.0:
 ; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s0
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-BF16-NEXT:    bl logf
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
@@ -1722,10 +1537,9 @@ define bfloat @test_log10(bfloat %a) #0 {
 ; CHECK-CVT-LABEL: test_log10:
 ; CHECK-CVT:       // %bb.0:
 ; CHECK-CVT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w8, s0
-; CHECK-CVT-NEXT:    lsl w8, w8, #16
-; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-CVT-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-CVT-NEXT:    bl log10f
 ; CHECK-CVT-NEXT:    fmov w9, s0
 ; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
@@ -1741,10 +1555,9 @@ define bfloat @test_log10(bfloat %a) #0 {
 ; CHECK-BF16-LABEL: test_log10:
 ; CHECK-BF16:       // %bb.0:
 ; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s0
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-BF16-NEXT:    bl log10f
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
@@ -1757,10 +1570,9 @@ define bfloat @test_log2(bfloat %a) #0 {
 ; CHECK-CVT-LABEL: test_log2:
 ; CHECK-CVT:       // %bb.0:
 ; CHECK-CVT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w8, s0
-; CHECK-CVT-NEXT:    lsl w8, w8, #16
-; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-CVT-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-CVT-NEXT:    bl log2f
 ; CHECK-CVT-NEXT:    fmov w9, s0
 ; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
@@ -1776,10 +1588,9 @@ define bfloat @test_log2(bfloat %a) #0 {
 ; CHECK-BF16-LABEL: test_log2:
 ; CHECK-BF16:       // %bb.0:
 ; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s0
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-BF16-NEXT:    bl log2f
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
@@ -1791,20 +1602,14 @@ define bfloat @test_log2(bfloat %a) #0 {
 define bfloat @test_fma(bfloat %a, bfloat %b, bfloat %c) #0 {
 ; CHECK-CVT-LABEL: test_fma:
 ; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    // kill: def $h2 killed $h2 def $s2
-; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w8, s2
-; CHECK-CVT-NEXT:    fmov w9, s1
-; CHECK-CVT-NEXT:    fmov w10, s0
-; CHECK-CVT-NEXT:    lsl w8, w8, #16
-; CHECK-CVT-NEXT:    lsl w9, w9, #16
-; CHECK-CVT-NEXT:    lsl w10, w10, #16
-; CHECK-CVT-NEXT:    fmov s0, w8
-; CHECK-CVT-NEXT:    fmov s1, w9
-; CHECK-CVT-NEXT:    fmov s2, w10
+; CHECK-CVT-NEXT:    // kill: def $h2 killed $h2 def $d2
+; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-CVT-NEXT:    mov w10, #32767 // =0x7fff
-; CHECK-CVT-NEXT:    fmadd s0, s2, s1, s0
+; CHECK-CVT-NEXT:    shll v2.4s, v2.4h, #16
+; CHECK-CVT-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-CVT-NEXT:    fmadd s0, s0, s1, s2
 ; CHECK-CVT-NEXT:    fmov w8, s0
 ; CHECK-CVT-NEXT:    ubfx w9, w8, #16, #1
 ; CHECK-CVT-NEXT:    add w8, w8, w10
@@ -1816,19 +1621,13 @@ define bfloat @test_fma(bfloat %a, bfloat %b, bfloat %c) #0 {
 ;
 ; CHECK-BF16-LABEL: test_fma:
 ; CHECK-BF16:       // %bb.0:
-; CHECK-BF16-NEXT:    // kill: def $h2 killed $h2 def $s2
-; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s2
-; CHECK-BF16-NEXT:    fmov w9, s1
-; CHECK-BF16-NEXT:    fmov w10, s0
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    lsl w9, w9, #16
-; CHECK-BF16-NEXT:    lsl w10, w10, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
-; CHECK-BF16-NEXT:    fmov s1, w9
-; CHECK-BF16-NEXT:    fmov s2, w10
-; CHECK-BF16-NEXT:    fmadd s0, s2, s1, s0
+; CHECK-BF16-NEXT:    // kill: def $h2 killed $h2 def $d2
+; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v2.4s, v2.4h, #16
+; CHECK-BF16-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    fmadd s0, s0, s1, s2
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ret
   %r = call bfloat @llvm.fma.f16(bfloat %a, bfloat %b, bfloat %c)
@@ -1851,16 +1650,12 @@ define bfloat @test_fabs(bfloat %a) #0 {
 define bfloat @test_minnum(bfloat %a, bfloat %b) #0 {
 ; CHECK-CVT-LABEL: test_minnum:
 ; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w9, s1
-; CHECK-CVT-NEXT:    fmov w10, s0
+; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
-; CHECK-CVT-NEXT:    lsl w9, w9, #16
-; CHECK-CVT-NEXT:    lsl w10, w10, #16
-; CHECK-CVT-NEXT:    fmov s0, w9
-; CHECK-CVT-NEXT:    fmov s1, w10
-; CHECK-CVT-NEXT:    fminnm s0, s1, s0
+; CHECK-CVT-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-CVT-NEXT:    fminnm s0, s0, s1
 ; CHECK-CVT-NEXT:    fmov w9, s0
 ; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
 ; CHECK-CVT-NEXT:    add w8, w9, w8
@@ -1872,15 +1667,11 @@ define bfloat @test_minnum(bfloat %a, bfloat %b) #0 {
 ;
 ; CHECK-BF16-LABEL: test_minnum:
 ; CHECK-BF16:       // %bb.0:
-; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s1
-; CHECK-BF16-NEXT:    fmov w9, s0
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    lsl w9, w9, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
-; CHECK-BF16-NEXT:    fmov s1, w9
-; CHECK-BF16-NEXT:    fminnm s0, s1, s0
+; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    fminnm s0, s0, s1
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ret
   %r = call bfloat @llvm.minnum.f16(bfloat %a, bfloat %b)
@@ -1890,16 +1681,12 @@ define bfloat @test_minnum(bfloat %a, bfloat %b) #0 {
 define bfloat @test_maxnum(bfloat %a, bfloat %b) #0 {
 ; CHECK-CVT-LABEL: test_maxnum:
 ; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w9, s1
-; CHECK-CVT-NEXT:    fmov w10, s0
+; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
-; CHECK-CVT-NEXT:    lsl w9, w9, #16
-; CHECK-CVT-NEXT:    lsl w10, w10, #16
-; CHECK-CVT-NEXT:    fmov s0, w9
-; CHECK-CVT-NEXT:    fmov s1, w10
-; CHECK-CVT-NEXT:    fmaxnm s0, s1, s0
+; CHECK-CVT-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-CVT-NEXT:    fmaxnm s0, s0, s1
 ; CHECK-CVT-NEXT:    fmov w9, s0
 ; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
 ; CHECK-CVT-NEXT:    add w8, w9, w8
@@ -1911,15 +1698,11 @@ define bfloat @test_maxnum(bfloat %a, bfloat %b) #0 {
 ;
 ; CHECK-BF16-LABEL: test_maxnum:
 ; CHECK-BF16:       // %bb.0:
-; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s1
-; CHECK-BF16-NEXT:    fmov w9, s0
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    lsl w9, w9, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
-; CHECK-BF16-NEXT:    fmov s1, w9
-; CHECK-BF16-NEXT:    fmaxnm s0, s1, s0
+; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    fmaxnm s0, s0, s1
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ret
   %r = call bfloat @llvm.maxnum.f16(bfloat %a, bfloat %b)
@@ -1929,16 +1712,12 @@ define bfloat @test_maxnum(bfloat %a, bfloat %b) #0 {
 define bfloat @test_copysign(bfloat %a, bfloat %b) #0 {
 ; CHECK-CVT-LABEL: test_copysign:
 ; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w8, s1
-; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-CVT-NEXT:    mvni v2.4s, #128, lsl #24
-; CHECK-CVT-NEXT:    lsl w8, w8, #16
-; CHECK-CVT-NEXT:    lsl w9, w9, #16
-; CHECK-CVT-NEXT:    fmov s0, w8
-; CHECK-CVT-NEXT:    fmov s1, w9
-; CHECK-CVT-NEXT:    bit v0.16b, v1.16b, v2.16b
+; CHECK-CVT-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-CVT-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-CVT-NEXT:    fmov w8, s0
 ; CHECK-CVT-NEXT:    lsr w8, w8, #16
 ; CHECK-CVT-NEXT:    fmov s0, w8
@@ -1947,16 +1726,12 @@ define bfloat @test_copysign(bfloat %a, bfloat %b) #0 {
 ;
 ; CHECK-BF16-LABEL: test_copysign:
 ; CHECK-BF16:       // %bb.0:
-; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s1
-; CHECK-BF16-NEXT:    fmov w9, s0
+; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-BF16-NEXT:    mvni v2.4s, #128, lsl #24
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    lsl w9, w9, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
-; CHECK-BF16-NEXT:    fmov s1, w9
-; CHECK-BF16-NEXT:    bit v0.16b, v1.16b, v2.16b
+; CHECK-BF16-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ret
   %r = call bfloat @llvm.copysign.f16(bfloat %a, bfloat %b)
@@ -1966,12 +1741,10 @@ define bfloat @test_copysign(bfloat %a, bfloat %b) #0 {
 define bfloat @test_copysign_f32(bfloat %a, float %b) #0 {
 ; CHECK-CVT-LABEL: test_copysign_f32:
 ; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w8, s0
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-CVT-NEXT:    mvni v2.4s, #128, lsl #24
 ; CHECK-CVT-NEXT:    // kill: def $s1 killed $s1 def $q1
-; CHECK-CVT-NEXT:    lsl w8, w8, #16
-; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-CVT-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-CVT-NEXT:    fmov w8, s0
 ; CHECK-CVT-NEXT:    lsr w8, w8, #16
@@ -1981,12 +1754,10 @@ define bfloat @test_copysign_f32(bfloat %a, float %b) #0 {
 ;
 ; CHECK-BF16-LABEL: test_copysign_f32:
 ; CHECK-BF16:       // %bb.0:
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s0
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-BF16-NEXT:    mvni v2.4s, #128, lsl #24
 ; CHECK-BF16-NEXT:    // kill: def $s1 killed $s1 def $q1
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-BF16-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ret
@@ -1998,12 +1769,10 @@ define bfloat @test_copysign_f32(bfloat %a, float %b) #0 {
 define bfloat @test_copysign_f64(bfloat %a, double %b) #0 {
 ; CHECK-CVT-LABEL: test_copysign_f64:
 ; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w8, s0
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-CVT-NEXT:    fcvt s1, d1
 ; CHECK-CVT-NEXT:    mvni v2.4s, #128, lsl #24
-; CHECK-CVT-NEXT:    lsl w8, w8, #16
-; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-CVT-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-CVT-NEXT:    fmov w8, s0
 ; CHECK-CVT-NEXT:    lsr w8, w8, #16
@@ -2013,12 +1782,10 @@ define bfloat @test_copysign_f64(bfloat %a, double %b) #0 {
 ;
 ; CHECK-BF16-LABEL: test_copysign_f64:
 ; CHECK-BF16:       // %bb.0:
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s0
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-BF16-NEXT:    fcvt s1, d1
 ; CHECK-BF16-NEXT:    mvni v2.4s, #128, lsl #24
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-BF16-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ret
@@ -2032,34 +1799,33 @@ define bfloat @test_copysign_f64(bfloat %a, double %b) #0 {
 define float @test_copysign_extended(bfloat %a, bfloat %b) #0 {
 ; CHECK-CVT-LABEL: test_copysign_extended:
 ; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w8, s1
-; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-CVT-NEXT:    movi v2.4s, #16
+; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-CVT-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-CVT-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-CVT-NEXT:    ushl v0.4s, v0.4s, v2.4s
 ; CHECK-CVT-NEXT:    mvni v2.4s, #128, lsl #24
-; CHECK-CVT-NEXT:    lsl w8, w8, #16
-; CHECK-CVT-NEXT:    lsl w9, w9, #16
-; CHECK-CVT-NEXT:    fmov s0, w8
-; CHECK-CVT-NEXT:    fmov s1, w9
-; CHECK-CVT-NEXT:    bit v0.16b, v1.16b, v2.16b
+; CHECK-CVT-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-CVT-NEXT:    fmov w8, s0
 ; CHECK-CVT-NEXT:    lsr w8, w8, #16
-; CHECK-CVT-NEXT:    lsl w8, w8, #16
 ; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-CVT-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-CVT-NEXT:    ret
 ;
 ; CHECK-BF16-LABEL: test_copysign_extended:
 ; CHECK-BF16:       // %bb.0:
-; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s1
-; CHECK-BF16-NEXT:    fmov w9, s0
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    movi v2.4s, #16
+; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-BF16-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-BF16-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-BF16-NEXT:    ushl v0.4s, v0.4s, v2.4s
 ; CHECK-BF16-NEXT:    mvni v2.4s, #128, lsl #24
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    lsl w9, w9, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
-; CHECK-BF16-NEXT:    fmov s1, w9
-; CHECK-BF16-NEXT:    bit v0.16b, v1.16b, v2.16b
+; CHECK-BF16-NEXT:    bif v0.16b, v1.16b, v2.16b
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-BF16-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-BF16-NEXT:    ret
   %r = call bfloat @llvm.copysign.f16(bfloat %a, bfloat %b)
@@ -2070,11 +1836,9 @@ define float @test_copysign_extended(bfloat %a, bfloat %b) #0 {
 define bfloat @test_floor(bfloat %a) #0 {
 ; CHECK-CVT-LABEL: test_floor:
 ; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
-; CHECK-CVT-NEXT:    lsl w9, w9, #16
-; CHECK-CVT-NEXT:    fmov s0, w9
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-CVT-NEXT:    frintm s0, s0
 ; CHECK-CVT-NEXT:    fmov w9, s0
 ; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
@@ -2087,10 +1851,8 @@ define bfloat @test_floor(bfloat %a) #0 {
 ;
 ; CHECK-BF16-LABEL: test_floor:
 ; CHECK-BF16:       // %bb.0:
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s0
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-BF16-NEXT:    frintm s0, s0
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ret
@@ -2101,11 +1863,9 @@ define bfloat @test_floor(bfloat %a) #0 {
 define bfloat @test_ceil(bfloat %a) #0 {
 ; CHECK-CVT-LABEL: test_ceil:
 ; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
-; CHECK-CVT-NEXT:    lsl w9, w9, #16
-; CHECK-CVT-NEXT:    fmov s0, w9
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-CVT-NEXT:    frintp s0, s0
 ; CHECK-CVT-NEXT:    fmov w9, s0
 ; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
@@ -2118,10 +1878,8 @@ define bfloat @test_ceil(bfloat %a) #0 {
 ;
 ; CHECK-BF16-LABEL: test_ceil:
 ; CHECK-BF16:       // %bb.0:
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s0
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-BF16-NEXT:    frintp s0, s0
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ret
@@ -2132,11 +1890,9 @@ define bfloat @test_ceil(bfloat %a) #0 {
 define bfloat @test_trunc(bfloat %a) #0 {
 ; CHECK-CVT-LABEL: test_trunc:
 ; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
-; CHECK-CVT-NEXT:    lsl w9, w9, #16
-; CHECK-CVT-NEXT:    fmov s0, w9
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-CVT-NEXT:    frintz s0, s0
 ; CHECK-CVT-NEXT:    fmov w9, s0
 ; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
@@ -2149,10 +1905,8 @@ define bfloat @test_trunc(bfloat %a) #0 {
 ;
 ; CHECK-BF16-LABEL: test_trunc:
 ; CHECK-BF16:       // %bb.0:
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s0
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-BF16-NEXT:    frintz s0, s0
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ret
@@ -2163,11 +1917,9 @@ define bfloat @test_trunc(bfloat %a) #0 {
 define bfloat @test_rint(bfloat %a) #0 {
 ; CHECK-CVT-LABEL: test_rint:
 ; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
-; CHECK-CVT-NEXT:    lsl w9, w9, #16
-; CHECK-CVT-NEXT:    fmov s0, w9
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-CVT-NEXT:    frintx s0, s0
 ; CHECK-CVT-NEXT:    fmov w9, s0
 ; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
@@ -2180,10 +1932,8 @@ define bfloat @test_rint(bfloat %a) #0 {
 ;
 ; CHECK-BF16-LABEL: test_rint:
 ; CHECK-BF16:       // %bb.0:
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s0
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-BF16-NEXT:    frintx s0, s0
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ret
@@ -2194,11 +1944,9 @@ define bfloat @test_rint(bfloat %a) #0 {
 define bfloat @test_nearbyint(bfloat %a) #0 {
 ; CHECK-CVT-LABEL: test_nearbyint:
 ; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
-; CHECK-CVT-NEXT:    lsl w9, w9, #16
-; CHECK-CVT-NEXT:    fmov s0, w9
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-CVT-NEXT:    frinti s0, s0
 ; CHECK-CVT-NEXT:    fmov w9, s0
 ; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
@@ -2211,10 +1959,8 @@ define bfloat @test_nearbyint(bfloat %a) #0 {
 ;
 ; CHECK-BF16-LABEL: test_nearbyint:
 ; CHECK-BF16:       // %bb.0:
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s0
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-BF16-NEXT:    frinti s0, s0
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ret
@@ -2225,11 +1971,9 @@ define bfloat @test_nearbyint(bfloat %a) #0 {
 define bfloat @test_round(bfloat %a) #0 {
 ; CHECK-CVT-LABEL: test_round:
 ; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
-; CHECK-CVT-NEXT:    lsl w9, w9, #16
-; CHECK-CVT-NEXT:    fmov s0, w9
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-CVT-NEXT:    frinta s0, s0
 ; CHECK-CVT-NEXT:    fmov w9, s0
 ; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
@@ -2242,10 +1986,8 @@ define bfloat @test_round(bfloat %a) #0 {
 ;
 ; CHECK-BF16-LABEL: test_round:
 ; CHECK-BF16:       // %bb.0:
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s0
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-BF16-NEXT:    frinta s0, s0
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ret
@@ -2256,11 +1998,9 @@ define bfloat @test_round(bfloat %a) #0 {
 define bfloat @test_roundeven(bfloat %a) #0 {
 ; CHECK-CVT-LABEL: test_roundeven:
 ; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
-; CHECK-CVT-NEXT:    lsl w9, w9, #16
-; CHECK-CVT-NEXT:    fmov s0, w9
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-CVT-NEXT:    frintn s0, s0
 ; CHECK-CVT-NEXT:    fmov w9, s0
 ; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
@@ -2273,10 +2013,8 @@ define bfloat @test_roundeven(bfloat %a) #0 {
 ;
 ; CHECK-BF16-LABEL: test_roundeven:
 ; CHECK-BF16:       // %bb.0:
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s0
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-BF16-NEXT:    frintn s0, s0
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ret
@@ -2287,27 +2025,21 @@ define bfloat @test_roundeven(bfloat %a) #0 {
 define bfloat @test_fmuladd(bfloat %a, bfloat %b, bfloat %c) #0 {
 ; CHECK-CVT-LABEL: test_fmuladd:
 ; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w8, s1
-; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-CVT-NEXT:    mov w10, #32767 // =0x7fff
-; CHECK-CVT-NEXT:    // kill: def $h2 killed $h2 def $s2
-; CHECK-CVT-NEXT:    lsl w8, w8, #16
-; CHECK-CVT-NEXT:    lsl w9, w9, #16
-; CHECK-CVT-NEXT:    fmov s0, w8
-; CHECK-CVT-NEXT:    fmov s1, w9
-; CHECK-CVT-NEXT:    fmul s0, s1, s0
+; CHECK-CVT-NEXT:    // kill: def $h2 killed $h2 def $d2
+; CHECK-CVT-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-CVT-NEXT:    fmul s0, s0, s1
+; CHECK-CVT-NEXT:    shll v1.4s, v2.4h, #16
 ; CHECK-CVT-NEXT:    fmov w8, s0
 ; CHECK-CVT-NEXT:    ubfx w9, w8, #16, #1
 ; CHECK-CVT-NEXT:    add w8, w8, w10
 ; CHECK-CVT-NEXT:    add w8, w9, w8
-; CHECK-CVT-NEXT:    fmov w9, s2
 ; CHECK-CVT-NEXT:    lsr w8, w8, #16
-; CHECK-CVT-NEXT:    lsl w8, w8, #16
-; CHECK-CVT-NEXT:    lsl w9, w9, #16
 ; CHECK-CVT-NEXT:    fmov s0, w8
-; CHECK-CVT-NEXT:    fmov s1, w9
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-CVT-NEXT:    fadd s0, s0, s1
 ; CHECK-CVT-NEXT:    fmov w8, s0
 ; CHECK-CVT-NEXT:    ubfx w9, w8, #16, #1
@@ -2320,23 +2052,15 @@ define bfloat @test_fmuladd(bfloat %a, bfloat %b, bfloat %c) #0 {
 ;
 ; CHECK-BF16-LABEL: test_fmuladd:
 ; CHECK-BF16:       // %bb.0:
-; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-BF16-NEXT:    fmov w8, s1
-; CHECK-BF16-NEXT:    fmov w9, s0
-; CHECK-BF16-NEXT:    // kill: def $h2 killed $h2 def $s2
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    lsl w9, w9, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
-; CHECK-BF16-NEXT:    fmov s1, w9
-; CHECK-BF16-NEXT:    fmov w9, s2
-; CHECK-BF16-NEXT:    fmul s0, s1, s0
-; CHECK-BF16-NEXT:    lsl w9, w9, #16
-; CHECK-BF16-NEXT:    fmov s1, w9
+; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    // kill: def $h2 killed $h2 def $d2
+; CHECK-BF16-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    fmul s0, s0, s1
+; CHECK-BF16-NEXT:    shll v1.4s, v2.4h, #16
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
-; CHECK-BF16-NEXT:    fmov w8, s0
-; CHECK-BF16-NEXT:    lsl w8, w8, #16
-; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-BF16-NEXT:    fadd s0, s0, s1
 ; CHECK-BF16-NEXT:    bfcvt h0, s0
 ; CHECK-BF16-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/bf16-v8-instructions.ll b/llvm/test/CodeGen/AArch64/bf16-v8-instructions.ll
index c03e2e532132..a609e33be935 100644
--- a/llvm/test/CodeGen/AArch64/bf16-v8-instructions.ll
+++ b/llvm/test/CodeGen/AArch64/bf16-v8-instructions.ll
@@ -272,9 +272,8 @@ define <8 x bfloat> @d_to_h(<8 x double> %a) {
 define <8 x float> @h_to_s(<8 x bfloat> %a) {
 ; CHECK-LABEL: h_to_s:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT:    shll2 v1.4s, v0.8h, #16
 ; CHECK-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-NEXT:    shll v1.4s, v1.4h, #16
 ; CHECK-NEXT:    ret
   %1 = fpext <8 x bfloat> %a to <8 x float>
   ret <8 x float> %1
@@ -283,13 +282,12 @@ define <8 x float> @h_to_s(<8 x bfloat> %a) {
 define <8 x double> @h_to_d(<8 x bfloat> %a) {
 ; CHECK-LABEL: h_to_d:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT:    shll v2.4s, v0.4h, #16
-; CHECK-NEXT:    fcvtl v0.2d, v2.2s
-; CHECK-NEXT:    shll v4.4s, v1.4h, #16
-; CHECK-NEXT:    fcvtl2 v1.2d, v2.4s
-; CHECK-NEXT:    fcvtl2 v3.2d, v4.4s
-; CHECK-NEXT:    fcvtl v2.2d, v4.2s
+; CHECK-NEXT:    shll v1.4s, v0.4h, #16
+; CHECK-NEXT:    shll2 v2.4s, v0.8h, #16
+; CHECK-NEXT:    fcvtl v0.2d, v1.2s
+; CHECK-NEXT:    fcvtl2 v3.2d, v2.4s
+; CHECK-NEXT:    fcvtl2 v1.2d, v1.4s
+; CHECK-NEXT:    fcvtl v2.2d, v2.2s
 ; CHECK-NEXT:    ret
   %1 = fpext <8 x bfloat> %a to <8 x double>
   ret <8 x double> %1
@@ -788,11 +786,10 @@ define void @test_insert_at_zero(bfloat %a, ptr %b) #0 {
 define <8 x i8> @fptosi_i8(<8 x bfloat> %a) #0 {
 ; CHECK-LABEL: fptosi_i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT:    shll2 v1.4s, v0.8h, #16
 ; CHECK-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-NEXT:    fcvtzs v0.4s, v0.4s
-; CHECK-NEXT:    shll v1.4s, v1.4h, #16
 ; CHECK-NEXT:    fcvtzs v1.4s, v1.4s
+; CHECK-NEXT:    fcvtzs v0.4s, v0.4s
 ; CHECK-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
 ; CHECK-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-NEXT:    ret
@@ -803,11 +800,10 @@ define <8 x i8> @fptosi_i8(<8 x bfloat> %a) #0 {
 define <8 x i16> @fptosi_i16(<8 x bfloat> %a) #0 {
 ; CHECK-LABEL: fptosi_i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT:    shll2 v1.4s, v0.8h, #16
 ; CHECK-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-NEXT:    fcvtzs v0.4s, v0.4s
-; CHECK-NEXT:    shll v1.4s, v1.4h, #16
 ; CHECK-NEXT:    fcvtzs v1.4s, v1.4s
+; CHECK-NEXT:    fcvtzs v0.4s, v0.4s
 ; CHECK-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
 ; CHECK-NEXT:    ret
   %1 = fptosi<8 x bfloat> %a to <8 x i16>
@@ -817,11 +813,10 @@ define <8 x i16> @fptosi_i16(<8 x bfloat> %a) #0 {
 define <8 x i8> @fptoui_i8(<8 x bfloat> %a) #0 {
 ; CHECK-LABEL: fptoui_i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT:    shll2 v1.4s, v0.8h, #16
 ; CHECK-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-NEXT:    fcvtzu v0.4s, v0.4s
-; CHECK-NEXT:    shll v1.4s, v1.4h, #16
 ; CHECK-NEXT:    fcvtzu v1.4s, v1.4s
+; CHECK-NEXT:    fcvtzu v0.4s, v0.4s
 ; CHECK-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
 ; CHECK-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-NEXT:    ret
@@ -832,11 +827,10 @@ define <8 x i8> @fptoui_i8(<8 x bfloat> %a) #0 {
 define <8 x i16> @fptoui_i16(<8 x bfloat> %a) #0 {
 ; CHECK-LABEL: fptoui_i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT:    shll2 v1.4s, v0.8h, #16
 ; CHECK-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-NEXT:    fcvtzu v0.4s, v0.4s
-; CHECK-NEXT:    shll v1.4s, v1.4h, #16
 ; CHECK-NEXT:    fcvtzu v1.4s, v1.4s
+; CHECK-NEXT:    fcvtzu v0.4s, v0.4s
 ; CHECK-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
 ; CHECK-NEXT:    ret
   %1 = fptoui<8 x bfloat> %a to <8 x i16>
@@ -846,90 +840,58 @@ define <8 x i16> @fptoui_i16(<8 x bfloat> %a) #0 {
 define <8 x i1> @test_fcmp_une(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
 ; CHECK-LABEL: test_fcmp_une:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov h2, v1.h[1]
-; CHECK-NEXT:    mov h3, v0.h[1]
-; CHECK-NEXT:    fmov w10, s1
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    fmov w9, s3
-; CHECK-NEXT:    mov h2, v1.h[2]
-; CHECK-NEXT:    mov h3, v0.h[2]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s4, w8
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    fmov s5, w9
-; CHECK-NEXT:    lsl w9, w10, #16
-; CHECK-NEXT:    fmov w10, s3
-; CHECK-NEXT:    mov h3, v1.h[4]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fcmp s5, s4
-; CHECK-NEXT:    fmov s5, w9
-; CHECK-NEXT:    mov h4, v1.h[3]
-; CHECK-NEXT:    lsl w10, w10, #16
-; CHECK-NEXT:    fmov s6, w8
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    csetm w9, ne
-; CHECK-NEXT:    fmov s16, w10
-; CHECK-NEXT:    fcmp s6, s5
-; CHECK-NEXT:    mov h5, v0.h[3]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fmov w10, s4
-; CHECK-NEXT:    mov h6, v0.h[4]
-; CHECK-NEXT:    mov h4, v1.h[5]
-; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    dup v2.4h, v1.h[1]
+; CHECK-NEXT:    dup v3.4h, v0.h[1]
+; CHECK-NEXT:    dup v4.4h, v1.h[2]
+; CHECK-NEXT:    dup v5.4h, v0.h[2]
+; CHECK-NEXT:    dup v6.4h, v0.h[3]
+; CHECK-NEXT:    shll v2.4s, v2.4h, #16
+; CHECK-NEXT:    shll v3.4s, v3.4h, #16
+; CHECK-NEXT:    fcmp s3, s2
+; CHECK-NEXT:    shll v2.4s, v1.4h, #16
+; CHECK-NEXT:    shll v3.4s, v0.4h, #16
 ; CHECK-NEXT:    csetm w8, ne
-; CHECK-NEXT:    fmov s2, w8
-; CHECK-NEXT:    fmov w8, s5
-; CHECK-NEXT:    mov h5, v0.h[5]
-; CHECK-NEXT:    fcmp s16, s7
-; CHECK-NEXT:    mov v2.h[1], w9
-; CHECK-NEXT:    lsl w9, w10, #16
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fmov w10, s3
-; CHECK-NEXT:    fmov s3, w9
-; CHECK-NEXT:    fmov w9, s6
-; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    fcmp s3, s2
+; CHECK-NEXT:    shll v3.4s, v4.4h, #16
+; CHECK-NEXT:    shll v4.4s, v5.4h, #16
+; CHECK-NEXT:    dup v5.4h, v1.h[3]
+; CHECK-NEXT:    csetm w9, ne
+; CHECK-NEXT:    fmov s2, w9
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[4]
+; CHECK-NEXT:    dup v6.8h, v0.h[4]
+; CHECK-NEXT:    mov v2.h[1], w8
 ; CHECK-NEXT:    csetm w8, ne
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[5]
+; CHECK-NEXT:    dup v6.8h, v0.h[5]
 ; CHECK-NEXT:    mov v2.h[2], w8
-; CHECK-NEXT:    lsl w8, w10, #16
-; CHECK-NEXT:    fmov w10, s4
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fcmp s7, s3
-; CHECK-NEXT:    mov h3, v1.h[6]
-; CHECK-NEXT:    fmov s4, w8
-; CHECK-NEXT:    mov h1, v1.h[7]
-; CHECK-NEXT:    fmov s6, w9
-; CHECK-NEXT:    fmov w9, s5
 ; CHECK-NEXT:    csetm w8, ne
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[6]
+; CHECK-NEXT:    dup v6.8h, v0.h[6]
+; CHECK-NEXT:    dup v1.8h, v1.h[7]
+; CHECK-NEXT:    dup v0.8h, v0.h[7]
 ; CHECK-NEXT:    mov v2.h[3], w8
-; CHECK-NEXT:    lsl w8, w10, #16
-; CHECK-NEXT:    fcmp s6, s4
-; CHECK-NEXT:    mov h4, v0.h[6]
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s5, w8
-; CHECK-NEXT:    mov h0, v0.h[7]
-; CHECK-NEXT:    fmov s6, w9
 ; CHECK-NEXT:    csetm w8, ne
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-NEXT:    mov v2.h[4], w8
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    fmov w9, s4
-; CHECK-NEXT:    fcmp s6, s5
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    csetm w10, ne
-; CHECK-NEXT:    fmov s3, w8
-; CHECK-NEXT:    fmov s4, w9
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    mov v2.h[5], w10
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fcmp s4, s3
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fmov s1, w9
 ; CHECK-NEXT:    csetm w8, ne
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    mov v2.h[5], w8
+; CHECK-NEXT:    csetm w8, ne
+; CHECK-NEXT:    fcmp s0, s1
 ; CHECK-NEXT:    mov v2.h[6], w8
-; CHECK-NEXT:    fcmp s1, s0
 ; CHECK-NEXT:    csetm w8, ne
 ; CHECK-NEXT:    mov v2.h[7], w8
 ; CHECK-NEXT:    xtn v0.8b, v2.8h
@@ -941,96 +903,64 @@ define <8 x i1> @test_fcmp_une(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
 define <8 x i1> @test_fcmp_ueq(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
 ; CHECK-LABEL: test_fcmp_ueq:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov h2, v1.h[1]
-; CHECK-NEXT:    mov h3, v0.h[1]
-; CHECK-NEXT:    fmov w10, s1
-; CHECK-NEXT:    fmov w11, s0
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    fmov w9, s3
-; CHECK-NEXT:    mov h2, v1.h[2]
-; CHECK-NEXT:    mov h3, v0.h[2]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s4, w8
-; CHECK-NEXT:    fmov s5, w9
-; CHECK-NEXT:    lsl w8, w10, #16
-; CHECK-NEXT:    lsl w9, w11, #16
-; CHECK-NEXT:    fmov s6, w8
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    fcmp s5, s4
-; CHECK-NEXT:    fmov s7, w9
-; CHECK-NEXT:    fmov w9, s3
-; CHECK-NEXT:    mov h4, v1.h[3]
-; CHECK-NEXT:    mov h5, v0.h[3]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    csetm w10, eq
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    csinv w10, w10, wzr, vc
-; CHECK-NEXT:    fcmp s7, s6
-; CHECK-NEXT:    fmov s2, w8
-; CHECK-NEXT:    fmov s3, w9
-; CHECK-NEXT:    fmov w11, s4
-; CHECK-NEXT:    fmov w8, s5
-; CHECK-NEXT:    mov h4, v0.h[4]
-; CHECK-NEXT:    mov h7, v1.h[5]
+; CHECK-NEXT:    dup v2.4h, v1.h[1]
+; CHECK-NEXT:    dup v3.4h, v0.h[1]
+; CHECK-NEXT:    dup v4.4h, v1.h[2]
+; CHECK-NEXT:    dup v5.4h, v0.h[2]
+; CHECK-NEXT:    dup v6.4h, v0.h[3]
+; CHECK-NEXT:    shll v2.4s, v2.4h, #16
+; CHECK-NEXT:    shll v3.4s, v3.4h, #16
+; CHECK-NEXT:    fcmp s3, s2
+; CHECK-NEXT:    shll v2.4s, v1.4h, #16
+; CHECK-NEXT:    shll v3.4s, v0.4h, #16
+; CHECK-NEXT:    csetm w8, eq
+; CHECK-NEXT:    csinv w8, w8, wzr, vc
+; CHECK-NEXT:    fcmp s3, s2
+; CHECK-NEXT:    shll v3.4s, v4.4h, #16
+; CHECK-NEXT:    shll v4.4s, v5.4h, #16
+; CHECK-NEXT:    dup v5.4h, v1.h[3]
 ; CHECK-NEXT:    csetm w9, eq
 ; CHECK-NEXT:    csinv w9, w9, wzr, vc
-; CHECK-NEXT:    fcmp s3, s2
-; CHECK-NEXT:    mov h3, v1.h[4]
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
 ; CHECK-NEXT:    fmov s2, w9
-; CHECK-NEXT:    lsl w11, w11, #16
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fmov s5, w11
-; CHECK-NEXT:    fmov s6, w8
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[4]
+; CHECK-NEXT:    dup v6.8h, v0.h[4]
+; CHECK-NEXT:    mov v2.h[1], w8
 ; CHECK-NEXT:    csetm w8, eq
-; CHECK-NEXT:    mov v2.h[1], w10
-; CHECK-NEXT:    fmov w9, s3
-; CHECK-NEXT:    fmov w10, s4
 ; CHECK-NEXT:    csinv w8, w8, wzr, vc
-; CHECK-NEXT:    mov h3, v1.h[6]
-; CHECK-NEXT:    mov h1, v1.h[7]
-; CHECK-NEXT:    fcmp s6, s5
-; CHECK-NEXT:    mov h5, v0.h[5]
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    lsl w10, w10, #16
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[5]
+; CHECK-NEXT:    dup v6.8h, v0.h[5]
 ; CHECK-NEXT:    mov v2.h[2], w8
-; CHECK-NEXT:    fmov s4, w9
-; CHECK-NEXT:    fmov s6, w10
 ; CHECK-NEXT:    csetm w8, eq
-; CHECK-NEXT:    fmov w9, s7
-; CHECK-NEXT:    fmov w10, s5
 ; CHECK-NEXT:    csinv w8, w8, wzr, vc
-; CHECK-NEXT:    fcmp s6, s4
-; CHECK-NEXT:    mov h4, v0.h[6]
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[6]
+; CHECK-NEXT:    dup v6.8h, v0.h[6]
+; CHECK-NEXT:    dup v1.8h, v1.h[7]
+; CHECK-NEXT:    dup v0.8h, v0.h[7]
 ; CHECK-NEXT:    mov v2.h[3], w8
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    lsl w10, w10, #16
-; CHECK-NEXT:    mov h0, v0.h[7]
-; CHECK-NEXT:    fmov s5, w9
-; CHECK-NEXT:    fmov s6, w10
-; CHECK-NEXT:    fmov w9, s3
-; CHECK-NEXT:    fmov w10, s4
 ; CHECK-NEXT:    csetm w8, eq
 ; CHECK-NEXT:    csinv w8, w8, wzr, vc
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-NEXT:    mov v2.h[4], w8
-; CHECK-NEXT:    lsl w8, w9, #16
-; CHECK-NEXT:    fcmp s6, s5
-; CHECK-NEXT:    lsl w9, w10, #16
-; CHECK-NEXT:    fmov s3, w8
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    fmov s4, w9
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    csetm w10, eq
-; CHECK-NEXT:    csinv w10, w10, wzr, vc
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    mov v2.h[5], w10
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fcmp s4, s3
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fmov s1, w9
 ; CHECK-NEXT:    csetm w8, eq
 ; CHECK-NEXT:    csinv w8, w8, wzr, vc
-; CHECK-NEXT:    fcmp s1, s0
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    mov v2.h[5], w8
+; CHECK-NEXT:    csetm w8, eq
+; CHECK-NEXT:    csinv w8, w8, wzr, vc
+; CHECK-NEXT:    fcmp s0, s1
 ; CHECK-NEXT:    mov v2.h[6], w8
 ; CHECK-NEXT:    csetm w8, eq
 ; CHECK-NEXT:    csinv w8, w8, wzr, vc
@@ -1044,90 +974,58 @@ define <8 x i1> @test_fcmp_ueq(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
 define <8 x i1> @test_fcmp_ugt(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
 ; CHECK-LABEL: test_fcmp_ugt:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov h2, v1.h[1]
-; CHECK-NEXT:    mov h3, v0.h[1]
-; CHECK-NEXT:    fmov w10, s1
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    fmov w9, s3
-; CHECK-NEXT:    mov h2, v1.h[2]
-; CHECK-NEXT:    mov h3, v0.h[2]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s4, w8
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    fmov s5, w9
-; CHECK-NEXT:    lsl w9, w10, #16
-; CHECK-NEXT:    fmov w10, s3
-; CHECK-NEXT:    mov h3, v1.h[4]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fcmp s5, s4
-; CHECK-NEXT:    fmov s5, w9
-; CHECK-NEXT:    mov h4, v1.h[3]
-; CHECK-NEXT:    lsl w10, w10, #16
-; CHECK-NEXT:    fmov s6, w8
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    csetm w9, hi
-; CHECK-NEXT:    fmov s16, w10
-; CHECK-NEXT:    fcmp s6, s5
-; CHECK-NEXT:    mov h5, v0.h[3]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fmov w10, s4
-; CHECK-NEXT:    mov h6, v0.h[4]
-; CHECK-NEXT:    mov h4, v1.h[5]
-; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    dup v2.4h, v1.h[1]
+; CHECK-NEXT:    dup v3.4h, v0.h[1]
+; CHECK-NEXT:    dup v4.4h, v1.h[2]
+; CHECK-NEXT:    dup v5.4h, v0.h[2]
+; CHECK-NEXT:    dup v6.4h, v0.h[3]
+; CHECK-NEXT:    shll v2.4s, v2.4h, #16
+; CHECK-NEXT:    shll v3.4s, v3.4h, #16
+; CHECK-NEXT:    fcmp s3, s2
+; CHECK-NEXT:    shll v2.4s, v1.4h, #16
+; CHECK-NEXT:    shll v3.4s, v0.4h, #16
 ; CHECK-NEXT:    csetm w8, hi
-; CHECK-NEXT:    fmov s2, w8
-; CHECK-NEXT:    fmov w8, s5
-; CHECK-NEXT:    mov h5, v0.h[5]
-; CHECK-NEXT:    fcmp s16, s7
-; CHECK-NEXT:    mov v2.h[1], w9
-; CHECK-NEXT:    lsl w9, w10, #16
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fmov w10, s3
-; CHECK-NEXT:    fmov s3, w9
-; CHECK-NEXT:    fmov w9, s6
-; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    fcmp s3, s2
+; CHECK-NEXT:    shll v3.4s, v4.4h, #16
+; CHECK-NEXT:    shll v4.4s, v5.4h, #16
+; CHECK-NEXT:    dup v5.4h, v1.h[3]
+; CHECK-NEXT:    csetm w9, hi
+; CHECK-NEXT:    fmov s2, w9
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[4]
+; CHECK-NEXT:    dup v6.8h, v0.h[4]
+; CHECK-NEXT:    mov v2.h[1], w8
 ; CHECK-NEXT:    csetm w8, hi
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[5]
+; CHECK-NEXT:    dup v6.8h, v0.h[5]
 ; CHECK-NEXT:    mov v2.h[2], w8
-; CHECK-NEXT:    lsl w8, w10, #16
-; CHECK-NEXT:    fmov w10, s4
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fcmp s7, s3
-; CHECK-NEXT:    mov h3, v1.h[6]
-; CHECK-NEXT:    fmov s4, w8
-; CHECK-NEXT:    mov h1, v1.h[7]
-; CHECK-NEXT:    fmov s6, w9
-; CHECK-NEXT:    fmov w9, s5
 ; CHECK-NEXT:    csetm w8, hi
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[6]
+; CHECK-NEXT:    dup v6.8h, v0.h[6]
+; CHECK-NEXT:    dup v1.8h, v1.h[7]
+; CHECK-NEXT:    dup v0.8h, v0.h[7]
 ; CHECK-NEXT:    mov v2.h[3], w8
-; CHECK-NEXT:    lsl w8, w10, #16
-; CHECK-NEXT:    fcmp s6, s4
-; CHECK-NEXT:    mov h4, v0.h[6]
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s5, w8
-; CHECK-NEXT:    mov h0, v0.h[7]
-; CHECK-NEXT:    fmov s6, w9
 ; CHECK-NEXT:    csetm w8, hi
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-NEXT:    mov v2.h[4], w8
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    fmov w9, s4
-; CHECK-NEXT:    fcmp s6, s5
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    csetm w10, hi
-; CHECK-NEXT:    fmov s3, w8
-; CHECK-NEXT:    fmov s4, w9
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    mov v2.h[5], w10
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fcmp s4, s3
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fmov s1, w9
 ; CHECK-NEXT:    csetm w8, hi
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    mov v2.h[5], w8
+; CHECK-NEXT:    csetm w8, hi
+; CHECK-NEXT:    fcmp s0, s1
 ; CHECK-NEXT:    mov v2.h[6], w8
-; CHECK-NEXT:    fcmp s1, s0
 ; CHECK-NEXT:    csetm w8, hi
 ; CHECK-NEXT:    mov v2.h[7], w8
 ; CHECK-NEXT:    xtn v0.8b, v2.8h
@@ -1139,90 +1037,58 @@ define <8 x i1> @test_fcmp_ugt(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
 define <8 x i1> @test_fcmp_uge(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
 ; CHECK-LABEL: test_fcmp_uge:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov h2, v1.h[1]
-; CHECK-NEXT:    mov h3, v0.h[1]
-; CHECK-NEXT:    fmov w10, s1
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    fmov w9, s3
-; CHECK-NEXT:    mov h2, v1.h[2]
-; CHECK-NEXT:    mov h3, v0.h[2]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s4, w8
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    fmov s5, w9
-; CHECK-NEXT:    lsl w9, w10, #16
-; CHECK-NEXT:    fmov w10, s3
-; CHECK-NEXT:    mov h3, v1.h[4]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fcmp s5, s4
-; CHECK-NEXT:    fmov s5, w9
-; CHECK-NEXT:    mov h4, v1.h[3]
-; CHECK-NEXT:    lsl w10, w10, #16
-; CHECK-NEXT:    fmov s6, w8
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    csetm w9, pl
-; CHECK-NEXT:    fmov s16, w10
-; CHECK-NEXT:    fcmp s6, s5
-; CHECK-NEXT:    mov h5, v0.h[3]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fmov w10, s4
-; CHECK-NEXT:    mov h6, v0.h[4]
-; CHECK-NEXT:    mov h4, v1.h[5]
-; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    dup v2.4h, v1.h[1]
+; CHECK-NEXT:    dup v3.4h, v0.h[1]
+; CHECK-NEXT:    dup v4.4h, v1.h[2]
+; CHECK-NEXT:    dup v5.4h, v0.h[2]
+; CHECK-NEXT:    dup v6.4h, v0.h[3]
+; CHECK-NEXT:    shll v2.4s, v2.4h, #16
+; CHECK-NEXT:    shll v3.4s, v3.4h, #16
+; CHECK-NEXT:    fcmp s3, s2
+; CHECK-NEXT:    shll v2.4s, v1.4h, #16
+; CHECK-NEXT:    shll v3.4s, v0.4h, #16
 ; CHECK-NEXT:    csetm w8, pl
-; CHECK-NEXT:    fmov s2, w8
-; CHECK-NEXT:    fmov w8, s5
-; CHECK-NEXT:    mov h5, v0.h[5]
-; CHECK-NEXT:    fcmp s16, s7
-; CHECK-NEXT:    mov v2.h[1], w9
-; CHECK-NEXT:    lsl w9, w10, #16
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fmov w10, s3
-; CHECK-NEXT:    fmov s3, w9
-; CHECK-NEXT:    fmov w9, s6
-; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    fcmp s3, s2
+; CHECK-NEXT:    shll v3.4s, v4.4h, #16
+; CHECK-NEXT:    shll v4.4s, v5.4h, #16
+; CHECK-NEXT:    dup v5.4h, v1.h[3]
+; CHECK-NEXT:    csetm w9, pl
+; CHECK-NEXT:    fmov s2, w9
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[4]
+; CHECK-NEXT:    dup v6.8h, v0.h[4]
+; CHECK-NEXT:    mov v2.h[1], w8
 ; CHECK-NEXT:    csetm w8, pl
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[5]
+; CHECK-NEXT:    dup v6.8h, v0.h[5]
 ; CHECK-NEXT:    mov v2.h[2], w8
-; CHECK-NEXT:    lsl w8, w10, #16
-; CHECK-NEXT:    fmov w10, s4
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fcmp s7, s3
-; CHECK-NEXT:    mov h3, v1.h[6]
-; CHECK-NEXT:    fmov s4, w8
-; CHECK-NEXT:    mov h1, v1.h[7]
-; CHECK-NEXT:    fmov s6, w9
-; CHECK-NEXT:    fmov w9, s5
 ; CHECK-NEXT:    csetm w8, pl
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[6]
+; CHECK-NEXT:    dup v6.8h, v0.h[6]
+; CHECK-NEXT:    dup v1.8h, v1.h[7]
+; CHECK-NEXT:    dup v0.8h, v0.h[7]
 ; CHECK-NEXT:    mov v2.h[3], w8
-; CHECK-NEXT:    lsl w8, w10, #16
-; CHECK-NEXT:    fcmp s6, s4
-; CHECK-NEXT:    mov h4, v0.h[6]
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s5, w8
-; CHECK-NEXT:    mov h0, v0.h[7]
-; CHECK-NEXT:    fmov s6, w9
 ; CHECK-NEXT:    csetm w8, pl
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-NEXT:    mov v2.h[4], w8
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    fmov w9, s4
-; CHECK-NEXT:    fcmp s6, s5
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    csetm w10, pl
-; CHECK-NEXT:    fmov s3, w8
-; CHECK-NEXT:    fmov s4, w9
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    mov v2.h[5], w10
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fcmp s4, s3
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fmov s1, w9
 ; CHECK-NEXT:    csetm w8, pl
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    mov v2.h[5], w8
+; CHECK-NEXT:    csetm w8, pl
+; CHECK-NEXT:    fcmp s0, s1
 ; CHECK-NEXT:    mov v2.h[6], w8
-; CHECK-NEXT:    fcmp s1, s0
 ; CHECK-NEXT:    csetm w8, pl
 ; CHECK-NEXT:    mov v2.h[7], w8
 ; CHECK-NEXT:    xtn v0.8b, v2.8h
@@ -1234,90 +1100,58 @@ define <8 x i1> @test_fcmp_uge(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
 define <8 x i1> @test_fcmp_ult(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
 ; CHECK-LABEL: test_fcmp_ult:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov h2, v1.h[1]
-; CHECK-NEXT:    mov h3, v0.h[1]
-; CHECK-NEXT:    fmov w10, s1
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    fmov w9, s3
-; CHECK-NEXT:    mov h2, v1.h[2]
-; CHECK-NEXT:    mov h3, v0.h[2]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s4, w8
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    fmov s5, w9
-; CHECK-NEXT:    lsl w9, w10, #16
-; CHECK-NEXT:    fmov w10, s3
-; CHECK-NEXT:    mov h3, v1.h[4]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fcmp s5, s4
-; CHECK-NEXT:    fmov s5, w9
-; CHECK-NEXT:    mov h4, v1.h[3]
-; CHECK-NEXT:    lsl w10, w10, #16
-; CHECK-NEXT:    fmov s6, w8
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    csetm w9, lt
-; CHECK-NEXT:    fmov s16, w10
-; CHECK-NEXT:    fcmp s6, s5
-; CHECK-NEXT:    mov h5, v0.h[3]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fmov w10, s4
-; CHECK-NEXT:    mov h6, v0.h[4]
-; CHECK-NEXT:    mov h4, v1.h[5]
-; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    dup v2.4h, v1.h[1]
+; CHECK-NEXT:    dup v3.4h, v0.h[1]
+; CHECK-NEXT:    dup v4.4h, v1.h[2]
+; CHECK-NEXT:    dup v5.4h, v0.h[2]
+; CHECK-NEXT:    dup v6.4h, v0.h[3]
+; CHECK-NEXT:    shll v2.4s, v2.4h, #16
+; CHECK-NEXT:    shll v3.4s, v3.4h, #16
+; CHECK-NEXT:    fcmp s3, s2
+; CHECK-NEXT:    shll v2.4s, v1.4h, #16
+; CHECK-NEXT:    shll v3.4s, v0.4h, #16
 ; CHECK-NEXT:    csetm w8, lt
-; CHECK-NEXT:    fmov s2, w8
-; CHECK-NEXT:    fmov w8, s5
-; CHECK-NEXT:    mov h5, v0.h[5]
-; CHECK-NEXT:    fcmp s16, s7
-; CHECK-NEXT:    mov v2.h[1], w9
-; CHECK-NEXT:    lsl w9, w10, #16
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fmov w10, s3
-; CHECK-NEXT:    fmov s3, w9
-; CHECK-NEXT:    fmov w9, s6
-; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    fcmp s3, s2
+; CHECK-NEXT:    shll v3.4s, v4.4h, #16
+; CHECK-NEXT:    shll v4.4s, v5.4h, #16
+; CHECK-NEXT:    dup v5.4h, v1.h[3]
+; CHECK-NEXT:    csetm w9, lt
+; CHECK-NEXT:    fmov s2, w9
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[4]
+; CHECK-NEXT:    dup v6.8h, v0.h[4]
+; CHECK-NEXT:    mov v2.h[1], w8
 ; CHECK-NEXT:    csetm w8, lt
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[5]
+; CHECK-NEXT:    dup v6.8h, v0.h[5]
 ; CHECK-NEXT:    mov v2.h[2], w8
-; CHECK-NEXT:    lsl w8, w10, #16
-; CHECK-NEXT:    fmov w10, s4
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fcmp s7, s3
-; CHECK-NEXT:    mov h3, v1.h[6]
-; CHECK-NEXT:    fmov s4, w8
-; CHECK-NEXT:    mov h1, v1.h[7]
-; CHECK-NEXT:    fmov s6, w9
-; CHECK-NEXT:    fmov w9, s5
 ; CHECK-NEXT:    csetm w8, lt
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[6]
+; CHECK-NEXT:    dup v6.8h, v0.h[6]
+; CHECK-NEXT:    dup v1.8h, v1.h[7]
+; CHECK-NEXT:    dup v0.8h, v0.h[7]
 ; CHECK-NEXT:    mov v2.h[3], w8
-; CHECK-NEXT:    lsl w8, w10, #16
-; CHECK-NEXT:    fcmp s6, s4
-; CHECK-NEXT:    mov h4, v0.h[6]
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s5, w8
-; CHECK-NEXT:    mov h0, v0.h[7]
-; CHECK-NEXT:    fmov s6, w9
 ; CHECK-NEXT:    csetm w8, lt
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-NEXT:    mov v2.h[4], w8
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    fmov w9, s4
-; CHECK-NEXT:    fcmp s6, s5
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    csetm w10, lt
-; CHECK-NEXT:    fmov s3, w8
-; CHECK-NEXT:    fmov s4, w9
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    mov v2.h[5], w10
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fcmp s4, s3
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fmov s1, w9
 ; CHECK-NEXT:    csetm w8, lt
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    mov v2.h[5], w8
+; CHECK-NEXT:    csetm w8, lt
+; CHECK-NEXT:    fcmp s0, s1
 ; CHECK-NEXT:    mov v2.h[6], w8
-; CHECK-NEXT:    fcmp s1, s0
 ; CHECK-NEXT:    csetm w8, lt
 ; CHECK-NEXT:    mov v2.h[7], w8
 ; CHECK-NEXT:    xtn v0.8b, v2.8h
@@ -1329,90 +1163,58 @@ define <8 x i1> @test_fcmp_ult(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
 define <8 x i1> @test_fcmp_ule(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
 ; CHECK-LABEL: test_fcmp_ule:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov h2, v1.h[1]
-; CHECK-NEXT:    mov h3, v0.h[1]
-; CHECK-NEXT:    fmov w10, s1
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    fmov w9, s3
-; CHECK-NEXT:    mov h2, v1.h[2]
-; CHECK-NEXT:    mov h3, v0.h[2]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s4, w8
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    fmov s5, w9
-; CHECK-NEXT:    lsl w9, w10, #16
-; CHECK-NEXT:    fmov w10, s3
-; CHECK-NEXT:    mov h3, v1.h[4]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fcmp s5, s4
-; CHECK-NEXT:    fmov s5, w9
-; CHECK-NEXT:    mov h4, v1.h[3]
-; CHECK-NEXT:    lsl w10, w10, #16
-; CHECK-NEXT:    fmov s6, w8
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    csetm w9, le
-; CHECK-NEXT:    fmov s16, w10
-; CHECK-NEXT:    fcmp s6, s5
-; CHECK-NEXT:    mov h5, v0.h[3]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fmov w10, s4
-; CHECK-NEXT:    mov h6, v0.h[4]
-; CHECK-NEXT:    mov h4, v1.h[5]
-; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    dup v2.4h, v1.h[1]
+; CHECK-NEXT:    dup v3.4h, v0.h[1]
+; CHECK-NEXT:    dup v4.4h, v1.h[2]
+; CHECK-NEXT:    dup v5.4h, v0.h[2]
+; CHECK-NEXT:    dup v6.4h, v0.h[3]
+; CHECK-NEXT:    shll v2.4s, v2.4h, #16
+; CHECK-NEXT:    shll v3.4s, v3.4h, #16
+; CHECK-NEXT:    fcmp s3, s2
+; CHECK-NEXT:    shll v2.4s, v1.4h, #16
+; CHECK-NEXT:    shll v3.4s, v0.4h, #16
 ; CHECK-NEXT:    csetm w8, le
-; CHECK-NEXT:    fmov s2, w8
-; CHECK-NEXT:    fmov w8, s5
-; CHECK-NEXT:    mov h5, v0.h[5]
-; CHECK-NEXT:    fcmp s16, s7
-; CHECK-NEXT:    mov v2.h[1], w9
-; CHECK-NEXT:    lsl w9, w10, #16
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fmov w10, s3
-; CHECK-NEXT:    fmov s3, w9
-; CHECK-NEXT:    fmov w9, s6
-; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    fcmp s3, s2
+; CHECK-NEXT:    shll v3.4s, v4.4h, #16
+; CHECK-NEXT:    shll v4.4s, v5.4h, #16
+; CHECK-NEXT:    dup v5.4h, v1.h[3]
+; CHECK-NEXT:    csetm w9, le
+; CHECK-NEXT:    fmov s2, w9
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[4]
+; CHECK-NEXT:    dup v6.8h, v0.h[4]
+; CHECK-NEXT:    mov v2.h[1], w8
 ; CHECK-NEXT:    csetm w8, le
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[5]
+; CHECK-NEXT:    dup v6.8h, v0.h[5]
 ; CHECK-NEXT:    mov v2.h[2], w8
-; CHECK-NEXT:    lsl w8, w10, #16
-; CHECK-NEXT:    fmov w10, s4
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fcmp s7, s3
-; CHECK-NEXT:    mov h3, v1.h[6]
-; CHECK-NEXT:    fmov s4, w8
-; CHECK-NEXT:    mov h1, v1.h[7]
-; CHECK-NEXT:    fmov s6, w9
-; CHECK-NEXT:    fmov w9, s5
 ; CHECK-NEXT:    csetm w8, le
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[6]
+; CHECK-NEXT:    dup v6.8h, v0.h[6]
+; CHECK-NEXT:    dup v1.8h, v1.h[7]
+; CHECK-NEXT:    dup v0.8h, v0.h[7]
 ; CHECK-NEXT:    mov v2.h[3], w8
-; CHECK-NEXT:    lsl w8, w10, #16
-; CHECK-NEXT:    fcmp s6, s4
-; CHECK-NEXT:    mov h4, v0.h[6]
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s5, w8
-; CHECK-NEXT:    mov h0, v0.h[7]
-; CHECK-NEXT:    fmov s6, w9
 ; CHECK-NEXT:    csetm w8, le
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-NEXT:    mov v2.h[4], w8
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    fmov w9, s4
-; CHECK-NEXT:    fcmp s6, s5
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    csetm w10, le
-; CHECK-NEXT:    fmov s3, w8
-; CHECK-NEXT:    fmov s4, w9
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    mov v2.h[5], w10
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fcmp s4, s3
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fmov s1, w9
 ; CHECK-NEXT:    csetm w8, le
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    mov v2.h[5], w8
+; CHECK-NEXT:    csetm w8, le
+; CHECK-NEXT:    fcmp s0, s1
 ; CHECK-NEXT:    mov v2.h[6], w8
-; CHECK-NEXT:    fcmp s1, s0
 ; CHECK-NEXT:    csetm w8, le
 ; CHECK-NEXT:    mov v2.h[7], w8
 ; CHECK-NEXT:    xtn v0.8b, v2.8h
@@ -1424,90 +1226,58 @@ define <8 x i1> @test_fcmp_ule(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
 define <8 x i1> @test_fcmp_uno(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
 ; CHECK-LABEL: test_fcmp_uno:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov h2, v1.h[1]
-; CHECK-NEXT:    mov h3, v0.h[1]
-; CHECK-NEXT:    fmov w10, s1
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    fmov w9, s3
-; CHECK-NEXT:    mov h2, v1.h[2]
-; CHECK-NEXT:    mov h3, v0.h[2]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s4, w8
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    fmov s5, w9
-; CHECK-NEXT:    lsl w9, w10, #16
-; CHECK-NEXT:    fmov w10, s3
-; CHECK-NEXT:    mov h3, v1.h[4]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fcmp s5, s4
-; CHECK-NEXT:    fmov s5, w9
-; CHECK-NEXT:    mov h4, v1.h[3]
-; CHECK-NEXT:    lsl w10, w10, #16
-; CHECK-NEXT:    fmov s6, w8
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    csetm w9, vs
-; CHECK-NEXT:    fmov s16, w10
-; CHECK-NEXT:    fcmp s6, s5
-; CHECK-NEXT:    mov h5, v0.h[3]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fmov w10, s4
-; CHECK-NEXT:    mov h6, v0.h[4]
-; CHECK-NEXT:    mov h4, v1.h[5]
-; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    dup v2.4h, v1.h[1]
+; CHECK-NEXT:    dup v3.4h, v0.h[1]
+; CHECK-NEXT:    dup v4.4h, v1.h[2]
+; CHECK-NEXT:    dup v5.4h, v0.h[2]
+; CHECK-NEXT:    dup v6.4h, v0.h[3]
+; CHECK-NEXT:    shll v2.4s, v2.4h, #16
+; CHECK-NEXT:    shll v3.4s, v3.4h, #16
+; CHECK-NEXT:    fcmp s3, s2
+; CHECK-NEXT:    shll v2.4s, v1.4h, #16
+; CHECK-NEXT:    shll v3.4s, v0.4h, #16
 ; CHECK-NEXT:    csetm w8, vs
-; CHECK-NEXT:    fmov s2, w8
-; CHECK-NEXT:    fmov w8, s5
-; CHECK-NEXT:    mov h5, v0.h[5]
-; CHECK-NEXT:    fcmp s16, s7
-; CHECK-NEXT:    mov v2.h[1], w9
-; CHECK-NEXT:    lsl w9, w10, #16
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fmov w10, s3
-; CHECK-NEXT:    fmov s3, w9
-; CHECK-NEXT:    fmov w9, s6
-; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    fcmp s3, s2
+; CHECK-NEXT:    shll v3.4s, v4.4h, #16
+; CHECK-NEXT:    shll v4.4s, v5.4h, #16
+; CHECK-NEXT:    dup v5.4h, v1.h[3]
+; CHECK-NEXT:    csetm w9, vs
+; CHECK-NEXT:    fmov s2, w9
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[4]
+; CHECK-NEXT:    dup v6.8h, v0.h[4]
+; CHECK-NEXT:    mov v2.h[1], w8
 ; CHECK-NEXT:    csetm w8, vs
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[5]
+; CHECK-NEXT:    dup v6.8h, v0.h[5]
 ; CHECK-NEXT:    mov v2.h[2], w8
-; CHECK-NEXT:    lsl w8, w10, #16
-; CHECK-NEXT:    fmov w10, s4
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fcmp s7, s3
-; CHECK-NEXT:    mov h3, v1.h[6]
-; CHECK-NEXT:    fmov s4, w8
-; CHECK-NEXT:    mov h1, v1.h[7]
-; CHECK-NEXT:    fmov s6, w9
-; CHECK-NEXT:    fmov w9, s5
 ; CHECK-NEXT:    csetm w8, vs
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[6]
+; CHECK-NEXT:    dup v6.8h, v0.h[6]
+; CHECK-NEXT:    dup v1.8h, v1.h[7]
+; CHECK-NEXT:    dup v0.8h, v0.h[7]
 ; CHECK-NEXT:    mov v2.h[3], w8
-; CHECK-NEXT:    lsl w8, w10, #16
-; CHECK-NEXT:    fcmp s6, s4
-; CHECK-NEXT:    mov h4, v0.h[6]
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s5, w8
-; CHECK-NEXT:    mov h0, v0.h[7]
-; CHECK-NEXT:    fmov s6, w9
 ; CHECK-NEXT:    csetm w8, vs
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-NEXT:    mov v2.h[4], w8
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    fmov w9, s4
-; CHECK-NEXT:    fcmp s6, s5
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    csetm w10, vs
-; CHECK-NEXT:    fmov s3, w8
-; CHECK-NEXT:    fmov s4, w9
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    mov v2.h[5], w10
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fcmp s4, s3
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fmov s1, w9
 ; CHECK-NEXT:    csetm w8, vs
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    mov v2.h[5], w8
+; CHECK-NEXT:    csetm w8, vs
+; CHECK-NEXT:    fcmp s0, s1
 ; CHECK-NEXT:    mov v2.h[6], w8
-; CHECK-NEXT:    fcmp s1, s0
 ; CHECK-NEXT:    csetm w8, vs
 ; CHECK-NEXT:    mov v2.h[7], w8
 ; CHECK-NEXT:    xtn v0.8b, v2.8h
@@ -1519,96 +1289,64 @@ define <8 x i1> @test_fcmp_uno(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
 define <8 x i1> @test_fcmp_one(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
 ; CHECK-LABEL: test_fcmp_one:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov h2, v1.h[1]
-; CHECK-NEXT:    mov h3, v0.h[1]
-; CHECK-NEXT:    fmov w10, s1
-; CHECK-NEXT:    fmov w11, s0
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    fmov w9, s3
-; CHECK-NEXT:    mov h2, v1.h[2]
-; CHECK-NEXT:    mov h3, v0.h[2]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s4, w8
-; CHECK-NEXT:    fmov s5, w9
-; CHECK-NEXT:    lsl w8, w10, #16
-; CHECK-NEXT:    lsl w9, w11, #16
-; CHECK-NEXT:    fmov s6, w8
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    fcmp s5, s4
-; CHECK-NEXT:    fmov s7, w9
-; CHECK-NEXT:    fmov w9, s3
-; CHECK-NEXT:    mov h4, v1.h[3]
-; CHECK-NEXT:    mov h5, v0.h[3]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    csetm w10, mi
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    csinv w10, w10, wzr, le
-; CHECK-NEXT:    fcmp s7, s6
-; CHECK-NEXT:    fmov s2, w8
-; CHECK-NEXT:    fmov s3, w9
-; CHECK-NEXT:    fmov w11, s4
-; CHECK-NEXT:    fmov w8, s5
-; CHECK-NEXT:    mov h4, v0.h[4]
-; CHECK-NEXT:    mov h7, v1.h[5]
+; CHECK-NEXT:    dup v2.4h, v1.h[1]
+; CHECK-NEXT:    dup v3.4h, v0.h[1]
+; CHECK-NEXT:    dup v4.4h, v1.h[2]
+; CHECK-NEXT:    dup v5.4h, v0.h[2]
+; CHECK-NEXT:    dup v6.4h, v0.h[3]
+; CHECK-NEXT:    shll v2.4s, v2.4h, #16
+; CHECK-NEXT:    shll v3.4s, v3.4h, #16
+; CHECK-NEXT:    fcmp s3, s2
+; CHECK-NEXT:    shll v2.4s, v1.4h, #16
+; CHECK-NEXT:    shll v3.4s, v0.4h, #16
+; CHECK-NEXT:    csetm w8, mi
+; CHECK-NEXT:    csinv w8, w8, wzr, le
+; CHECK-NEXT:    fcmp s3, s2
+; CHECK-NEXT:    shll v3.4s, v4.4h, #16
+; CHECK-NEXT:    shll v4.4s, v5.4h, #16
+; CHECK-NEXT:    dup v5.4h, v1.h[3]
 ; CHECK-NEXT:    csetm w9, mi
 ; CHECK-NEXT:    csinv w9, w9, wzr, le
-; CHECK-NEXT:    fcmp s3, s2
-; CHECK-NEXT:    mov h3, v1.h[4]
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
 ; CHECK-NEXT:    fmov s2, w9
-; CHECK-NEXT:    lsl w11, w11, #16
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fmov s5, w11
-; CHECK-NEXT:    fmov s6, w8
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[4]
+; CHECK-NEXT:    dup v6.8h, v0.h[4]
+; CHECK-NEXT:    mov v2.h[1], w8
 ; CHECK-NEXT:    csetm w8, mi
-; CHECK-NEXT:    mov v2.h[1], w10
-; CHECK-NEXT:    fmov w9, s3
-; CHECK-NEXT:    fmov w10, s4
 ; CHECK-NEXT:    csinv w8, w8, wzr, le
-; CHECK-NEXT:    mov h3, v1.h[6]
-; CHECK-NEXT:    mov h1, v1.h[7]
-; CHECK-NEXT:    fcmp s6, s5
-; CHECK-NEXT:    mov h5, v0.h[5]
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    lsl w10, w10, #16
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[5]
+; CHECK-NEXT:    dup v6.8h, v0.h[5]
 ; CHECK-NEXT:    mov v2.h[2], w8
-; CHECK-NEXT:    fmov s4, w9
-; CHECK-NEXT:    fmov s6, w10
 ; CHECK-NEXT:    csetm w8, mi
-; CHECK-NEXT:    fmov w9, s7
-; CHECK-NEXT:    fmov w10, s5
 ; CHECK-NEXT:    csinv w8, w8, wzr, le
-; CHECK-NEXT:    fcmp s6, s4
-; CHECK-NEXT:    mov h4, v0.h[6]
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[6]
+; CHECK-NEXT:    dup v6.8h, v0.h[6]
+; CHECK-NEXT:    dup v1.8h, v1.h[7]
+; CHECK-NEXT:    dup v0.8h, v0.h[7]
 ; CHECK-NEXT:    mov v2.h[3], w8
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    lsl w10, w10, #16
-; CHECK-NEXT:    mov h0, v0.h[7]
-; CHECK-NEXT:    fmov s5, w9
-; CHECK-NEXT:    fmov s6, w10
-; CHECK-NEXT:    fmov w9, s3
-; CHECK-NEXT:    fmov w10, s4
 ; CHECK-NEXT:    csetm w8, mi
 ; CHECK-NEXT:    csinv w8, w8, wzr, le
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-NEXT:    mov v2.h[4], w8
-; CHECK-NEXT:    lsl w8, w9, #16
-; CHECK-NEXT:    fcmp s6, s5
-; CHECK-NEXT:    lsl w9, w10, #16
-; CHECK-NEXT:    fmov s3, w8
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    fmov s4, w9
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    csetm w10, mi
-; CHECK-NEXT:    csinv w10, w10, wzr, le
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    mov v2.h[5], w10
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fcmp s4, s3
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fmov s1, w9
 ; CHECK-NEXT:    csetm w8, mi
 ; CHECK-NEXT:    csinv w8, w8, wzr, le
-; CHECK-NEXT:    fcmp s1, s0
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    mov v2.h[5], w8
+; CHECK-NEXT:    csetm w8, mi
+; CHECK-NEXT:    csinv w8, w8, wzr, le
+; CHECK-NEXT:    fcmp s0, s1
 ; CHECK-NEXT:    mov v2.h[6], w8
 ; CHECK-NEXT:    csetm w8, mi
 ; CHECK-NEXT:    csinv w8, w8, wzr, le
@@ -1622,90 +1360,58 @@ define <8 x i1> @test_fcmp_one(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
 define <8 x i1> @test_fcmp_oeq(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
 ; CHECK-LABEL: test_fcmp_oeq:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov h2, v1.h[1]
-; CHECK-NEXT:    mov h3, v0.h[1]
-; CHECK-NEXT:    fmov w10, s1
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    fmov w9, s3
-; CHECK-NEXT:    mov h2, v1.h[2]
-; CHECK-NEXT:    mov h3, v0.h[2]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s4, w8
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    fmov s5, w9
-; CHECK-NEXT:    lsl w9, w10, #16
-; CHECK-NEXT:    fmov w10, s3
-; CHECK-NEXT:    mov h3, v1.h[4]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fcmp s5, s4
-; CHECK-NEXT:    fmov s5, w9
-; CHECK-NEXT:    mov h4, v1.h[3]
-; CHECK-NEXT:    lsl w10, w10, #16
-; CHECK-NEXT:    fmov s6, w8
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    csetm w9, eq
-; CHECK-NEXT:    fmov s16, w10
-; CHECK-NEXT:    fcmp s6, s5
-; CHECK-NEXT:    mov h5, v0.h[3]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fmov w10, s4
-; CHECK-NEXT:    mov h6, v0.h[4]
-; CHECK-NEXT:    mov h4, v1.h[5]
-; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    dup v2.4h, v1.h[1]
+; CHECK-NEXT:    dup v3.4h, v0.h[1]
+; CHECK-NEXT:    dup v4.4h, v1.h[2]
+; CHECK-NEXT:    dup v5.4h, v0.h[2]
+; CHECK-NEXT:    dup v6.4h, v0.h[3]
+; CHECK-NEXT:    shll v2.4s, v2.4h, #16
+; CHECK-NEXT:    shll v3.4s, v3.4h, #16
+; CHECK-NEXT:    fcmp s3, s2
+; CHECK-NEXT:    shll v2.4s, v1.4h, #16
+; CHECK-NEXT:    shll v3.4s, v0.4h, #16
 ; CHECK-NEXT:    csetm w8, eq
-; CHECK-NEXT:    fmov s2, w8
-; CHECK-NEXT:    fmov w8, s5
-; CHECK-NEXT:    mov h5, v0.h[5]
-; CHECK-NEXT:    fcmp s16, s7
-; CHECK-NEXT:    mov v2.h[1], w9
-; CHECK-NEXT:    lsl w9, w10, #16
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fmov w10, s3
-; CHECK-NEXT:    fmov s3, w9
-; CHECK-NEXT:    fmov w9, s6
-; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    fcmp s3, s2
+; CHECK-NEXT:    shll v3.4s, v4.4h, #16
+; CHECK-NEXT:    shll v4.4s, v5.4h, #16
+; CHECK-NEXT:    dup v5.4h, v1.h[3]
+; CHECK-NEXT:    csetm w9, eq
+; CHECK-NEXT:    fmov s2, w9
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[4]
+; CHECK-NEXT:    dup v6.8h, v0.h[4]
+; CHECK-NEXT:    mov v2.h[1], w8
 ; CHECK-NEXT:    csetm w8, eq
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[5]
+; CHECK-NEXT:    dup v6.8h, v0.h[5]
 ; CHECK-NEXT:    mov v2.h[2], w8
-; CHECK-NEXT:    lsl w8, w10, #16
-; CHECK-NEXT:    fmov w10, s4
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fcmp s7, s3
-; CHECK-NEXT:    mov h3, v1.h[6]
-; CHECK-NEXT:    fmov s4, w8
-; CHECK-NEXT:    mov h1, v1.h[7]
-; CHECK-NEXT:    fmov s6, w9
-; CHECK-NEXT:    fmov w9, s5
 ; CHECK-NEXT:    csetm w8, eq
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[6]
+; CHECK-NEXT:    dup v6.8h, v0.h[6]
+; CHECK-NEXT:    dup v1.8h, v1.h[7]
+; CHECK-NEXT:    dup v0.8h, v0.h[7]
 ; CHECK-NEXT:    mov v2.h[3], w8
-; CHECK-NEXT:    lsl w8, w10, #16
-; CHECK-NEXT:    fcmp s6, s4
-; CHECK-NEXT:    mov h4, v0.h[6]
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s5, w8
-; CHECK-NEXT:    mov h0, v0.h[7]
-; CHECK-NEXT:    fmov s6, w9
 ; CHECK-NEXT:    csetm w8, eq
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-NEXT:    mov v2.h[4], w8
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    fmov w9, s4
-; CHECK-NEXT:    fcmp s6, s5
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    csetm w10, eq
-; CHECK-NEXT:    fmov s3, w8
-; CHECK-NEXT:    fmov s4, w9
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    mov v2.h[5], w10
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fcmp s4, s3
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fmov s1, w9
 ; CHECK-NEXT:    csetm w8, eq
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    mov v2.h[5], w8
+; CHECK-NEXT:    csetm w8, eq
+; CHECK-NEXT:    fcmp s0, s1
 ; CHECK-NEXT:    mov v2.h[6], w8
-; CHECK-NEXT:    fcmp s1, s0
 ; CHECK-NEXT:    csetm w8, eq
 ; CHECK-NEXT:    mov v2.h[7], w8
 ; CHECK-NEXT:    xtn v0.8b, v2.8h
@@ -1717,90 +1423,58 @@ define <8 x i1> @test_fcmp_oeq(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
 define <8 x i1> @test_fcmp_ogt(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
 ; CHECK-LABEL: test_fcmp_ogt:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov h2, v1.h[1]
-; CHECK-NEXT:    mov h3, v0.h[1]
-; CHECK-NEXT:    fmov w10, s1
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    fmov w9, s3
-; CHECK-NEXT:    mov h2, v1.h[2]
-; CHECK-NEXT:    mov h3, v0.h[2]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s4, w8
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    fmov s5, w9
-; CHECK-NEXT:    lsl w9, w10, #16
-; CHECK-NEXT:    fmov w10, s3
-; CHECK-NEXT:    mov h3, v1.h[4]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fcmp s5, s4
-; CHECK-NEXT:    fmov s5, w9
-; CHECK-NEXT:    mov h4, v1.h[3]
-; CHECK-NEXT:    lsl w10, w10, #16
-; CHECK-NEXT:    fmov s6, w8
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    csetm w9, gt
-; CHECK-NEXT:    fmov s16, w10
-; CHECK-NEXT:    fcmp s6, s5
-; CHECK-NEXT:    mov h5, v0.h[3]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fmov w10, s4
-; CHECK-NEXT:    mov h6, v0.h[4]
-; CHECK-NEXT:    mov h4, v1.h[5]
-; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    dup v2.4h, v1.h[1]
+; CHECK-NEXT:    dup v3.4h, v0.h[1]
+; CHECK-NEXT:    dup v4.4h, v1.h[2]
+; CHECK-NEXT:    dup v5.4h, v0.h[2]
+; CHECK-NEXT:    dup v6.4h, v0.h[3]
+; CHECK-NEXT:    shll v2.4s, v2.4h, #16
+; CHECK-NEXT:    shll v3.4s, v3.4h, #16
+; CHECK-NEXT:    fcmp s3, s2
+; CHECK-NEXT:    shll v2.4s, v1.4h, #16
+; CHECK-NEXT:    shll v3.4s, v0.4h, #16
 ; CHECK-NEXT:    csetm w8, gt
-; CHECK-NEXT:    fmov s2, w8
-; CHECK-NEXT:    fmov w8, s5
-; CHECK-NEXT:    mov h5, v0.h[5]
-; CHECK-NEXT:    fcmp s16, s7
-; CHECK-NEXT:    mov v2.h[1], w9
-; CHECK-NEXT:    lsl w9, w10, #16
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fmov w10, s3
-; CHECK-NEXT:    fmov s3, w9
-; CHECK-NEXT:    fmov w9, s6
-; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    fcmp s3, s2
+; CHECK-NEXT:    shll v3.4s, v4.4h, #16
+; CHECK-NEXT:    shll v4.4s, v5.4h, #16
+; CHECK-NEXT:    dup v5.4h, v1.h[3]
+; CHECK-NEXT:    csetm w9, gt
+; CHECK-NEXT:    fmov s2, w9
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[4]
+; CHECK-NEXT:    dup v6.8h, v0.h[4]
+; CHECK-NEXT:    mov v2.h[1], w8
 ; CHECK-NEXT:    csetm w8, gt
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[5]
+; CHECK-NEXT:    dup v6.8h, v0.h[5]
 ; CHECK-NEXT:    mov v2.h[2], w8
-; CHECK-NEXT:    lsl w8, w10, #16
-; CHECK-NEXT:    fmov w10, s4
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fcmp s7, s3
-; CHECK-NEXT:    mov h3, v1.h[6]
-; CHECK-NEXT:    fmov s4, w8
-; CHECK-NEXT:    mov h1, v1.h[7]
-; CHECK-NEXT:    fmov s6, w9
-; CHECK-NEXT:    fmov w9, s5
 ; CHECK-NEXT:    csetm w8, gt
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[6]
+; CHECK-NEXT:    dup v6.8h, v0.h[6]
+; CHECK-NEXT:    dup v1.8h, v1.h[7]
+; CHECK-NEXT:    dup v0.8h, v0.h[7]
 ; CHECK-NEXT:    mov v2.h[3], w8
-; CHECK-NEXT:    lsl w8, w10, #16
-; CHECK-NEXT:    fcmp s6, s4
-; CHECK-NEXT:    mov h4, v0.h[6]
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s5, w8
-; CHECK-NEXT:    mov h0, v0.h[7]
-; CHECK-NEXT:    fmov s6, w9
 ; CHECK-NEXT:    csetm w8, gt
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-NEXT:    mov v2.h[4], w8
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    fmov w9, s4
-; CHECK-NEXT:    fcmp s6, s5
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    csetm w10, gt
-; CHECK-NEXT:    fmov s3, w8
-; CHECK-NEXT:    fmov s4, w9
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    mov v2.h[5], w10
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fcmp s4, s3
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fmov s1, w9
 ; CHECK-NEXT:    csetm w8, gt
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    mov v2.h[5], w8
+; CHECK-NEXT:    csetm w8, gt
+; CHECK-NEXT:    fcmp s0, s1
 ; CHECK-NEXT:    mov v2.h[6], w8
-; CHECK-NEXT:    fcmp s1, s0
 ; CHECK-NEXT:    csetm w8, gt
 ; CHECK-NEXT:    mov v2.h[7], w8
 ; CHECK-NEXT:    xtn v0.8b, v2.8h
@@ -1812,90 +1486,58 @@ define <8 x i1> @test_fcmp_ogt(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
 define <8 x i1> @test_fcmp_oge(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
 ; CHECK-LABEL: test_fcmp_oge:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov h2, v1.h[1]
-; CHECK-NEXT:    mov h3, v0.h[1]
-; CHECK-NEXT:    fmov w10, s1
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    fmov w9, s3
-; CHECK-NEXT:    mov h2, v1.h[2]
-; CHECK-NEXT:    mov h3, v0.h[2]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s4, w8
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    fmov s5, w9
-; CHECK-NEXT:    lsl w9, w10, #16
-; CHECK-NEXT:    fmov w10, s3
-; CHECK-NEXT:    mov h3, v1.h[4]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fcmp s5, s4
-; CHECK-NEXT:    fmov s5, w9
-; CHECK-NEXT:    mov h4, v1.h[3]
-; CHECK-NEXT:    lsl w10, w10, #16
-; CHECK-NEXT:    fmov s6, w8
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    csetm w9, ge
-; CHECK-NEXT:    fmov s16, w10
-; CHECK-NEXT:    fcmp s6, s5
-; CHECK-NEXT:    mov h5, v0.h[3]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fmov w10, s4
-; CHECK-NEXT:    mov h6, v0.h[4]
-; CHECK-NEXT:    mov h4, v1.h[5]
-; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    dup v2.4h, v1.h[1]
+; CHECK-NEXT:    dup v3.4h, v0.h[1]
+; CHECK-NEXT:    dup v4.4h, v1.h[2]
+; CHECK-NEXT:    dup v5.4h, v0.h[2]
+; CHECK-NEXT:    dup v6.4h, v0.h[3]
+; CHECK-NEXT:    shll v2.4s, v2.4h, #16
+; CHECK-NEXT:    shll v3.4s, v3.4h, #16
+; CHECK-NEXT:    fcmp s3, s2
+; CHECK-NEXT:    shll v2.4s, v1.4h, #16
+; CHECK-NEXT:    shll v3.4s, v0.4h, #16
 ; CHECK-NEXT:    csetm w8, ge
-; CHECK-NEXT:    fmov s2, w8
-; CHECK-NEXT:    fmov w8, s5
-; CHECK-NEXT:    mov h5, v0.h[5]
-; CHECK-NEXT:    fcmp s16, s7
-; CHECK-NEXT:    mov v2.h[1], w9
-; CHECK-NEXT:    lsl w9, w10, #16
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fmov w10, s3
-; CHECK-NEXT:    fmov s3, w9
-; CHECK-NEXT:    fmov w9, s6
-; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    fcmp s3, s2
+; CHECK-NEXT:    shll v3.4s, v4.4h, #16
+; CHECK-NEXT:    shll v4.4s, v5.4h, #16
+; CHECK-NEXT:    dup v5.4h, v1.h[3]
+; CHECK-NEXT:    csetm w9, ge
+; CHECK-NEXT:    fmov s2, w9
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[4]
+; CHECK-NEXT:    dup v6.8h, v0.h[4]
+; CHECK-NEXT:    mov v2.h[1], w8
 ; CHECK-NEXT:    csetm w8, ge
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[5]
+; CHECK-NEXT:    dup v6.8h, v0.h[5]
 ; CHECK-NEXT:    mov v2.h[2], w8
-; CHECK-NEXT:    lsl w8, w10, #16
-; CHECK-NEXT:    fmov w10, s4
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fcmp s7, s3
-; CHECK-NEXT:    mov h3, v1.h[6]
-; CHECK-NEXT:    fmov s4, w8
-; CHECK-NEXT:    mov h1, v1.h[7]
-; CHECK-NEXT:    fmov s6, w9
-; CHECK-NEXT:    fmov w9, s5
 ; CHECK-NEXT:    csetm w8, ge
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[6]
+; CHECK-NEXT:    dup v6.8h, v0.h[6]
+; CHECK-NEXT:    dup v1.8h, v1.h[7]
+; CHECK-NEXT:    dup v0.8h, v0.h[7]
 ; CHECK-NEXT:    mov v2.h[3], w8
-; CHECK-NEXT:    lsl w8, w10, #16
-; CHECK-NEXT:    fcmp s6, s4
-; CHECK-NEXT:    mov h4, v0.h[6]
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s5, w8
-; CHECK-NEXT:    mov h0, v0.h[7]
-; CHECK-NEXT:    fmov s6, w9
 ; CHECK-NEXT:    csetm w8, ge
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-NEXT:    mov v2.h[4], w8
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    fmov w9, s4
-; CHECK-NEXT:    fcmp s6, s5
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    csetm w10, ge
-; CHECK-NEXT:    fmov s3, w8
-; CHECK-NEXT:    fmov s4, w9
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    mov v2.h[5], w10
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fcmp s4, s3
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fmov s1, w9
 ; CHECK-NEXT:    csetm w8, ge
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    mov v2.h[5], w8
+; CHECK-NEXT:    csetm w8, ge
+; CHECK-NEXT:    fcmp s0, s1
 ; CHECK-NEXT:    mov v2.h[6], w8
-; CHECK-NEXT:    fcmp s1, s0
 ; CHECK-NEXT:    csetm w8, ge
 ; CHECK-NEXT:    mov v2.h[7], w8
 ; CHECK-NEXT:    xtn v0.8b, v2.8h
@@ -1907,90 +1549,58 @@ define <8 x i1> @test_fcmp_oge(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
 define <8 x i1> @test_fcmp_olt(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
 ; CHECK-LABEL: test_fcmp_olt:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov h2, v1.h[1]
-; CHECK-NEXT:    mov h3, v0.h[1]
-; CHECK-NEXT:    fmov w10, s1
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    fmov w9, s3
-; CHECK-NEXT:    mov h2, v1.h[2]
-; CHECK-NEXT:    mov h3, v0.h[2]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s4, w8
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    fmov s5, w9
-; CHECK-NEXT:    lsl w9, w10, #16
-; CHECK-NEXT:    fmov w10, s3
-; CHECK-NEXT:    mov h3, v1.h[4]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fcmp s5, s4
-; CHECK-NEXT:    fmov s5, w9
-; CHECK-NEXT:    mov h4, v1.h[3]
-; CHECK-NEXT:    lsl w10, w10, #16
-; CHECK-NEXT:    fmov s6, w8
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    csetm w9, mi
-; CHECK-NEXT:    fmov s16, w10
-; CHECK-NEXT:    fcmp s6, s5
-; CHECK-NEXT:    mov h5, v0.h[3]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fmov w10, s4
-; CHECK-NEXT:    mov h6, v0.h[4]
-; CHECK-NEXT:    mov h4, v1.h[5]
-; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    dup v2.4h, v1.h[1]
+; CHECK-NEXT:    dup v3.4h, v0.h[1]
+; CHECK-NEXT:    dup v4.4h, v1.h[2]
+; CHECK-NEXT:    dup v5.4h, v0.h[2]
+; CHECK-NEXT:    dup v6.4h, v0.h[3]
+; CHECK-NEXT:    shll v2.4s, v2.4h, #16
+; CHECK-NEXT:    shll v3.4s, v3.4h, #16
+; CHECK-NEXT:    fcmp s3, s2
+; CHECK-NEXT:    shll v2.4s, v1.4h, #16
+; CHECK-NEXT:    shll v3.4s, v0.4h, #16
 ; CHECK-NEXT:    csetm w8, mi
-; CHECK-NEXT:    fmov s2, w8
-; CHECK-NEXT:    fmov w8, s5
-; CHECK-NEXT:    mov h5, v0.h[5]
-; CHECK-NEXT:    fcmp s16, s7
-; CHECK-NEXT:    mov v2.h[1], w9
-; CHECK-NEXT:    lsl w9, w10, #16
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fmov w10, s3
-; CHECK-NEXT:    fmov s3, w9
-; CHECK-NEXT:    fmov w9, s6
-; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    fcmp s3, s2
+; CHECK-NEXT:    shll v3.4s, v4.4h, #16
+; CHECK-NEXT:    shll v4.4s, v5.4h, #16
+; CHECK-NEXT:    dup v5.4h, v1.h[3]
+; CHECK-NEXT:    csetm w9, mi
+; CHECK-NEXT:    fmov s2, w9
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[4]
+; CHECK-NEXT:    dup v6.8h, v0.h[4]
+; CHECK-NEXT:    mov v2.h[1], w8
 ; CHECK-NEXT:    csetm w8, mi
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[5]
+; CHECK-NEXT:    dup v6.8h, v0.h[5]
 ; CHECK-NEXT:    mov v2.h[2], w8
-; CHECK-NEXT:    lsl w8, w10, #16
-; CHECK-NEXT:    fmov w10, s4
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fcmp s7, s3
-; CHECK-NEXT:    mov h3, v1.h[6]
-; CHECK-NEXT:    fmov s4, w8
-; CHECK-NEXT:    mov h1, v1.h[7]
-; CHECK-NEXT:    fmov s6, w9
-; CHECK-NEXT:    fmov w9, s5
 ; CHECK-NEXT:    csetm w8, mi
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[6]
+; CHECK-NEXT:    dup v6.8h, v0.h[6]
+; CHECK-NEXT:    dup v1.8h, v1.h[7]
+; CHECK-NEXT:    dup v0.8h, v0.h[7]
 ; CHECK-NEXT:    mov v2.h[3], w8
-; CHECK-NEXT:    lsl w8, w10, #16
-; CHECK-NEXT:    fcmp s6, s4
-; CHECK-NEXT:    mov h4, v0.h[6]
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s5, w8
-; CHECK-NEXT:    mov h0, v0.h[7]
-; CHECK-NEXT:    fmov s6, w9
 ; CHECK-NEXT:    csetm w8, mi
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-NEXT:    mov v2.h[4], w8
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    fmov w9, s4
-; CHECK-NEXT:    fcmp s6, s5
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    csetm w10, mi
-; CHECK-NEXT:    fmov s3, w8
-; CHECK-NEXT:    fmov s4, w9
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    mov v2.h[5], w10
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fcmp s4, s3
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fmov s1, w9
 ; CHECK-NEXT:    csetm w8, mi
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    mov v2.h[5], w8
+; CHECK-NEXT:    csetm w8, mi
+; CHECK-NEXT:    fcmp s0, s1
 ; CHECK-NEXT:    mov v2.h[6], w8
-; CHECK-NEXT:    fcmp s1, s0
 ; CHECK-NEXT:    csetm w8, mi
 ; CHECK-NEXT:    mov v2.h[7], w8
 ; CHECK-NEXT:    xtn v0.8b, v2.8h
@@ -2002,90 +1612,58 @@ define <8 x i1> @test_fcmp_olt(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
 define <8 x i1> @test_fcmp_ole(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
 ; CHECK-LABEL: test_fcmp_ole:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov h2, v1.h[1]
-; CHECK-NEXT:    mov h3, v0.h[1]
-; CHECK-NEXT:    fmov w10, s1
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    fmov w9, s3
-; CHECK-NEXT:    mov h2, v1.h[2]
-; CHECK-NEXT:    mov h3, v0.h[2]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s4, w8
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    fmov s5, w9
-; CHECK-NEXT:    lsl w9, w10, #16
-; CHECK-NEXT:    fmov w10, s3
-; CHECK-NEXT:    mov h3, v1.h[4]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fcmp s5, s4
-; CHECK-NEXT:    fmov s5, w9
-; CHECK-NEXT:    mov h4, v1.h[3]
-; CHECK-NEXT:    lsl w10, w10, #16
-; CHECK-NEXT:    fmov s6, w8
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    csetm w9, ls
-; CHECK-NEXT:    fmov s16, w10
-; CHECK-NEXT:    fcmp s6, s5
-; CHECK-NEXT:    mov h5, v0.h[3]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fmov w10, s4
-; CHECK-NEXT:    mov h6, v0.h[4]
-; CHECK-NEXT:    mov h4, v1.h[5]
-; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    dup v2.4h, v1.h[1]
+; CHECK-NEXT:    dup v3.4h, v0.h[1]
+; CHECK-NEXT:    dup v4.4h, v1.h[2]
+; CHECK-NEXT:    dup v5.4h, v0.h[2]
+; CHECK-NEXT:    dup v6.4h, v0.h[3]
+; CHECK-NEXT:    shll v2.4s, v2.4h, #16
+; CHECK-NEXT:    shll v3.4s, v3.4h, #16
+; CHECK-NEXT:    fcmp s3, s2
+; CHECK-NEXT:    shll v2.4s, v1.4h, #16
+; CHECK-NEXT:    shll v3.4s, v0.4h, #16
 ; CHECK-NEXT:    csetm w8, ls
-; CHECK-NEXT:    fmov s2, w8
-; CHECK-NEXT:    fmov w8, s5
-; CHECK-NEXT:    mov h5, v0.h[5]
-; CHECK-NEXT:    fcmp s16, s7
-; CHECK-NEXT:    mov v2.h[1], w9
-; CHECK-NEXT:    lsl w9, w10, #16
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fmov w10, s3
-; CHECK-NEXT:    fmov s3, w9
-; CHECK-NEXT:    fmov w9, s6
-; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    fcmp s3, s2
+; CHECK-NEXT:    shll v3.4s, v4.4h, #16
+; CHECK-NEXT:    shll v4.4s, v5.4h, #16
+; CHECK-NEXT:    dup v5.4h, v1.h[3]
+; CHECK-NEXT:    csetm w9, ls
+; CHECK-NEXT:    fmov s2, w9
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[4]
+; CHECK-NEXT:    dup v6.8h, v0.h[4]
+; CHECK-NEXT:    mov v2.h[1], w8
 ; CHECK-NEXT:    csetm w8, ls
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[5]
+; CHECK-NEXT:    dup v6.8h, v0.h[5]
 ; CHECK-NEXT:    mov v2.h[2], w8
-; CHECK-NEXT:    lsl w8, w10, #16
-; CHECK-NEXT:    fmov w10, s4
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fcmp s7, s3
-; CHECK-NEXT:    mov h3, v1.h[6]
-; CHECK-NEXT:    fmov s4, w8
-; CHECK-NEXT:    mov h1, v1.h[7]
-; CHECK-NEXT:    fmov s6, w9
-; CHECK-NEXT:    fmov w9, s5
 ; CHECK-NEXT:    csetm w8, ls
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[6]
+; CHECK-NEXT:    dup v6.8h, v0.h[6]
+; CHECK-NEXT:    dup v1.8h, v1.h[7]
+; CHECK-NEXT:    dup v0.8h, v0.h[7]
 ; CHECK-NEXT:    mov v2.h[3], w8
-; CHECK-NEXT:    lsl w8, w10, #16
-; CHECK-NEXT:    fcmp s6, s4
-; CHECK-NEXT:    mov h4, v0.h[6]
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s5, w8
-; CHECK-NEXT:    mov h0, v0.h[7]
-; CHECK-NEXT:    fmov s6, w9
 ; CHECK-NEXT:    csetm w8, ls
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-NEXT:    mov v2.h[4], w8
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    fmov w9, s4
-; CHECK-NEXT:    fcmp s6, s5
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    csetm w10, ls
-; CHECK-NEXT:    fmov s3, w8
-; CHECK-NEXT:    fmov s4, w9
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    mov v2.h[5], w10
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fcmp s4, s3
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fmov s1, w9
 ; CHECK-NEXT:    csetm w8, ls
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    mov v2.h[5], w8
+; CHECK-NEXT:    csetm w8, ls
+; CHECK-NEXT:    fcmp s0, s1
 ; CHECK-NEXT:    mov v2.h[6], w8
-; CHECK-NEXT:    fcmp s1, s0
 ; CHECK-NEXT:    csetm w8, ls
 ; CHECK-NEXT:    mov v2.h[7], w8
 ; CHECK-NEXT:    xtn v0.8b, v2.8h
@@ -2097,90 +1675,58 @@ define <8 x i1> @test_fcmp_ole(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
 define <8 x i1> @test_fcmp_ord(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
 ; CHECK-LABEL: test_fcmp_ord:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov h2, v1.h[1]
-; CHECK-NEXT:    mov h3, v0.h[1]
-; CHECK-NEXT:    fmov w10, s1
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    fmov w9, s3
-; CHECK-NEXT:    mov h2, v1.h[2]
-; CHECK-NEXT:    mov h3, v0.h[2]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s4, w8
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    fmov s5, w9
-; CHECK-NEXT:    lsl w9, w10, #16
-; CHECK-NEXT:    fmov w10, s3
-; CHECK-NEXT:    mov h3, v1.h[4]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fcmp s5, s4
-; CHECK-NEXT:    fmov s5, w9
-; CHECK-NEXT:    mov h4, v1.h[3]
-; CHECK-NEXT:    lsl w10, w10, #16
-; CHECK-NEXT:    fmov s6, w8
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    csetm w9, vc
-; CHECK-NEXT:    fmov s16, w10
-; CHECK-NEXT:    fcmp s6, s5
-; CHECK-NEXT:    mov h5, v0.h[3]
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fmov w10, s4
-; CHECK-NEXT:    mov h6, v0.h[4]
-; CHECK-NEXT:    mov h4, v1.h[5]
-; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    dup v2.4h, v1.h[1]
+; CHECK-NEXT:    dup v3.4h, v0.h[1]
+; CHECK-NEXT:    dup v4.4h, v1.h[2]
+; CHECK-NEXT:    dup v5.4h, v0.h[2]
+; CHECK-NEXT:    dup v6.4h, v0.h[3]
+; CHECK-NEXT:    shll v2.4s, v2.4h, #16
+; CHECK-NEXT:    shll v3.4s, v3.4h, #16
+; CHECK-NEXT:    fcmp s3, s2
+; CHECK-NEXT:    shll v2.4s, v1.4h, #16
+; CHECK-NEXT:    shll v3.4s, v0.4h, #16
 ; CHECK-NEXT:    csetm w8, vc
-; CHECK-NEXT:    fmov s2, w8
-; CHECK-NEXT:    fmov w8, s5
-; CHECK-NEXT:    mov h5, v0.h[5]
-; CHECK-NEXT:    fcmp s16, s7
-; CHECK-NEXT:    mov v2.h[1], w9
-; CHECK-NEXT:    lsl w9, w10, #16
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fmov w10, s3
-; CHECK-NEXT:    fmov s3, w9
-; CHECK-NEXT:    fmov w9, s6
-; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    fcmp s3, s2
+; CHECK-NEXT:    shll v3.4s, v4.4h, #16
+; CHECK-NEXT:    shll v4.4s, v5.4h, #16
+; CHECK-NEXT:    dup v5.4h, v1.h[3]
+; CHECK-NEXT:    csetm w9, vc
+; CHECK-NEXT:    fmov s2, w9
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[4]
+; CHECK-NEXT:    dup v6.8h, v0.h[4]
+; CHECK-NEXT:    mov v2.h[1], w8
 ; CHECK-NEXT:    csetm w8, vc
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[5]
+; CHECK-NEXT:    dup v6.8h, v0.h[5]
 ; CHECK-NEXT:    mov v2.h[2], w8
-; CHECK-NEXT:    lsl w8, w10, #16
-; CHECK-NEXT:    fmov w10, s4
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fcmp s7, s3
-; CHECK-NEXT:    mov h3, v1.h[6]
-; CHECK-NEXT:    fmov s4, w8
-; CHECK-NEXT:    mov h1, v1.h[7]
-; CHECK-NEXT:    fmov s6, w9
-; CHECK-NEXT:    fmov w9, s5
 ; CHECK-NEXT:    csetm w8, vc
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    dup v5.8h, v1.h[6]
+; CHECK-NEXT:    dup v6.8h, v0.h[6]
+; CHECK-NEXT:    dup v1.8h, v1.h[7]
+; CHECK-NEXT:    dup v0.8h, v0.h[7]
 ; CHECK-NEXT:    mov v2.h[3], w8
-; CHECK-NEXT:    lsl w8, w10, #16
-; CHECK-NEXT:    fcmp s6, s4
-; CHECK-NEXT:    mov h4, v0.h[6]
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s5, w8
-; CHECK-NEXT:    mov h0, v0.h[7]
-; CHECK-NEXT:    fmov s6, w9
 ; CHECK-NEXT:    csetm w8, vc
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    shll v3.4s, v5.4h, #16
+; CHECK-NEXT:    shll v4.4s, v6.4h, #16
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-NEXT:    mov v2.h[4], w8
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    fmov w9, s4
-; CHECK-NEXT:    fcmp s6, s5
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    csetm w10, vc
-; CHECK-NEXT:    fmov s3, w8
-; CHECK-NEXT:    fmov s4, w9
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    mov v2.h[5], w10
-; CHECK-NEXT:    lsl w8, w8, #16
-; CHECK-NEXT:    fcmp s4, s3
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fmov s1, w9
 ; CHECK-NEXT:    csetm w8, vc
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    mov v2.h[5], w8
+; CHECK-NEXT:    csetm w8, vc
+; CHECK-NEXT:    fcmp s0, s1
 ; CHECK-NEXT:    mov v2.h[6], w8
-; CHECK-NEXT:    fcmp s1, s0
 ; CHECK-NEXT:    csetm w8, vc
 ; CHECK-NEXT:    mov v2.h[7], w8
 ; CHECK-NEXT:    xtn v0.8b, v2.8h
diff --git a/llvm/test/CodeGen/AArch64/cvt-fp-int-fp.ll b/llvm/test/CodeGen/AArch64/cvt-fp-int-fp.ll
index 40684b0f3a25..e3263252875f 100644
--- a/llvm/test/CodeGen/AArch64/cvt-fp-int-fp.ll
+++ b/llvm/test/CodeGen/AArch64/cvt-fp-int-fp.ll
@@ -76,11 +76,9 @@ entry:
 define bfloat @t7(bfloat %x)  {
 ; CHECK-LABEL: t7:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-NEXT:    mov w8, #32767 // =0x7fff
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s0, w9
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-NEXT:    fcvtzs w9, s0
 ; CHECK-NEXT:    scvtf d0, w9
 ; CHECK-NEXT:    fcvtxn s0, d0
@@ -101,11 +99,9 @@ entry:
 define bfloat @t8(bfloat %x)  {
 ; CHECK-LABEL: t8:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-NEXT:    mov w8, #32767 // =0x7fff
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s0, w9
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-NEXT:    fcvtzu w9, s0
 ; CHECK-NEXT:    ucvtf d0, w9
 ; CHECK-NEXT:    fcvtxn s0, d0
@@ -198,11 +194,9 @@ entry:
 define bfloat @t7_strict(bfloat %x) #0 {
 ; CHECK-LABEL: t7_strict:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-NEXT:    mov w8, #32767 // =0x7fff
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s0, w9
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-NEXT:    fcvtzs w9, s0
 ; CHECK-NEXT:    scvtf d0, w9
 ; CHECK-NEXT:    fcvtxn s0, d0
@@ -223,11 +217,9 @@ entry:
 define bfloat @t8_strict(bfloat %x) #0 {
 ; CHECK-LABEL: t8_strict:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-NEXT:    mov w8, #32767 // =0x7fff
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s0, w9
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-NEXT:    fcvtzu w9, s0
 ; CHECK-NEXT:    ucvtf d0, w9
 ; CHECK-NEXT:    fcvtxn s0, d0
diff --git a/llvm/test/CodeGen/AArch64/hwasan-zero-ptr.ll b/llvm/test/CodeGen/AArch64/hwasan-zero-ptr.ll
new file mode 100644
index 000000000000..dca39fe03fb1
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/hwasan-zero-ptr.ll
@@ -0,0 +1,65 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -filetype asm -o - %s | FileCheck %s
+
+; This shows that when dereferencing a null pointer, HWASan will call
+; __hwasan_check_x4294967071_19_fixed_0_short_v2
+; (N.B. 4294967071 == 2**32 - 239 + 14 == 2**32 - X0 + XZR
+;
+; The source was generated from llvm/test/Instrumentation/HWAddressSanitizer/zero-ptr.ll.
+
+; ModuleID = '<stdin>'
+source_filename = "<stdin>"
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-android10000"
+
+$hwasan.module_ctor = comdat any
+
+@__start_hwasan_globals = external hidden constant [0 x i8]
+@__stop_hwasan_globals = external hidden constant [0 x i8]
+@hwasan.note = private constant { i32, i32, i32, [8 x i8], i32, i32 } { i32 8, i32 8, i32 3, [8 x i8] c"LLVM\00\00\00\00", i32 trunc (i64 sub (i64 ptrtoint (ptr @__start_hwasan_globals to i64), i64 ptrtoint (ptr @hwasan.note to i64)) to i32), i32 trunc (i64 sub (i64 ptrtoint (ptr @__stop_hwasan_globals to i64), i64 ptrtoint (ptr @hwasan.note to i64)) to i32) }, section ".note.hwasan.globals", comdat($hwasan.module_ctor), align 4
+
+; Function Attrs: sanitize_hwaddress
+define void @test_store_to_zeroptr() #0 {
+; CHECK-LABEL: test_store_to_zeroptr:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    bl __hwasan_check_x4294967071_19_fixed_0_short_v2
+; CHECK-NEXT:    mov x8, xzr
+; CHECK-NEXT:    mov w9, #42 // =0x2a
+; CHECK-NEXT:    str x9, [x8]
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %.hwasan.shadow = call ptr asm "", "=r,0"(ptr null)
+  %b = inttoptr i64 0 to ptr
+  call void @llvm.hwasan.check.memaccess.shortgranules.fixedshadow(ptr %b, i32 19, i64 0)
+  store i64 42, ptr %b, align 8
+  ret void
+}
+
+; Function Attrs: nounwind
+declare void @llvm.hwasan.check.memaccess.shortgranules.fixedshadow(ptr, i32 immarg, i64 immarg) #1
+
+attributes #0 = { sanitize_hwaddress }
+attributes #1 = { nounwind }
+
+declare void @__hwasan_init()
+
+; Function Attrs: nounwind
+define internal void @hwasan.module_ctor() #1 comdat {
+; CHECK-LABEL: hwasan.module_ctor:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    bl __hwasan_init
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  call void @__hwasan_init()
+  ret void
+}
+
+!llvm.module.flags = !{!1}
+
+!0 = !{ptr @hwasan.note}
+!1 = !{i32 4, !"nosanitize_hwaddress", i32 1}
diff --git a/llvm/test/CodeGen/AArch64/register-coalesce-update-subranges-remat.mir b/llvm/test/CodeGen/AArch64/register-coalesce-update-subranges-remat.mir
index b61fa4be0400..08fc47d9480c 100644
--- a/llvm/test/CodeGen/AArch64/register-coalesce-update-subranges-remat.mir
+++ b/llvm/test/CodeGen/AArch64/register-coalesce-update-subranges-remat.mir
@@ -1,5 +1,5 @@
+# RUN: llc -mtriple=aarch64 -o /dev/null -run-pass=register-coalescer -aarch64-enable-subreg-liveness-tracking -debug-only=regalloc %s 2>&1 | FileCheck %s --check-prefix=CHECK-DBG
 # RUN: llc -mtriple=aarch64 -verify-machineinstrs -o - -run-pass=register-coalescer -aarch64-enable-subreg-liveness-tracking %s | FileCheck %s --check-prefix=CHECK
-# RUN: llc -mtriple=aarch64 -verify-machineinstrs -o /dev/null -run-pass=register-coalescer -aarch64-enable-subreg-liveness-tracking -debug-only=regalloc %s 2>&1 | FileCheck %s --check-prefix=CHECK-DBG
 # REQUIRES: asserts
 
 # CHECK-DBG: ********** REGISTER COALESCER **********
@@ -36,3 +36,94 @@ body:             |
     RET_ReallyLR
 
 ...
+# CHECK-DBG: ********** REGISTER COALESCER **********
+# CHECK-DBG: ********** Function: reproducer
+# CHECK-DBG: ********** JOINING INTERVALS ***********
+# CHECK-DBG: ********** INTERVALS **********
+# CHECK-DBG: %1 [32r,48B:2)[48B,320r:0)[320r,368B:1) 0@48B-phi 1@320r 2@32r
+# CHECK-DBG-SAME: weight:0.000000e+00
+# CHECK-DBG: %3 [80r,160B:2)[240r,272B:1)[288r,304B:0)[304B,320r:3) 0@288r 1@240r 2@80r 3@304B-phi
+# CHECK-DBG-SAME: L0000000000000080 [288r,304B:0)[304B,320r:3) 0@288r 1@x 2@x 3@304B-phi
+# CHECK-DBG-SAME: L0000000000000040 [80r,160B:2)[240r,272B:1)[288r,304B:0)[304B,320r:3) 0@288r 1@240r 2@80r 3@304B-phi
+# CHECK-DBG-SAME: weight:0.000000e+00
+---
+name:              reproducer
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    %0:gpr32 = MOVi32imm 1
+    %1:gpr64 = IMPLICIT_DEF
+
+  bb.1:
+
+  bb.2:
+    %3:gpr64all = SUBREG_TO_REG 0, %0, %subreg.sub_32
+
+  bb.3:
+    $nzcv = IMPLICIT_DEF
+    %4:gpr64 = COPY killed %3
+    Bcc 1, %bb.7, implicit killed $nzcv
+
+  bb.4:
+    $nzcv = IMPLICIT_DEF
+    Bcc 1, %bb.6, implicit killed $nzcv
+
+  bb.5:
+    %5:gpr64all = SUBREG_TO_REG 0, %0, %subreg.sub_32
+    %4:gpr64 = COPY killed %5
+    B %bb.7
+
+  bb.6:
+    %4:gpr64 = COPY $xzr
+
+  bb.7:
+    %7:gpr64 = ADDXrs killed %1, killed %4, 1
+    %1:gpr64 = COPY killed %7
+    B %bb.1
+
+...
+# CHECK-DBG: ********** REGISTER COALESCER **********
+# CHECK-DBG: ********** Function: reproducer2
+# CHECK-DBG: ********** JOINING INTERVALS ***********
+# CHECK-DBG: ********** INTERVALS **********
+# CHECK-DBG: %1 [32r,48B:2)[48B,304r:0)[304r,352B:1) 0@48B-phi 1@304r 2@32r
+# CHECK-DBG-SAME: weight:0.000000e+00
+# CHECK-DBG: %3 [80r,160B:2)[224r,256B:1)[272r,288B:0)[288B,304r:3) 0@272r 1@224r 2@80r 3@288B-phi
+# CHECK-DBG-SAME: L0000000000000080 [224r,256B:1)[272r,288B:0)[288B,304r:3) 0@272r 1@224r 2@x 3@288B-phi
+# CHECK-DBG-SAME: L0000000000000040 [80r,160B:2)[224r,256B:1)[272r,288B:0)[288B,304r:3) 0@272r 1@224r 2@80r 3@288B-phi
+# CHECK-DBG-SAME: weight:0.000000e+00
+---
+name:              reproducer2
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    %0:gpr32 = MOVi32imm 1
+    %1:gpr64 = IMPLICIT_DEF
+
+  bb.1:
+
+  bb.2:
+    %3:gpr64all = SUBREG_TO_REG 0, %0, %subreg.sub_32
+
+  bb.3:
+    $nzcv = IMPLICIT_DEF
+    %4:gpr64 = COPY killed %3
+    Bcc 1, %bb.7, implicit killed $nzcv
+
+  bb.4:
+    $nzcv = IMPLICIT_DEF
+    Bcc 1, %bb.6, implicit killed $nzcv
+
+  bb.5:
+    %4:gpr64 = IMPLICIT_DEF
+    B %bb.7
+
+  bb.6:
+    %4:gpr64 = COPY $xzr
+
+  bb.7:
+    %5:gpr64 = ADDXrs killed %1, killed %4, 1
+    %1:gpr64 = COPY killed %5
+    B %bb.1
+
+...
diff --git a/llvm/test/CodeGen/AArch64/round-fptosi-sat-scalar.ll b/llvm/test/CodeGen/AArch64/round-fptosi-sat-scalar.ll
index ec7548e1e654..b7fae2bff687 100644
--- a/llvm/test/CodeGen/AArch64/round-fptosi-sat-scalar.ll
+++ b/llvm/test/CodeGen/AArch64/round-fptosi-sat-scalar.ll
@@ -7,19 +7,17 @@
 define i32 @testmswbf(bfloat %a) {
 ; CHECK-LABEL: testmswbf:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-NEXT:    mov w8, #32767 // =0x7fff
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s0, w9
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-NEXT:    frintm s0, s0
 ; CHECK-NEXT:    fmov w9, s0
 ; CHECK-NEXT:    ubfx w10, w9, #16, #1
 ; CHECK-NEXT:    add w8, w9, w8
 ; CHECK-NEXT:    add w8, w10, w8
 ; CHECK-NEXT:    lsr w8, w8, #16
-; CHECK-NEXT:    lsl w8, w8, #16
 ; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-NEXT:    fcvtzs w0, s0
 ; CHECK-NEXT:    ret
 entry:
@@ -31,19 +29,17 @@ entry:
 define i64 @testmsxbf(bfloat %a) {
 ; CHECK-LABEL: testmsxbf:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-NEXT:    mov w8, #32767 // =0x7fff
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s0, w9
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-NEXT:    frintm s0, s0
 ; CHECK-NEXT:    fmov w9, s0
 ; CHECK-NEXT:    ubfx w10, w9, #16, #1
 ; CHECK-NEXT:    add w8, w9, w8
 ; CHECK-NEXT:    add w8, w10, w8
 ; CHECK-NEXT:    lsr w8, w8, #16
-; CHECK-NEXT:    lsl w8, w8, #16
 ; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-NEXT:    fcvtzs x0, s0
 ; CHECK-NEXT:    ret
 entry:
@@ -141,19 +137,17 @@ entry:
 define i32 @testpswbf(bfloat %a) {
 ; CHECK-LABEL: testpswbf:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-NEXT:    mov w8, #32767 // =0x7fff
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s0, w9
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-NEXT:    frintp s0, s0
 ; CHECK-NEXT:    fmov w9, s0
 ; CHECK-NEXT:    ubfx w10, w9, #16, #1
 ; CHECK-NEXT:    add w8, w9, w8
 ; CHECK-NEXT:    add w8, w10, w8
 ; CHECK-NEXT:    lsr w8, w8, #16
-; CHECK-NEXT:    lsl w8, w8, #16
 ; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-NEXT:    fcvtzs w0, s0
 ; CHECK-NEXT:    ret
 entry:
@@ -165,19 +159,17 @@ entry:
 define i64 @testpsxbf(bfloat %a) {
 ; CHECK-LABEL: testpsxbf:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-NEXT:    mov w8, #32767 // =0x7fff
-; CHECK-NEXT:    lsl w9, w9, #16
-; CHECK-NEXT:    fmov s0, w9
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-NEXT:    frintp s0, s0
 ; CHECK-NEXT:    fmov w9, s0
 ; CHECK-NEXT:    ubfx w10, w9, #16, #1
 ; CHECK-NEXT:    add w8, w9, w8
 ; CHECK-NEXT:    add w8, w10, w8
 ; CHECK-NEXT:    lsr w8, w8, #16
-; CHECK-NEXT:    lsl w8, w8, #16
 ; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
 ; CHECK-NEXT:    fcvtzs x0, s0
 ; CHECK-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/AArch64/sme-intrinsics-state.ll b/llvm/test/CodeGen/AArch64/sme-intrinsics-state.ll
new file mode 100644
index 000000000000..5037772a51ce
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sme-intrinsics-state.ll
@@ -0,0 +1,18 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -verify-machineinstrs < %s | FileCheck %s
+
+
+define i1 @streaming_mode_streaming_compatible() #0 {
+; CHECK-LABEL: streaming_mode_streaming_compatible:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    bl __arm_sme_state
+; CHECK-NEXT:    and w0, w0, #0x1
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %mode = tail call noundef i1 @llvm.aarch64.sme.in.streaming.mode()
+  ret i1 %mode
+}
+
+
+attributes #0 = {nounwind memory(none) "aarch64_pstate_sm_compatible"}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
index b0f2aac9a42d..7cafa2f608a4 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
@@ -3990,6 +3990,116 @@ bb:
   ret void
 }
 
+define amdgpu_gs void @sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset(ptr addrspace(5) inreg %sgpr_base, i32 inreg %sidx, i32 %vidx) {
+; GFX9-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset:
+; GFX9:       ; %bb.0: ; %bb
+; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s5
+; GFX9-NEXT:    v_add_u32_e32 v0, s3, v0
+; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
+; GFX9-NEXT:    v_add3_u32 v0, s2, v0, -16
+; GFX9-NEXT:    v_mov_b32_e32 v1, 15
+; GFX9-NEXT:    scratch_store_dword v0, v1, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset:
+; GFX10:       ; %bb.0: ; %bb
+; GFX10-NEXT:    s_add_u32 s0, s0, s5
+; GFX10-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
+; GFX10-NEXT:    v_add_nc_u32_e32 v0, s3, v0
+; GFX10-NEXT:    v_mov_b32_e32 v1, 15
+; GFX10-NEXT:    v_add_nc_u32_e32 v0, s2, v0
+; GFX10-NEXT:    scratch_store_dword v0, v1, off offset:-16
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_endpgm
+;
+; GFX940-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset:
+; GFX940:       ; %bb.0: ; %bb
+; GFX940-NEXT:    v_add_u32_e32 v0, s1, v0
+; GFX940-NEXT:    v_add3_u32 v0, s0, v0, -16
+; GFX940-NEXT:    v_mov_b32_e32 v1, 15
+; GFX940-NEXT:    scratch_store_dword v0, v1, off sc0 sc1
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    s_endpgm
+;
+; GFX11-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset:
+; GFX11:       ; %bb.0: ; %bb
+; GFX11-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_add_nc_u32 v0, s1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, s0, v0
+; GFX11-NEXT:    scratch_store_b32 v0, v1, off offset:-16 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_add_nc_u32 v0, s1, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_add_nc_u32_e32 v0, s0, v0
+; GFX12-NEXT:    scratch_store_b32 v0, v1, off offset:-16 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    s_endpgm
+;
+; UNALIGNED_GFX9-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset:
+; UNALIGNED_GFX9:       ; %bb.0: ; %bb
+; UNALIGNED_GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s5
+; UNALIGNED_GFX9-NEXT:    v_add_u32_e32 v0, s3, v0
+; UNALIGNED_GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
+; UNALIGNED_GFX9-NEXT:    v_add3_u32 v0, s2, v0, -16
+; UNALIGNED_GFX9-NEXT:    v_mov_b32_e32 v1, 15
+; UNALIGNED_GFX9-NEXT:    scratch_store_dword v0, v1, off
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    s_endpgm
+;
+; UNALIGNED_GFX10-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset:
+; UNALIGNED_GFX10:       ; %bb.0: ; %bb
+; UNALIGNED_GFX10-NEXT:    s_add_u32 s0, s0, s5
+; UNALIGNED_GFX10-NEXT:    s_addc_u32 s1, s1, 0
+; UNALIGNED_GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
+; UNALIGNED_GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
+; UNALIGNED_GFX10-NEXT:    v_add_nc_u32_e32 v0, s3, v0
+; UNALIGNED_GFX10-NEXT:    v_mov_b32_e32 v1, 15
+; UNALIGNED_GFX10-NEXT:    v_add_nc_u32_e32 v0, s2, v0
+; UNALIGNED_GFX10-NEXT:    scratch_store_dword v0, v1, off offset:-16
+; UNALIGNED_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX10-NEXT:    s_endpgm
+;
+; UNALIGNED_GFX940-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset:
+; UNALIGNED_GFX940:       ; %bb.0: ; %bb
+; UNALIGNED_GFX940-NEXT:    v_add_u32_e32 v0, s1, v0
+; UNALIGNED_GFX940-NEXT:    v_add3_u32 v0, s0, v0, -16
+; UNALIGNED_GFX940-NEXT:    v_mov_b32_e32 v1, 15
+; UNALIGNED_GFX940-NEXT:    scratch_store_dword v0, v1, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    s_endpgm
+;
+; UNALIGNED_GFX11-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset:
+; UNALIGNED_GFX11:       ; %bb.0: ; %bb
+; UNALIGNED_GFX11-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_add_nc_u32 v0, s1, v0
+; UNALIGNED_GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; UNALIGNED_GFX11-NEXT:    v_add_nc_u32_e32 v0, s0, v0
+; UNALIGNED_GFX11-NEXT:    scratch_store_b32 v0, v1, off offset:-16 dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX11-NEXT:    s_endpgm
+;
+; UNALIGNED_GFX12-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset:
+; UNALIGNED_GFX12:       ; %bb.0: ; %bb
+; UNALIGNED_GFX12-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_add_nc_u32 v0, s1, v0
+; UNALIGNED_GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; UNALIGNED_GFX12-NEXT:    v_add_nc_u32_e32 v0, s0, v0
+; UNALIGNED_GFX12-NEXT:    scratch_store_b32 v0, v1, off offset:-16 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
+; UNALIGNED_GFX12-NEXT:    s_endpgm
+bb:
+  %add1 = add nsw i32 %sidx, %vidx
+  %add2 = add nsw i32 %add1, -16
+  %gep = getelementptr inbounds [16 x i8], ptr addrspace(5) %sgpr_base, i32 0, i32 %add2
+  store volatile i32 15, ptr addrspace(5) %gep, align 4
+  ret void
+}
+
 define amdgpu_gs void @sgpr_base_negative_offset(ptr addrspace(1) %out, ptr addrspace(5) inreg %scevgep) {
 ; GFX9-LABEL: sgpr_base_negative_offset:
 ; GFX9:       ; %bb.0: ; %entry
diff --git a/llvm/test/CodeGen/AMDGPU/add64-low-32-bits-known-zero.ll b/llvm/test/CodeGen/AMDGPU/add64-low-32-bits-known-zero.ll
new file mode 100644
index 000000000000..52259c4c2e6e
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/add64-low-32-bits-known-zero.ll
@@ -0,0 +1,193 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
+
+; Reduce a 64-bit add by a constant if we know the low 32-bits are all
+; zero.
+
+; add i64:x, K if computeTrailingZeros(K) >= 32
+; => build_pair (add x.hi, K.hi), x.lo
+
+define amdgpu_ps i64 @s_add_i64_const_low_bits_known0_0(i64 inreg %reg) {
+; GFX9-LABEL: s_add_i64_const_low_bits_known0_0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_add_i32 s1, s1, 0x40000
+; GFX9-NEXT:    ; return to shader part epilog
+  %add = add i64 %reg, 1125899906842624 ; (1 << 50)
+  ret i64 %add
+}
+
+define amdgpu_ps i64 @s_add_i64_const_low_bits_known0_1(i64 inreg %reg) {
+; GFX9-LABEL: s_add_i64_const_low_bits_known0_1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_add_i32 s1, s1, 1
+; GFX9-NEXT:    ; return to shader part epilog
+  %add = add i64 %reg, 4294967296 ; (1 << 32)
+  ret i64 %add
+}
+
+define amdgpu_ps i64 @s_add_i64_const_low_bits_known0_2(i64 inreg %reg) {
+; GFX9-LABEL: s_add_i64_const_low_bits_known0_2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_add_i32 s1, s1, 2
+; GFX9-NEXT:    ; return to shader part epilog
+  %add = add i64 %reg, 8589934592 ; (1 << 33)
+  ret i64 %add
+}
+
+define amdgpu_ps i64 @s_add_i64_const_low_bits_known0_3(i64 inreg %reg) {
+; GFX9-LABEL: s_add_i64_const_low_bits_known0_3:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_add_i32 s1, s1, 0x80000000
+; GFX9-NEXT:    ; return to shader part epilog
+  %add = add i64 %reg, -9223372036854775808 ; (1 << 63)
+  ret i64 %add
+}
+
+define amdgpu_ps i64 @s_add_i64_const_low_bits_known0_4(i64 inreg %reg) {
+; GFX9-LABEL: s_add_i64_const_low_bits_known0_4:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_add_i32 s1, s1, -1
+; GFX9-NEXT:    ; return to shader part epilog
+  %add = add i64 %reg, -4294967296 ; 0xffffffff00000000
+  ret i64 %add
+}
+
+define i64 @v_add_i64_const_low_bits_known0_0(i64 %reg) {
+; GFX9-LABEL: v_add_i64_const_low_bits_known0_0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_add_u32_e32 v1, 0x40000, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %add = add i64 %reg, 1125899906842624 ; (1 << 50)
+  ret i64 %add
+}
+
+define i64 @v_add_i64_const_low_bits_known0_1(i64 %reg) {
+; GFX9-LABEL: v_add_i64_const_low_bits_known0_1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_add_u32_e32 v1, 1, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %add = add i64 %reg, 4294967296 ; (1 << 32)
+  ret i64 %add
+}
+
+define i64 @v_add_i64_const_low_bits_known0_2(i64 %reg) {
+; GFX9-LABEL: v_add_i64_const_low_bits_known0_2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_add_u32_e32 v1, 2, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %add = add i64 %reg, 8589934592 ; (1 << 33)
+  ret i64 %add
+}
+
+define i64 @v_add_i64_const_low_bits_known0_3(i64 %reg) {
+; GFX9-LABEL: v_add_i64_const_low_bits_known0_3:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_add_u32_e32 v1, 0x80000000, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %add = add i64 %reg, -9223372036854775808 ; (1 << 63)
+  ret i64 %add
+}
+
+define i64 @v_add_i64_const_low_bits_known0_4(i64 %reg) {
+; GFX9-LABEL: v_add_i64_const_low_bits_known0_4:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_add_u32_e32 v1, -1, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %add = add i64 %reg, -4294967296 ; 0xffffffff00000000
+  ret i64 %add
+}
+
+define amdgpu_ps i64 @s_add_i64_const_high_bits_known0_0(i64 inreg %reg) {
+; GFX9-LABEL: s_add_i64_const_high_bits_known0_0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_add_u32 s0, s0, -1
+; GFX9-NEXT:    s_addc_u32 s1, s1, 0
+; GFX9-NEXT:    ; return to shader part epilog
+  %add = add i64 %reg, 4294967295 ; (1 << 31)
+  ret i64 %add
+}
+
+define i64 @v_add_i64_const_high_bits_known0_0(i64 %reg) {
+; GFX9-LABEL: v_add_i64_const_high_bits_known0_0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, -1, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %add = add i64 %reg, 4294967295 ; (1 << 31)
+  ret i64 %add
+}
+
+define <2 x i64> @v_add_v2i64_splat_const_low_bits_known0_0(<2 x i64> %reg) {
+; GFX9-LABEL: v_add_v2i64_splat_const_low_bits_known0_0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_add_u32_e32 v1, 1, v1
+; GFX9-NEXT:    v_add_u32_e32 v3, 1, v3
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %add = add <2 x i64> %reg, <i64 4294967296, i64 4294967296> ; (1 << 32)
+  ret <2 x i64> %add
+}
+
+define <2 x i64> @v_add_v2i64_nonsplat_const_low_bits_known0_0(<2 x i64> %reg) {
+; GFX9-LABEL: v_add_v2i64_nonsplat_const_low_bits_known0_0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_add_u32_e32 v1, 1, v1
+; GFX9-NEXT:    v_add_u32_e32 v3, 2, v3
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %add = add <2 x i64> %reg, <i64 4294967296, i64 8589934592> ; (1 << 32), (1 << 33)
+  ret <2 x i64> %add
+}
+
+define amdgpu_ps <2 x i64> @s_add_v2i64_splat_const_low_bits_known0_0(<2 x i64> inreg %reg) {
+; GFX9-LABEL: s_add_v2i64_splat_const_low_bits_known0_0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_add_i32 s1, s1, 1
+; GFX9-NEXT:    s_add_i32 s3, s3, 1
+; GFX9-NEXT:    ; return to shader part epilog
+  %add = add <2 x i64> %reg, <i64 4294967296, i64 4294967296> ; (1 << 32)
+  ret <2 x i64> %add
+}
+
+define amdgpu_ps <2 x i64> @s_add_v2i64_nonsplat_const_low_bits_known0_0(<2 x i64> inreg %reg) {
+; GFX9-LABEL: s_add_v2i64_nonsplat_const_low_bits_known0_0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_add_i32 s1, s1, 1
+; GFX9-NEXT:    s_add_i32 s3, s3, 2
+; GFX9-NEXT:    ; return to shader part epilog
+  %add = add <2 x i64> %reg, <i64 4294967296, i64 8589934592> ; (1 << 32), (1 << 33)
+  ret <2 x i64> %add
+}
+
+; We could reduce this to use a 32-bit add if we use computeKnownBits
+define i64 @v_add_i64_variable_high_bits_known0_0(i64 %reg, i32 %offset.hi32) {
+; GFX9-LABEL: v_add_i64_variable_high_bits_known0_0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v2, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %zext.offset.hi32 = zext i32 %offset.hi32 to i64
+  %in.high.bits = shl i64 %zext.offset.hi32, 32
+  %add = add i64 %reg, %in.high.bits
+  ret i64 %add
+}
+
+; We could reduce this to use a 32-bit add if we use computeKnownBits
+define amdgpu_ps i64 @s_add_i64_variable_high_bits_known0_0(i64 inreg %reg, i32 inreg %offset.hi32) {
+; GFX9-LABEL: s_add_i64_variable_high_bits_known0_0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_add_u32 s0, s0, 0
+; GFX9-NEXT:    s_addc_u32 s1, s1, s2
+; GFX9-NEXT:    ; return to shader part epilog
+  %zext.offset.hi32 = zext i32 %offset.hi32 to i64
+  %in.high.bits = shl i64 %zext.offset.hi32, 32
+  %add = add i64 %reg, %in.high.bits
+  ret i64 %add
+}
diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
index 97d642b991f7..5415af02ef89 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
@@ -5249,6 +5249,114 @@ bb:
   ret void
 }
 
+define amdgpu_gs void @sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset(ptr addrspace(5) inreg %sgpr_base, i32 inreg %sidx, i32 %vidx) {
+; GFX9-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset:
+; GFX9:       ; %bb.0: ; %bb
+; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s5
+; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
+; GFX9-NEXT:    s_add_i32 s2, s2, s3
+; GFX9-NEXT:    v_add_u32_e32 v0, s2, v0
+; GFX9-NEXT:    v_add_u32_e32 v0, -16, v0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 15
+; GFX9-NEXT:    scratch_store_dword v0, v1, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset:
+; GFX10:       ; %bb.0: ; %bb
+; GFX10-NEXT:    s_add_u32 s0, s0, s5
+; GFX10-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
+; GFX10-NEXT:    v_add3_u32 v0, s2, s3, v0
+; GFX10-NEXT:    v_mov_b32_e32 v1, 15
+; GFX10-NEXT:    scratch_store_dword v0, v1, off offset:-16
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset:
+; GFX11:       ; %bb.0: ; %bb
+; GFX11-NEXT:    v_add3_u32 v0, s0, s1, v0
+; GFX11-NEXT:    v_mov_b32_e32 v1, 15
+; GFX11-NEXT:    scratch_store_b32 v0, v1, off offset:-16 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_mov_b32_e32 v1, 15
+; GFX12-NEXT:    s_add_co_i32 s0, s0, s1
+; GFX12-NEXT:    scratch_store_b32 v0, v1, s0 offset:-16 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    s_endpgm
+;
+; GFX9-PAL-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset:
+; GFX9-PAL:       ; %bb.0: ; %bb
+; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
+; GFX9-PAL-NEXT:    s_mov_b32 s2, s8
+; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 15
+; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
+; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s5
+; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
+; GFX9-PAL-NEXT:    s_add_i32 s0, s0, s1
+; GFX9-PAL-NEXT:    v_add_u32_e32 v0, s0, v0
+; GFX9-PAL-NEXT:    v_add_u32_e32 v0, -16, v0
+; GFX9-PAL-NEXT:    scratch_store_dword v0, v1, off
+; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-PAL-NEXT:    s_endpgm
+;
+; GFX940-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset:
+; GFX940:       ; %bb.0: ; %bb
+; GFX940-NEXT:    s_add_i32 s0, s0, s1
+; GFX940-NEXT:    v_add_u32_e32 v0, s0, v0
+; GFX940-NEXT:    v_add_u32_e32 v0, -16, v0
+; GFX940-NEXT:    v_mov_b32_e32 v1, 15
+; GFX940-NEXT:    scratch_store_dword v0, v1, off sc0 sc1
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    s_endpgm
+;
+; GFX10-PAL-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset:
+; GFX10-PAL:       ; %bb.0: ; %bb
+; GFX10-PAL-NEXT:    s_getpc_b64 s[2:3]
+; GFX10-PAL-NEXT:    s_mov_b32 s2, s8
+; GFX10-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
+; GFX10-PAL-NEXT:    s_add_u32 s2, s2, s5
+; GFX10-PAL-NEXT:    s_addc_u32 s3, s3, 0
+; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
+; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
+; GFX10-PAL-NEXT:    v_add3_u32 v0, s0, s1, v0
+; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 15
+; GFX10-PAL-NEXT:    scratch_store_dword v0, v1, off offset:-16
+; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-PAL-NEXT:    s_endpgm
+;
+; GFX11-PAL-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset:
+; GFX11-PAL:       ; %bb.0: ; %bb
+; GFX11-PAL-NEXT:    v_add3_u32 v0, s0, s1, v0
+; GFX11-PAL-NEXT:    v_mov_b32_e32 v1, 15
+; GFX11-PAL-NEXT:    scratch_store_b32 v0, v1, off offset:-16 dlc
+; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-PAL-NEXT:    s_endpgm
+;
+; GFX12-PAL-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset:
+; GFX12-PAL:       ; %bb.0: ; %bb
+; GFX12-PAL-NEXT:    v_mov_b32_e32 v1, 15
+; GFX12-PAL-NEXT:    s_add_co_i32 s0, s0, s1
+; GFX12-PAL-NEXT:    scratch_store_b32 v0, v1, s0 offset:-16 scope:SCOPE_SYS
+; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
+; GFX12-PAL-NEXT:    s_endpgm
+bb:
+  %add1 = add nsw i32 %sidx, %vidx
+  %add2 = add nsw i32 %add1, -16
+  %gep = getelementptr inbounds [16 x i8], ptr addrspace(5) %sgpr_base, i32 0, i32 %add2
+  store volatile i32 15, ptr addrspace(5) %gep, align 4
+  ret void
+}
+
 define amdgpu_gs void @sgpr_base_negative_offset(ptr addrspace(1) %out, ptr addrspace(5) inreg %scevgep) {
 ; GFX9-LABEL: sgpr_base_negative_offset:
 ; GFX9:       ; %bb.0: ; %entry
diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
index 157f91ccc6b1..b2f113f08a91 100644
--- a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
@@ -668,37 +668,32 @@ define amdgpu_ps float @global_load_saddr_i8_offset_0xFFFFFFFF(ptr addrspace(1)
 define amdgpu_ps float @global_load_saddr_i8_offset_0x100000000(ptr addrspace(1) inreg %sbase) {
 ; GFX9-LABEL: global_load_saddr_i8_offset_0x100000000:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    v_add_co_u32_e64 v0, vcc, 0, s2
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 1, v1, vcc
-; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off
+; GFX9-NEXT:    s_add_i32 s3, s3, 1
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    global_load_ubyte v0, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: global_load_saddr_i8_offset_0x100000000:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    v_add_co_u32 v0, s[0:1], 0, s2
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s[0:1], 1, s3, s[0:1]
-; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    s_add_i32 s3, s3, 1
+; GFX10-NEXT:    global_load_ubyte v0, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: global_load_saddr_i8_offset_0x100000000:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    v_add_co_u32 v0, s[0:1], 0, s2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, 1, s3, s[0:1]
-; GFX11-NEXT:    global_load_u8 v0, v[0:1], off
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    s_add_i32 s3, s3, 1
+; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-SDAG-LABEL: global_load_saddr_i8_offset_0x100000000:
 ; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_mov_b32 s0, 0
-; GFX12-SDAG-NEXT:    s_mov_b32 s1, 1
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-SDAG-NEXT:    s_add_nc_u64 s[0:1], s[2:3], s[0:1]
-; GFX12-SDAG-NEXT:    s_load_u8 s0, s[0:1], 0x0
+; GFX12-SDAG-NEXT:    s_add_co_i32 s3, s3, 1
+; GFX12-SDAG-NEXT:    s_load_u8 s0, s[2:3], 0x0
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-SDAG-NEXT:    ; return to shader part epilog
@@ -934,37 +929,32 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg0xFFFFFFFF(ptr addrspace(
 define amdgpu_ps float @global_load_saddr_i8_offset_neg0x100000000(ptr addrspace(1) inreg %sbase) {
 ; GFX9-LABEL: global_load_saddr_i8_offset_neg0x100000000:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    v_add_co_u32_e64 v0, vcc, 0, s2
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off
+; GFX9-NEXT:    s_add_i32 s3, s3, -1
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    global_load_ubyte v0, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: global_load_saddr_i8_offset_neg0x100000000:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    v_add_co_u32 v0, s[0:1], 0, s2
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s[0:1], -1, s3, s[0:1]
-; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    s_add_i32 s3, s3, -1
+; GFX10-NEXT:    global_load_ubyte v0, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: global_load_saddr_i8_offset_neg0x100000000:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    v_add_co_u32 v0, s[0:1], 0, s2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, s3, s[0:1]
-; GFX11-NEXT:    global_load_u8 v0, v[0:1], off
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    s_add_i32 s3, s3, -1
+; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-SDAG-LABEL: global_load_saddr_i8_offset_neg0x100000000:
 ; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_mov_b32 s0, 0
-; GFX12-SDAG-NEXT:    s_mov_b32 s1, -1
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-SDAG-NEXT:    s_add_nc_u64 s[0:1], s[2:3], s[0:1]
-; GFX12-SDAG-NEXT:    s_load_u8 s0, s[0:1], 0x0
+; GFX12-SDAG-NEXT:    s_add_co_i32 s3, s3, -1
+; GFX12-SDAG-NEXT:    s_load_u8 s0, s[2:3], 0x0
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-SDAG-NEXT:    ; return to shader part epilog
diff --git a/llvm/test/CodeGen/AMDGPU/minmax.ll b/llvm/test/CodeGen/AMDGPU/minmax.ll
index 73f3d4c037ad..774a22fb907d 100644
--- a/llvm/test/CodeGen/AMDGPU/minmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/minmax.ll
@@ -1,6 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX11,SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX11,GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX11,SDAG,SDAG-GFX11 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX11,GISEL,GISEL-GFX11 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX12,SDAG,SDAG-GFX12 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX12,GISEL,GISEL-GFX12 %s
 
 define i32 @test_minmax_i32(i32 %a, i32 %b, i32 %c) {
 ; GFX11-LABEL: test_minmax_i32:
@@ -8,6 +10,16 @@ define i32 @test_minmax_i32(i32 %a, i32 %b, i32 %c) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_maxmin_i32 v0, v0, v1, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_minmax_i32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maxmin_i32 v0, v0, v1, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %smax = call i32 @llvm.smax.i32(i32 %a, i32 %b)
   %sminmax = call i32 @llvm.smin.i32(i32 %smax, i32 %c)
   ret i32 %sminmax
@@ -45,6 +57,16 @@ define i32 @test_minmax_commuted_i32(i32 %a, i32 %b, i32 %c) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_maxmin_i32 v0, v0, v1, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_minmax_commuted_i32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maxmin_i32 v0, v0, v1, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %smax = call i32 @llvm.smax.i32(i32 %a, i32 %b)
   %sminmax = call i32 @llvm.smin.i32(i32 %c, i32 %smax)
   ret i32 %sminmax
@@ -56,6 +78,16 @@ define i32 @test_maxmin_i32(i32 %a, i32 %b, i32 %c) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_minmax_i32 v0, v0, v1, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_maxmin_i32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minmax_i32 v0, v0, v1, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %smin = call i32 @llvm.smin.i32(i32 %a, i32 %b)
   %smaxmin = call i32 @llvm.smax.i32(i32 %smin, i32 %c)
   ret i32 %smaxmin
@@ -67,6 +99,16 @@ define i32 @test_maxmin_commuted_i32(i32 %a, i32 %b, i32 %c) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_minmax_i32 v0, v0, v1, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_maxmin_commuted_i32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minmax_i32 v0, v0, v1, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %smin = call i32 @llvm.smin.i32(i32 %a, i32 %b)
   %smaxmin = call i32 @llvm.smax.i32(i32 %c, i32 %smin)
   ret i32 %smaxmin
@@ -79,6 +121,17 @@ define void @test_smed3_i32(ptr addrspace(1) %arg, i32 %x, i32 %y, i32 %z) {
 ; GFX11-NEXT:    v_med3_i32 v2, v2, v3, v4
 ; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_smed3_i32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_med3_i32 v2, v2, v3, v4
+; GFX12-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %tmp0 = call i32 @llvm.smin.i32(i32 %x, i32 %y)
   %tmp1 = call i32 @llvm.smax.i32(i32 %x, i32 %y)
   %tmp2 = call i32 @llvm.smin.i32(i32 %tmp1, i32 %z)
@@ -93,6 +146,16 @@ define i32 @test_minmax_u32(i32 %a, i32 %b, i32 %c) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_maxmin_u32 v0, v0, v1, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_minmax_u32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maxmin_u32 v0, v0, v1, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %umax = call i32 @llvm.umax.i32(i32 %a, i32 %b)
   %uminmax = call i32 @llvm.umin.i32(i32 %umax, i32 %c)
   ret i32 %uminmax
@@ -130,6 +193,16 @@ define i32 @test_minmax_commuted_u32(i32 %a, i32 %b, i32 %c) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_maxmin_u32 v0, v0, v1, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_minmax_commuted_u32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maxmin_u32 v0, v0, v1, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %umax = call i32 @llvm.umax.i32(i32 %a, i32 %b)
   %uminmax = call i32 @llvm.umin.i32(i32 %c, i32 %umax)
   ret i32 %uminmax
@@ -141,6 +214,16 @@ define i32 @test_maxmin_u32(i32 %a, i32 %b, i32 %c) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_minmax_u32 v0, v0, v1, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_maxmin_u32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minmax_u32 v0, v0, v1, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %umin = call i32 @llvm.umin.i32(i32 %a, i32 %b)
   %umaxmin = call i32 @llvm.umax.i32(i32 %umin, i32 %c)
   ret i32 %umaxmin
@@ -152,6 +235,16 @@ define i32 @test_maxmin_commuted_u32(i32 %a, i32 %b, i32 %c) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_minmax_u32 v0, v0, v1, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_maxmin_commuted_u32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minmax_u32 v0, v0, v1, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %umin = call i32 @llvm.umin.i32(i32 %a, i32 %b)
   %umaxmin = call i32 @llvm.umax.i32(i32 %c, i32 %umin)
   ret i32 %umaxmin
@@ -164,6 +257,17 @@ define void @test_umed3_i32(ptr addrspace(1) %arg, i32 %x, i32 %y, i32 %z) {
 ; GFX11-NEXT:    v_med3_u32 v2, v2, v3, v4
 ; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_umed3_i32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_med3_u32 v2, v2, v3, v4
+; GFX12-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %tmp0 = call i32 @llvm.umin.i32(i32 %x, i32 %y)
   %tmp1 = call i32 @llvm.umax.i32(i32 %x, i32 %y)
   %tmp2 = call i32 @llvm.umin.i32(i32 %tmp1, i32 %z)
@@ -173,44 +277,88 @@ define void @test_umed3_i32(ptr addrspace(1) %arg, i32 %x, i32 %y, i32 %z) {
 }
 
 define float @test_minmax_f32_ieee_true(float %a, float %b, float %c) {
-; SDAG-LABEL: test_minmax_f32_ieee_true:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v0, v0, v0
-; SDAG-NEXT:    v_max_f32_e32 v2, v2, v2
-; SDAG-NEXT:    v_maxmin_f32 v0, v0, v1, v2
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-GFX11-LABEL: test_minmax_f32_ieee_true:
+; SDAG-GFX11:       ; %bb.0:
+; SDAG-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX11-NEXT:    v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v0, v0, v0
+; SDAG-GFX11-NEXT:    v_max_f32_e32 v2, v2, v2
+; SDAG-GFX11-NEXT:    v_maxmin_f32 v0, v0, v1, v2
+; SDAG-GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GISEL-LABEL: test_minmax_f32_ieee_true:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1
-; GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
-; GISEL-NEXT:    v_maxmin_f32 v0, v0, v1, v2
-; GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GISEL-GFX11-LABEL: test_minmax_f32_ieee_true:
+; GISEL-GFX11:       ; %bb.0:
+; GISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX11-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1
+; GISEL-GFX11-NEXT:    v_max_f32_e32 v2, v2, v2
+; GISEL-GFX11-NEXT:    v_maxmin_f32 v0, v0, v1, v2
+; GISEL-GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; SDAG-GFX12-LABEL: test_minmax_f32_ieee_true:
+; SDAG-GFX12:       ; %bb.0:
+; SDAG-GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; SDAG-GFX12-NEXT:    s_wait_expcnt 0x0
+; SDAG-GFX12-NEXT:    s_wait_samplecnt 0x0
+; SDAG-GFX12-NEXT:    s_wait_bvhcnt 0x0
+; SDAG-GFX12-NEXT:    s_wait_kmcnt 0x0
+; SDAG-GFX12-NEXT:    v_dual_max_num_f32 v1, v1, v1 :: v_dual_max_num_f32 v0, v0, v0
+; SDAG-GFX12-NEXT:    v_max_num_f32_e32 v2, v2, v2
+; SDAG-GFX12-NEXT:    v_maxmin_num_f32 v0, v0, v1, v2
+; SDAG-GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-GFX12-LABEL: test_minmax_f32_ieee_true:
+; GISEL-GFX12:       ; %bb.0:
+; GISEL-GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GISEL-GFX12-NEXT:    s_wait_expcnt 0x0
+; GISEL-GFX12-NEXT:    s_wait_samplecnt 0x0
+; GISEL-GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GISEL-GFX12-NEXT:    s_wait_kmcnt 0x0
+; GISEL-GFX12-NEXT:    v_dual_max_num_f32 v0, v0, v0 :: v_dual_max_num_f32 v1, v1, v1
+; GISEL-GFX12-NEXT:    v_max_num_f32_e32 v2, v2, v2
+; GISEL-GFX12-NEXT:    v_maxmin_num_f32 v0, v0, v1, v2
+; GISEL-GFX12-NEXT:    s_setpc_b64 s[30:31]
   %max = call float @llvm.maxnum.f32(float %a, float %b)
   %minmax = call float @llvm.minnum.f32(float %max, float %c)
   ret float %minmax
 }
 
 define amdgpu_ps void @s_test_minmax_f32_ieee_false(float inreg %a, float inreg %b, float inreg %c, ptr addrspace(1) inreg %out) {
-; SDAG-LABEL: s_test_minmax_f32_ieee_false:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
-; SDAG-NEXT:    s_mov_b32 s5, s4
-; SDAG-NEXT:    s_mov_b32 s4, s3
-; SDAG-NEXT:    v_maxmin_f32 v0, s0, s1, v0
-; SDAG-NEXT:    global_store_b32 v1, v0, s[4:5]
-; SDAG-NEXT:    s_endpgm
+; SDAG-GFX11-LABEL: s_test_minmax_f32_ieee_false:
+; SDAG-GFX11:       ; %bb.0:
+; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
+; SDAG-GFX11-NEXT:    s_mov_b32 s5, s4
+; SDAG-GFX11-NEXT:    s_mov_b32 s4, s3
+; SDAG-GFX11-NEXT:    v_maxmin_f32 v0, s0, s1, v0
+; SDAG-GFX11-NEXT:    global_store_b32 v1, v0, s[4:5]
+; SDAG-GFX11-NEXT:    s_endpgm
 ;
-; GISEL-LABEL: s_test_minmax_f32_ieee_false:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
-; GISEL-NEXT:    s_mov_b32 s6, s3
-; GISEL-NEXT:    s_mov_b32 s7, s4
-; GISEL-NEXT:    v_maxmin_f32 v0, s0, s1, v0
-; GISEL-NEXT:    global_store_b32 v1, v0, s[6:7]
-; GISEL-NEXT:    s_endpgm
+; GISEL-GFX11-LABEL: s_test_minmax_f32_ieee_false:
+; GISEL-GFX11:       ; %bb.0:
+; GISEL-GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
+; GISEL-GFX11-NEXT:    s_mov_b32 s6, s3
+; GISEL-GFX11-NEXT:    s_mov_b32 s7, s4
+; GISEL-GFX11-NEXT:    v_maxmin_f32 v0, s0, s1, v0
+; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[6:7]
+; GISEL-GFX11-NEXT:    s_endpgm
+;
+; SDAG-GFX12-LABEL: s_test_minmax_f32_ieee_false:
+; SDAG-GFX12:       ; %bb.0:
+; SDAG-GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
+; SDAG-GFX12-NEXT:    s_mov_b32 s5, s4
+; SDAG-GFX12-NEXT:    s_mov_b32 s4, s3
+; SDAG-GFX12-NEXT:    v_maxmin_num_f32 v0, s0, s1, v0
+; SDAG-GFX12-NEXT:    global_store_b32 v1, v0, s[4:5]
+; SDAG-GFX12-NEXT:    s_endpgm
+;
+; GISEL-GFX12-LABEL: s_test_minmax_f32_ieee_false:
+; GISEL-GFX12:       ; %bb.0:
+; GISEL-GFX12-NEXT:    s_max_num_f32 s0, s0, s1
+; GISEL-GFX12-NEXT:    s_mov_b32 s6, s3
+; GISEL-GFX12-NEXT:    s_mov_b32 s7, s4
+; GISEL-GFX12-NEXT:    v_mov_b32_e32 v1, 0
+; GISEL-GFX12-NEXT:    s_min_num_f32 s0, s0, s2
+; GISEL-GFX12-NEXT:    v_mov_b32_e32 v0, s0
+; GISEL-GFX12-NEXT:    global_store_b32 v1, v0, s[6:7]
+; GISEL-GFX12-NEXT:    s_endpgm
   %smax = call float @llvm.maxnum.f32(float %a, float %b)
   %sminmax = call float @llvm.minnum.f32(float %smax, float %c)
   store float %sminmax, ptr addrspace(1) %out
@@ -222,27 +370,56 @@ define amdgpu_ps float @test_minmax_commuted_f32_ieee_false(float %a, float %b,
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    v_maxmin_f32 v0, v0, v1, v2
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_minmax_commuted_f32_ieee_false:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_maxmin_num_f32 v0, v0, v1, v2
+; GFX12-NEXT:    ; return to shader part epilog
   %max = call float @llvm.maxnum.f32(float %a, float %b)
   %minmax = call float @llvm.minnum.f32(float %c, float %max)
   ret float %minmax
 }
 
 define float @test_maxmin_f32_ieee_true(float %a, float %b, float %c) {
-; SDAG-LABEL: test_maxmin_f32_ieee_true:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v0, v0, v0
-; SDAG-NEXT:    v_max_f32_e32 v2, v2, v2
-; SDAG-NEXT:    v_minmax_f32 v0, v0, v1, v2
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-GFX11-LABEL: test_maxmin_f32_ieee_true:
+; SDAG-GFX11:       ; %bb.0:
+; SDAG-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX11-NEXT:    v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v0, v0, v0
+; SDAG-GFX11-NEXT:    v_max_f32_e32 v2, v2, v2
+; SDAG-GFX11-NEXT:    v_minmax_f32 v0, v0, v1, v2
+; SDAG-GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GISEL-LABEL: test_maxmin_f32_ieee_true:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1
-; GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
-; GISEL-NEXT:    v_minmax_f32 v0, v0, v1, v2
-; GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GISEL-GFX11-LABEL: test_maxmin_f32_ieee_true:
+; GISEL-GFX11:       ; %bb.0:
+; GISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX11-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1
+; GISEL-GFX11-NEXT:    v_max_f32_e32 v2, v2, v2
+; GISEL-GFX11-NEXT:    v_minmax_f32 v0, v0, v1, v2
+; GISEL-GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; SDAG-GFX12-LABEL: test_maxmin_f32_ieee_true:
+; SDAG-GFX12:       ; %bb.0:
+; SDAG-GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; SDAG-GFX12-NEXT:    s_wait_expcnt 0x0
+; SDAG-GFX12-NEXT:    s_wait_samplecnt 0x0
+; SDAG-GFX12-NEXT:    s_wait_bvhcnt 0x0
+; SDAG-GFX12-NEXT:    s_wait_kmcnt 0x0
+; SDAG-GFX12-NEXT:    v_dual_max_num_f32 v1, v1, v1 :: v_dual_max_num_f32 v0, v0, v0
+; SDAG-GFX12-NEXT:    v_max_num_f32_e32 v2, v2, v2
+; SDAG-GFX12-NEXT:    v_minmax_num_f32 v0, v0, v1, v2
+; SDAG-GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-GFX12-LABEL: test_maxmin_f32_ieee_true:
+; GISEL-GFX12:       ; %bb.0:
+; GISEL-GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GISEL-GFX12-NEXT:    s_wait_expcnt 0x0
+; GISEL-GFX12-NEXT:    s_wait_samplecnt 0x0
+; GISEL-GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GISEL-GFX12-NEXT:    s_wait_kmcnt 0x0
+; GISEL-GFX12-NEXT:    v_dual_max_num_f32 v0, v0, v0 :: v_dual_max_num_f32 v1, v1, v1
+; GISEL-GFX12-NEXT:    v_max_num_f32_e32 v2, v2, v2
+; GISEL-GFX12-NEXT:    v_minmax_num_f32 v0, v0, v1, v2
+; GISEL-GFX12-NEXT:    s_setpc_b64 s[30:31]
   %min = call float @llvm.minnum.f32(float %a, float %b)
   %maxmin = call float @llvm.maxnum.f32(float %min, float %c)
   ret float %maxmin
@@ -253,6 +430,11 @@ define amdgpu_ps float @test_maxmin_commuted_f32_ieee_false(float %a, float %b,
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    v_minmax_f32 v0, v0, v1, v2
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_maxmin_commuted_f32_ieee_false:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_minmax_num_f32 v0, v0, v1, v2
+; GFX12-NEXT:    ; return to shader part epilog
   %min = call float @llvm.minnum.f32(float %a, float %b)
   %maxmin = call float @llvm.maxnum.f32(float %c, float %min)
   ret float %maxmin
@@ -265,6 +447,17 @@ define void @test_med3_f32(ptr addrspace(1) %arg, float %x, float %y, float %z)
 ; GFX11-NEXT:    v_med3_f32 v2, v2, v3, v4
 ; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_med3_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_med3_num_f32 v2, v2, v3, v4
+; GFX12-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %tmp0 = call float @llvm.minnum.f32(float %x, float %y)
   %tmp1 = call float @llvm.maxnum.f32(float %x, float %y)
   %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %z)
@@ -278,29 +471,54 @@ define amdgpu_ps half @test_minmax_f16_ieee_false(half %a, half %b, half %c) {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    v_maxmin_f16 v0, v0, v1, v2
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_minmax_f16_ieee_false:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_maxmin_num_f16 v0, v0, v1, v2
+; GFX12-NEXT:    ; return to shader part epilog
   %max = call half @llvm.maxnum.f16(half %a, half %b)
   %minmax = call half @llvm.minnum.f16(half %max, half %c)
   ret half %minmax
 }
 
 define amdgpu_ps void @s_test_minmax_f16_ieee_false(half inreg %a, half inreg %b, half inreg %c, ptr addrspace(1) inreg %out) {
-; SDAG-LABEL: s_test_minmax_f16_ieee_false:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
-; SDAG-NEXT:    s_mov_b32 s5, s4
-; SDAG-NEXT:    s_mov_b32 s4, s3
-; SDAG-NEXT:    v_maxmin_f16 v0, s0, s1, v0
-; SDAG-NEXT:    global_store_b16 v1, v0, s[4:5]
-; SDAG-NEXT:    s_endpgm
+; SDAG-GFX11-LABEL: s_test_minmax_f16_ieee_false:
+; SDAG-GFX11:       ; %bb.0:
+; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
+; SDAG-GFX11-NEXT:    s_mov_b32 s5, s4
+; SDAG-GFX11-NEXT:    s_mov_b32 s4, s3
+; SDAG-GFX11-NEXT:    v_maxmin_f16 v0, s0, s1, v0
+; SDAG-GFX11-NEXT:    global_store_b16 v1, v0, s[4:5]
+; SDAG-GFX11-NEXT:    s_endpgm
 ;
-; GISEL-LABEL: s_test_minmax_f16_ieee_false:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
-; GISEL-NEXT:    s_mov_b32 s6, s3
-; GISEL-NEXT:    s_mov_b32 s7, s4
-; GISEL-NEXT:    v_maxmin_f16 v0, s0, s1, v0
-; GISEL-NEXT:    global_store_b16 v1, v0, s[6:7]
-; GISEL-NEXT:    s_endpgm
+; GISEL-GFX11-LABEL: s_test_minmax_f16_ieee_false:
+; GISEL-GFX11:       ; %bb.0:
+; GISEL-GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
+; GISEL-GFX11-NEXT:    s_mov_b32 s6, s3
+; GISEL-GFX11-NEXT:    s_mov_b32 s7, s4
+; GISEL-GFX11-NEXT:    v_maxmin_f16 v0, s0, s1, v0
+; GISEL-GFX11-NEXT:    global_store_b16 v1, v0, s[6:7]
+; GISEL-GFX11-NEXT:    s_endpgm
+;
+; SDAG-GFX12-LABEL: s_test_minmax_f16_ieee_false:
+; SDAG-GFX12:       ; %bb.0:
+; SDAG-GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
+; SDAG-GFX12-NEXT:    s_mov_b32 s5, s4
+; SDAG-GFX12-NEXT:    s_mov_b32 s4, s3
+; SDAG-GFX12-NEXT:    v_maxmin_num_f16 v0, s0, s1, v0
+; SDAG-GFX12-NEXT:    global_store_b16 v1, v0, s[4:5]
+; SDAG-GFX12-NEXT:    s_endpgm
+;
+; GISEL-GFX12-LABEL: s_test_minmax_f16_ieee_false:
+; GISEL-GFX12:       ; %bb.0:
+; GISEL-GFX12-NEXT:    s_max_num_f16 s0, s0, s1
+; GISEL-GFX12-NEXT:    s_mov_b32 s6, s3
+; GISEL-GFX12-NEXT:    s_mov_b32 s7, s4
+; GISEL-GFX12-NEXT:    v_mov_b32_e32 v1, 0
+; GISEL-GFX12-NEXT:    s_min_num_f16 s0, s0, s2
+; GISEL-GFX12-NEXT:    v_mov_b32_e32 v0, s0
+; GISEL-GFX12-NEXT:    global_store_b16 v1, v0, s[6:7]
+; GISEL-GFX12-NEXT:    s_endpgm
   %smax = call half @llvm.maxnum.f16(half %a, half %b)
   %sminmax = call half @llvm.minnum.f16(half %smax, half %c)
   store half %sminmax, ptr addrspace(1) %out
@@ -308,23 +526,49 @@ define amdgpu_ps void @s_test_minmax_f16_ieee_false(half inreg %a, half inreg %b
 }
 
 define half @test_minmax_commuted_f16_ieee_true(half %a, half %b, half %c) {
-; SDAG-LABEL: test_minmax_commuted_f16_ieee_true:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_max_f16_e32 v1, v1, v1
-; SDAG-NEXT:    v_max_f16_e32 v0, v0, v0
-; SDAG-NEXT:    v_max_f16_e32 v2, v2, v2
-; SDAG-NEXT:    v_maxmin_f16 v0, v0, v1, v2
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_minmax_commuted_f16_ieee_true:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_max_f16_e32 v0, v0, v0
-; GISEL-NEXT:    v_max_f16_e32 v1, v1, v1
-; GISEL-NEXT:    v_max_f16_e32 v2, v2, v2
-; GISEL-NEXT:    v_maxmin_f16 v0, v0, v1, v2
-; GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-GFX11-LABEL: test_minmax_commuted_f16_ieee_true:
+; SDAG-GFX11:       ; %bb.0:
+; SDAG-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX11-NEXT:    v_max_f16_e32 v1, v1, v1
+; SDAG-GFX11-NEXT:    v_max_f16_e32 v0, v0, v0
+; SDAG-GFX11-NEXT:    v_max_f16_e32 v2, v2, v2
+; SDAG-GFX11-NEXT:    v_maxmin_f16 v0, v0, v1, v2
+; SDAG-GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-GFX11-LABEL: test_minmax_commuted_f16_ieee_true:
+; GISEL-GFX11:       ; %bb.0:
+; GISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX11-NEXT:    v_max_f16_e32 v0, v0, v0
+; GISEL-GFX11-NEXT:    v_max_f16_e32 v1, v1, v1
+; GISEL-GFX11-NEXT:    v_max_f16_e32 v2, v2, v2
+; GISEL-GFX11-NEXT:    v_maxmin_f16 v0, v0, v1, v2
+; GISEL-GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; SDAG-GFX12-LABEL: test_minmax_commuted_f16_ieee_true:
+; SDAG-GFX12:       ; %bb.0:
+; SDAG-GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; SDAG-GFX12-NEXT:    s_wait_expcnt 0x0
+; SDAG-GFX12-NEXT:    s_wait_samplecnt 0x0
+; SDAG-GFX12-NEXT:    s_wait_bvhcnt 0x0
+; SDAG-GFX12-NEXT:    s_wait_kmcnt 0x0
+; SDAG-GFX12-NEXT:    v_max_num_f16_e32 v1, v1, v1
+; SDAG-GFX12-NEXT:    v_max_num_f16_e32 v0, v0, v0
+; SDAG-GFX12-NEXT:    v_max_num_f16_e32 v2, v2, v2
+; SDAG-GFX12-NEXT:    v_maxmin_num_f16 v0, v0, v1, v2
+; SDAG-GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-GFX12-LABEL: test_minmax_commuted_f16_ieee_true:
+; GISEL-GFX12:       ; %bb.0:
+; GISEL-GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GISEL-GFX12-NEXT:    s_wait_expcnt 0x0
+; GISEL-GFX12-NEXT:    s_wait_samplecnt 0x0
+; GISEL-GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GISEL-GFX12-NEXT:    s_wait_kmcnt 0x0
+; GISEL-GFX12-NEXT:    v_max_num_f16_e32 v0, v0, v0
+; GISEL-GFX12-NEXT:    v_max_num_f16_e32 v1, v1, v1
+; GISEL-GFX12-NEXT:    v_max_num_f16_e32 v2, v2, v2
+; GISEL-GFX12-NEXT:    v_maxmin_num_f16 v0, v0, v1, v2
+; GISEL-GFX12-NEXT:    s_setpc_b64 s[30:31]
   %max = call half @llvm.maxnum.f16(half %a, half %b)
   %minmax = call half @llvm.minnum.f16(half %c, half %max)
   ret half %minmax
@@ -335,29 +579,60 @@ define amdgpu_ps half @test_maxmin_f16_ieee_false(half %a, half %b, half %c) {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    v_minmax_f16 v0, v0, v1, v2
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_maxmin_f16_ieee_false:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_minmax_num_f16 v0, v0, v1, v2
+; GFX12-NEXT:    ; return to shader part epilog
   %min = call half @llvm.minnum.f16(half %a, half %b)
   %maxmin = call half @llvm.maxnum.f16(half %min, half %c)
   ret half %maxmin
 }
 
 define half @test_maxmin_commuted_f16_ieee_true(half %a, half %b, half %c) {
-; SDAG-LABEL: test_maxmin_commuted_f16_ieee_true:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_max_f16_e32 v1, v1, v1
-; SDAG-NEXT:    v_max_f16_e32 v0, v0, v0
-; SDAG-NEXT:    v_max_f16_e32 v2, v2, v2
-; SDAG-NEXT:    v_minmax_f16 v0, v0, v1, v2
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_maxmin_commuted_f16_ieee_true:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_max_f16_e32 v0, v0, v0
-; GISEL-NEXT:    v_max_f16_e32 v1, v1, v1
-; GISEL-NEXT:    v_max_f16_e32 v2, v2, v2
-; GISEL-NEXT:    v_minmax_f16 v0, v0, v1, v2
-; GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-GFX11-LABEL: test_maxmin_commuted_f16_ieee_true:
+; SDAG-GFX11:       ; %bb.0:
+; SDAG-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX11-NEXT:    v_max_f16_e32 v1, v1, v1
+; SDAG-GFX11-NEXT:    v_max_f16_e32 v0, v0, v0
+; SDAG-GFX11-NEXT:    v_max_f16_e32 v2, v2, v2
+; SDAG-GFX11-NEXT:    v_minmax_f16 v0, v0, v1, v2
+; SDAG-GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-GFX11-LABEL: test_maxmin_commuted_f16_ieee_true:
+; GISEL-GFX11:       ; %bb.0:
+; GISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX11-NEXT:    v_max_f16_e32 v0, v0, v0
+; GISEL-GFX11-NEXT:    v_max_f16_e32 v1, v1, v1
+; GISEL-GFX11-NEXT:    v_max_f16_e32 v2, v2, v2
+; GISEL-GFX11-NEXT:    v_minmax_f16 v0, v0, v1, v2
+; GISEL-GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; SDAG-GFX12-LABEL: test_maxmin_commuted_f16_ieee_true:
+; SDAG-GFX12:       ; %bb.0:
+; SDAG-GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; SDAG-GFX12-NEXT:    s_wait_expcnt 0x0
+; SDAG-GFX12-NEXT:    s_wait_samplecnt 0x0
+; SDAG-GFX12-NEXT:    s_wait_bvhcnt 0x0
+; SDAG-GFX12-NEXT:    s_wait_kmcnt 0x0
+; SDAG-GFX12-NEXT:    v_max_num_f16_e32 v1, v1, v1
+; SDAG-GFX12-NEXT:    v_max_num_f16_e32 v0, v0, v0
+; SDAG-GFX12-NEXT:    v_max_num_f16_e32 v2, v2, v2
+; SDAG-GFX12-NEXT:    v_minmax_num_f16 v0, v0, v1, v2
+; SDAG-GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-GFX12-LABEL: test_maxmin_commuted_f16_ieee_true:
+; GISEL-GFX12:       ; %bb.0:
+; GISEL-GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GISEL-GFX12-NEXT:    s_wait_expcnt 0x0
+; GISEL-GFX12-NEXT:    s_wait_samplecnt 0x0
+; GISEL-GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GISEL-GFX12-NEXT:    s_wait_kmcnt 0x0
+; GISEL-GFX12-NEXT:    v_max_num_f16_e32 v0, v0, v0
+; GISEL-GFX12-NEXT:    v_max_num_f16_e32 v1, v1, v1
+; GISEL-GFX12-NEXT:    v_max_num_f16_e32 v2, v2, v2
+; GISEL-GFX12-NEXT:    v_minmax_num_f16 v0, v0, v1, v2
+; GISEL-GFX12-NEXT:    s_setpc_b64 s[30:31]
   %min = call half @llvm.minnum.f16(half %a, half %b)
   %maxmin = call half @llvm.maxnum.f16(half %c, half %min)
   ret half %maxmin
@@ -370,6 +645,17 @@ define void @test_med3_f16(ptr addrspace(1) %arg, half %x, half %y, half %z) #0
 ; GFX11-NEXT:    v_med3_f16 v2, v2, v3, v4
 ; GFX11-NEXT:    global_store_b16 v[0:1], v2, off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_med3_f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_med3_num_f16 v2, v2, v3, v4
+; GFX12-NEXT:    global_store_b16 v[0:1], v2, off
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %tmp0 = call half @llvm.minnum.f16(half %x, half %y)
   %tmp1 = call half @llvm.maxnum.f16(half %x, half %y)
   %tmp2 = call half @llvm.minnum.f16(half %tmp1, half %z)
diff --git a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll
new file mode 100644
index 000000000000..a9b8663a48de
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll
@@ -0,0 +1,120 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx942 %s -o - | FileCheck %s --check-prefixes=GFX942
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx908 %s -o - | FileCheck %s --check-prefixes=GFX908
+
+define amdgpu_kernel void @matmul_kernel(i32 %a0, i32 %a1) {
+; GFX942-LABEL: matmul_kernel:
+; GFX942:       ; %bb.0: ; %entry
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-NEXT:    s_mov_b32 s2, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a0, v1
+; GFX942-NEXT:    s_mov_b32 s3, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX942-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; GFX942-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX942-NEXT:    v_cmp_ne_u32_e64 s[0:1], 1, v0
+; GFX942-NEXT:    s_branch .LBB0_2
+; GFX942-NEXT:  .LBB0_1: ; %bb2
+; GFX942-NEXT:    ; in Loop: Header=BB0_2 Depth=1
+; GFX942-NEXT:    s_or_b32 s4, s3, 1
+; GFX942-NEXT:    s_ashr_i32 s5, s3, 31
+; GFX942-NEXT:    s_mov_b32 s3, s2
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], s[2:3]
+; GFX942-NEXT:    v_accvgpr_read_b32 v0, a0
+; GFX942-NEXT:    v_mov_b32_e32 v2, v1
+; GFX942-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NEXT:    v_accvgpr_write_b32 a0, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a1, v1
+; GFX942-NEXT:    v_accvgpr_write_b32 a2, v2
+; GFX942-NEXT:    v_accvgpr_write_b32 a3, v3
+; GFX942-NEXT:    s_and_b32 s3, s5, s4
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_mfma_f32_16x16x16_f16 a[0:3], v[4:5], v[4:5], a[0:3]
+; GFX942-NEXT:    s_cbranch_execz .LBB0_4
+; GFX942-NEXT:  .LBB0_2: ; %bb
+; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT:    s_and_b64 vcc, exec, s[0:1]
+; GFX942-NEXT:    s_cbranch_vccz .LBB0_1
+; GFX942-NEXT:  ; %bb.3:
+; GFX942-NEXT:    ; implicit-def: $sgpr3
+; GFX942-NEXT:  .LBB0_4: ; %common.ret
+; GFX942-NEXT:    s_endpgm
+;
+; GFX908-LABEL: matmul_kernel:
+; GFX908:       ; %bb.0: ; %entry
+; GFX908-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX908-NEXT:    v_mov_b32_e32 v1, 0
+; GFX908-NEXT:    s_mov_b32 s2, 0
+; GFX908-NEXT:    s_mov_b32 s3, 0
+; GFX908-NEXT:    v_accvgpr_write_b32 a0, v1
+; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX908-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX908-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; GFX908-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX908-NEXT:    v_cmp_ne_u32_e64 s[0:1], 1, v0
+; GFX908-NEXT:    s_branch .LBB0_2
+; GFX908-NEXT:  .LBB0_1: ; %bb2
+; GFX908-NEXT:    ; in Loop: Header=BB0_2 Depth=1
+; GFX908-NEXT:    s_or_b32 s4, s3, 1
+; GFX908-NEXT:    s_ashr_i32 s5, s3, 31
+; GFX908-NEXT:    s_mov_b32 s3, s2
+; GFX908-NEXT:    s_nop 3
+; GFX908-NEXT:    v_accvgpr_read_b32 v0, a0
+; GFX908-NEXT:    v_mov_b32_e32 v5, s3
+; GFX908-NEXT:    v_mov_b32_e32 v4, s2
+; GFX908-NEXT:    v_mov_b32_e32 v2, v1
+; GFX908-NEXT:    v_mov_b32_e32 v3, v1
+; GFX908-NEXT:    v_accvgpr_write_b32 a0, v0
+; GFX908-NEXT:    v_accvgpr_write_b32 a1, v1
+; GFX908-NEXT:    v_accvgpr_write_b32 a2, v2
+; GFX908-NEXT:    v_accvgpr_write_b32 a3, v3
+; GFX908-NEXT:    s_and_b32 s3, s5, s4
+; GFX908-NEXT:    v_mfma_f32_16x16x16f16 a[0:3], v[4:5], v[4:5], a[0:3]
+; GFX908-NEXT:    s_cbranch_execz .LBB0_4
+; GFX908-NEXT:  .LBB0_2: ; %bb
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_and_b64 vcc, exec, s[0:1]
+; GFX908-NEXT:    s_cbranch_vccz .LBB0_1
+; GFX908-NEXT:  ; %bb.3:
+; GFX908-NEXT:    ; implicit-def: $sgpr3
+; GFX908-NEXT:  .LBB0_4: ; %common.ret
+; GFX908-NEXT:    s_endpgm
+entry:
+  br label %bb
+
+bb:
+  %i = phi { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } [ %i10, %bb2 ], [ zeroinitializer, %entry ]
+  %i1 = phi i32 [ %i5, %bb2 ], [ 0, %entry ]
+  %c0 = icmp ne i32 %a0, 0
+  br i1 %c0, label %bb2, label %bb11
+
+bb2:
+  %i3 = or i32 %i1, 1
+  %i4 = icmp slt i32 %i1, 0
+  %i5 = select i1 %i4, i32 %i3, i32 0
+  %i6 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %i, 123
+  %i7 = insertelement <4 x float> zeroinitializer, float %i6, i32 0
+  %i8 = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> zeroinitializer, <4 x half> zeroinitializer, <4 x float> %i7, i32 0, i32 0, i32 0)
+  %i9 = extractelement <4 x float> %i8, i32 0
+  %i10 = insertvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } zeroinitializer, float %i9, 123
+  br label %bb
+
+bb11:
+  %c1 = icmp ne i32 %a1, 0
+  br i1 %c1, label %bb12, label %common.ret
+
+common.ret:
+  ret void
+
+bb12:
+  %i13 = extractvalue { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float } %i, 0
+  %i14 = insertelement <4 x float> zeroinitializer, float %i13, i32 0
+  %i15 = insertelement <4 x float> %i14, float 0.000000e+00, i32 0
+  %i16 = insertelement <4 x float> %i15, float 0.000000e+00, i32 0
+  br label %common.ret
+}
+
+; Function Attrs: convergent nocallback nofree nosync nounwind willreturn memory(none)
+declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half>, <4 x half>, <4 x float>, i32 immarg, i32 immarg, i32 immarg)
diff --git a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.mir b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.mir
new file mode 100644
index 000000000000..5c83170563e5
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.mir
@@ -0,0 +1,235 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx942 -run-pass si-fold-operands %s -o - | FileCheck %s
+# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx942 -start-before=si-fold-operands -stop-after=register-coalescer %s -o - | FileCheck %s --check-prefixes=COALESCE
+# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx908 -start-before=si-fold-operands -stop-after=register-coalescer %s -o - | FileCheck %s --check-prefixes=GFX908-COALESCE
+
+...
+---
+name:            test
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: test
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $sgpr4_sgpr5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
+  ; CHECK-NEXT:   [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 0, 0 :: (dereferenceable invariant load (s32), align 16, addrspace 4)
+  ; CHECK-NEXT:   S_BITCMP1_B32 killed [[S_LOAD_DWORD_IMM]], 0, implicit-def $scc
+  ; CHECK-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 0
+  ; CHECK-NEXT:   [[S_CSELECT_B64_:%[0-9]+]]:sreg_64_xexec = S_CSELECT_B64 -1, 0, implicit $scc
+  ; CHECK-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:agpr_32 = COPY [[V_MOV_B32_e32_]]
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[S_CSELECT_B64_]], implicit $exec
+  ; CHECK-NEXT:   [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U32_e64 [[V_CNDMASK_B32_e64_]], 1, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI:%[0-9]+]]:agpr_32 = PHI [[COPY1]], %bb.0, %24, %bb.3
+  ; CHECK-NEXT:   [[PHI1:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_]], %bb.0, %11, %bb.3
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[PHI]]
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:agpr_32 = COPY [[PHI]]
+  ; CHECK-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1
+  ; CHECK-NEXT:   $vcc = S_AND_B64 $exec, [[V_CMP_NE_U32_e64_]], implicit-def $scc
+  ; CHECK-NEXT:   S_CBRANCH_VCCNZ %bb.3, implicit $vcc
+  ; CHECK-NEXT:   S_BRANCH %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[PHI1]], 1, implicit-def dead $scc
+  ; CHECK-NEXT:   [[S_ASHR_I32_:%[0-9]+]]:sreg_32 = S_ASHR_I32 [[PHI1]], 31, implicit-def dead $scc
+  ; CHECK-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 killed [[S_ASHR_I32_]], killed [[S_OR_B32_]], implicit-def dead $scc
+  ; CHECK-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[V_MOV_B32_e32_]], %subreg.sub1, [[V_MOV_B32_e32_]], %subreg.sub2, [[V_MOV_B32_e32_]], %subreg.sub3
+  ; CHECK-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:areg_128_align2 = COPY [[REG_SEQUENCE]]
+  ; CHECK-NEXT:   [[V_MFMA_F32_16X16X16F16_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY4]], [[COPY4]], killed [[COPY5]], 0, 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   successors: %bb.4(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI2:%[0-9]+]]:sreg_32 = PHI [[DEF]], %bb.1, [[S_AND_B32_]], %bb.2
+  ; CHECK-NEXT:   [[PHI3:%[0-9]+]]:agpr_32 = PHI [[COPY3]], %bb.1, [[V_MFMA_F32_16X16X16F16_e64_]].sub0, %bb.2
+  ; CHECK-NEXT:   [[PHI4:%[0-9]+]]:sreg_64_xexec = PHI [[S_MOV_B64_]], %bb.1, [[S_MOV_B64_1]], %bb.2
+  ; CHECK-NEXT:   [[V_CNDMASK_B32_e64_1:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[PHI4]], implicit $exec
+  ; CHECK-NEXT:   [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U32_e64 [[V_CNDMASK_B32_e64_1]], 1, implicit $exec
+  ; CHECK-NEXT:   $vcc = S_AND_B64 $exec, [[V_CMP_NE_U32_e64_1]], implicit-def $scc
+  ; CHECK-NEXT:   S_CBRANCH_VCCNZ %bb.1, implicit $vcc
+  ; CHECK-NEXT:   S_BRANCH %bb.4
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4:
+  ; CHECK-NEXT:   successors: %bb.5(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.5:
+  ; CHECK-NEXT:   S_ENDPGM 0
+  ;
+  ; COALESCE-LABEL: name: test
+  ; COALESCE: bb.0:
+  ; COALESCE-NEXT:   successors: %bb.1(0x80000000)
+  ; COALESCE-NEXT:   liveins: $sgpr4_sgpr5
+  ; COALESCE-NEXT: {{  $}}
+  ; COALESCE-NEXT:   [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
+  ; COALESCE-NEXT:   [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 0, 0 :: (dereferenceable invariant load (s32), align 16, addrspace 4)
+  ; COALESCE-NEXT:   S_BITCMP1_B32 [[S_LOAD_DWORD_IMM]], 0, implicit-def $scc
+  ; COALESCE-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sgpr_64 = S_MOV_B32 0
+  ; COALESCE-NEXT:   [[S_CSELECT_B64_:%[0-9]+]]:sreg_64_xexec = S_CSELECT_B64 -1, 0, implicit killed $scc
+  ; COALESCE-NEXT:   undef [[V_MOV_B32_e32_:%[0-9]+]].sub1:vreg_128_align2 = V_MOV_B32_e32 0, implicit $exec
+  ; COALESCE-NEXT:   undef [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]].sub0:areg_128_align2 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec
+  ; COALESCE-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[S_CSELECT_B64_]], implicit $exec
+  ; COALESCE-NEXT:   [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U32_e64 1, [[V_CNDMASK_B32_e64_]], implicit $exec
+  ; COALESCE-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+  ; COALESCE-NEXT: {{  $}}
+  ; COALESCE-NEXT: bb.1:
+  ; COALESCE-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
+  ; COALESCE-NEXT: {{  $}}
+  ; COALESCE-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]].sub0:vreg_128_align2 = COPY [[V_ACCVGPR_WRITE_B32_e64_]].sub0
+  ; COALESCE-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 -1
+  ; COALESCE-NEXT:   $vcc = S_AND_B64 $exec, [[V_CMP_NE_U32_e64_]], implicit-def dead $scc
+  ; COALESCE-NEXT:   S_CBRANCH_VCCNZ %bb.3, implicit killed $vcc
+  ; COALESCE-NEXT:   S_BRANCH %bb.2
+  ; COALESCE-NEXT: {{  $}}
+  ; COALESCE-NEXT: bb.2:
+  ; COALESCE-NEXT:   successors: %bb.3(0x80000000)
+  ; COALESCE-NEXT: {{  $}}
+  ; COALESCE-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_MOV_B32_1]], 1, implicit-def dead $scc
+  ; COALESCE-NEXT:   [[S_ASHR_I32_:%[0-9]+]]:sreg_32 = S_ASHR_I32 [[S_MOV_B32_1]], 31, implicit-def dead $scc
+  ; COALESCE-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_ASHR_I32_]], [[S_OR_B32_]], implicit-def dead $scc
+  ; COALESCE-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]].sub2:vreg_128_align2 = COPY [[V_MOV_B32_e32_]].sub1
+  ; COALESCE-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]].sub3:vreg_128_align2 = COPY [[V_MOV_B32_e32_]].sub1
+  ; COALESCE-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_64 = COPY [[S_MOV_B32_]].sub0
+  ; COALESCE-NEXT:   [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B32_]]
+  ; COALESCE-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[V_MOV_B32_e32_]]
+  ; COALESCE-NEXT:   [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[COPY2]], 0, 0, 0, implicit $mode, implicit $exec
+  ; COALESCE-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 0
+  ; COALESCE-NEXT: {{  $}}
+  ; COALESCE-NEXT: bb.3:
+  ; COALESCE-NEXT:   successors: %bb.4(0x40000000), %bb.1(0x40000000)
+  ; COALESCE-NEXT: {{  $}}
+  ; COALESCE-NEXT:   [[V_CNDMASK_B32_e64_1:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[S_MOV_B64_]], implicit $exec
+  ; COALESCE-NEXT:   [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U32_e64 1, [[V_CNDMASK_B32_e64_1]], implicit $exec
+  ; COALESCE-NEXT:   $vcc = S_AND_B64 $exec, [[V_CMP_NE_U32_e64_1]], implicit-def dead $scc
+  ; COALESCE-NEXT:   S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc
+  ; COALESCE-NEXT:   S_BRANCH %bb.4
+  ; COALESCE-NEXT: {{  $}}
+  ; COALESCE-NEXT: bb.4:
+  ; COALESCE-NEXT:   successors: %bb.5(0x80000000)
+  ; COALESCE-NEXT: {{  $}}
+  ; COALESCE-NEXT: bb.5:
+  ; COALESCE-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX908-COALESCE-LABEL: name: test
+  ; GFX908-COALESCE: bb.0:
+  ; GFX908-COALESCE-NEXT:   successors: %bb.1(0x80000000)
+  ; GFX908-COALESCE-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX908-COALESCE-NEXT: {{  $}}
+  ; GFX908-COALESCE-NEXT:   [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
+  ; GFX908-COALESCE-NEXT:   [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 0, 0 :: (dereferenceable invariant load (s32), align 16, addrspace 4)
+  ; GFX908-COALESCE-NEXT:   S_BITCMP1_B32 [[S_LOAD_DWORD_IMM]], 0, implicit-def $scc
+  ; GFX908-COALESCE-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sgpr_64 = S_MOV_B32 0
+  ; GFX908-COALESCE-NEXT:   [[S_CSELECT_B64_:%[0-9]+]]:sreg_64_xexec = S_CSELECT_B64 -1, 0, implicit killed $scc
+  ; GFX908-COALESCE-NEXT:   undef [[V_MOV_B32_e32_:%[0-9]+]].sub1:vreg_128_align2 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX908-COALESCE-NEXT:   undef [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]].sub0:areg_128_align2 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec
+  ; GFX908-COALESCE-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[S_CSELECT_B64_]], implicit $exec
+  ; GFX908-COALESCE-NEXT:   [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U32_e64 1, [[V_CNDMASK_B32_e64_]], implicit $exec
+  ; GFX908-COALESCE-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+  ; GFX908-COALESCE-NEXT: {{  $}}
+  ; GFX908-COALESCE-NEXT: bb.1:
+  ; GFX908-COALESCE-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
+  ; GFX908-COALESCE-NEXT: {{  $}}
+  ; GFX908-COALESCE-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]].sub0:vreg_128_align2 = COPY [[V_ACCVGPR_WRITE_B32_e64_]].sub0
+  ; GFX908-COALESCE-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 -1
+  ; GFX908-COALESCE-NEXT:   $vcc = S_AND_B64 $exec, [[V_CMP_NE_U32_e64_]], implicit-def dead $scc
+  ; GFX908-COALESCE-NEXT:   S_CBRANCH_VCCNZ %bb.3, implicit killed $vcc
+  ; GFX908-COALESCE-NEXT:   S_BRANCH %bb.2
+  ; GFX908-COALESCE-NEXT: {{  $}}
+  ; GFX908-COALESCE-NEXT: bb.2:
+  ; GFX908-COALESCE-NEXT:   successors: %bb.3(0x80000000)
+  ; GFX908-COALESCE-NEXT: {{  $}}
+  ; GFX908-COALESCE-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_MOV_B32_1]], 1, implicit-def dead $scc
+  ; GFX908-COALESCE-NEXT:   [[S_ASHR_I32_:%[0-9]+]]:sreg_32 = S_ASHR_I32 [[S_MOV_B32_1]], 31, implicit-def dead $scc
+  ; GFX908-COALESCE-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_ASHR_I32_]], [[S_OR_B32_]], implicit-def dead $scc
+  ; GFX908-COALESCE-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]].sub2:vreg_128_align2 = COPY [[V_MOV_B32_e32_]].sub1
+  ; GFX908-COALESCE-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]].sub3:vreg_128_align2 = COPY [[V_MOV_B32_e32_]].sub1
+  ; GFX908-COALESCE-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_64 = COPY [[S_MOV_B32_]].sub0
+  ; GFX908-COALESCE-NEXT:   [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B32_]]
+  ; GFX908-COALESCE-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[V_MOV_B32_e32_]]
+  ; GFX908-COALESCE-NEXT:   [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[COPY2]], 0, 0, 0, implicit $mode, implicit $exec
+  ; GFX908-COALESCE-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 0
+  ; GFX908-COALESCE-NEXT: {{  $}}
+  ; GFX908-COALESCE-NEXT: bb.3:
+  ; GFX908-COALESCE-NEXT:   successors: %bb.4(0x40000000), %bb.1(0x40000000)
+  ; GFX908-COALESCE-NEXT: {{  $}}
+  ; GFX908-COALESCE-NEXT:   [[V_CNDMASK_B32_e64_1:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[S_MOV_B64_]], implicit $exec
+  ; GFX908-COALESCE-NEXT:   [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U32_e64 1, [[V_CNDMASK_B32_e64_1]], implicit $exec
+  ; GFX908-COALESCE-NEXT:   $vcc = S_AND_B64 $exec, [[V_CMP_NE_U32_e64_1]], implicit-def dead $scc
+  ; GFX908-COALESCE-NEXT:   S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc
+  ; GFX908-COALESCE-NEXT:   S_BRANCH %bb.4
+  ; GFX908-COALESCE-NEXT: {{  $}}
+  ; GFX908-COALESCE-NEXT: bb.4:
+  ; GFX908-COALESCE-NEXT:   successors: %bb.5(0x80000000)
+  ; GFX908-COALESCE-NEXT: {{  $}}
+  ; GFX908-COALESCE-NEXT: bb.5:
+  ; GFX908-COALESCE-NEXT:   S_ENDPGM 0
+  bb.0:
+    successors: %bb.1
+    liveins: $sgpr4_sgpr5
+
+    %0:sgpr_64(p4) = COPY $sgpr4_sgpr5
+    %1:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0(p4), 0, 0 :: (dereferenceable invariant load (s32), align 16, addrspace 4)
+    S_BITCMP1_B32 killed %1, 0, implicit-def $scc
+    %2:sgpr_32 = S_MOV_B32 0
+    %3:sreg_64_xexec = S_CSELECT_B64 -1, 0, implicit $scc
+    %4:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %5:sreg_32 = IMPLICIT_DEF
+    %6:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, %3, implicit $exec
+    %7:sreg_64_xexec = V_CMP_NE_U32_e64 %6, 1, implicit $exec
+
+  bb.1:
+    successors: %bb.2, %bb.3
+
+    %8:vgpr_32 = PHI %4, %bb.0, %9, %bb.3
+    %10:sreg_32 = PHI %2, %bb.0, %11, %bb.3
+    %12:agpr_32 = COPY %8
+    %13:sreg_64 = S_MOV_B64 -1
+    $vcc = S_AND_B64 $exec, %7, implicit-def $scc
+    S_CBRANCH_VCCNZ %bb.3, implicit $vcc
+    S_BRANCH %bb.2
+
+  bb.2:
+    successors: %bb.3
+
+    %14:sreg_32 = S_OR_B32 %10, 1, implicit-def dead $scc
+    %15:sreg_32 = S_ASHR_I32 %10, 31, implicit-def dead $scc
+    %16:sreg_32 = S_AND_B32 killed %15, killed %14, implicit-def dead $scc
+    %17:vreg_128_align2 = REG_SEQUENCE %8, %subreg.sub0, %4, %subreg.sub1, %4, %subreg.sub2, %4, %subreg.sub3
+    %18:sreg_64 = REG_SEQUENCE %2, %subreg.sub0, %2, %subreg.sub1
+    %19:vreg_64_align2 = COPY %18
+    %20:areg_128_align2 = COPY %17
+    %21:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 %19, %19, killed %20, 0, 0, 0, implicit $mode, implicit $exec
+    %22:vgpr_32 = COPY %21.sub0
+    %23:sreg_64 = S_MOV_B64 0
+
+  bb.3:
+    successors: %bb.4, %bb.1
+
+    %11:sreg_32 = PHI %5, %bb.1, %16, %bb.2
+    %24:agpr_32 = PHI %12, %bb.1, %21.sub0, %bb.2
+    %25:sreg_64_xexec = PHI %13, %bb.1, %23, %bb.2
+    %9:vgpr_32 = COPY %24
+    %26:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, %25, implicit $exec
+    %27:sreg_64_xexec = V_CMP_NE_U32_e64 %26, 1, implicit $exec
+    $vcc = S_AND_B64 $exec, %27, implicit-def $scc
+    S_CBRANCH_VCCNZ %bb.1, implicit $vcc
+    S_BRANCH %bb.4
+
+  bb.4:
+    successors: %bb.5
+
+  bb.5:
+    S_ENDPGM 0
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-read.mir b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-read.mir
new file mode 100644
index 000000000000..49c0aaf9fb39
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-read.mir
@@ -0,0 +1,182 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx942 -run-pass si-fold-operands %s -o - | FileCheck %s
+# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx942 -start-before=si-fold-operands -stop-after=register-coalescer %s -o - | FileCheck %s --check-prefixes=COALESCE
+# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx908 -start-before=si-fold-operands -stop-after=register-coalescer %s -o - | FileCheck %s --check-prefixes=GFX908-COALESCE
+
+...
+---
+name:            test
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: test
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT:   liveins: $sgpr4_sgpr5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
+  ; CHECK-NEXT:   [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 0, 0 :: (dereferenceable invariant load (s32), align 16, addrspace 4)
+  ; CHECK-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 0
+  ; CHECK-NEXT:   S_BITCMP0_B32 killed [[S_LOAD_DWORD_IMM]], 0, implicit-def $scc
+  ; CHECK-NEXT:   S_CBRANCH_SCC0 %bb.2, implicit $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sgpr_32 = S_MOV_B32 0
+  ; CHECK-NEXT:   [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec
+  ; CHECK-NEXT:   [[V_ACCVGPR_WRITE_B32_e64_2:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec
+  ; CHECK-NEXT:   [[V_ACCVGPR_WRITE_B32_e64_3:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec
+  ; CHECK-NEXT:   [[V_ACCVGPR_WRITE_B32_e64_4:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec
+  ; CHECK-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:areg_128_align2 = REG_SEQUENCE [[V_ACCVGPR_WRITE_B32_e64_1]], %subreg.sub0, [[V_ACCVGPR_WRITE_B32_e64_2]], %subreg.sub1, [[V_ACCVGPR_WRITE_B32_e64_3]], %subreg.sub2, [[V_ACCVGPR_WRITE_B32_e64_4]], %subreg.sub3
+  ; CHECK-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_1]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
+  ; CHECK-NEXT:   [[V_MFMA_F32_16X16X16F16_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], 0, 0, 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_F32_16X16X16F16_e64_1:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], killed [[V_MFMA_F32_16X16X16F16_e64_]], 0, 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_F32_16X16X16F16_e64_2:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], killed [[V_MFMA_F32_16X16X16F16_e64_1]], 0, 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_F32_16X16X16F16_e64_3:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], killed [[V_MFMA_F32_16X16X16F16_e64_2]], 0, 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MFMA_F32_16X16X16F16_e64_3]].sub0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   [[PHI:%[0-9]+]]:agpr_32 = PHI [[V_ACCVGPR_WRITE_B32_e64_]], %bb.1, [[V_MFMA_F32_16X16X16F16_e64_3]].sub0, %bb.2
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[PHI]]
+  ; CHECK-NEXT:   [[V_CVT_F16_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_e64 0, [[COPY3]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_PACK_B32_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed [[V_CVT_F16_F32_e64_]], 0, 0, 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_PACK_B32_F16_e64_]], %subreg.sub0, killed [[V_MOV_B32_e32_]], %subreg.sub1
+  ; CHECK-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1, [[S_MOV_B32_]], %subreg.sub2, [[S_MOV_B32_]], %subreg.sub3
+  ; CHECK-NEXT:   BUFFER_STORE_DWORDX2_OFFSET_exact [[REG_SEQUENCE2]], killed [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s64) into `ptr addrspace(8) null`, align 1, addrspace 8)
+  ; CHECK-NEXT:   S_ENDPGM 0
+  ;
+  ; COALESCE-LABEL: name: test
+  ; COALESCE: bb.0:
+  ; COALESCE-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; COALESCE-NEXT:   liveins: $sgpr4_sgpr5
+  ; COALESCE-NEXT: {{  $}}
+  ; COALESCE-NEXT:   [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
+  ; COALESCE-NEXT:   [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 0, 0 :: (dereferenceable invariant load (s32), align 16, addrspace 4)
+  ; COALESCE-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sgpr_128 = S_MOV_B32 0
+  ; COALESCE-NEXT:   S_BITCMP0_B32 [[S_LOAD_DWORD_IMM]], 0, implicit-def $scc
+  ; COALESCE-NEXT:   S_CBRANCH_SCC0 %bb.2, implicit killed $scc
+  ; COALESCE-NEXT: {{  $}}
+  ; COALESCE-NEXT: bb.1:
+  ; COALESCE-NEXT:   successors: %bb.3(0x80000000)
+  ; COALESCE-NEXT: {{  $}}
+  ; COALESCE-NEXT:   undef [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]].sub0:areg_128_align2 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec
+  ; COALESCE-NEXT:   S_BRANCH %bb.3
+  ; COALESCE-NEXT: {{  $}}
+  ; COALESCE-NEXT: bb.2:
+  ; COALESCE-NEXT:   successors: %bb.3(0x80000000)
+  ; COALESCE-NEXT: {{  $}}
+  ; COALESCE-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_128 = COPY [[S_MOV_B32_]].sub0
+  ; COALESCE-NEXT:   [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B32_]].sub0_sub1
+  ; COALESCE-NEXT:   [[V_MFMA_F32_16X16X16F16_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], 0, 0, 0, 0, implicit $mode, implicit $exec
+  ; COALESCE-NEXT:   [[V_MFMA_F32_16X16X16F16_e64_1:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[V_MFMA_F32_16X16X16F16_e64_]], 0, 0, 0, implicit $mode, implicit $exec
+  ; COALESCE-NEXT:   [[V_MFMA_F32_16X16X16F16_e64_2:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[V_MFMA_F32_16X16X16F16_e64_1]], 0, 0, 0, implicit $mode, implicit $exec
+  ; COALESCE-NEXT:   [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[V_MFMA_F32_16X16X16F16_e64_2]], 0, 0, 0, implicit $mode, implicit $exec
+  ; COALESCE-NEXT: {{  $}}
+  ; COALESCE-NEXT: bb.3:
+  ; COALESCE-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_ACCVGPR_WRITE_B32_e64_]].sub0
+  ; COALESCE-NEXT:   [[V_CVT_F16_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 [[COPY2]], implicit $mode, implicit $exec
+  ; COALESCE-NEXT:   undef [[V_PACK_B32_F16_e64_:%[0-9]+]].sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, [[V_CVT_F16_F32_e32_]], 0, 0, 0, 0, implicit $mode, implicit $exec
+  ; COALESCE-NEXT:   [[V_PACK_B32_F16_e64_:%[0-9]+]].sub1:vreg_64_align2 = V_MOV_B32_e32 0, implicit $exec
+  ; COALESCE-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_128 = COPY [[S_MOV_B32_]].sub0
+  ; COALESCE-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub2:sgpr_128 = COPY [[S_MOV_B32_]].sub0
+  ; COALESCE-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub3:sgpr_128 = COPY [[S_MOV_B32_]].sub0
+  ; COALESCE-NEXT:   BUFFER_STORE_DWORDX2_OFFSET_exact [[V_PACK_B32_F16_e64_]], [[S_MOV_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s64) into `ptr addrspace(8) null`, align 1, addrspace 8)
+  ; COALESCE-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX908-COALESCE-LABEL: name: test
+  ; GFX908-COALESCE: bb.0:
+  ; GFX908-COALESCE-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; GFX908-COALESCE-NEXT:   liveins: $sgpr4_sgpr5
+  ; GFX908-COALESCE-NEXT: {{  $}}
+  ; GFX908-COALESCE-NEXT:   [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
+  ; GFX908-COALESCE-NEXT:   [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 0, 0 :: (dereferenceable invariant load (s32), align 16, addrspace 4)
+  ; GFX908-COALESCE-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sgpr_128 = S_MOV_B32 0
+  ; GFX908-COALESCE-NEXT:   S_BITCMP0_B32 [[S_LOAD_DWORD_IMM]], 0, implicit-def $scc
+  ; GFX908-COALESCE-NEXT:   S_CBRANCH_SCC0 %bb.2, implicit killed $scc
+  ; GFX908-COALESCE-NEXT: {{  $}}
+  ; GFX908-COALESCE-NEXT: bb.1:
+  ; GFX908-COALESCE-NEXT:   successors: %bb.3(0x80000000)
+  ; GFX908-COALESCE-NEXT: {{  $}}
+  ; GFX908-COALESCE-NEXT:   undef [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]].sub0:areg_128_align2 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec
+  ; GFX908-COALESCE-NEXT:   S_BRANCH %bb.3
+  ; GFX908-COALESCE-NEXT: {{  $}}
+  ; GFX908-COALESCE-NEXT: bb.2:
+  ; GFX908-COALESCE-NEXT:   successors: %bb.3(0x80000000)
+  ; GFX908-COALESCE-NEXT: {{  $}}
+  ; GFX908-COALESCE-NEXT:   undef [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]].sub0:areg_128_align2 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec
+  ; GFX908-COALESCE-NEXT:   [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]].sub1:areg_128_align2 = COPY [[V_ACCVGPR_WRITE_B32_e64_1]].sub0
+  ; GFX908-COALESCE-NEXT:   [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]].sub2:areg_128_align2 = COPY [[V_ACCVGPR_WRITE_B32_e64_1]].sub0
+  ; GFX908-COALESCE-NEXT:   [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]].sub3:areg_128_align2 = COPY [[V_ACCVGPR_WRITE_B32_e64_1]].sub0
+  ; GFX908-COALESCE-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_128 = COPY [[S_MOV_B32_]].sub0
+  ; GFX908-COALESCE-NEXT:   [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B32_]].sub0_sub1
+  ; GFX908-COALESCE-NEXT:   [[V_MFMA_F32_16X16X16F16_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[V_ACCVGPR_WRITE_B32_e64_1]], 0, 0, 0, implicit $mode, implicit $exec
+  ; GFX908-COALESCE-NEXT:   [[V_MFMA_F32_16X16X16F16_e64_1:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[V_MFMA_F32_16X16X16F16_e64_]], 0, 0, 0, implicit $mode, implicit $exec
+  ; GFX908-COALESCE-NEXT:   [[V_MFMA_F32_16X16X16F16_e64_2:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[V_MFMA_F32_16X16X16F16_e64_1]], 0, 0, 0, implicit $mode, implicit $exec
+  ; GFX908-COALESCE-NEXT:   [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[V_MFMA_F32_16X16X16F16_e64_2]], 0, 0, 0, implicit $mode, implicit $exec
+  ; GFX908-COALESCE-NEXT: {{  $}}
+  ; GFX908-COALESCE-NEXT: bb.3:
+  ; GFX908-COALESCE-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_ACCVGPR_WRITE_B32_e64_]].sub0
+  ; GFX908-COALESCE-NEXT:   [[V_CVT_F16_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 [[COPY2]], implicit $mode, implicit $exec
+  ; GFX908-COALESCE-NEXT:   undef [[V_PACK_B32_F16_e64_:%[0-9]+]].sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, [[V_CVT_F16_F32_e32_]], 0, 0, 0, 0, implicit $mode, implicit $exec
+  ; GFX908-COALESCE-NEXT:   [[V_PACK_B32_F16_e64_:%[0-9]+]].sub1:vreg_64_align2 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX908-COALESCE-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_128 = COPY [[S_MOV_B32_]].sub0
+  ; GFX908-COALESCE-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub2:sgpr_128 = COPY [[S_MOV_B32_]].sub0
+  ; GFX908-COALESCE-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub3:sgpr_128 = COPY [[S_MOV_B32_]].sub0
+  ; GFX908-COALESCE-NEXT:   BUFFER_STORE_DWORDX2_OFFSET_exact [[V_PACK_B32_F16_e64_]], [[S_MOV_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s64) into `ptr addrspace(8) null`, align 1, addrspace 8)
+  ; GFX908-COALESCE-NEXT:   S_ENDPGM 0
+  bb.0:
+    successors: %bb.2, %bb.1
+    liveins: $sgpr4_sgpr5
+
+    %0:sgpr_64(p4) = COPY $sgpr4_sgpr5
+    %1:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0(p4), 0, 0 :: (dereferenceable invariant load (s32), align 16, addrspace 4)
+    %2:sgpr_32 = S_MOV_B32 0
+    S_BITCMP0_B32 killed %1, 0, implicit-def $scc
+    S_CBRANCH_SCC0 %bb.2, implicit $scc
+
+  bb.1:
+    successors: %bb.3
+
+    %3:sgpr_32 = COPY %2
+    %4:vgpr_32 = COPY %3, implicit $exec
+    S_BRANCH %bb.3
+
+  bb.2:
+    successors: %bb.3
+
+    %5:sgpr_32 = S_MOV_B32 0
+    %6:vgpr_32 = COPY %5
+    %7:agpr_32 = V_ACCVGPR_WRITE_B32_e64 %6, implicit $exec
+    %8:agpr_32 = V_ACCVGPR_WRITE_B32_e64 %6, implicit $exec
+    %9:agpr_32 = V_ACCVGPR_WRITE_B32_e64 %6, implicit $exec
+    %10:agpr_32 = V_ACCVGPR_WRITE_B32_e64 %6, implicit $exec
+    %11:areg_128_align2 = REG_SEQUENCE %7, %subreg.sub0, %8, %subreg.sub1, %9, %subreg.sub2, %10, %subreg.sub3
+    %12:sreg_64 = REG_SEQUENCE %5, %subreg.sub0, %5, %subreg.sub1
+    %13:vreg_64_align2 = COPY %12
+    %14:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 %13, %13, killed %11, 0, 0, 0, implicit $mode, implicit $exec
+    %15:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 %13, %13, killed %14, 0, 0, 0, implicit $mode, implicit $exec
+    %16:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 %13, %13, killed %15, 0, 0, 0, implicit $mode, implicit $exec
+    %17:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 %13, %13, killed %16, 0, 0, 0, implicit $mode, implicit $exec
+    %18:vgpr_32 = COPY %17.sub0
+    %19:vgpr_32 = COPY %18
+
+  bb.3:
+    %20:vgpr_32 = PHI %4, %bb.1, %19, %bb.2
+    %21:vgpr_32 = nofpexcept V_CVT_F16_F32_e64 0, %20, 0, 0, implicit $mode, implicit $exec
+    %22:vgpr_32 = nofpexcept V_PACK_B32_F16_e64 0, killed %21, 0, %2, 0, 0, implicit $mode, implicit $exec
+    %23:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %24:vreg_64_align2 = REG_SEQUENCE %22, %subreg.sub0, killed %23, %subreg.sub1
+    %25:sgpr_128 = REG_SEQUENCE %2, %subreg.sub0, %2, %subreg.sub1, %2, %subreg.sub2, %2, %subreg.sub3
+    %26:vreg_64_align2 = COPY %24
+    BUFFER_STORE_DWORDX2_OFFSET_exact killed %26, killed %25, %2, 0, 0, 0, implicit $exec :: (dereferenceable store (s64) into `ptr addrspace(8) null`, align 1, addrspace 8)
+    S_ENDPGM 0
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
index 98d5f3097153..a2a0107a6f7d 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
@@ -1372,20 +1372,19 @@ define amdgpu_kernel void @Offset64(ptr addrspace(1)  %buffer) {
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v1, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v2, vcc
 ; GFX8-NEXT:    s_movk_i32 s0, 0xf000
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s0, v3
-; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, 0, v4, vcc
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, s0, v3
+; GFX8-NEXT:    v_addc_u32_e32 v8, vcc, 0, v4, vcc
 ; GFX8-NEXT:    s_movk_i32 s0, 0xf800
-; GFX8-NEXT:    flat_load_dwordx2 v[7:8], v[3:4]
-; GFX8-NEXT:    flat_load_dwordx2 v[5:6], v[5:6]
+; GFX8-NEXT:    flat_load_dwordx2 v[5:6], v[3:4]
+; GFX8-NEXT:    flat_load_dwordx2 v[7:8], v[7:8]
 ; GFX8-NEXT:    v_add_u32_e32 v9, vcc, s0, v3
 ; GFX8-NEXT:    v_addc_u32_e32 v10, vcc, 0, v4, vcc
 ; GFX8-NEXT:    flat_load_dwordx2 v[9:10], v[9:10]
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0, v3
-; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 1, v4, vcc
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 1, v4
 ; GFX8-NEXT:    flat_load_dwordx2 v[3:4], v[3:4]
 ; GFX8-NEXT:    s_waitcnt vmcnt(2)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v5, v7
-; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v6, v8, vcc
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v7, v5
+; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v8, v6, vcc
 ; GFX8-NEXT:    s_waitcnt vmcnt(1)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v9, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v10, v5, vcc
@@ -1416,32 +1415,32 @@ define amdgpu_kernel void @Offset64(ptr addrspace(1)  %buffer) {
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 7, v0
-; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff8000, v1
+; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff8000, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s35
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s34, v12
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s34, v10
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 3
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, 0, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 1, v1, vcc
-; GFX9-NEXT:    global_load_dwordx2 v[2:3], v[0:1], off
-; GFX9-NEXT:    global_load_dwordx2 v[6:7], v[4:5], off offset:-4096
 ; GFX9-NEXT:    s_movk_i32 s0, 0xf000
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    global_load_dwordx2 v[8:9], v[4:5], off
-; GFX9-NEXT:    global_load_dwordx2 v[10:11], v[0:1], off offset:2048
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
+; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
+; GFX9-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off
+; GFX9-NEXT:    global_load_dwordx2 v[8:9], v[2:3], off offset:2048
+; GFX9-NEXT:    v_add_u32_e32 v1, 1, v1
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v6, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v7, v3, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v4, v6
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v5, v7, vcc
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v8, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v9, v3, vcc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v10, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v11, v1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v8, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v9, v1, vcc
-; GFX9-NEXT:    global_store_dwordx2 v12, v[0:1], s[34:35]
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-NEXT:    global_store_dwordx2 v10, v[0:1], s[34:35]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: Offset64:
@@ -1477,8 +1476,7 @@ define amdgpu_kernel void @Offset64(ptr addrspace(1)  %buffer) {
 ; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
 ; GFX10-NEXT:    global_load_dwordx2 v[6:7], v[2:3], off offset:-2048
-; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 1, v1, vcc_lo
+; GFX10-NEXT:    v_add_nc_u32_e32 v1, 1, v1
 ; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    global_load_dwordx2 v[8:9], v[2:3], off
 ; GFX10-NEXT:    global_load_dwordx2 v[10:11], v[0:1], off
@@ -1517,25 +1515,25 @@ define amdgpu_kernel void @Offset64(ptr addrspace(1)  %buffer) {
 ; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v1, v0
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, v0, 0
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 1, v1, vcc_lo
-; GFX11-NEXT:    global_load_b64 v[2:3], v[0:1], off
-; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff000, v0
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, 0xfffff000, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
 ; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    global_load_b64 v[6:7], v[4:5], off offset:-4096
-; GFX11-NEXT:    global_load_b64 v[4:5], v[4:5], off
-; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off offset:2048
+; GFX11-NEXT:    global_load_b64 v[4:5], v[2:3], off
+; GFX11-NEXT:    global_load_b64 v[6:7], v[0:1], off
+; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off offset:2048
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 1, v1
+; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(2)
-; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v6, v2
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v7, v3, vcc_lo
+; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, v4, v6
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, v5, v7, vcc_lo
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v4, v0
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v5, v1, vcc_lo
 ; GFX11-NEXT:    global_store_b64 v8, v[0:1], s[34:35]
 ; GFX11-NEXT:    s_endpgm
 entry:
@@ -2408,18 +2406,17 @@ define hidden amdgpu_kernel void @negativeoffset(ptr addrspace(1) nocapture %buf
 ; GFX8-NEXT:    v_mov_b32_e32 v3, 3
 ; GFX8-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v1, v0
-; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, 0, v2, vcc
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v1, v0
+; GFX8-NEXT:    v_addc_u32_e32 v0, vcc, 0, v2, vcc
 ; GFX8-NEXT:    s_movk_i32 s0, 0x800
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, s0, v0
-; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, -1, v6, vcc
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0, v0
-; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, -1, v6, vcc
-; GFX8-NEXT:    flat_load_dwordx2 v[3:4], v[3:4]
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s0, v3
+; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, -1, v0, vcc
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, -1, v0
 ; GFX8-NEXT:    flat_load_dwordx2 v[5:6], v[5:6]
+; GFX8-NEXT:    flat_load_dwordx2 v[3:4], v[3:4]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v5, v3
-; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, v6, v4, vcc
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v5
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, v4, v6, vcc
 ; GFX8-NEXT:    flat_store_dwordx2 v[1:2], v[3:4]
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -2450,14 +2447,13 @@ define hidden amdgpu_kernel void @negativeoffset(ptr addrspace(1) nocapture %buf
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 3
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1000, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v3, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, 0, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, -1, v3, vcc
-; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:-2048
-; GFX9-NEXT:    global_load_dwordx2 v[6:7], v[2:3], off
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, 0x1000, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, -1, v1, vcc
+; GFX9-NEXT:    v_add_u32_e32 v1, -1, v1
+; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off offset:-2048
+; GFX9-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v6, v4
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v7, v5, vcc
@@ -2490,15 +2486,14 @@ define hidden amdgpu_kernel void @negativeoffset(ptr addrspace(1) nocapture %buf
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX10-NEXT:    v_add_co_u32 v1, s0, s34, v8
 ; GFX10-NEXT:    v_add_co_ci_u32_e64 v2, s0, s35, 0, s0
-; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, v1, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v2, vcc_lo
-; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800, v3
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v4, vcc_lo
-; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, 0, v3
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, -1, v4, vcc_lo
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v1, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo
+; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, 0x800, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, -1, v1, vcc_lo
+; GFX10-NEXT:    v_add_nc_u32_e32 v1, -1, v1
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
-; GFX10-NEXT:    global_load_dwordx2 v[6:7], v[2:3], off
+; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
+; GFX10-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v6, v4
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v7, v5, vcc_lo
@@ -2525,19 +2520,18 @@ define hidden amdgpu_kernel void @negativeoffset(ptr addrspace(1) nocapture %buf
 ; GFX11-NEXT:    v_add_co_u32 v1, s0, s34, v4
 ; GFX11-NEXT:    v_add_co_ci_u32_e64 v2, null, s35, 0, s0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, v1, v0
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v2, vcc_lo
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v1, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v3
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v5, vcc_lo
-; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, 0, v3
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, -1, v5, vcc_lo
+; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, 0x1000, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, -1, v1
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off offset:-2048
-; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off
+; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off offset:-2048
+; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v2, v0
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
 ; GFX11-NEXT:    global_store_b64 v4, v[0:1], s[34:35]
 ; GFX11-NEXT:    s_endpgm
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/remat-physreg-copy-subreg-extract-already-live-at-def-issue120970.mir b/llvm/test/CodeGen/AMDGPU/remat-physreg-copy-subreg-extract-already-live-at-def-issue120970.mir
new file mode 100644
index 000000000000..3879f6dccf9d
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/remat-physreg-copy-subreg-extract-already-live-at-def-issue120970.mir
@@ -0,0 +1,85 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -run-pass=register-coalescer -o - %s | FileCheck %s
+
+# This used to assert due to trying to rematerialize V_MOV_B64_PSEUDO
+# at copy to $vgpr1. This would assert since this would clobber the
+# live value in $vgpr0.
+
+---
+name: rematerialize_physreg_sub_def_already_live_at_def_assert
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; CHECK-LABEL: name: rematerialize_physreg_sub_def_already_live_at_def_assert
+    ; CHECK: liveins: $vgpr0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 1, implicit $exec
+    ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+    ; CHECK-NEXT: $vgpr1 = COPY [[V_MOV_B]].sub1
+    ; CHECK-NEXT: SI_RETURN implicit $vgpr0, implicit killed $vgpr1
+    %0:vreg_64 = V_MOV_B64_PSEUDO 1, implicit $exec
+    %1:vgpr_32 = COPY %0.sub1
+    $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+    $vgpr1 = COPY %1
+    SI_RETURN implicit $vgpr0, implicit killed $vgpr1
+...
+
+# Same as previous, except with an IMPLICIT_DEF
+---
+name: rematerialize_physreg_sub_def_already_live_at_def_assert_implicit_def
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; CHECK-LABEL: name: rematerialize_physreg_sub_def_already_live_at_def_assert_implicit_def
+    ; CHECK: liveins: $vgpr0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+    ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+    ; CHECK-NEXT: $vgpr1 = COPY [[DEF]].sub1
+    ; CHECK-NEXT: SI_RETURN implicit $vgpr0, implicit killed $vgpr1
+    %0:vreg_64 = IMPLICIT_DEF
+    %1:vgpr_32 = COPY %0.sub1
+    $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+    $vgpr1 = COPY %1
+    SI_RETURN implicit $vgpr0, implicit killed $vgpr1
+...
+
+---
+name: rematerialize_physreg_sub_def_no_live_sub_def_0
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; CHECK-LABEL: name: rematerialize_physreg_sub_def_no_live_sub_def_0
+    ; CHECK: liveins: $vgpr0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: dead $vgpr0_vgpr1 = V_MOV_B64_PSEUDO 1, implicit $exec, implicit-def $vgpr1
+    ; CHECK-NEXT: SI_RETURN implicit killed $vgpr1
+    %0:vreg_64 = V_MOV_B64_PSEUDO 1, implicit $exec
+    %1:vgpr_32 = COPY %0.sub1
+    $vgpr1 = COPY %1
+    SI_RETURN implicit killed $vgpr1
+...
+
+---
+name: rematerialize_physreg_sub_def_no_live_sub_def_1
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; CHECK-LABEL: name: rematerialize_physreg_sub_def_no_live_sub_def_1
+    ; CHECK: liveins: $vgpr0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: dead $vgpr1_vgpr2 = V_MOV_B64_PSEUDO 1, implicit $exec, implicit-def $vgpr1
+    ; CHECK-NEXT: SI_RETURN implicit killed $vgpr1
+    %0:vreg_64 = V_MOV_B64_PSEUDO 1, implicit $exec
+    %1:vgpr_32 = COPY %0.sub0
+    $vgpr1 = COPY %1
+    SI_RETURN implicit killed $vgpr1
+...
diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
index 3e8768c98b5c..96dd6276f7e3 100644
--- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
@@ -1065,100 +1065,37 @@ define amdgpu_kernel void @s_test_sdiv24_48(ptr addrspace(1) %out, i48 %x, i48 %
 ; GCN-NEXT:    s_endpgm
 ;
 ; GCN-IR-LABEL: s_test_sdiv24_48:
-; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
-; GCN-IR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xb
-; GCN-IR-NEXT:    s_mov_b32 s15, 0
-; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-IR-NEXT:    s_sext_i32_i16 s1, s1
-; GCN-IR-NEXT:    s_ashr_i64 s[0:1], s[0:1], 24
-; GCN-IR-NEXT:    s_sext_i32_i16 s3, s3
-; GCN-IR-NEXT:    s_lshl_b64 s[0:1], s[0:1], 16
-; GCN-IR-NEXT:    s_ashr_i64 s[2:3], s[2:3], 24
-; GCN-IR-NEXT:    s_ashr_i64 s[6:7], s[0:1], 16
-; GCN-IR-NEXT:    s_ashr_i32 s0, s1, 31
-; GCN-IR-NEXT:    s_lshl_b64 s[2:3], s[2:3], 16
-; GCN-IR-NEXT:    s_mov_b32 s1, s0
-; GCN-IR-NEXT:    s_ashr_i64 s[8:9], s[2:3], 16
-; GCN-IR-NEXT:    s_ashr_i32 s2, s3, 31
-; GCN-IR-NEXT:    s_xor_b64 s[6:7], s[6:7], s[0:1]
-; GCN-IR-NEXT:    s_mov_b32 s3, s2
-; GCN-IR-NEXT:    s_sub_u32 s12, s6, s0
-; GCN-IR-NEXT:    s_subb_u32 s13, s7, s0
-; GCN-IR-NEXT:    s_xor_b64 s[6:7], s[8:9], s[2:3]
-; GCN-IR-NEXT:    s_sub_u32 s6, s6, s2
-; GCN-IR-NEXT:    s_subb_u32 s7, s7, s2
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[8:9], s[6:7], 0
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[12:13], 0
-; GCN-IR-NEXT:    s_flbit_i32_b64 s14, s[6:7]
-; GCN-IR-NEXT:    s_or_b64 s[10:11], s[8:9], s[10:11]
-; GCN-IR-NEXT:    s_flbit_i32_b64 s20, s[12:13]
-; GCN-IR-NEXT:    s_sub_u32 s16, s14, s20
-; GCN-IR-NEXT:    s_subb_u32 s17, 0, 0
-; GCN-IR-NEXT:    v_cmp_gt_u64_e64 s[18:19], s[16:17], 63
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[22:23], s[16:17], 63
-; GCN-IR-NEXT:    s_or_b64 s[18:19], s[10:11], s[18:19]
-; GCN-IR-NEXT:    s_and_b64 s[10:11], s[18:19], exec
-; GCN-IR-NEXT:    s_cselect_b32 s11, 0, s13
-; GCN-IR-NEXT:    s_cselect_b32 s10, 0, s12
-; GCN-IR-NEXT:    s_or_b64 s[18:19], s[18:19], s[22:23]
-; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0
-; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[18:19]
-; GCN-IR-NEXT:    s_cbranch_vccz .LBB9_5
-; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT:    s_add_u32 s18, s16, 1
-; GCN-IR-NEXT:    s_addc_u32 s19, s17, 0
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[18:19], 0
-; GCN-IR-NEXT:    s_sub_i32 s16, 63, s16
-; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[10:11]
-; GCN-IR-NEXT:    s_lshl_b64 s[10:11], s[12:13], s16
-; GCN-IR-NEXT:    s_cbranch_vccz .LBB9_4
-; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT:    s_lshr_b64 s[16:17], s[12:13], s18
-; GCN-IR-NEXT:    s_add_u32 s18, s6, -1
-; GCN-IR-NEXT:    s_addc_u32 s19, s7, -1
-; GCN-IR-NEXT:    s_not_b64 s[8:9], s[14:15]
-; GCN-IR-NEXT:    s_add_u32 s12, s8, s20
-; GCN-IR-NEXT:    s_addc_u32 s13, s9, 0
-; GCN-IR-NEXT:    s_mov_b64 s[14:15], 0
-; GCN-IR-NEXT:    s_mov_b32 s9, 0
-; GCN-IR-NEXT:  .LBB9_3: ; %udiv-do-while
-; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT:    s_lshl_b64 s[16:17], s[16:17], 1
-; GCN-IR-NEXT:    s_lshr_b32 s8, s11, 31
-; GCN-IR-NEXT:    s_lshl_b64 s[10:11], s[10:11], 1
-; GCN-IR-NEXT:    s_or_b64 s[16:17], s[16:17], s[8:9]
-; GCN-IR-NEXT:    s_or_b64 s[10:11], s[14:15], s[10:11]
-; GCN-IR-NEXT:    s_sub_u32 s8, s18, s16
-; GCN-IR-NEXT:    s_subb_u32 s8, s19, s17
-; GCN-IR-NEXT:    s_ashr_i32 s14, s8, 31
-; GCN-IR-NEXT:    s_mov_b32 s15, s14
-; GCN-IR-NEXT:    s_and_b32 s8, s14, 1
-; GCN-IR-NEXT:    s_and_b64 s[14:15], s[14:15], s[6:7]
-; GCN-IR-NEXT:    s_sub_u32 s16, s16, s14
-; GCN-IR-NEXT:    s_subb_u32 s17, s17, s15
-; GCN-IR-NEXT:    s_add_u32 s12, s12, 1
-; GCN-IR-NEXT:    s_addc_u32 s13, s13, 0
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[20:21], s[12:13], 0
-; GCN-IR-NEXT:    s_mov_b64 s[14:15], s[8:9]
-; GCN-IR-NEXT:    s_and_b64 vcc, exec, s[20:21]
-; GCN-IR-NEXT:    s_cbranch_vccz .LBB9_3
-; GCN-IR-NEXT:  .LBB9_4: ; %Flow4
-; GCN-IR-NEXT:    s_lshl_b64 s[6:7], s[10:11], 1
-; GCN-IR-NEXT:    s_or_b64 s[10:11], s[8:9], s[6:7]
-; GCN-IR-NEXT:  .LBB9_5: ; %udiv-end
-; GCN-IR-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
-; GCN-IR-NEXT:    s_xor_b64 s[0:1], s[2:3], s[0:1]
-; GCN-IR-NEXT:    s_xor_b64 s[2:3], s[10:11], s[0:1]
-; GCN-IR-NEXT:    s_sub_u32 s0, s2, s0
-; GCN-IR-NEXT:    s_subb_u32 s1, s3, s1
+; GCN-IR:       ; %bb.0:
+; GCN-IR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-IR-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
 ; GCN-IR-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-IR-NEXT:    s_mov_b32 s6, -1
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s1
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-IR-NEXT:    buffer_store_short v0, off, s[4:7], 0 offset:4
-; GCN-IR-NEXT:    s_waitcnt expcnt(0)
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-IR-NEXT:    s_mov_b32 s5, s1
+; GCN-IR-NEXT:    s_sext_i32_i16 s1, s9
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s8
+; GCN-IR-NEXT:    v_alignbit_b32 v0, s1, v0, 24
+; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v1, v0
+; GCN-IR-NEXT:    s_mov_b32 s4, s0
+; GCN-IR-NEXT:    s_sext_i32_i16 s0, s3
+; GCN-IR-NEXT:    v_mov_b32_e32 v2, s2
+; GCN-IR-NEXT:    v_alignbit_b32 v2, s0, v2, 24
+; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v3, v2
+; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v4, v1
+; GCN-IR-NEXT:    v_xor_b32_e32 v0, v2, v0
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v0, 30, v0
+; GCN-IR-NEXT:    v_or_b32_e32 v0, 1, v0
+; GCN-IR-NEXT:    v_mul_f32_e32 v2, v3, v4
+; GCN-IR-NEXT:    v_trunc_f32_e32 v2, v2
+; GCN-IR-NEXT:    v_mad_f32 v3, -v2, v1, v3
+; GCN-IR-NEXT:    v_cvt_i32_f32_e32 v2, v2
+; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v1|
+; GCN-IR-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GCN-IR-NEXT:    v_bfe_i32 v0, v0, 0, 24
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GCN-IR-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GCN-IR-NEXT:    buffer_store_short v1, off, s[4:7], 0 offset:4
 ; GCN-IR-NEXT:    s_endpgm
   %1 = ashr i48 %x, 24
   %2 = ashr i48 %y, 24
diff --git a/llvm/test/CodeGen/AMDGPU/smed3.ll b/llvm/test/CodeGen/AMDGPU/smed3.ll
index e0d0ddce208c..ddf6297bc27a 100644
--- a/llvm/test/CodeGen/AMDGPU/smed3.ll
+++ b/llvm/test/CodeGen/AMDGPU/smed3.ll
@@ -1,6 +1,8 @@
 ; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s
 
 declare i32 @llvm.amdgcn.workitem.id.x() #0
 
@@ -98,6 +100,8 @@ declare i64 @llvm.smin.i64(i64, i64)
 ; VI: v_max_i16_e32 [[MAX:v[0-9]]], 12, {{v[0-9]}}
 ; VI: v_min_i16_e32 {{v[0-9]}}, 17, [[MAX]]
 ; GFX9: v_med3_i16 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17
+; GFX11-TRUE16: v_med3_i16 v{{[0-9]+}}.l, v{{[0-9]+}}.l, 12, 17
+; GFX11-FAKE16: v_med3_i16 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17
 define amdgpu_kernel void @v_test_smed3_r_i_i_i16(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr i16, ptr addrspace(1) %aptr, i32 %tid
@@ -686,6 +690,8 @@ bb:
 ; VI: v_max_i16
 
 ; GFX9: v_med3_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; GFX11-TRUE16: v_med3_i16 v{{[0-9]+}}.l, v{{[0-9]+}}.l, v{{[0-9]+}}.h, v{{[0-9]+}}.l
+; GFX11-FAKE16: v_med3_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 define amdgpu_kernel void @v_test_smed3_i16_pat_0(ptr addrspace(1) %arg, ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #1 {
 bb:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -707,6 +713,8 @@ bb:
 
 ; GCN-LABEL: {{^}}v_test_smed3_i16_pat_1:
 ; GFX9: v_med3_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; GFX11-TRUE16: v_med3_i16 v{{[0-9]+}}.l, v{{[0-9]+}}.l, v{{[0-9]+}}.h, v{{[0-9]+}}.l
+; GFX11-FAKE16: v_med3_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 
 define amdgpu_kernel void @v_test_smed3_i16_pat_1(ptr addrspace(1) %arg, ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #1 {
 bb:
diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll
index cb8f82db92bb..23364e860d15 100644
--- a/llvm/test/CodeGen/AMDGPU/srem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/srem64.ll
@@ -1188,109 +1188,39 @@ define amdgpu_kernel void @s_test_srem24_48(ptr addrspace(1) %out, i48 %x, i48 %
 ; GCN-NEXT:    s_endpgm
 ;
 ; GCN-IR-LABEL: s_test_srem24_48:
-; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
-; GCN-IR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xb
-; GCN-IR-NEXT:    s_mov_b32 s13, 0
+; GCN-IR:       ; %bb.0:
+; GCN-IR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-IR-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
+; GCN-IR-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-IR-NEXT:    s_mov_b32 s6, -1
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-IR-NEXT:    s_sext_i32_i16 s1, s1
 ; GCN-IR-NEXT:    s_sext_i32_i16 s3, s3
-; GCN-IR-NEXT:    s_ashr_i64 s[0:1], s[0:1], 24
-; GCN-IR-NEXT:    s_ashr_i64 s[2:3], s[2:3], 24
-; GCN-IR-NEXT:    s_lshl_b64 s[0:1], s[0:1], 16
-; GCN-IR-NEXT:    s_lshl_b64 s[6:7], s[2:3], 16
-; GCN-IR-NEXT:    s_ashr_i64 s[2:3], s[0:1], 16
-; GCN-IR-NEXT:    s_ashr_i32 s0, s1, 31
-; GCN-IR-NEXT:    s_mov_b32 s1, s0
-; GCN-IR-NEXT:    s_ashr_i64 s[8:9], s[6:7], 16
-; GCN-IR-NEXT:    s_xor_b64 s[2:3], s[2:3], s[0:1]
-; GCN-IR-NEXT:    s_sub_u32 s2, s2, s0
-; GCN-IR-NEXT:    s_subb_u32 s3, s3, s0
-; GCN-IR-NEXT:    s_ashr_i32 s10, s7, 31
-; GCN-IR-NEXT:    s_mov_b32 s11, s10
-; GCN-IR-NEXT:    s_xor_b64 s[6:7], s[8:9], s[10:11]
-; GCN-IR-NEXT:    s_sub_u32 s6, s6, s10
-; GCN-IR-NEXT:    s_subb_u32 s7, s7, s10
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[8:9], s[6:7], 0
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[2:3], 0
-; GCN-IR-NEXT:    s_flbit_i32_b64 s12, s[6:7]
-; GCN-IR-NEXT:    s_or_b64 s[10:11], s[8:9], s[10:11]
-; GCN-IR-NEXT:    s_flbit_i32_b64 s20, s[2:3]
-; GCN-IR-NEXT:    s_sub_u32 s14, s12, s20
-; GCN-IR-NEXT:    s_subb_u32 s15, 0, 0
-; GCN-IR-NEXT:    v_cmp_gt_u64_e64 s[16:17], s[14:15], 63
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[18:19], s[14:15], 63
-; GCN-IR-NEXT:    s_or_b64 s[16:17], s[10:11], s[16:17]
-; GCN-IR-NEXT:    s_and_b64 s[10:11], s[16:17], exec
-; GCN-IR-NEXT:    s_cselect_b32 s11, 0, s3
-; GCN-IR-NEXT:    s_cselect_b32 s10, 0, s2
-; GCN-IR-NEXT:    s_or_b64 s[16:17], s[16:17], s[18:19]
-; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0
-; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[16:17]
-; GCN-IR-NEXT:    s_cbranch_vccz .LBB9_5
-; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT:    s_add_u32 s16, s14, 1
-; GCN-IR-NEXT:    s_addc_u32 s17, s15, 0
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[16:17], 0
-; GCN-IR-NEXT:    s_sub_i32 s14, 63, s14
-; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[10:11]
-; GCN-IR-NEXT:    s_lshl_b64 s[10:11], s[2:3], s14
-; GCN-IR-NEXT:    s_cbranch_vccz .LBB9_4
-; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT:    s_lshr_b64 s[14:15], s[2:3], s16
-; GCN-IR-NEXT:    s_add_u32 s18, s6, -1
-; GCN-IR-NEXT:    s_addc_u32 s19, s7, -1
-; GCN-IR-NEXT:    s_not_b64 s[8:9], s[12:13]
-; GCN-IR-NEXT:    s_add_u32 s12, s8, s20
-; GCN-IR-NEXT:    s_addc_u32 s13, s9, 0
-; GCN-IR-NEXT:    s_mov_b64 s[16:17], 0
-; GCN-IR-NEXT:    s_mov_b32 s9, 0
-; GCN-IR-NEXT:  .LBB9_3: ; %udiv-do-while
-; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT:    s_lshl_b64 s[14:15], s[14:15], 1
-; GCN-IR-NEXT:    s_lshr_b32 s8, s11, 31
-; GCN-IR-NEXT:    s_lshl_b64 s[10:11], s[10:11], 1
-; GCN-IR-NEXT:    s_or_b64 s[14:15], s[14:15], s[8:9]
-; GCN-IR-NEXT:    s_or_b64 s[10:11], s[16:17], s[10:11]
-; GCN-IR-NEXT:    s_sub_u32 s8, s18, s14
-; GCN-IR-NEXT:    s_subb_u32 s8, s19, s15
-; GCN-IR-NEXT:    s_ashr_i32 s16, s8, 31
-; GCN-IR-NEXT:    s_mov_b32 s17, s16
-; GCN-IR-NEXT:    s_and_b32 s8, s16, 1
-; GCN-IR-NEXT:    s_and_b64 s[16:17], s[16:17], s[6:7]
-; GCN-IR-NEXT:    s_sub_u32 s14, s14, s16
-; GCN-IR-NEXT:    s_subb_u32 s15, s15, s17
-; GCN-IR-NEXT:    s_add_u32 s12, s12, 1
-; GCN-IR-NEXT:    s_addc_u32 s13, s13, 0
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[20:21], s[12:13], 0
-; GCN-IR-NEXT:    s_mov_b64 s[16:17], s[8:9]
-; GCN-IR-NEXT:    s_and_b64 vcc, exec, s[20:21]
-; GCN-IR-NEXT:    s_cbranch_vccz .LBB9_3
-; GCN-IR-NEXT:  .LBB9_4: ; %Flow4
-; GCN-IR-NEXT:    s_lshl_b64 s[10:11], s[10:11], 1
-; GCN-IR-NEXT:    s_or_b64 s[10:11], s[8:9], s[10:11]
-; GCN-IR-NEXT:  .LBB9_5: ; %udiv-end
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s10
-; GCN-IR-NEXT:    v_mul_hi_u32 v0, s6, v0
-; GCN-IR-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x9
-; GCN-IR-NEXT:    s_mul_i32 s4, s6, s11
-; GCN-IR-NEXT:    v_mov_b32_e32 v2, s3
-; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, s4, v0
-; GCN-IR-NEXT:    s_mul_i32 s4, s7, s10
-; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, s4, v0
-; GCN-IR-NEXT:    s_mul_i32 s4, s6, s10
-; GCN-IR-NEXT:    v_mov_b32_e32 v1, s4
-; GCN-IR-NEXT:    v_sub_i32_e32 v1, vcc, s2, v1
-; GCN-IR-NEXT:    v_subb_u32_e32 v0, vcc, v2, v0, vcc
-; GCN-IR-NEXT:    v_xor_b32_e32 v1, s0, v1
-; GCN-IR-NEXT:    v_xor_b32_e32 v0, s1, v0
-; GCN-IR-NEXT:    v_mov_b32_e32 v2, s1
-; GCN-IR-NEXT:    v_subrev_i32_e32 v1, vcc, s0, v1
-; GCN-IR-NEXT:    s_mov_b32 s15, 0xf000
-; GCN-IR-NEXT:    s_mov_b32 s14, -1
-; GCN-IR-NEXT:    v_subb_u32_e32 v0, vcc, v0, v2, vcc
-; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-IR-NEXT:    buffer_store_short v0, off, s[12:15], 0 offset:4
-; GCN-IR-NEXT:    buffer_store_dword v1, off, s[12:15], 0
+; GCN-IR-NEXT:    s_sext_i32_i16 s5, s5
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-IR-NEXT:    v_alignbit_b32 v0, s5, v0, 24
+; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v1, v0
+; GCN-IR-NEXT:    v_mov_b32_e32 v2, s2
+; GCN-IR-NEXT:    v_alignbit_b32 v2, s3, v2, 24
+; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v3, v2
+; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v4, v1
+; GCN-IR-NEXT:    v_xor_b32_e32 v5, v2, v0
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v5, 30, v5
+; GCN-IR-NEXT:    v_or_b32_e32 v5, 1, v5
+; GCN-IR-NEXT:    v_mul_f32_e32 v4, v3, v4
+; GCN-IR-NEXT:    v_trunc_f32_e32 v4, v4
+; GCN-IR-NEXT:    v_mad_f32 v3, -v4, v1, v3
+; GCN-IR-NEXT:    v_cvt_i32_f32_e32 v4, v4
+; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v1|
+; GCN-IR-NEXT:    v_cndmask_b32_e32 v1, 0, v5, vcc
+; GCN-IR-NEXT:    s_mov_b32 s4, s0
+; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v1, v4
+; GCN-IR-NEXT:    v_mul_lo_u32 v0, v1, v0
+; GCN-IR-NEXT:    s_mov_b32 s5, s1
+; GCN-IR-NEXT:    v_subrev_i32_e32 v0, vcc, v0, v2
+; GCN-IR-NEXT:    v_bfe_i32 v0, v0, 0, 24
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GCN-IR-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GCN-IR-NEXT:    buffer_store_short v1, off, s[4:7], 0 offset:4
 ; GCN-IR-NEXT:    s_endpgm
   %1 = ashr i48 %x, 24
   %2 = ashr i48 %y, 24
diff --git a/llvm/test/CodeGen/AMDGPU/sub64-low-32-bits-known-zero.ll b/llvm/test/CodeGen/AMDGPU/sub64-low-32-bits-known-zero.ll
new file mode 100644
index 000000000000..f52f1164f2ba
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/sub64-low-32-bits-known-zero.ll
@@ -0,0 +1,193 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
+
+; Reduce a 64-bit sub by a constant if we know the low 32-bits are all
+; zero.
+
+; sub i64:x, K if computeTrailingZeros(K) >= 32
+; => build_pair (sub x.hi, K.hi), x.lo
+
+define amdgpu_ps i64 @s_sub_i64_const_low_bits_known0_0(i64 inreg %reg) {
+; GFX9-LABEL: s_sub_i64_const_low_bits_known0_0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_add_i32 s1, s1, 0xfffc0000
+; GFX9-NEXT:    ; return to shader part epilog
+  %sub = sub i64 %reg, 1125899906842624 ; (1 << 50)
+  ret i64 %sub
+}
+
+define amdgpu_ps i64 @s_sub_i64_const_low_bits_known0_1(i64 inreg %reg) {
+; GFX9-LABEL: s_sub_i64_const_low_bits_known0_1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_add_i32 s1, s1, -1
+; GFX9-NEXT:    ; return to shader part epilog
+  %sub = sub i64 %reg, 4294967296 ; (1 << 32)
+  ret i64 %sub
+}
+
+define amdgpu_ps i64 @s_sub_i64_const_low_bits_known0_2(i64 inreg %reg) {
+; GFX9-LABEL: s_sub_i64_const_low_bits_known0_2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_add_i32 s1, s1, -2
+; GFX9-NEXT:    ; return to shader part epilog
+  %sub = sub i64 %reg, 8589934592 ; (1 << 33)
+  ret i64 %sub
+}
+
+define amdgpu_ps i64 @s_sub_i64_const_low_bits_known0_3(i64 inreg %reg) {
+; GFX9-LABEL: s_sub_i64_const_low_bits_known0_3:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_add_i32 s1, s1, 0x80000000
+; GFX9-NEXT:    ; return to shader part epilog
+  %sub = sub i64 %reg, -9223372036854775808 ; (1 << 63)
+  ret i64 %sub
+}
+
+define amdgpu_ps i64 @s_sub_i64_const_low_bits_known0_4(i64 inreg %reg) {
+; GFX9-LABEL: s_sub_i64_const_low_bits_known0_4:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_add_i32 s1, s1, 1
+; GFX9-NEXT:    ; return to shader part epilog
+  %sub = sub i64 %reg, -4294967296 ; 0xffffffff00000000
+  ret i64 %sub
+}
+
+define i64 @v_sub_i64_const_low_bits_known0_0(i64 %reg) {
+; GFX9-LABEL: v_sub_i64_const_low_bits_known0_0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_add_u32_e32 v1, 0xfffc0000, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %sub = sub i64 %reg, 1125899906842624 ; (1 << 50)
+  ret i64 %sub
+}
+
+define i64 @v_sub_i64_const_low_bits_known0_1(i64 %reg) {
+; GFX9-LABEL: v_sub_i64_const_low_bits_known0_1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_add_u32_e32 v1, -1, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %sub = sub i64 %reg, 4294967296 ; (1 << 32)
+  ret i64 %sub
+}
+
+define i64 @v_sub_i64_const_low_bits_known0_2(i64 %reg) {
+; GFX9-LABEL: v_sub_i64_const_low_bits_known0_2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_add_u32_e32 v1, -2, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %sub = sub i64 %reg, 8589934592 ; (1 << 33)
+  ret i64 %sub
+}
+
+define i64 @v_sub_i64_const_low_bits_known0_3(i64 %reg) {
+; GFX9-LABEL: v_sub_i64_const_low_bits_known0_3:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_add_u32_e32 v1, 0x80000000, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %sub = sub i64 %reg, -9223372036854775808 ; (1 << 63)
+  ret i64 %sub
+}
+
+define i64 @v_sub_i64_const_low_bits_known0_4(i64 %reg) {
+; GFX9-LABEL: v_sub_i64_const_low_bits_known0_4:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_add_u32_e32 v1, 1, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %sub = sub i64 %reg, -4294967296 ; 0xffffffff00000000
+  ret i64 %sub
+}
+
+define amdgpu_ps i64 @s_sub_i64_const_high_bits_known0_0(i64 inreg %reg) {
+; GFX9-LABEL: s_sub_i64_const_high_bits_known0_0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_add_u32 s0, s0, 1
+; GFX9-NEXT:    s_addc_u32 s1, s1, -1
+; GFX9-NEXT:    ; return to shader part epilog
+  %sub = sub i64 %reg, 4294967295 ; (1 << 31)
+  ret i64 %sub
+}
+
+define i64 @v_sub_i64_const_high_bits_known0_0(i64 %reg) {
+; GFX9-LABEL: v_sub_i64_const_high_bits_known0_0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 1, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %sub = sub i64 %reg, 4294967295 ; (1 << 31)
+  ret i64 %sub
+}
+
+define <2 x i64> @v_sub_v2i64_splat_const_low_bits_known0_0(<2 x i64> %reg) {
+; GFX9-LABEL: v_sub_v2i64_splat_const_low_bits_known0_0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_add_u32_e32 v1, -1, v1
+; GFX9-NEXT:    v_add_u32_e32 v3, -1, v3
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %sub = sub <2 x i64> %reg, <i64 4294967296, i64 4294967296> ; (1 << 32)
+  ret <2 x i64> %sub
+}
+
+define <2 x i64> @v_sub_v2i64_nonsplat_const_low_bits_known0_0(<2 x i64> %reg) {
+; GFX9-LABEL: v_sub_v2i64_nonsplat_const_low_bits_known0_0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_add_u32_e32 v1, -1, v1
+; GFX9-NEXT:    v_add_u32_e32 v3, -2, v3
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %sub = sub <2 x i64> %reg, <i64 4294967296, i64 8589934592> ; (1 << 32), (1 << 33)
+  ret <2 x i64> %sub
+}
+
+define amdgpu_ps <2 x i64> @s_sub_v2i64_splat_const_low_bits_known0_0(<2 x i64> inreg %reg) {
+; GFX9-LABEL: s_sub_v2i64_splat_const_low_bits_known0_0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_add_i32 s1, s1, -1
+; GFX9-NEXT:    s_add_i32 s3, s3, -1
+; GFX9-NEXT:    ; return to shader part epilog
+  %sub = sub <2 x i64> %reg, <i64 4294967296, i64 4294967296> ; (1 << 32)
+  ret <2 x i64> %sub
+}
+
+define amdgpu_ps <2 x i64> @s_sub_v2i64_nonsplat_const_low_bits_known0_0(<2 x i64> inreg %reg) {
+; GFX9-LABEL: s_sub_v2i64_nonsplat_const_low_bits_known0_0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_add_i32 s1, s1, -1
+; GFX9-NEXT:    s_add_i32 s3, s3, -2
+; GFX9-NEXT:    ; return to shader part epilog
+  %sub = sub <2 x i64> %reg, <i64 4294967296, i64 8589934592> ; (1 << 32), (1 << 33)
+  ret <2 x i64> %sub
+}
+
+; We could reduce this to use a 32-bit sub if we use computeKnownBits
+define i64 @v_sub_i64_variable_high_bits_known0_0(i64 %reg, i32 %offset.hi32) {
+; GFX9-LABEL: v_sub_i64_variable_high_bits_known0_0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_subrev_co_u32_e32 v0, vcc, 0, v0
+; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v2, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %zext.offset.hi32 = zext i32 %offset.hi32 to i64
+  %in.high.bits = shl i64 %zext.offset.hi32, 32
+  %sub = sub i64 %reg, %in.high.bits
+  ret i64 %sub
+}
+
+; We could reduce this to use a 32-bit sub if we use computeKnownBits
+define amdgpu_ps i64 @s_sub_i64_variable_high_bits_known0_0(i64 inreg %reg, i32 inreg %offset.hi32) {
+; GFX9-LABEL: s_sub_i64_variable_high_bits_known0_0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_sub_u32 s0, s0, 0
+; GFX9-NEXT:    s_subb_u32 s1, s1, s2
+; GFX9-NEXT:    ; return to shader part epilog
+  %zext.offset.hi32 = zext i32 %offset.hi32 to i64
+  %in.high.bits = shl i64 %zext.offset.hi32, 32
+  %sub = sub i64 %reg, %in.high.bits
+  ret i64 %sub
+}
diff --git a/llvm/test/CodeGen/AMDGPU/swdev502267-use-after-free-last-chance-recoloring-alloc-succeeds.mir b/llvm/test/CodeGen/AMDGPU/swdev502267-use-after-free-last-chance-recoloring-alloc-succeeds.mir
index 3135e3c60f5f..831570800d06 100644
--- a/llvm/test/CodeGen/AMDGPU/swdev502267-use-after-free-last-chance-recoloring-alloc-succeeds.mir
+++ b/llvm/test/CodeGen/AMDGPU/swdev502267-use-after-free-last-chance-recoloring-alloc-succeeds.mir
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -stress-regalloc=4 -start-before=greedy,2 -stop-after=virtregrewriter,2  -o - %s | FileCheck %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -stress-regalloc=4 -verify-regalloc -start-before=greedy,2 -stop-after=virtregrewriter,2 -o - %s | FileCheck %s
 
 # This testcase hit a situation where greedy would hit a use after
 # free during last chance recoloring. This case successfully allocates
@@ -26,11 +26,12 @@ body:             |
   ; CHECK-NEXT:   renamable $vgpr5 = V_FMA_F32_e64 0, $vgpr7, 0, $vgpr7, 0, $vgpr2, 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT:   SI_SPILL_AV128_SAVE $vgpr6_vgpr7_vgpr8_vgpr9, %stack.2, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.2, align 4, addrspace 5)
   ; CHECK-NEXT:   renamable $vgpr6 = V_FMA_F32_e64 0, killed $vgpr8, 0, $vgpr8, 0, $vgpr1, 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   renamable $vgpr7 = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
-  ; CHECK-NEXT:   liveins: $vgpr0_vgpr1_vgpr2_vgpr3:0x00000000000000FF, $vgpr4_vgpr5_vgpr6_vgpr7:0x000000000000003F
+  ; CHECK-NEXT:   liveins: $vgpr0_vgpr1_vgpr2_vgpr3:0x00000000000000FF, $vgpr4_vgpr5_vgpr6_vgpr7:0x00000000000000FF
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   SI_SPILL_AV128_SAVE killed $vgpr0_vgpr1_vgpr2_vgpr3, %stack.1, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.1, align 4, addrspace 5)
   ; CHECK-NEXT:   renamable $vgpr0_vgpr1_vgpr2_vgpr3 = SI_SPILL_V128_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.2, align 4, addrspace 5)
@@ -40,7 +41,7 @@ body:             |
   ; CHECK-NEXT:   SI_SPILL_V32_SAVE killed $vgpr0, %stack.3, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5)
   ; CHECK-NEXT:   renamable $vgpr0 = IMPLICIT_DEF
   ; CHECK-NEXT:   renamable $vgpr2_vgpr3_vgpr4_vgpr5 = SI_SPILL_V128_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.2, align 4, addrspace 5)
-  ; CHECK-NEXT:   renamable $vgpr5 = nofpexcept V_DIV_FIXUP_F32_e64 0, killed $vgpr0, 0, undef $vgpr7, 0, killed $vgpr5, 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   renamable $vgpr5 = nofpexcept V_DIV_FIXUP_F32_e64 0, killed $vgpr0, 0, killed $vgpr7, 0, killed $vgpr5, 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT:   renamable $vgpr0 = SI_SPILL_V32_RESTORE %stack.3, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5)
   ; CHECK-NEXT:   renamable $vgpr9 = COPY killed renamable $vgpr5
   ; CHECK-NEXT:   renamable $vgpr4_vgpr5_vgpr6_vgpr7 = SI_SPILL_V128_RESTORE %stack.4, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.4, align 4, addrspace 5)
@@ -73,6 +74,7 @@ body:             |
     undef %4.sub0:vreg_128_align2 = V_FMA_F32_e64 0, %3.sub0, 0, %3.sub0, 0, %0.sub3, 0, 0, implicit $mode, implicit $exec
     %4.sub1:vreg_128_align2 = V_FMA_F32_e64 0, %3.sub1, 0, %3.sub1, 0, %0.sub2, 0, 0, implicit $mode, implicit $exec
     %4.sub2:vreg_128_align2 = V_FMA_F32_e64 0, %3.sub2, 0, %3.sub2, 0, %0.sub1, 0, 0, implicit $mode, implicit $exec
+    %4.sub3:vreg_128_align2 = IMPLICIT_DEF
     S_CBRANCH_EXECZ %bb.2, implicit $exec
 
   bb.1:
diff --git a/llvm/test/CodeGen/AMDGPU/swdev503538-move-to-valu-stack-srd-physreg.ll b/llvm/test/CodeGen/AMDGPU/swdev503538-move-to-valu-stack-srd-physreg.ll
new file mode 100644
index 000000000000..f0b3d334af67
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/swdev503538-move-to-valu-stack-srd-physreg.ll
@@ -0,0 +1,23 @@
+; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs=0 -O0 2> %t.err < %s | FileCheck %s
+; RUN: FileCheck -check-prefix=ERR %s < %t.err
+
+; FIXME: This error will be fixed by supporting arbitrary divergent
+; dynamic allocas by performing a wave umax of the size.
+
+; ERR: error: <unknown>:0:0: in function move_to_valu_assert_srd_is_physreg_swdev503538 i32 (ptr addrspace(1)): illegal VGPR to SGPR copy
+
+; CHECK: ; illegal copy v0 to s32
+
+define i32 @move_to_valu_assert_srd_is_physreg_swdev503538(ptr addrspace(1) %ptr) {
+entry:
+  %idx = load i32, ptr addrspace(1) %ptr, align 4
+  %zero = extractelement <4 x i32> zeroinitializer, i32 %idx
+  %alloca = alloca [2048 x i8], i32 %zero, align 8, addrspace(5)
+  %ld = load i32, ptr addrspace(5) %alloca, align 8
+  call void @llvm.memset.p5.i32(ptr addrspace(5) %alloca, i8 0, i32 2048, i1 false)
+  ret i32 %ld
+}
+
+declare void @llvm.memset.p5.i32(ptr addrspace(5) nocapture writeonly, i8, i32, i1 immarg) #0
+
+attributes #0 = { nocallback nofree nounwind willreturn memory(argmem: write) }
diff --git a/llvm/test/CodeGen/AMDGPU/umed3.ll b/llvm/test/CodeGen/AMDGPU/umed3.ll
index 557d023c45f9..4726e81ceb8c 100644
--- a/llvm/test/CodeGen/AMDGPU/umed3.ll
+++ b/llvm/test/CodeGen/AMDGPU/umed3.ll
@@ -1,6 +1,8 @@
 ; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN  -check-prefix=SI %s
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11-TRUE16 %s
 
 declare i32 @llvm.amdgcn.workitem.id.x() #0
 
@@ -84,6 +86,8 @@ define amdgpu_kernel void @v_test_umed3_r_i_i_i64(ptr addrspace(1) %out, ptr add
 ; VI: v_max_u16_e32 [[MAX:v[0-9]]], 12, {{v[0-9]}}
 ; VI: v_min_u16_e32 {{v[0-9]}}, 17, [[MAX]]
 ; GFX9: v_med3_u16 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17
+; GFX11-TRUE16: v_med3_u16 v{{[0-9]+}}.l, v{{[0-9]+}}.l, 12, 17
+; GFX11-FAKE16: v_med3_u16 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17
 define amdgpu_kernel void @v_test_umed3_r_i_i_i16(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr i16, ptr addrspace(1) %aptr, i32 %tid
@@ -707,6 +711,8 @@ bb:
 ; VI: v_max_u16
 
 ; GFX9: v_med3_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; GFX11-TRUE16: v_med3_u16 v{{[0-9]+}}.l, v{{[0-9]+}}.l, v{{[0-9]+}}.h, v{{[0-9]+}}.l
+; GFX11-FAKE16: v_med3_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 define amdgpu_kernel void @v_test_umed3_i16_pat_0(ptr addrspace(1) %arg, ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #1 {
 bb:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -728,6 +734,8 @@ bb:
 
 ; GCN-LABEL: {{^}}v_test_umed3_i16_pat_1:
 ; GFX9: v_med3_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; GFX11-TRUE16: v_med3_u16 v{{[0-9]+}}.l, v{{[0-9]+}}.l, v{{[0-9]+}}.h, v{{[0-9]+}}.l
+; GFX11-FAKE16: v_med3_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 define amdgpu_kernel void @v_test_umed3_i16_pat_1(ptr addrspace(1) %arg, ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #1 {
 bb:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/ARM/sink-store-pre-load-dependency.mir b/llvm/test/CodeGen/ARM/sink-store-pre-load-dependency.mir
new file mode 100644
index 000000000000..92c983e2bfd1
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/sink-store-pre-load-dependency.mir
@@ -0,0 +1,41 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -o - %s -mtriple=armv7-- -run-pass=machine-sink | FileCheck %s
+
+name: sink-store-load-dep
+tracksRegLiveness: true
+stack:
+  - { id: 0, type: default, size: 8, alignment: 8 }
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: sink-store-load-dep
+    ; CHECK:       bb.0:
+    ; CHECK:         [[LDRi12_:%[0-9]+]]:gpr = LDRi12 %stack.0, 0, 14 /* CC::al */, $noreg :: (load (s32))
+    ; CHECK-NEXT:    [[MOVi:%[0-9]+]]:gpr = MOVi 55296, 14 /* CC::al */, $noreg, $noreg
+    ; CHECK-NEXT:    [[ADDri1:%[0-9]+]]:gpr = ADDri [[LDRi12_:%[0-9]+]], 0, 14 /* CC::al */, $noreg, $noreg
+    ; CHECK-NEXT:    [[LDRH:%[0-9]+]]:gpr = LDRH killed [[ADDri1:%[0-9]+]], $noreg, 0, 14 /* CC::al */, $noreg :: (load (s16))
+    ; CHECK-NEXT:    [[MOVi1:%[0-9]+]]:gpr = MOVi 0, 14 /* CC::al */, $noreg, $noreg
+    ; CHECK-NEXT:    early-clobber %5:gpr = STRH_PRE [[MOVi:%[0-9]+]], [[LDRi12_:%[0-9]+]], [[MOVi1:%[0-9]+]], 0, 14 /* CC::al */, $noreg
+    ; CHECK-NEXT:    [[SUBri:%.*]]:gpr = SUBri killed [[LDRi12_:%[0-9]+]], 0, 14 /* CC::al */, $noreg, $noreg
+    ; CHECK:       bb.2:
+    ; CHECK-NEXT:    [[MOVi2:%[0-9]+]]:gpr = MOVi [[LDRH:%[0-9]+]], 14 /* CC::al */, $noreg, $noreg
+    %0:gpr = LDRi12 %stack.0, 0, 14, $noreg :: (load (s32))
+    %1:gpr = MOVi 55296, 14, $noreg, $noreg
+    %2:gpr = ADDri %0:gpr, 0, 14, $noreg, $noreg
+    %3:gpr = LDRH killed %2:gpr, $noreg, 0, 14, $noreg :: (load (s16))
+    %4:gpr = MOVi 0, 14, $noreg, $noreg
+    early-clobber %5:gpr = STRH_PRE %1:gpr, %0:gpr, %4:gpr, 0, 14, $noreg
+    %6:gpr = SUBri killed %0:gpr, 0, 14, $noreg, $noreg
+    CMPri %6:gpr, 0, 14, $noreg, implicit-def $cpsr
+    Bcc %bb.2, 3, $cpsr
+    B %bb.1
+
+  bb.1:
+    %8:gpr = MOVi 0, 14, $noreg, $noreg
+    $r0 = COPY %8:gpr
+    BX_RET 14, $noreg, implicit $r0
+
+  bb.2:
+    %9:gpr = MOVi %3:gpr, 14, $noreg, $noreg
+    $r0 = COPY %9:gpr
+    BX_RET 14, $noreg, implicit $r0
+...
diff --git a/llvm/test/CodeGen/DirectX/BufferLoad-sm61.ll b/llvm/test/CodeGen/DirectX/BufferLoad-sm61.ll
new file mode 100644
index 000000000000..501f15192d27
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/BufferLoad-sm61.ll
@@ -0,0 +1,60 @@
+; RUN: opt -S -dxil-op-lower %s | FileCheck %s
+; Before SM6.2 ByteAddressBuffer and StructuredBuffer lower to bufferLoad.
+
+target triple = "dxil-pc-shadermodel6.1-compute"
+
+; CHECK-LABEL: define void @loadf32_struct
+define void @loadf32_struct(i32 %index) {
+  %buffer = call target("dx.RawBuffer", float, 0, 0, 0)
+      @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_f32_0_0_0(
+          i32 0, i32 0, i32 1, i32 0, i1 false)
+
+  ; CHECK: [[DATA:%.*]] = call %dx.types.ResRet.f32 @dx.op.bufferLoad.f32(i32 68, %dx.types.Handle %{{.*}}, i32 %index, i32 0)
+  %load = call {float, i1}
+      @llvm.dx.resource.load.rawbuffer.f32.tdx.RawBuffer_f32_0_0_0t(
+          target("dx.RawBuffer", float, 0, 0, 0) %buffer,
+          i32 %index,
+          i32 0)
+
+  ret void
+}
+
+; CHECK-LABEL: define void @loadv4f32_byte
+define void @loadv4f32_byte(i32 %offset) {
+  %buffer = call target("dx.RawBuffer", i8, 0, 0, 0)
+      @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_i8_0_0_0(
+          i32 0, i32 0, i32 1, i32 0, i1 false)
+
+  ; CHECK: [[DATA:%.*]] = call %dx.types.ResRet.f32 @dx.op.bufferLoad.f32(i32 68, %dx.types.Handle %{{.*}}, i32 %offset, i32 0)
+  %load = call {<4 x float>, i1}
+      @llvm.dx.resource.load.rawbuffer.f32.tdx.RawBuffer_i8_0_0_0t(
+          target("dx.RawBuffer", i8, 0, 0, 0) %buffer,
+          i32 %offset,
+          i32 0)
+
+  ret void
+}
+
+; CHECK-LABEL: define void @loadnested
+define void @loadnested(i32 %index) {
+  %buffer = call
+      target("dx.RawBuffer", {i32, {<4 x float>, <3 x half>}}, 0, 0, 0)
+      @llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, i1 false)
+
+  ; CHECK: [[DATAI32:%.*]] = call %dx.types.ResRet.i32 @dx.op.bufferLoad.i32(i32 68, %dx.types.Handle %{{.*}}, i32 %index, i32 0)
+  %loadi32 = call {i32, i1} @llvm.dx.resource.load.rawbuffer.i32(
+      target("dx.RawBuffer", {i32, {<4 x float>, <3 x half>}}, 0, 0, 0) %buffer,
+      i32 %index, i32 0)
+
+  ; CHECK: [[DATAF32:%.*]] = call %dx.types.ResRet.f32 @dx.op.bufferLoad.f32(i32 68, %dx.types.Handle %{{.*}}, i32 %index, i32 4)
+  %loadf32 = call {<4 x float>, i1} @llvm.dx.resource.load.rawbuffer.v4f32(
+      target("dx.RawBuffer", {i32, {<4 x float>, <3 x half>}}, 0, 0, 0) %buffer,
+      i32 %index, i32 4)
+
+  ; CHECK: [[DATAF16:%.*]] = call %dx.types.ResRet.f16 @dx.op.bufferLoad.f16(i32 68, %dx.types.Handle %{{.*}}, i32 %index, i32 20)
+  %loadf16 = call {<3 x half>, i1} @llvm.dx.resource.load.rawbuffer.v3f16(
+      target("dx.RawBuffer", {i32, {<4 x float>, <3 x half>}}, 0, 0, 0) %buffer,
+      i32 %index, i32 20)
+
+  ret void
+}
diff --git a/llvm/test/CodeGen/DirectX/BufferLoad.ll b/llvm/test/CodeGen/DirectX/BufferLoad.ll
index 7f1291bf4a5c..86e2217a8e76 100644
--- a/llvm/test/CodeGen/DirectX/BufferLoad.ll
+++ b/llvm/test/CodeGen/DirectX/BufferLoad.ll
@@ -17,8 +17,9 @@ define void @loadv4f32() {
   ; CHECK-NOT: %dx.resource.casthandle
 
   ; CHECK: [[DATA0:%.*]] = call %dx.types.ResRet.f32 @dx.op.bufferLoad.f32(i32 68, %dx.types.Handle [[HANDLE]], i32 0, i32 undef)
-  %data0 = call <4 x float> @llvm.dx.resource.load.typedbuffer(
+  %load0 = call {<4 x float>, i1} @llvm.dx.resource.load.typedbuffer(
       target("dx.TypedBuffer", <4 x float>, 0, 0, 0) %buffer, i32 0)
+  %data0 = extractvalue {<4 x float>, i1} %load0, 0
 
   ; The extract order depends on the users, so don't enforce that here.
   ; CHECK-DAG: [[VAL0_0:%.*]] = extractvalue %dx.types.ResRet.f32 [[DATA0]], 0
@@ -34,8 +35,9 @@ define void @loadv4f32() {
   call void @scalar_user(float %data0_2)
 
   ; CHECK: [[DATA4:%.*]] = call %dx.types.ResRet.f32 @dx.op.bufferLoad.f32(i32 68, %dx.types.Handle [[HANDLE]], i32 4, i32 undef)
-  %data4 = call <4 x float> @llvm.dx.resource.load.typedbuffer(
+  %load4 = call {<4 x float>, i1} @llvm.dx.resource.load.typedbuffer(
       target("dx.TypedBuffer", <4 x float>, 0, 0, 0) %buffer, i32 4)
+  %data4 = extractvalue {<4 x float>, i1} %load4, 0
 
   ; CHECK: extractvalue %dx.types.ResRet.f32 [[DATA4]], 0
   ; CHECK: extractvalue %dx.types.ResRet.f32 [[DATA4]], 1
@@ -48,8 +50,9 @@ define void @loadv4f32() {
   call void @vector_user(<4 x float> %data4)
 
   ; CHECK: [[DATA12:%.*]] = call %dx.types.ResRet.f32 @dx.op.bufferLoad.f32(i32 68, %dx.types.Handle [[HANDLE]], i32 12, i32 undef)
-  %data12 = call <4 x float> @llvm.dx.resource.load.typedbuffer(
+  %load12 = call {<4 x float>, i1} @llvm.dx.resource.load.typedbuffer(
       target("dx.TypedBuffer", <4 x float>, 0, 0, 0) %buffer, i32 12)
+  %data12 = extractvalue {<4 x float>, i1} %load12, 0
 
   ; CHECK: [[DATA12_3:%.*]] = extractvalue %dx.types.ResRet.f32 [[DATA12]], 3
   %data12_3 = extractelement <4 x float> %data12, i32 3
@@ -70,8 +73,9 @@ define void @index_dynamic(i32 %bufindex, i32 %elemindex) {
           i32 0, i32 0, i32 1, i32 0, i1 false)
 
   ; CHECK: [[LOAD:%.*]] = call %dx.types.ResRet.f32 @dx.op.bufferLoad.f32(i32 68, %dx.types.Handle [[HANDLE]], i32 %bufindex, i32 undef)
-  %load = call <4 x float> @llvm.dx.resource.load.typedbuffer(
+  %load = call {<4 x float>, i1} @llvm.dx.resource.load.typedbuffer(
       target("dx.TypedBuffer", <4 x float>, 0, 0, 0) %buffer, i32 %bufindex)
+  %data = extractvalue {<4 x float>, i1} %load, 0
 
   ; CHECK: [[ALLOCA:%.*]] = alloca [4 x float]
   ; CHECK: [[V0:%.*]] = extractvalue %dx.types.ResRet.f32 [[LOAD]], 0
@@ -89,10 +93,10 @@ define void @index_dynamic(i32 %bufindex, i32 %elemindex) {
   ;
   ; CHECK: [[PTR:%.*]] = getelementptr inbounds [4 x float], ptr [[ALLOCA]], i32 0, i32 %elemindex
   ; CHECK: [[X:%.*]] = load float, ptr [[PTR]]
-  %data = extractelement <4 x float> %load, i32 %elemindex
+  %x = extractelement <4 x float> %data, i32 %elemindex
 
   ; CHECK: call void @scalar_user(float [[X]])
-  call void @scalar_user(float %data)
+  call void @scalar_user(float %x)
 
   ret void
 }
@@ -105,8 +109,9 @@ define void @loadf32() {
           i32 0, i32 0, i32 1, i32 0, i1 false)
 
   ; CHECK: [[DATA0:%.*]] = call %dx.types.ResRet.f32 @dx.op.bufferLoad.f32(i32 68, %dx.types.Handle [[HANDLE]], i32 0, i32 undef)
-  %data0 = call float @llvm.dx.resource.load.typedbuffer(
+  %load0 = call {float, i1} @llvm.dx.resource.load.typedbuffer(
       target("dx.TypedBuffer", float, 0, 0, 0) %buffer, i32 0)
+  %data0 = extractvalue {float, i1} %load0, 0
 
   ; CHECK: [[VAL0:%.*]] = extractvalue %dx.types.ResRet.f32 [[DATA0]], 0
   ; CHECK: call void @scalar_user(float [[VAL0]])
@@ -123,7 +128,7 @@ define void @loadv2f32() {
           i32 0, i32 0, i32 1, i32 0, i1 false)
 
   ; CHECK: [[DATA0:%.*]] = call %dx.types.ResRet.f32 @dx.op.bufferLoad.f32(i32 68, %dx.types.Handle [[HANDLE]], i32 0, i32 undef)
-  %data0 = call <2 x float> @llvm.dx.resource.load.typedbuffer(
+  %data0 = call {<2 x float>, i1} @llvm.dx.resource.load.typedbuffer(
       target("dx.TypedBuffer", <2 x float>, 0, 0, 0) %buffer, i32 0)
 
   ret void
@@ -137,7 +142,7 @@ define void @loadv4f32_checkbit() {
           i32 0, i32 0, i32 1, i32 0, i1 false)
 
   ; CHECK: [[DATA0:%.*]] = call %dx.types.ResRet.f32 @dx.op.bufferLoad.f32(i32 68, %dx.types.Handle [[HANDLE]], i32 0, i32 undef)
-  %data0 = call {<4 x float>, i1} @llvm.dx.resource.loadchecked.typedbuffer.f32(
+  %data0 = call {<4 x float>, i1} @llvm.dx.resource.load.typedbuffer.f32(
       target("dx.TypedBuffer", <4 x float>, 0, 0, 0) %buffer, i32 0)
 
   ; CHECK: [[STATUS:%.*]] = extractvalue %dx.types.ResRet.f32 [[DATA0]], 4
@@ -158,7 +163,7 @@ define void @loadv4i32() {
           i32 0, i32 0, i32 1, i32 0, i1 false)
 
   ; CHECK: [[DATA0:%.*]] = call %dx.types.ResRet.i32 @dx.op.bufferLoad.i32(i32 68, %dx.types.Handle [[HANDLE]], i32 0, i32 undef)
-  %data0 = call <4 x i32> @llvm.dx.resource.load.typedbuffer(
+  %data0 = call {<4 x i32>, i1} @llvm.dx.resource.load.typedbuffer(
       target("dx.TypedBuffer", <4 x i32>, 0, 0, 0) %buffer, i32 0)
 
   ret void
@@ -172,7 +177,7 @@ define void @loadv4f16() {
           i32 0, i32 0, i32 1, i32 0, i1 false)
 
   ; CHECK: [[DATA0:%.*]] = call %dx.types.ResRet.f16 @dx.op.bufferLoad.f16(i32 68, %dx.types.Handle [[HANDLE]], i32 0, i32 undef)
-  %data0 = call <4 x half> @llvm.dx.resource.load.typedbuffer(
+  %data0 = call {<4 x half>, i1} @llvm.dx.resource.load.typedbuffer(
       target("dx.TypedBuffer", <4 x half>, 0, 0, 0) %buffer, i32 0)
 
   ret void
@@ -186,7 +191,7 @@ define void @loadv4i16() {
           i32 0, i32 0, i32 1, i32 0, i1 false)
 
   ; CHECK: [[DATA0:%.*]] = call %dx.types.ResRet.i16 @dx.op.bufferLoad.i16(i32 68, %dx.types.Handle [[HANDLE]], i32 0, i32 undef)
-  %data0 = call <4 x i16> @llvm.dx.resource.load.typedbuffer(
+  %data0 = call {<4 x i16>, i1} @llvm.dx.resource.load.typedbuffer(
       target("dx.TypedBuffer", <4 x i16>, 0, 0, 0) %buffer, i32 0)
 
   ret void
diff --git a/llvm/test/CodeGen/DirectX/HLSLControlFlowHint.ll b/llvm/test/CodeGen/DirectX/HLSLControlFlowHint.ll
deleted file mode 100644
index fe66e481359b..000000000000
--- a/llvm/test/CodeGen/DirectX/HLSLControlFlowHint.ll
+++ /dev/null
@@ -1,98 +0,0 @@
-; RUN: opt -S -dxil-op-lower -dxil-translate-metadata -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
-
-; This test make sure LLVM metadata is being translated into DXIL.
-
-
-; CHECK: define i32 @test_branch(i32 %X)
-; CHECK-NO: hlsl.controlflow.hint
-; CHECK: br i1 %cmp, label %if.then, label %if.else, !dx.controlflow.hints [[HINT_BRANCH:![0-9]+]]
-define i32 @test_branch(i32 %X) {
-entry:
-  %X.addr = alloca i32, align 4
-  %resp = alloca i32, align 4
-  store i32 %X, ptr %X.addr, align 4
-  %0 = load i32, ptr %X.addr, align 4
-  %cmp = icmp sgt i32 %0, 0
-  br i1 %cmp, label %if.then, label %if.else, !hlsl.controlflow.hint !0
-
-if.then:                                          ; preds = %entry
-  %1 = load i32, ptr %X.addr, align 4
-  %sub = sub nsw i32 0, %1
-  store i32 %sub, ptr %resp, align 4
-  br label %if.end
-
-if.else:                                          ; preds = %entry
-  %2 = load i32, ptr %X.addr, align 4
-  %mul = mul nsw i32 %2, 2
-  store i32 %mul, ptr %resp, align 4
-  br label %if.end
-
-if.end:                                           ; preds = %if.else, %if.then
-  %3 = load i32, ptr %resp, align 4
-  ret i32 %3
-}
-
-
-; CHECK: define i32 @test_flatten(i32 %X)
-; CHECK-NO: hlsl.controlflow.hint
-; CHECK: br i1 %cmp, label %if.then, label %if.else, !dx.controlflow.hints [[HINT_FLATTEN:![0-9]+]]
-define i32 @test_flatten(i32 %X) {
-entry:
-  %X.addr = alloca i32, align 4
-  %resp = alloca i32, align 4
-  store i32 %X, ptr %X.addr, align 4
-  %0 = load i32, ptr %X.addr, align 4
-  %cmp = icmp sgt i32 %0, 0
-  br i1 %cmp, label %if.then, label %if.else, !hlsl.controlflow.hint !1
-
-if.then:                                          ; preds = %entry
-  %1 = load i32, ptr %X.addr, align 4
-  %sub = sub nsw i32 0, %1
-  store i32 %sub, ptr %resp, align 4
-  br label %if.end
-
-if.else:                                          ; preds = %entry
-  %2 = load i32, ptr %X.addr, align 4
-  %mul = mul nsw i32 %2, 2
-  store i32 %mul, ptr %resp, align 4
-  br label %if.end
-
-if.end:                                           ; preds = %if.else, %if.then
-  %3 = load i32, ptr %resp, align 4
-  ret i32 %3
-}
-
-
-; CHECK: define i32 @test_no_attr(i32 %X)
-; CHECK-NO: hlsl.controlflow.hint
-; CHECK-NO: !dx.controlflow.hints
-define i32 @test_no_attr(i32 %X) {
-entry:
-  %X.addr = alloca i32, align 4
-  %resp = alloca i32, align 4
-  store i32 %X, ptr %X.addr, align 4
-  %0 = load i32, ptr %X.addr, align 4
-  %cmp = icmp sgt i32 %0, 0
-  br i1 %cmp, label %if.then, label %if.else
-
-if.then:                                          ; preds = %entry
-  %1 = load i32, ptr %X.addr, align 4
-  %sub = sub nsw i32 0, %1
-  store i32 %sub, ptr %resp, align 4
-  br label %if.end
-
-if.else:                                          ; preds = %entry
-  %2 = load i32, ptr %X.addr, align 4
-  %mul = mul nsw i32 %2, 2
-  store i32 %mul, ptr %resp, align 4
-  br label %if.end
-
-if.end:                                           ; preds = %if.else, %if.then
-  %3 = load i32, ptr %resp, align 4
-  ret i32 %3
-}
-; CHECK-NO: hlsl.controlflow.hint
-; CHECK: [[HINT_BRANCH]] = !{!"dx.controlflow.hints", i32 1}
-; CHECK: [[HINT_FLATTEN]] = !{!"dx.controlflow.hints", i32 2}
-!0 = !{!"hlsl.controlflow.hint", i32 1}
-!1 = !{!"hlsl.controlflow.hint", i32 2}
diff --git a/llvm/test/CodeGen/DirectX/RawBufferLoad-error64.ll b/llvm/test/CodeGen/DirectX/RawBufferLoad-error64.ll
new file mode 100644
index 000000000000..b8a6649baf68
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/RawBufferLoad-error64.ll
@@ -0,0 +1,24 @@
+; We use llc for this test so that we don't abort after the first error.
+; RUN: not llc %s -o /dev/null 2>&1 | FileCheck %s
+
+target triple = "dxil-pc-shadermodel6.2-compute"
+
+declare void @v4f64_user(<4 x double>)
+
+; Can't load 64 bit types directly until SM6.3 (byteaddressbuf.Load<int64_t4>)
+; CHECK: error:
+; CHECK-SAME: in function loadv4f64_byte
+; CHECK-SAME: Cannot create RawBufferLoad operation: Invalid overload type
+define void @loadv4f64_byte(i32 %offset) "hlsl.export" {
+  %buffer = call target("dx.RawBuffer", i8, 0, 0, 0)
+      @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_i8_0_0_0(
+          i32 0, i32 0, i32 1, i32 0, i1 false)
+
+  %load = call {<4 x double>, i1} @llvm.dx.resource.load.rawbuffer.v4i64(
+      target("dx.RawBuffer", i8, 0, 0, 0) %buffer, i32 %offset, i32 0)
+  %data = extractvalue {<4 x double>, i1} %load, 0
+
+  call void @v4f64_user(<4 x double> %data)
+
+  ret void
+}
diff --git a/llvm/test/CodeGen/DirectX/RawBufferLoad.ll b/llvm/test/CodeGen/DirectX/RawBufferLoad.ll
new file mode 100644
index 000000000000..586b9c44e95d
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/RawBufferLoad.ll
@@ -0,0 +1,232 @@
+; RUN: opt -S -dxil-op-lower %s | FileCheck %s
+
+target triple = "dxil-pc-shadermodel6.6-compute"
+
+declare void @f32_user(float)
+declare void @v4f32_user(<4 x float>)
+declare void @i32_user(i32)
+declare void @v4i32_user(<4 x i32>)
+declare void @v3f16_user(<3 x half>)
+declare void @v4f64_user(<4 x double>)
+
+; CHECK-LABEL: define void @loadf32_struct
+define void @loadf32_struct(i32 %index) {
+  %buffer = call target("dx.RawBuffer", float, 0, 0, 0)
+      @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_f32_0_0_0(
+          i32 0, i32 0, i32 1, i32 0, i1 false)
+
+  ; CHECK: [[DATA:%.*]] = call %dx.types.ResRet.f32 @dx.op.rawBufferLoad.f32(i32 139, %dx.types.Handle %{{.*}}, i32 %index, i32 0, i8 1, i32 4)
+  %load = call {float, i1}
+      @llvm.dx.resource.load.rawbuffer.f32.tdx.RawBuffer_f32_0_0_0t(
+          target("dx.RawBuffer", float, 0, 0, 0) %buffer,
+          i32 %index,
+          i32 0)
+  %data = extractvalue {float, i1} %load, 0
+
+  ; CHECK: [[VAL:%.*]] = extractvalue %dx.types.ResRet.f32 [[DATA]], 0
+  ; CHECK: call void @f32_user(float [[VAL]])
+  call void @f32_user(float %data)
+
+  ret void
+}
+
+; CHECK-LABEL: define void @loadf32_byte
+define void @loadf32_byte(i32 %offset) {
+  %buffer = call target("dx.RawBuffer", i8, 0, 0, 0)
+      @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_i8_0_0_0(
+          i32 0, i32 0, i32 1, i32 0, i1 false)
+
+  ; CHECK: [[DATA:%.*]] = call %dx.types.ResRet.f32 @dx.op.rawBufferLoad.f32(i32 139, %dx.types.Handle %{{.*}}, i32 %offset, i32 0, i8 1, i32 4)
+  %load = call {float, i1}
+      @llvm.dx.resource.load.rawbuffer.f32.tdx.RawBuffer_i8_0_0_0t(
+          target("dx.RawBuffer", i8, 0, 0, 0) %buffer,
+          i32 %offset,
+          i32 0)
+  %data = extractvalue {float, i1} %load, 0
+
+  ; CHECK: [[VAL:%.*]] = extractvalue %dx.types.ResRet.f32 [[DATA]], 0
+  ; CHECK: call void @f32_user(float [[VAL]])
+  call void @f32_user(float %data)
+
+  ret void
+}
+
+; CHECK-LABEL: define void @loadv4f32_struct
+define void @loadv4f32_struct(i32 %index) {
+  %buffer = call target("dx.RawBuffer", <4 x float>, 0, 0, 0)
+      @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_v4f32_0_0_0(
+          i32 0, i32 0, i32 1, i32 0, i1 false)
+
+  ; CHECK: [[DATA:%.*]] = call %dx.types.ResRet.f32 @dx.op.rawBufferLoad.f32(i32 139, %dx.types.Handle %{{.*}}, i32 %index, i32 0, i8 15, i32 4)
+  %load = call {<4 x float>, i1}
+      @llvm.dx.resource.load.rawbuffer.f32.tdx.RawBuffer_v4f32_0_0_0t(
+          target("dx.RawBuffer", <4 x float>, 0, 0, 0) %buffer,
+          i32 %index,
+          i32 0)
+  %data = extractvalue {<4 x float>, i1} %load, 0
+
+  ; CHECK: extractvalue %dx.types.ResRet.f32 [[DATA]], 0
+  ; CHECK: extractvalue %dx.types.ResRet.f32 [[DATA]], 1
+  ; CHECK: extractvalue %dx.types.ResRet.f32 [[DATA]], 2
+  ; CHECK: extractvalue %dx.types.ResRet.f32 [[DATA]], 3
+  ; CHECK: insertelement <4 x float> undef
+  ; CHECK: insertelement <4 x float>
+  ; CHECK: insertelement <4 x float>
+  ; CHECK: insertelement <4 x float>
+  ; CHECK: call void @v4f32_user(<4 x float>
+  call void @v4f32_user(<4 x float> %data)
+
+  ret void
+}
+
+; CHECK-LABEL: define void @loadv4f32_byte
+define void @loadv4f32_byte(i32 %offset) {
+  %buffer = call target("dx.RawBuffer", i8, 0, 0, 0)
+      @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_i8_0_0_0(
+          i32 0, i32 0, i32 1, i32 0, i1 false)
+
+  ; CHECK: [[DATA:%.*]] = call %dx.types.ResRet.f32 @dx.op.rawBufferLoad.f32(i32 139, %dx.types.Handle %{{.*}}, i32 %offset, i32 0, i8 15, i32 4)
+  %load = call {<4 x float>, i1}
+      @llvm.dx.resource.load.rawbuffer.f32.tdx.RawBuffer_i8_0_0_0t(
+          target("dx.RawBuffer", i8, 0, 0, 0) %buffer,
+          i32 %offset,
+          i32 0)
+  %data = extractvalue {<4 x float>, i1} %load, 0
+
+  ; CHECK: extractvalue %dx.types.ResRet.f32 [[DATA]], 0
+  ; CHECK: extractvalue %dx.types.ResRet.f32 [[DATA]], 1
+  ; CHECK: extractvalue %dx.types.ResRet.f32 [[DATA]], 2
+  ; CHECK: extractvalue %dx.types.ResRet.f32 [[DATA]], 3
+  ; CHECK: insertelement <4 x float> undef
+  ; CHECK: insertelement <4 x float>
+  ; CHECK: insertelement <4 x float>
+  ; CHECK: insertelement <4 x float>
+  ; CHECK: call void @v4f32_user(<4 x float>
+  call void @v4f32_user(<4 x float> %data)
+
+  ret void
+}
+
+; CHECK-LABEL: define void @loadelements
+define void @loadelements(i32 %index) {
+  %buffer = call target("dx.RawBuffer", {<4 x float>, <4 x i32>}, 0, 0, 0)
+      @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_sl_v4f32v4i32s_0_0_0(
+          i32 0, i32 0, i32 1, i32 0, i1 false)
+
+  ; CHECK: [[DATAF32:%.*]] = call %dx.types.ResRet.f32 @dx.op.rawBufferLoad.f32(i32 139, %dx.types.Handle %{{.*}}, i32 %index, i32 0, i8 15, i32 4)
+  %loadf32 = call {<4 x float>, i1}
+      @llvm.dx.resource.load.rawbuffer.v4f32(
+          target("dx.RawBuffer", {<4 x float>, <4 x i32>}, 0, 0, 0) %buffer,
+          i32 %index,
+          i32 0)
+  %dataf32 = extractvalue {<4 x float>, i1} %loadf32, 0
+
+  ; CHECK: extractvalue %dx.types.ResRet.f32 [[DATAF32]], 0
+  ; CHECK: extractvalue %dx.types.ResRet.f32 [[DATAF32]], 1
+  ; CHECK: extractvalue %dx.types.ResRet.f32 [[DATAF32]], 2
+  ; CHECK: extractvalue %dx.types.ResRet.f32 [[DATAF32]], 3
+  ; CHECK: insertelement <4 x float> undef
+  ; CHECK: insertelement <4 x float>
+  ; CHECK: insertelement <4 x float>
+  ; CHECK: insertelement <4 x float>
+  ; CHECK: call void @v4f32_user(<4 x float>
+  call void @v4f32_user(<4 x float> %dataf32)
+
+  ; CHECK: [[DATAI32:%.*]] = call %dx.types.ResRet.i32 @dx.op.rawBufferLoad.i32(i32 139, %dx.types.Handle %{{.*}}, i32 %index, i32 1, i8 15, i32 4)
+  %loadi32 = call {<4 x i32>, i1}
+      @llvm.dx.resource.load.rawbuffer.v4i32(
+          target("dx.RawBuffer", {<4 x float>, <4 x i32>}, 0, 0, 0) %buffer,
+          i32 %index,
+          i32 1)
+  %datai32 = extractvalue {<4 x i32>, i1} %loadi32, 0
+
+  ; CHECK: extractvalue %dx.types.ResRet.i32 [[DATAI32]], 0
+  ; CHECK: extractvalue %dx.types.ResRet.i32 [[DATAI32]], 1
+  ; CHECK: extractvalue %dx.types.ResRet.i32 [[DATAI32]], 2
+  ; CHECK: extractvalue %dx.types.ResRet.i32 [[DATAI32]], 3
+  ; CHECK: insertelement <4 x i32> undef
+  ; CHECK: insertelement <4 x i32>
+  ; CHECK: insertelement <4 x i32>
+  ; CHECK: insertelement <4 x i32>
+  ; CHECK: call void @v4i32_user(<4 x i32>
+  call void @v4i32_user(<4 x i32> %datai32)
+
+  ret void
+}
+
+; CHECK-LABEL: define void @loadnested
+define void @loadnested(i32 %index) {
+  %buffer = call
+      target("dx.RawBuffer", {i32, {<4 x float>, <3 x half>}}, 0, 0, 0)
+      @llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, i1 false)
+
+  ; CHECK: [[DATAI32:%.*]] = call %dx.types.ResRet.i32 @dx.op.rawBufferLoad.i32(i32 139, %dx.types.Handle %{{.*}}, i32 %index, i32 0, i8 1, i32 4)
+  %loadi32 = call {i32, i1} @llvm.dx.resource.load.rawbuffer.i32(
+      target("dx.RawBuffer", {i32, {<4 x float>, <3 x half>}}, 0, 0, 0) %buffer,
+      i32 %index, i32 0)
+  %datai32 = extractvalue {i32, i1} %loadi32, 0
+
+  ; CHECK: [[VALI32:%.*]] = extractvalue %dx.types.ResRet.i32 [[DATAI32]], 0
+  ; CHECK: call void @i32_user(i32 [[VALI32]])
+  call void @i32_user(i32 %datai32)
+
+  ; CHECK: [[DATAF32:%.*]] = call %dx.types.ResRet.f32 @dx.op.rawBufferLoad.f32(i32 139, %dx.types.Handle %{{.*}}, i32 %index, i32 4, i8 15, i32 4)
+  %loadf32 = call {<4 x float>, i1} @llvm.dx.resource.load.rawbuffer.v4f32(
+      target("dx.RawBuffer", {i32, {<4 x float>, <3 x half>}}, 0, 0, 0) %buffer,
+      i32 %index, i32 4)
+  %dataf32 = extractvalue {<4 x float>, i1} %loadf32, 0
+
+  ; CHECK: extractvalue %dx.types.ResRet.f32 [[DATAF32]], 0
+  ; CHECK: extractvalue %dx.types.ResRet.f32 [[DATAF32]], 1
+  ; CHECK: extractvalue %dx.types.ResRet.f32 [[DATAF32]], 2
+  ; CHECK: extractvalue %dx.types.ResRet.f32 [[DATAF32]], 3
+  ; CHECK: insertelement <4 x float> undef
+  ; CHECK: insertelement <4 x float>
+  ; CHECK: insertelement <4 x float>
+  ; CHECK: insertelement <4 x float>
+  ; CHECK: call void @v4f32_user(<4 x float>
+  call void @v4f32_user(<4 x float> %dataf32)
+
+  ; CHECK: [[DATAF16:%.*]] = call %dx.types.ResRet.f16 @dx.op.rawBufferLoad.f16(i32 139, %dx.types.Handle %{{.*}}, i32 %index, i32 20, i8 7, i32 2)
+  %loadf16 = call {<3 x half>, i1} @llvm.dx.resource.load.rawbuffer.v3f16(
+      target("dx.RawBuffer", {i32, {<4 x float>, <3 x half>}}, 0, 0, 0) %buffer,
+      i32 %index, i32 20)
+  %dataf16 = extractvalue {<3 x half>, i1} %loadf16, 0
+
+  ; CHECK: extractvalue %dx.types.ResRet.f16 [[DATAF16]], 0
+  ; CHECK: extractvalue %dx.types.ResRet.f16 [[DATAF16]], 1
+  ; CHECK: extractvalue %dx.types.ResRet.f16 [[DATAF16]], 2
+  ; CHECK: insertelement <3 x half> undef
+  ; CHECK: insertelement <3 x half>
+  ; CHECK: insertelement <3 x half>
+  ; CHECK: call void @v3f16_user(<3 x half>
+  call void @v3f16_user(<3 x half> %dataf16)
+
+  ret void
+}
+
+; byteaddressbuf.Load<int64_t4>
+; CHECK-LABEL: define void @loadv4f64_byte
+define void @loadv4f64_byte(i32 %offset) {
+  %buffer = call target("dx.RawBuffer", i8, 0, 0, 0)
+      @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_i8_0_0_0(
+          i32 0, i32 0, i32 1, i32 0, i1 false)
+
+  ; CHECK: [[DATA:%.*]] = call %dx.types.ResRet.f64 @dx.op.rawBufferLoad.f64(i32 139, %dx.types.Handle %{{.*}}, i32 %offset, i32 0, i8 15, i32 8)
+  %load = call {<4 x double>, i1} @llvm.dx.resource.load.rawbuffer.v4i64(
+      target("dx.RawBuffer", i8, 0, 0, 0) %buffer, i32 %offset, i32 0)
+  %data = extractvalue {<4 x double>, i1} %load, 0
+
+  ; CHECK: extractvalue %dx.types.ResRet.f64 [[DATA]], 0
+  ; CHECK: extractvalue %dx.types.ResRet.f64 [[DATA]], 1
+  ; CHECK: extractvalue %dx.types.ResRet.f64 [[DATA]], 2
+  ; CHECK: extractvalue %dx.types.ResRet.f64 [[DATA]], 3
+  ; CHECK: insertelement <4 x double> undef
+  ; CHECK: insertelement <4 x double>
+  ; CHECK: insertelement <4 x double>
+  ; CHECK: insertelement <4 x double>
+  ; CHECK: call void @v4f64_user(<4 x double>
+  call void @v4f64_user(<4 x double> %data)
+
+  ret void
+}
diff --git a/llvm/test/CodeGen/DirectX/ResourceAccess/load_typedbuffer.ll b/llvm/test/CodeGen/DirectX/ResourceAccess/load_typedbuffer.ll
index 9b7e7fd04f60..8769e6ec66d8 100644
--- a/llvm/test/CodeGen/DirectX/ResourceAccess/load_typedbuffer.ll
+++ b/llvm/test/CodeGen/DirectX/ResourceAccess/load_typedbuffer.ll
@@ -15,17 +15,19 @@ define void @load_float4(i32 %index, i32 %elemindex) {
   %ptr = call ptr @llvm.dx.resource.getpointer(
       target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %buffer, i32 %index)
 
-  ; CHECK: %[[VALUE:.*]] = call <4 x float> @llvm.dx.resource.load.typedbuffer.v4f32.tdx.TypedBuffer_v4f32_1_0_0t(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %buffer, i32 %index)
+  ; CHECK: %[[VALUE:.*]] = call { <4 x float>, i1 } @llvm.dx.resource.load.typedbuffer.v4f32.tdx.TypedBuffer_v4f32_1_0_0t(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %buffer, i32 %index)
   %vec_data = load <4 x float>, ptr %ptr
   call void @use_float4(<4 x float> %vec_data)
 
-  ; CHECK: %[[VALUE:.*]] = call <4 x float> @llvm.dx.resource.load.typedbuffer.v4f32.tdx.TypedBuffer_v4f32_1_0_0t(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %buffer, i32 %index)
+  ; CHECK: %[[LOAD:.*]] = call { <4 x float>, i1 } @llvm.dx.resource.load.typedbuffer.v4f32.tdx.TypedBuffer_v4f32_1_0_0t(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %buffer, i32 %index)
+  ; CHECK: %[[VALUE:.*]] = extractvalue { <4 x float>, i1 } %[[LOAD]], 0
   ; CHECK: extractelement <4 x float> %[[VALUE]], i32 1
   %y_ptr = getelementptr inbounds <4 x float>, ptr %ptr, i32 0, i32 1
   %y_data = load float, ptr %y_ptr
   call void @use_float(float %y_data)
 
-  ; CHECK: %[[VALUE:.*]] = call <4 x float> @llvm.dx.resource.load.typedbuffer.v4f32.tdx.TypedBuffer_v4f32_1_0_0t(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %buffer, i32 %index)
+  ; CHECK: %[[LOAD:.*]] = call { <4 x float>, i1 } @llvm.dx.resource.load.typedbuffer.v4f32.tdx.TypedBuffer_v4f32_1_0_0t(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %buffer, i32 %index)
+  ; CHECK: %[[VALUE:.*]] = extractvalue { <4 x float>, i1 } %[[LOAD]], 0
   ; CHECK: extractelement <4 x float> %[[VALUE]], i32 %elemindex
   %dynamic = getelementptr inbounds <4 x float>, ptr %ptr, i32 0, i32 %elemindex
   %dyndata = load float, ptr %dynamic
diff --git a/llvm/test/CodeGen/DirectX/ResourceAccess/store_typedbuffer.ll b/llvm/test/CodeGen/DirectX/ResourceAccess/store_typedbuffer.ll
index 17606408cadf..0b7882ac722e 100644
--- a/llvm/test/CodeGen/DirectX/ResourceAccess/store_typedbuffer.ll
+++ b/llvm/test/CodeGen/DirectX/ResourceAccess/store_typedbuffer.ll
@@ -18,21 +18,24 @@ define void @store_float4(<4 x float> %data, i32 %index, i32 %elemindex) {
 
   ; Store just the .x component
   %scalar = extractelement <4 x float> %data, i32 0
-  ; CHECK: %[[LOAD:.*]] = call <4 x float> @llvm.dx.resource.load.typedbuffer.v4f32.tdx.TypedBuffer_v4f32_1_0_0t(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %buffer, i32 %index)
-  ; CHECK: %[[INSERT:.*]] = insertelement <4 x float> %[[LOAD]], float %scalar, i32 0
+  ; CHECK: %[[LOAD:.*]] = call { <4 x float>, i1 } @llvm.dx.resource.load.typedbuffer.v4f32.tdx.TypedBuffer_v4f32_1_0_0t(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %buffer, i32 %index)
+  ; CHECK: %[[VEC:.*]] = extractvalue { <4 x float>, i1 } %[[LOAD]], 0
+  ; CHECK: %[[INSERT:.*]] = insertelement <4 x float> %[[VEC]], float %scalar, i32 0
   ; CHECK: call void @llvm.dx.resource.store.typedbuffer.tdx.TypedBuffer_v4f32_1_0_0t.v4f32(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %buffer, i32 %index, <4 x float> %[[INSERT]])
   store float %scalar, ptr %ptr
 
   ; Store just the .y component
-  ; CHECK: %[[LOAD:.*]] = call <4 x float> @llvm.dx.resource.load.typedbuffer.v4f32.tdx.TypedBuffer_v4f32_1_0_0t(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %buffer, i32 %index)
-  ; CHECK: %[[INSERT:.*]] = insertelement <4 x float> %[[LOAD]], float %scalar, i32 1
+  ; CHECK: %[[LOAD:.*]] = call { <4 x float>, i1 } @llvm.dx.resource.load.typedbuffer.v4f32.tdx.TypedBuffer_v4f32_1_0_0t(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %buffer, i32 %index)
+  ; CHECK: %[[VEC:.*]] = extractvalue { <4 x float>, i1 } %[[LOAD]], 0
+  ; CHECK: %[[INSERT:.*]] = insertelement <4 x float> %[[VEC]], float %scalar, i32 1
   ; CHECK: call void @llvm.dx.resource.store.typedbuffer.tdx.TypedBuffer_v4f32_1_0_0t.v4f32(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %buffer, i32 %index, <4 x float> %[[INSERT]])
   %y_ptr = getelementptr inbounds i8, ptr %ptr, i32 4
   store float %scalar, ptr %y_ptr
 
   ; Store to one of the elements dynamically
-  ; CHECK: %[[LOAD:.*]] = call <4 x float> @llvm.dx.resource.load.typedbuffer.v4f32.tdx.TypedBuffer_v4f32_1_0_0t(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %buffer, i32 %index)
-  ; CHECK: %[[INSERT:.*]] = insertelement <4 x float> %[[LOAD]], float %scalar, i32 %elemindex
+  ; CHECK: %[[LOAD:.*]] = call { <4 x float>, i1 } @llvm.dx.resource.load.typedbuffer.v4f32.tdx.TypedBuffer_v4f32_1_0_0t(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %buffer, i32 %index)
+  ; CHECK: %[[VEC:.*]] = extractvalue { <4 x float>, i1 } %[[LOAD]], 0
+  ; CHECK: %[[INSERT:.*]] = insertelement <4 x float> %[[VEC]], float %scalar, i32 %elemindex
   ; CHECK: call void @llvm.dx.resource.store.typedbuffer.tdx.TypedBuffer_v4f32_1_0_0t.v4f32(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %buffer, i32 %index, <4 x float> %[[INSERT]])
   %dynamic = getelementptr inbounds <4 x float>, ptr %ptr, i32 0, i32 %elemindex
   store float %scalar, ptr %dynamic
@@ -56,14 +59,16 @@ define void @store_half4(<4 x half> %data, i32 %index) {
 
   ; Store just the .x component
   %scalar = extractelement <4 x half> %data, i32 0
-  ; CHECK: %[[LOAD:.*]] = call <4 x half> @llvm.dx.resource.load.typedbuffer.v4f16.tdx.TypedBuffer_v4f16_1_0_0t(target("dx.TypedBuffer", <4 x half>, 1, 0, 0) %buffer, i32 %index)
-  ; CHECK: %[[INSERT:.*]] = insertelement <4 x half> %[[LOAD]], half %scalar, i32 0
+  ; CHECK: %[[LOAD:.*]] = call { <4 x half>, i1 } @llvm.dx.resource.load.typedbuffer.v4f16.tdx.TypedBuffer_v4f16_1_0_0t(target("dx.TypedBuffer", <4 x half>, 1, 0, 0) %buffer, i32 %index)
+  ; CHECK: %[[VEC:.*]] = extractvalue { <4 x half>, i1 } %[[LOAD]], 0
+  ; CHECK: %[[INSERT:.*]] = insertelement <4 x half> %[[VEC]], half %scalar, i32 0
   ; CHECK: call void @llvm.dx.resource.store.typedbuffer.tdx.TypedBuffer_v4f16_1_0_0t.v4f16(target("dx.TypedBuffer", <4 x half>, 1, 0, 0) %buffer, i32 %index, <4 x half> %[[INSERT]])
   store half %scalar, ptr %ptr
 
   ; Store just the .y component
-  ; CHECK: %[[LOAD:.*]] = call <4 x half> @llvm.dx.resource.load.typedbuffer.v4f16.tdx.TypedBuffer_v4f16_1_0_0t(target("dx.TypedBuffer", <4 x half>, 1, 0, 0) %buffer, i32 %index)
-  ; CHECK: %[[INSERT:.*]] = insertelement <4 x half> %[[LOAD]], half %scalar, i32 1
+  ; CHECK: %[[LOAD:.*]] = call { <4 x half>, i1 } @llvm.dx.resource.load.typedbuffer.v4f16.tdx.TypedBuffer_v4f16_1_0_0t(target("dx.TypedBuffer", <4 x half>, 1, 0, 0) %buffer, i32 %index)
+  ; CHECK: %[[VEC:.*]] = extractvalue { <4 x half>, i1 } %[[LOAD]], 0
+  ; CHECK: %[[INSERT:.*]] = insertelement <4 x half> %[[VEC]], half %scalar, i32 1
   ; CHECK: call void @llvm.dx.resource.store.typedbuffer.tdx.TypedBuffer_v4f16_1_0_0t.v4f16(target("dx.TypedBuffer", <4 x half>, 1, 0, 0) %buffer, i32 %index, <4 x half> %[[INSERT]])
   %y_ptr = getelementptr inbounds i8, ptr %ptr, i32 2
   store half %scalar, ptr %y_ptr
@@ -87,14 +92,16 @@ define void @store_double2(<2 x double> %data, i32 %index) {
 
   ; Store just the .x component
   %scalar = extractelement <2 x double> %data, i32 0
-  ; CHECK: %[[LOAD:.*]] = call <2 x double> @llvm.dx.resource.load.typedbuffer.v2f64.tdx.TypedBuffer_v2f64_1_0_0t(target("dx.TypedBuffer", <2 x double>, 1, 0, 0) %buffer, i32 %index)
-  ; CHECK: %[[INSERT:.*]] = insertelement <2 x double> %[[LOAD]], double %scalar, i32 0
+  ; CHECK: %[[LOAD:.*]] = call { <2 x double>, i1 } @llvm.dx.resource.load.typedbuffer.v2f64.tdx.TypedBuffer_v2f64_1_0_0t(target("dx.TypedBuffer", <2 x double>, 1, 0, 0) %buffer, i32 %index)
+  ; CHECK: %[[VEC:.*]] = extractvalue { <2 x double>, i1 } %[[LOAD]], 0
+  ; CHECK: %[[INSERT:.*]] = insertelement <2 x double> %[[VEC]], double %scalar, i32 0
   ; CHECK: call void @llvm.dx.resource.store.typedbuffer.tdx.TypedBuffer_v2f64_1_0_0t.v2f64(target("dx.TypedBuffer", <2 x double>, 1, 0, 0) %buffer, i32 %index, <2 x double> %[[INSERT]])
   store double %scalar, ptr %ptr
 
   ; Store just the .y component
-  ; CHECK: %[[LOAD:.*]] = call <2 x double> @llvm.dx.resource.load.typedbuffer.v2f64.tdx.TypedBuffer_v2f64_1_0_0t(target("dx.TypedBuffer", <2 x double>, 1, 0, 0) %buffer, i32 %index)
-  ; CHECK: %[[INSERT:.*]] = insertelement <2 x double> %[[LOAD]], double %scalar, i32 1
+  ; CHECK: %[[LOAD:.*]] = call { <2 x double>, i1 } @llvm.dx.resource.load.typedbuffer.v2f64.tdx.TypedBuffer_v2f64_1_0_0t(target("dx.TypedBuffer", <2 x double>, 1, 0, 0) %buffer, i32 %index)
+  ; CHECK: %[[VEC:.*]] = extractvalue { <2 x double>, i1 } %[[LOAD]], 0
+  ; CHECK: %[[INSERT:.*]] = insertelement <2 x double> %[[VEC]], double %scalar, i32 1
   ; CHECK: call void @llvm.dx.resource.store.typedbuffer.tdx.TypedBuffer_v2f64_1_0_0t.v2f64(target("dx.TypedBuffer", <2 x double>, 1, 0, 0) %buffer, i32 %index, <2 x double> %[[INSERT]])
   %y_ptr = getelementptr inbounds i8, ptr %ptr, i32 8
   store double %scalar, ptr %y_ptr
diff --git a/llvm/test/CodeGen/DirectX/ResourceGlobalElimination.ll b/llvm/test/CodeGen/DirectX/ResourceGlobalElimination.ll
index c837b36a19e1..cd21adc11a9b 100644
--- a/llvm/test/CodeGen/DirectX/ResourceGlobalElimination.ll
+++ b/llvm/test/CodeGen/DirectX/ResourceGlobalElimination.ll
@@ -29,18 +29,20 @@ entry:
   %0 = call i32 @llvm.dx.flattened.thread.id.in.group()
   ; CHECK-NOT: load {{.*}} ptr @In
   %1 = load target("dx.TypedBuffer", <4 x float>, 1, 0, 0), ptr @In, align 4
-  ; CSE: call noundef <4 x float> @llvm.dx.resource.load.typedbuffer.v4f32.tdx.TypedBuffer_v4f32_1_0_0t
-  %2 = call noundef <4 x float> @llvm.dx.resource.load.typedbuffer.v4f32.tdx.TypedBuffer_v4f32_1_0_0t(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %1, i32 %0)
+  ; CSE: call noundef { <4 x float>, i1 } @llvm.dx.resource.load.typedbuffer.v4f32.tdx.TypedBuffer_v4f32_1_0_0t
+  %load = call noundef {<4 x float>, i1} @llvm.dx.resource.load.typedbuffer.v4f32.tdx.TypedBuffer_v4f32_1_0_0t(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %1, i32 %0)
+  %2 = extractvalue {<4 x float>, i1} %load, 0
   ; CHECK-NOT: load {{.*}} ptr @In
   %3 = load target("dx.TypedBuffer", <4 x float>, 1, 0, 0), ptr @In, align 4
-  %4 = call noundef <4 x float> @llvm.dx.resource.load.typedbuffer.v4f32.tdx.TypedBuffer_v4f32_1_0_0t(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %3, i32 %0)
+  %load2 = call noundef {<4 x float>, i1} @llvm.dx.resource.load.typedbuffer.v4f32.tdx.TypedBuffer_v4f32_1_0_0t(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %3, i32 %0)
+  %4 = extractvalue {<4 x float>, i1} %load2, 0
   %add.i = fadd <4 x float> %2, %4
   call void @llvm.dx.resource.store.typedbuffer.tdx.TypedBuffer_v4f32_1_0_0t.v4f32(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %Out_h.i, i32 %0, <4 x float> %add.i)
   ; CHECK: ret void
   ret void
 }
 
-; CSE-DAG: declare <4 x float> @llvm.dx.resource.load.typedbuffer.v4f32.tdx.TypedBuffer_v4f32_1_0_0t(target("dx.TypedBuffer", <4 x float>, 1, 0, 0), i32) [[ROAttr:#[0-9]+]]
+; CSE-DAG: declare { <4 x float>, i1 } @llvm.dx.resource.load.typedbuffer.v4f32.tdx.TypedBuffer_v4f32_1_0_0t(target("dx.TypedBuffer", <4 x float>, 1, 0, 0), i32) [[ROAttr:#[0-9]+]]
 ; CSE-DAG: declare void @llvm.dx.resource.store.typedbuffer.tdx.TypedBuffer_v4f32_1_0_0t.v4f32(target("dx.TypedBuffer", <4 x float>, 1, 0, 0), i32, <4 x float>) [[WOAttr:#[0-9]+]]
 
 attributes #0 = { convergent noinline norecurse "frame-pointer"="all" "hlsl.numthreads"="8,1,1" "hlsl.shader"="compute" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
diff --git a/llvm/test/CodeGen/DirectX/ShaderFlags/typed-uav-load-additional-formats.ll b/llvm/test/CodeGen/DirectX/ShaderFlags/typed-uav-load-additional-formats.ll
index 26223359dfdf..060d54f961c7 100644
--- a/llvm/test/CodeGen/DirectX/ShaderFlags/typed-uav-load-additional-formats.ll
+++ b/llvm/test/CodeGen/DirectX/ShaderFlags/typed-uav-load-additional-formats.ll
@@ -17,8 +17,9 @@ target triple = "dxil-pc-shadermodel6.7-library"
 define <4 x float> @multicomponent() #0 {
   %res = call target("dx.TypedBuffer", <4 x float>, 1, 0, 0)
       @llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, i1 false)
-  %val = call <4 x float> @llvm.dx.resource.load.typedbuffer(
+  %load = call {<4 x float>, i1} @llvm.dx.resource.load.typedbuffer(
       target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %res, i32 0)
+  %val = extractvalue {<4 x float>, i1} %load, 0
   ret <4 x float> %val
 }
 
@@ -26,8 +27,9 @@ define <4 x float> @multicomponent() #0 {
 define float @onecomponent() #0 {
   %res = call target("dx.TypedBuffer", float, 1, 0, 0)
       @llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, i1 false)
-  %val = call float @llvm.dx.resource.load.typedbuffer(
+  %load = call {float, i1} @llvm.dx.resource.load.typedbuffer(
       target("dx.TypedBuffer", float, 1, 0, 0) %res, i32 0)
+  %val = extractvalue {float, i1} %load, 0
   ret float %val
 }
 
diff --git a/llvm/test/CodeGen/Hexagon/loopIdiom.ll b/llvm/test/CodeGen/Hexagon/loopIdiom.ll
new file mode 100644
index 000000000000..9c3df674a493
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/loopIdiom.ll
@@ -0,0 +1,75 @@
+; RUN: opt -debug -S -march=hexagon -O2  < %s | FileCheck %s
+; REQUIRES: asserts
+; CHECK: define dso_local void @complexMultAccum
+target triple = "hexagon"
+
+; Function Attrs: noinline nounwind
+define dso_local void @complexMultAccum(i32 noundef %n) #0 {
+entry:
+  %n.addr = alloca i32, align 4
+  %run_c_code = alloca i8, align 1
+  %run_asm_code = alloca i8, align 1
+  %iOutter = alloca i32, align 4
+  %iOutter1 = alloca i32, align 4
+  store i32 %n, ptr %n.addr, align 4
+  store i8 1, ptr %run_c_code, align 1
+  store i8 0, ptr %run_asm_code, align 1
+  %0 = load i8, ptr %run_c_code, align 1
+  %tobool = icmp ne i8 %0, 0
+  br i1 %tobool, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  store i32 0, ptr %iOutter, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %if.then
+  %1 = load i32, ptr %iOutter, align 4
+  %cmp = icmp slt i32 %1, 2
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %2 = load i32, ptr %iOutter, align 4
+  %inc = add nsw i32 %2, 1
+  store i32 %inc, ptr %iOutter, align 4
+  br label %for.cond, !llvm.loop !3
+
+for.end:                                          ; preds = %for.cond
+  store i32 0, ptr %iOutter1, align 4
+  br label %for.cond2
+
+for.cond2:                                        ; preds = %for.inc5, %for.end
+  %3 = load i32, ptr %iOutter1, align 4
+  %cmp3 = icmp slt i32 %3, 2
+  br i1 %cmp3, label %for.body4, label %for.end7
+
+for.body4:                                        ; preds = %for.cond2
+  br label %for.inc5
+
+for.inc5:                                         ; preds = %for.body4
+  %4 = load i32, ptr %iOutter1, align 4
+  %inc6 = add nsw i32 %4, 1
+  store i32 %inc6, ptr %iOutter1, align 4
+  br label %for.cond2, !llvm.loop !5
+
+for.end7:                                         ; preds = %for.cond2
+  br label %if.end
+
+if.end:                                           ; preds = %for.end7, %entry
+  ret void
+}
+
+attributes #0 = { noinline nounwind "approx-func-fp-math"="true" "frame-pointer"="all" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="hexagonv79" "target-features"="+v79,-long-calls" "unsafe-fp-math"="true" }
+
+!llvm.module.flags = !{!0, !1}
+!llvm.ident = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 7, !"frame-pointer", i32 2}
+!2 = !{!"LLVM Clang"}
+!3 = distinct !{!3, !4}
+!4 = !{!"llvm.loop.mustprogress"}
+!5 = distinct !{!5, !4}
+
diff --git a/llvm/test/CodeGen/NVPTX/b52037.ll b/llvm/test/CodeGen/NVPTX/b52037.ll
index 5d1c390909f6..b6317dfb2859 100644
--- a/llvm/test/CodeGen/NVPTX/b52037.ll
+++ b/llvm/test/CodeGen/NVPTX/b52037.ll
@@ -39,7 +39,7 @@ declare %int3 @hoge(i32, i32, i32) local_unnamed_addr
 
 declare i64 @foo() local_unnamed_addr
 
-define void @barney(ptr nocapture readonly %arg) local_unnamed_addr {
+define ptx_kernel void @barney(ptr nocapture readonly %arg) local_unnamed_addr {
 bb:
   tail call void asm sideeffect "// KEEP", ""() #1
   %tmp = alloca %struct.zot, align 16
@@ -210,9 +210,6 @@ bb14:                                             ; preds = %bb49.i.lr.ph, %bb49
 attributes #0 = { argmemonly mustprogress nofree nounwind willreturn }
 attributes #1 = { nounwind }
 
-!nvvm.annotations = !{!0}
-
-!0 = !{ptr @barney, !"kernel", i32 1}
 !1 = !{!2, !11, i64 64}
 !2 = !{!"_ZTSN7cuneibs22neiblist_iterator_coreE", !3, i64 0, !3, i64 8, !6, i64 16, !8, i64 32, !9, i64 44, !10, i64 48, !11, i64 64, !9, i64 72, !4, i64 76, !9, i64 80}
 !3 = !{!"any pointer", !4, i64 0}
diff --git a/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll
index 03cdeb9683ab..8be3a66b7f48 100644
--- a/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll
@@ -182,8 +182,8 @@ define <2 x bfloat> @test_fneg(<2 x bfloat> %a) #0 {
 ; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [test_fneg_param_0];
-; CHECK-NEXT:    xor.b32 %r2, %r1, -2147450880;
+; CHECK-NEXT:    ld.param.b32 %r1, [test_fneg_param_0];
+; CHECK-NEXT:    neg.bf16x2 %r2, %r1;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-NEXT:    ret;
   %r = fneg <2 x bfloat> %a
@@ -532,8 +532,8 @@ define <2 x bfloat> @test_fabs(<2 x bfloat> %a) #0 {
 ; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [test_fabs_param_0];
-; CHECK-NEXT:    and.b32 %r2, %r1, 2147450879;
+; CHECK-NEXT:    ld.param.b32 %r1, [test_fabs_param_0];
+; CHECK-NEXT:    abs.bf16x2 %r2, %r1;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-NEXT:    ret;
   %r = call <2 x bfloat> @llvm.fabs.f16(<2 x bfloat> %a)
diff --git a/llvm/test/CodeGen/NVPTX/bug21465.ll b/llvm/test/CodeGen/NVPTX/bug21465.ll
index 9b1f1049c648..76300e3cfdc5 100644
--- a/llvm/test/CodeGen/NVPTX/bug21465.ll
+++ b/llvm/test/CodeGen/NVPTX/bug21465.ll
@@ -8,7 +8,7 @@ target triple = "nvptx64-unknown-unknown"
 %struct.S = type { i32, i32 }
 
 ; Function Attrs: nounwind
-define void @_Z11TakesStruct1SPi(ptr byval(%struct.S) nocapture readonly %input, ptr nocapture %output) #0 {
+define ptx_kernel void @_Z11TakesStruct1SPi(ptr byval(%struct.S) nocapture readonly %input, ptr nocapture %output) #0 {
 entry:
 ; CHECK-LABEL: @_Z11TakesStruct1SPi
 ; PTX-LABEL: .visible .entry _Z11TakesStruct1SPi(
@@ -23,7 +23,3 @@ entry:
 }
 
 attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
-!nvvm.annotations = !{!0}
-
-!0 = !{ptr @_Z11TakesStruct1SPi, !"kernel", i32 1}
diff --git a/llvm/test/CodeGen/NVPTX/bug22322.ll b/llvm/test/CodeGen/NVPTX/bug22322.ll
index e3656fd16b21..ace31667184b 100644
--- a/llvm/test/CodeGen/NVPTX/bug22322.ll
+++ b/llvm/test/CodeGen/NVPTX/bug22322.ll
@@ -8,7 +8,7 @@ target triple = "nvptx64-nvidia-cuda"
 
 ; Function Attrs: nounwind
 ; CHECK-LABEL: some_kernel
-define void @some_kernel(ptr nocapture %dst) #0 {
+define ptx_kernel void @some_kernel(ptr nocapture %dst) #0 {
 _ZL11compute_vecRK6float3jb.exit:
   %ret_vec.sroa.8.i = alloca float, align 4
   %0 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
@@ -55,8 +55,5 @@ attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "n
 attributes #1 = { nounwind readnone }
 attributes #2 = { nounwind }
 
-!nvvm.annotations = !{!0}
 !llvm.ident = !{!1}
-
-!0 = !{ptr @some_kernel, !"kernel", i32 1}
 !1 = !{!"clang version 3.5.1 (tags/RELEASE_351/final)"}
diff --git a/llvm/test/CodeGen/NVPTX/bug26185.ll b/llvm/test/CodeGen/NVPTX/bug26185.ll
index 00c97fb381e0..193df7f86ca7 100644
--- a/llvm/test/CodeGen/NVPTX/bug26185.ll
+++ b/llvm/test/CodeGen/NVPTX/bug26185.ll
@@ -8,7 +8,7 @@ target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
 target triple = "nvptx64-unknown-unknown"
 
 ; CHECK-LABEL: ex_zext
-define void @ex_zext(ptr noalias readonly %data, ptr %res) {
+define ptx_kernel void @ex_zext(ptr noalias readonly %data, ptr %res) {
 entry:
 ; CHECK: ld.global.nc.u8
   %val = load i8, ptr %data
@@ -19,7 +19,7 @@ entry:
 }
 
 ; CHECK-LABEL: ex_sext
-define void @ex_sext(ptr noalias readonly %data, ptr %res) {
+define ptx_kernel void @ex_sext(ptr noalias readonly %data, ptr %res) {
 entry:
 ; CHECK: ld.global.nc.u8
   %val = load i8, ptr %data
@@ -30,7 +30,7 @@ entry:
 }
 
 ; CHECK-LABEL: ex_zext_v2
-define void @ex_zext_v2(ptr noalias readonly %data, ptr %res) {
+define ptx_kernel void @ex_zext_v2(ptr noalias readonly %data, ptr %res) {
 entry:
 ; CHECK: ld.global.nc.v2.u8
   %val = load <2 x i8>, ptr %data
@@ -41,7 +41,7 @@ entry:
 }
 
 ; CHECK-LABEL: ex_sext_v2
-define void @ex_sext_v2(ptr noalias readonly %data, ptr %res) {
+define ptx_kernel void @ex_sext_v2(ptr noalias readonly %data, ptr %res) {
 entry:
 ; CHECK: ld.global.nc.v2.u8
   %val = load <2 x i8>, ptr %data
@@ -51,8 +51,3 @@ entry:
   ret void
 }
 
-!nvvm.annotations = !{!0,!1,!2,!3}
-!0 = !{ptr @ex_zext, !"kernel", i32 1}
-!1 = !{ptr @ex_sext, !"kernel", i32 1}
-!2 = !{ptr @ex_zext_v2, !"kernel", i32 1}
-!3 = !{ptr @ex_sext_v2, !"kernel", i32 1}
diff --git a/llvm/test/CodeGen/NVPTX/call-with-alloca-buffer.ll b/llvm/test/CodeGen/NVPTX/call-with-alloca-buffer.ll
index 19f4ef8ec77b..1c9d271902fd 100644
--- a/llvm/test/CodeGen/NVPTX/call-with-alloca-buffer.ll
+++ b/llvm/test/CodeGen/NVPTX/call-with-alloca-buffer.ll
@@ -16,7 +16,7 @@
 ;  }
 
 ; CHECK: .visible .entry kernel_func
-define void @kernel_func(ptr %a) {
+define ptx_kernel void @kernel_func(ptr %a) {
 entry:
   %buf = alloca [16 x i8], align 4
 
@@ -56,7 +56,3 @@ entry:
 }
 
 declare void @callee(ptr, ptr)
-
-!nvvm.annotations = !{!0}
-
-!0 = !{ptr @kernel_func, !"kernel", i32 1}
diff --git a/llvm/test/CodeGen/NVPTX/cluster-dim.ll b/llvm/test/CodeGen/NVPTX/cluster-dim.ll
index c9258addbe04..9275c895b224 100644
--- a/llvm/test/CodeGen/NVPTX/cluster-dim.ll
+++ b/llvm/test/CodeGen/NVPTX/cluster-dim.ll
@@ -3,7 +3,7 @@
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 | FileCheck -check-prefixes=CHECK90 %s
 ; RUN: %if ptxas-12.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 | %ptxas-verify -arch=sm_90 %}
 
-define void @kernel_func_clusterxyz() {
+define ptx_kernel void @kernel_func_clusterxyz() {
 ; CHECK80-LABEL: kernel_func_clusterxyz(
 ; CHECK80:       {
 ; CHECK80-EMPTY:
@@ -23,7 +23,6 @@ define void @kernel_func_clusterxyz() {
 }
 
 
-!nvvm.annotations = !{!1, !2}
+!nvvm.annotations = !{!1}
 
-!1 = !{ptr @kernel_func_clusterxyz, !"kernel", i32 1}
-!2 = !{ptr @kernel_func_clusterxyz, !"cluster_dim_x", i32 3, !"cluster_dim_y", i32 5, !"cluster_dim_z", i32 7}
+!1 = !{ptr @kernel_func_clusterxyz, !"cluster_dim_x", i32 3, !"cluster_dim_y", i32 5, !"cluster_dim_z", i32 7}
diff --git a/llvm/test/CodeGen/NVPTX/disjoint-or-addr.ll b/llvm/test/CodeGen/NVPTX/disjoint-or-addr.ll
new file mode 100644
index 000000000000..1b1bb91d5c79
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/disjoint-or-addr.ll
@@ -0,0 +1,25 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_35 -verify-machineinstrs | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_35 | %ptxas-verify %}
+target triple = "nvptx64-nvidia-cuda"
+
+@a = external global ptr align 16
+
+define i32  @test_disjoint_or_addr(i16 %a) {
+; CHECK-LABEL: test_disjoint_or_addr(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-NEXT:    .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    mov.u64 %rd1, a;
+; CHECK-NEXT:    cvta.global.u64 %rd2, %rd1;
+; CHECK-NEXT:    ld.u32 %r1, [%rd2+8];
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT:    ret;
+  %a1 = ptrtoint ptr @a to i64
+  %a2 = or disjoint i64 %a1, 8
+  %a3 = inttoptr i64 %a2 to ptr
+  %v = load i32, ptr %a3
+  ret i32 %v
+}
diff --git a/llvm/test/CodeGen/NVPTX/fabs-fneg-free.ll b/llvm/test/CodeGen/NVPTX/fabs-fneg-free.ll
new file mode 100644
index 000000000000..9031f33939f2
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/fabs-fneg-free.ll
@@ -0,0 +1,34 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_35 -verify-machineinstrs | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_35 | %ptxas-verify %}
+target triple = "nvptx64-nvidia-cuda"
+
+define float @fabs_free(i32 %in) {
+; CHECK-LABEL: fabs_free(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f32 %f1, [fabs_free_param_0];
+; CHECK-NEXT:    abs.f32 %f2, %f1;
+; CHECK-NEXT:    st.param.f32 [func_retval0], %f2;
+; CHECK-NEXT:    ret;
+  %b = bitcast i32 %in to float
+  %f = call float @llvm.fabs.f32(float %b)
+  ret float %f
+}
+
+define float @fneg_free(i32 %in) {
+; CHECK-LABEL: fneg_free(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f32 %f1, [fneg_free_param_0];
+; CHECK-NEXT:    neg.f32 %f2, %f1;
+; CHECK-NEXT:    st.param.f32 [func_retval0], %f2;
+; CHECK-NEXT:    ret;
+  %b = bitcast i32 %in to float
+  %f = fneg float %b
+  ret float %f
+}
diff --git a/llvm/test/CodeGen/NVPTX/generic-to-nvvm.ll b/llvm/test/CodeGen/NVPTX/generic-to-nvvm.ll
index 43e4dfca1456..2b6631154e38 100644
--- a/llvm/test/CodeGen/NVPTX/generic-to-nvvm.ll
+++ b/llvm/test/CodeGen/NVPTX/generic-to-nvvm.ll
@@ -12,7 +12,7 @@ target triple = "nvptx-nvidia-cuda"
 @myconst = internal constant i32 420, align 4
 
 
-define void @foo(ptr %a, ptr %b) {
+define ptx_kernel void @foo(ptr %a, ptr %b) {
 ; Expect one load -- @myconst isn't loaded from, because we know its value
 ; statically.
 ; CHECK: ld.global.u32
@@ -24,7 +24,3 @@ define void @foo(ptr %a, ptr %b) {
   store i32 %ld2, ptr %b
   ret void
 }
-
-
-!nvvm.annotations = !{!0}
-!0 = !{ptr @foo, !"kernel", i32 1}
diff --git a/llvm/test/CodeGen/NVPTX/i1-array-global.ll b/llvm/test/CodeGen/NVPTX/i1-array-global.ll
index ff3848b6f8f7..20b376f94c0d 100644
--- a/llvm/test/CodeGen/NVPTX/i1-array-global.ll
+++ b/llvm/test/CodeGen/NVPTX/i1-array-global.ll
@@ -7,13 +7,9 @@ target triple = "nvptx-nvidia-cuda"
 @global_cst = private constant [6 x i1] [i1 true, i1 false, i1 true, i1 false, i1 true, i1 false]
 
 ; CHECK: .global .align 1 .b8 global_cst[6] = {1, 0, 1, 0, 1}
-define void @kernel(i32 %i, ptr %out) {
+define ptx_kernel void @kernel(i32 %i, ptr %out) {
   %5 = getelementptr inbounds i1, ptr @global_cst, i32 %i
   %6 = load i1, ptr %5, align 1
   store i1 %6, ptr %out, align 1
   ret void
 }
-
-!nvvm.annotations = !{!0}
-!0 = !{ptr @kernel, !"kernel", i32 1}
-
diff --git a/llvm/test/CodeGen/NVPTX/i1-ext-load.ll b/llvm/test/CodeGen/NVPTX/i1-ext-load.ll
index 83f8f80919f8..f5f1dd9fcf0e 100644
--- a/llvm/test/CodeGen/NVPTX/i1-ext-load.ll
+++ b/llvm/test/CodeGen/NVPTX/i1-ext-load.ll
@@ -5,7 +5,7 @@
 
 target triple = "nvptx-nvidia-cuda"
 
-define void @foo(ptr noalias readonly %ptr, ptr noalias %retval) {
+define ptx_kernel void @foo(ptr noalias readonly %ptr, ptr noalias %retval) {
 ; CHECK-LABEL: foo(
 ; CHECK:    .reg .b16 %rs<2>;
 ; CHECK:    .reg .b32 %r<4>;
@@ -28,7 +28,3 @@ define void @foo(ptr noalias readonly %ptr, ptr noalias %retval) {
   store i32 %and, ptr %retval
   ret void
 }
-
-!nvvm.annotations = !{!0}
-
-!0 = !{ptr @foo, !"kernel", i32 1}
diff --git a/llvm/test/CodeGen/NVPTX/i1-global.ll b/llvm/test/CodeGen/NVPTX/i1-global.ll
index 17af1fa29e6c..60d2ccd46419 100644
--- a/llvm/test/CodeGen/NVPTX/i1-global.ll
+++ b/llvm/test/CodeGen/NVPTX/i1-global.ll
@@ -8,13 +8,9 @@ target triple = "nvptx-nvidia-cuda"
 @mypred = addrspace(1) global i1 true, align 1
 
 
-define void @foo(i1 %p, ptr %out) {
+define ptx_kernel void @foo(i1 %p, ptr %out) {
   %ld = load i1, ptr addrspace(1) @mypred
   %val = zext i1 %ld to i32
   store i32 %val, ptr %out
   ret void
 }
-
-
-!nvvm.annotations = !{!0}
-!0 = !{ptr @foo, !"kernel", i32 1}
diff --git a/llvm/test/CodeGen/NVPTX/i1-param.ll b/llvm/test/CodeGen/NVPTX/i1-param.ll
index 3c74ee6aaa3b..14d417bca459 100644
--- a/llvm/test/CodeGen/NVPTX/i1-param.ll
+++ b/llvm/test/CodeGen/NVPTX/i1-param.ll
@@ -9,12 +9,8 @@ target triple = "nvptx-nvidia-cuda"
 ; CHECK: .entry foo
 ; CHECK:   .param .u8 foo_param_0
 ; CHECK:   .param .u64 .ptr .align 1 foo_param_1
-define void @foo(i1 %p, ptr %out) {
+define ptx_kernel void @foo(i1 %p, ptr %out) {
   %val = zext i1 %p to i32
   store i32 %val, ptr %out
   ret void
 }
-
-
-!nvvm.annotations = !{!0}
-!0 = !{ptr @foo, !"kernel", i32 1}
diff --git a/llvm/test/CodeGen/NVPTX/intr-range.ll b/llvm/test/CodeGen/NVPTX/intr-range.ll
index 2f3e08a039f5..86776ab09efc 100644
--- a/llvm/test/CodeGen/NVPTX/intr-range.ll
+++ b/llvm/test/CodeGen/NVPTX/intr-range.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes --version 5
 ; RUN: opt < %s -S -mtriple=nvptx-nvidia-cuda -mcpu=sm_20 -passes=nvvm-intr-range | FileCheck %s
 
-define i32 @test_maxntid() {
-; CHECK-LABEL: define i32 @test_maxntid(
+define ptx_kernel i32 @test_maxntid() {
+; CHECK-LABEL: define ptx_kernel i32 @test_maxntid(
 ; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = call range(i32 0, 96) i32 @llvm.nvvm.read.ptx.sreg.tid.x()
 ; CHECK-NEXT:    [[TMP3:%.*]] = call range(i32 0, 96) i32 @llvm.nvvm.read.ptx.sreg.tid.y()
@@ -31,8 +31,8 @@ define i32 @test_maxntid() {
   ret i32 %11
 }
 
-define i32 @test_reqntid() {
-; CHECK-LABEL: define i32 @test_reqntid(
+define ptx_kernel i32 @test_reqntid() {
+; CHECK-LABEL: define ptx_kernel i32 @test_reqntid(
 ; CHECK-SAME: ) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = call range(i32 0, 20) i32 @llvm.nvvm.read.ptx.sreg.tid.x()
 ; CHECK-NEXT:    [[TMP5:%.*]] = call range(i32 0, 20) i32 @llvm.nvvm.read.ptx.sreg.tid.y()
@@ -64,8 +64,8 @@ define i32 @test_reqntid() {
 ;; A case like this could occur if a function with the sreg intrinsic was
 ;; inlined into a kernel where the tid metadata is present, ensure the range is
 ;; updated.
-define i32 @test_inlined() {
-; CHECK-LABEL: define i32 @test_inlined(
+define ptx_kernel i32 @test_inlined() {
+; CHECK-LABEL: define ptx_kernel i32 @test_inlined(
 ; CHECK-SAME: ) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = call range(i32 0, 4) i32 @llvm.nvvm.read.ptx.sreg.tid.x()
 ; CHECK-NEXT:    ret i32 [[TMP1]]
@@ -83,6 +83,6 @@ declare i32 @llvm.nvvm.read.ptx.sreg.ntid.y()
 declare i32 @llvm.nvvm.read.ptx.sreg.ntid.z()
 
 !nvvm.annotations = !{!0, !1, !2}
-!0 = !{ptr @test_maxntid, !"kernel", i32 1, !"maxntidx", i32 32, !"maxntidz", i32 3}
-!1 = !{ptr @test_reqntid, !"kernel", i32 1, !"reqntidx", i32 20}
-!2 = !{ptr @test_inlined, !"kernel", i32 1, !"maxntidx", i32 4}
+!0 = !{ptr @test_maxntid, !"maxntidx", i32 32, !"maxntidz", i32 3}
+!1 = !{ptr @test_reqntid, !"reqntidx", i32 20}
+!2 = !{ptr @test_inlined, !"maxntidx", i32 4}
diff --git a/llvm/test/CodeGen/NVPTX/kernel-param-align.ll b/llvm/test/CodeGen/NVPTX/kernel-param-align.ll
index 93d428d6fe6f..2889d2d89a85 100644
--- a/llvm/test/CodeGen/NVPTX/kernel-param-align.ll
+++ b/llvm/test/CodeGen/NVPTX/kernel-param-align.ll
@@ -10,7 +10,7 @@
 ; CHECK: .param .u64 .ptr .shared .align 8  func_align_param_3
 ; CHECK: .param .u64 .ptr .const  .align 16 func_align_param_4
 ; CHECK: .param .u64 .ptr .local  .align 32 func_align_param_5
-define void @func_align(ptr nocapture readonly align 1 %input,
+define ptx_kernel void @func_align(ptr nocapture readonly align 1 %input,
                         ptr nocapture align 2 %out,
                         ptr addrspace(1) align 4 %global,
                         ptr addrspace(3) align 8 %shared,
@@ -27,7 +27,7 @@ entry:
 ; CHECK: .param .u64 .ptr .shared .align 1 func_noalign_param_3
 ; CHECK: .param .u64 .ptr .const  .align 1 func_noalign_param_4
 ; CHECK: .param .u64 .ptr .local  .align 1 func_noalign_param_5
-define void @func_noalign(ptr nocapture readonly %input,
+define ptx_kernel void @func_noalign(ptr nocapture readonly %input,
                           ptr nocapture %out,
                           ptr addrspace(1) %global,
                           ptr addrspace(3) %shared,
@@ -36,7 +36,3 @@ define void @func_noalign(ptr nocapture readonly %input,
 entry:
   ret void
 }
-
-!nvvm.annotations = !{!0, !1}
-!0 = !{ptr @func_align, !"kernel", i32 1}
-!1 = !{ptr @func_noalign, !"kernel", i32 1}
diff --git a/llvm/test/CodeGen/NVPTX/load-with-non-coherent-cache.ll b/llvm/test/CodeGen/NVPTX/load-with-non-coherent-cache.ll
index bdaeccd53fac..dc1917f3b150 100644
--- a/llvm/test/CodeGen/NVPTX/load-with-non-coherent-cache.ll
+++ b/llvm/test/CodeGen/NVPTX/load-with-non-coherent-cache.ll
@@ -10,7 +10,7 @@ target triple = "nvptx64-unknown-unknown"
 ; SM20: ld.global.f32
 ; SM35-LABEL: .visible .entry foo1(
 ; SM35: ld.global.nc.f32
-define void @foo1(ptr noalias readonly %from, ptr %to) {
+define ptx_kernel void @foo1(ptr noalias readonly %from, ptr %to) {
   %1 = load float, ptr %from
   store float %1, ptr %to
   ret void
@@ -20,7 +20,7 @@ define void @foo1(ptr noalias readonly %from, ptr %to) {
 ; SM20: ld.global.f64
 ; SM35-LABEL: .visible .entry foo2(
 ; SM35: ld.global.nc.f64
-define void @foo2(ptr noalias readonly %from, ptr %to) {
+define ptx_kernel void @foo2(ptr noalias readonly %from, ptr %to) {
   %1 = load double, ptr %from
   store double %1, ptr %to
   ret void
@@ -30,7 +30,7 @@ define void @foo2(ptr noalias readonly %from, ptr %to) {
 ; SM20: ld.global.u16
 ; SM35-LABEL: .visible .entry foo3(
 ; SM35: ld.global.nc.u16
-define void @foo3(ptr noalias readonly %from, ptr %to) {
+define ptx_kernel void @foo3(ptr noalias readonly %from, ptr %to) {
   %1 = load i16, ptr %from
   store i16 %1, ptr %to
   ret void
@@ -40,7 +40,7 @@ define void @foo3(ptr noalias readonly %from, ptr %to) {
 ; SM20: ld.global.u32
 ; SM35-LABEL: .visible .entry foo4(
 ; SM35: ld.global.nc.u32
-define void @foo4(ptr noalias readonly %from, ptr %to) {
+define ptx_kernel void @foo4(ptr noalias readonly %from, ptr %to) {
   %1 = load i32, ptr %from
   store i32 %1, ptr %to
   ret void
@@ -50,7 +50,7 @@ define void @foo4(ptr noalias readonly %from, ptr %to) {
 ; SM20: ld.global.u64
 ; SM35-LABEL: .visible .entry foo5(
 ; SM35: ld.global.nc.u64
-define void @foo5(ptr noalias readonly %from, ptr %to) {
+define ptx_kernel void @foo5(ptr noalias readonly %from, ptr %to) {
   %1 = load i64, ptr %from
   store i64 %1, ptr %to
   ret void
@@ -63,7 +63,7 @@ define void @foo5(ptr noalias readonly %from, ptr %to) {
 ; SM35-LABEL: .visible .entry foo6(
 ; SM35: ld.global.nc.u64
 ; SM35: ld.global.nc.u64
-define void @foo6(ptr noalias readonly %from, ptr %to) {
+define ptx_kernel void @foo6(ptr noalias readonly %from, ptr %to) {
   %1 = load i128, ptr %from
   store i128 %1, ptr %to
   ret void
@@ -73,7 +73,7 @@ define void @foo6(ptr noalias readonly %from, ptr %to) {
 ; SM20: ld.global.v2.u8
 ; SM35-LABEL: .visible .entry foo7(
 ; SM35: ld.global.nc.v2.u8
-define void @foo7(ptr noalias readonly %from, ptr %to) {
+define ptx_kernel void @foo7(ptr noalias readonly %from, ptr %to) {
   %1 = load <2 x i8>, ptr %from
   store <2 x i8> %1, ptr %to
   ret void
@@ -83,7 +83,7 @@ define void @foo7(ptr noalias readonly %from, ptr %to) {
 ; SM20: ld.global.u32
 ; SM35-LABEL: .visible .entry foo8(
 ; SM35: ld.global.nc.u32
-define void @foo8(ptr noalias readonly %from, ptr %to) {
+define ptx_kernel void @foo8(ptr noalias readonly %from, ptr %to) {
   %1 = load <2 x i16>, ptr %from
   store <2 x i16> %1, ptr %to
   ret void
@@ -93,7 +93,7 @@ define void @foo8(ptr noalias readonly %from, ptr %to) {
 ; SM20: ld.global.v2.u32
 ; SM35-LABEL: .visible .entry foo9(
 ; SM35: ld.global.nc.v2.u32
-define void @foo9(ptr noalias readonly %from, ptr %to) {
+define ptx_kernel void @foo9(ptr noalias readonly %from, ptr %to) {
   %1 = load <2 x i32>, ptr %from
   store <2 x i32> %1, ptr %to
   ret void
@@ -103,7 +103,7 @@ define void @foo9(ptr noalias readonly %from, ptr %to) {
 ; SM20: ld.global.v2.u64
 ; SM35-LABEL: .visible .entry foo10(
 ; SM35: ld.global.nc.v2.u64
-define void @foo10(ptr noalias readonly %from, ptr %to) {
+define ptx_kernel void @foo10(ptr noalias readonly %from, ptr %to) {
   %1 = load <2 x i64>, ptr %from
   store <2 x i64> %1, ptr %to
   ret void
@@ -113,7 +113,7 @@ define void @foo10(ptr noalias readonly %from, ptr %to) {
 ; SM20: ld.global.v2.f32
 ; SM35-LABEL: .visible .entry foo11(
 ; SM35: ld.global.nc.v2.f32
-define void @foo11(ptr noalias readonly %from, ptr %to) {
+define ptx_kernel void @foo11(ptr noalias readonly %from, ptr %to) {
   %1 = load <2 x float>, ptr %from
   store <2 x float> %1, ptr %to
   ret void
@@ -123,7 +123,7 @@ define void @foo11(ptr noalias readonly %from, ptr %to) {
 ; SM20: ld.global.v2.f64
 ; SM35-LABEL: .visible .entry foo12(
 ; SM35: ld.global.nc.v2.f64
-define void @foo12(ptr noalias readonly %from, ptr %to) {
+define ptx_kernel void @foo12(ptr noalias readonly %from, ptr %to) {
   %1 = load <2 x double>, ptr %from
   store <2 x double> %1, ptr %to
   ret void
@@ -133,7 +133,7 @@ define void @foo12(ptr noalias readonly %from, ptr %to) {
 ; SM20: ld.global.u32
 ; SM35-LABEL: .visible .entry foo13(
 ; SM35: ld.global.nc.u32
-define void @foo13(ptr noalias readonly %from, ptr %to) {
+define ptx_kernel void @foo13(ptr noalias readonly %from, ptr %to) {
   %1 = load <4 x i8>, ptr %from
   store <4 x i8> %1, ptr %to
   ret void
@@ -143,7 +143,7 @@ define void @foo13(ptr noalias readonly %from, ptr %to) {
 ; SM20: ld.global.v4.u16
 ; SM35-LABEL: .visible .entry foo14(
 ; SM35: ld.global.nc.v4.u16
-define void @foo14(ptr noalias readonly %from, ptr %to) {
+define ptx_kernel void @foo14(ptr noalias readonly %from, ptr %to) {
   %1 = load <4 x i16>, ptr %from
   store <4 x i16> %1, ptr %to
   ret void
@@ -153,7 +153,7 @@ define void @foo14(ptr noalias readonly %from, ptr %to) {
 ; SM20: ld.global.v4.u32
 ; SM35-LABEL: .visible .entry foo15(
 ; SM35: ld.global.nc.v4.u32
-define void @foo15(ptr noalias readonly %from, ptr %to) {
+define ptx_kernel void @foo15(ptr noalias readonly %from, ptr %to) {
   %1 = load <4 x i32>, ptr %from
   store <4 x i32> %1, ptr %to
   ret void
@@ -163,7 +163,7 @@ define void @foo15(ptr noalias readonly %from, ptr %to) {
 ; SM20: ld.global.v4.f32
 ; SM35-LABEL: .visible .entry foo16(
 ; SM35: ld.global.nc.v4.f32
-define void @foo16(ptr noalias readonly %from, ptr %to) {
+define ptx_kernel void @foo16(ptr noalias readonly %from, ptr %to) {
   %1 = load <4 x float>, ptr %from
   store <4 x float> %1, ptr %to
   ret void
@@ -175,7 +175,7 @@ define void @foo16(ptr noalias readonly %from, ptr %to) {
 ; SM35-LABEL: .visible .entry foo17(
 ; SM35: ld.global.nc.v2.f64
 ; SM35: ld.global.nc.v2.f64
-define void @foo17(ptr noalias readonly %from, ptr %to) {
+define ptx_kernel void @foo17(ptr noalias readonly %from, ptr %to) {
   %1 = load <4 x double>, ptr %from
   store <4 x double> %1, ptr %to
   ret void
@@ -185,7 +185,7 @@ define void @foo17(ptr noalias readonly %from, ptr %to) {
 ; SM20: ld.global.u64
 ; SM35-LABEL: .visible .entry foo18(
 ; SM35: ld.global.nc.u64
-define void @foo18(ptr noalias readonly %from, ptr %to) {
+define ptx_kernel void @foo18(ptr noalias readonly %from, ptr %to) {
   %1 = load ptr, ptr %from
   store ptr %1, ptr %to
   ret void
@@ -196,7 +196,7 @@ define void @foo18(ptr noalias readonly %from, ptr %to) {
 ; SM20: ld.global.f32
 ; SM35-LABEL: .visible .entry foo19(
 ; SM35: ld.global.nc.f32
-define void @foo19(ptr noalias readonly %from, ptr %to, i32 %n) {
+define ptx_kernel void @foo19(ptr noalias readonly %from, ptr %to, i32 %n) {
 entry:
   br label %loop
 
@@ -243,24 +243,3 @@ define void @notkernel2(ptr addrspace(1) noalias readonly %from, ptr %to) {
   store float %1, ptr %to
   ret void
 }
-
-!nvvm.annotations = !{!1 ,!2 ,!3 ,!4 ,!5 ,!6, !7 ,!8 ,!9 ,!10 ,!11 ,!12, !13, !14, !15, !16, !17, !18, !19}
-!1 = !{ptr @foo1, !"kernel", i32 1}
-!2 = !{ptr @foo2, !"kernel", i32 1}
-!3 = !{ptr @foo3, !"kernel", i32 1}
-!4 = !{ptr @foo4, !"kernel", i32 1}
-!5 = !{ptr @foo5, !"kernel", i32 1}
-!6 = !{ptr @foo6, !"kernel", i32 1}
-!7 = !{ptr @foo7, !"kernel", i32 1}
-!8 = !{ptr @foo8, !"kernel", i32 1}
-!9 = !{ptr @foo9, !"kernel", i32 1}
-!10 = !{ptr @foo10, !"kernel", i32 1}
-!11 = !{ptr @foo11, !"kernel", i32 1}
-!12 = !{ptr @foo12, !"kernel", i32 1}
-!13 = !{ptr @foo13, !"kernel", i32 1}
-!14 = !{ptr @foo14, !"kernel", i32 1}
-!15 = !{ptr @foo15, !"kernel", i32 1}
-!16 = !{ptr @foo16, !"kernel", i32 1}
-!17 = !{ptr @foo17, !"kernel", i32 1}
-!18 = !{ptr @foo18, !"kernel", i32 1}
-!19 = !{ptr @foo19, !"kernel", i32 1}
diff --git a/llvm/test/CodeGen/NVPTX/local-stack-frame.ll b/llvm/test/CodeGen/NVPTX/local-stack-frame.ll
index e42f2303cdf7..f21ff974a2c6 100644
--- a/llvm/test/CodeGen/NVPTX/local-stack-frame.ll
+++ b/llvm/test/CodeGen/NVPTX/local-stack-frame.ll
@@ -29,7 +29,7 @@ define void @foo(i32 %a) {
 ; PTX64:        ld.param.u32     %r{{[0-9]+}}, [foo2_param_0];
 ; PTX64:        add.u64          %rd[[SP_REG:[0-9]+]], %SPL, 0;
 ; PTX64:        st.local.u32  [%rd[[SP_REG]]], %r{{[0-9]+}};
-define void @foo2(i32 %a) {
+define ptx_kernel void @foo2(i32 %a) {
   %local = alloca i32, align 4
   store i32 %a, ptr %local
   call void @bar(ptr %local)
@@ -38,8 +38,6 @@ define void @foo2(i32 %a) {
 
 declare void @bar(ptr %a)
 
-!nvvm.annotations = !{!0}
-!0 = !{ptr @foo2, !"kernel", i32 1}
 
 ; PTX32:        mov.u32          %SPL, __local_depot{{[0-9]+}};
 ; PTX32-NOT:    cvta.local.u32   %SP, %SPL;
diff --git a/llvm/test/CodeGen/NVPTX/lower-alloca.ll b/llvm/test/CodeGen/NVPTX/lower-alloca.ll
index 8f2d55151b31..530b48b3d3e3 100644
--- a/llvm/test/CodeGen/NVPTX/lower-alloca.ll
+++ b/llvm/test/CodeGen/NVPTX/lower-alloca.ll
@@ -6,7 +6,7 @@
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
 target triple = "nvptx64-unknown-unknown"
 
-define void @kernel() {
+define ptx_kernel void @kernel() {
 ; LABEL: @lower_alloca
 ; PTX-LABEL: .visible .entry kernel(
   %A = alloca i32
@@ -37,7 +37,5 @@ define void @alloca_in_explicit_local_as() {
 declare void @callee(ptr)
 declare void @callee_addrspace5(ptr addrspace(5))
 
-!nvvm.annotations = !{!0}
 !nvvm.annotations = !{!1}
-!0 = !{ptr @kernel, !"kernel", i32 1}
 !1 = !{ptr @alloca_in_explicit_local_as, !"alloca_in_explicit_local_as", i32 1}
diff --git a/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll b/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll
index 9cfe9192772b..208d4f0ef32a 100644
--- a/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll
+++ b/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll
@@ -29,7 +29,7 @@ define dso_local noundef i32 @non_kernel_function(ptr nocapture noundef readonly
 ; PTX-NEXT:    .reg .pred %p<2>;
 ; PTX-NEXT:    .reg .b16 %rs<3>;
 ; PTX-NEXT:    .reg .b32 %r<11>;
-; PTX-NEXT:    .reg .b64 %rd<10>;
+; PTX-NEXT:    .reg .b64 %rd<9>;
 ; PTX-EMPTY:
 ; PTX-NEXT:  // %bb.0: // %entry
 ; PTX-NEXT:    mov.u64 %SPL, __local_depot0;
@@ -38,23 +38,22 @@ define dso_local noundef i32 @non_kernel_function(ptr nocapture noundef readonly
 ; PTX-NEXT:    and.b16 %rs2, %rs1, 1;
 ; PTX-NEXT:    setp.eq.b16 %p1, %rs2, 1;
 ; PTX-NEXT:    ld.param.s32 %rd1, [non_kernel_function_param_2];
-; PTX-NEXT:    add.u64 %rd2, %SP, 0;
-; PTX-NEXT:    or.b64 %rd3, %rd2, 8;
-; PTX-NEXT:    ld.param.u64 %rd4, [non_kernel_function_param_0+8];
-; PTX-NEXT:    st.u64 [%rd3], %rd4;
-; PTX-NEXT:    ld.param.u64 %rd5, [non_kernel_function_param_0];
-; PTX-NEXT:    st.u64 [%SP], %rd5;
-; PTX-NEXT:    mov.u64 %rd6, gi;
-; PTX-NEXT:    cvta.global.u64 %rd7, %rd6;
-; PTX-NEXT:    selp.b64 %rd8, %rd2, %rd7, %p1;
-; PTX-NEXT:    add.s64 %rd9, %rd8, %rd1;
-; PTX-NEXT:    ld.u8 %r1, [%rd9];
-; PTX-NEXT:    ld.u8 %r2, [%rd9+1];
+; PTX-NEXT:    ld.param.u64 %rd2, [non_kernel_function_param_0+8];
+; PTX-NEXT:    st.u64 [%SP+8], %rd2;
+; PTX-NEXT:    ld.param.u64 %rd3, [non_kernel_function_param_0];
+; PTX-NEXT:    st.u64 [%SP], %rd3;
+; PTX-NEXT:    mov.u64 %rd4, gi;
+; PTX-NEXT:    cvta.global.u64 %rd5, %rd4;
+; PTX-NEXT:    add.u64 %rd6, %SP, 0;
+; PTX-NEXT:    selp.b64 %rd7, %rd6, %rd5, %p1;
+; PTX-NEXT:    add.s64 %rd8, %rd7, %rd1;
+; PTX-NEXT:    ld.u8 %r1, [%rd8];
+; PTX-NEXT:    ld.u8 %r2, [%rd8+1];
 ; PTX-NEXT:    shl.b32 %r3, %r2, 8;
 ; PTX-NEXT:    or.b32 %r4, %r3, %r1;
-; PTX-NEXT:    ld.u8 %r5, [%rd9+2];
+; PTX-NEXT:    ld.u8 %r5, [%rd8+2];
 ; PTX-NEXT:    shl.b32 %r6, %r5, 16;
-; PTX-NEXT:    ld.u8 %r7, [%rd9+3];
+; PTX-NEXT:    ld.u8 %r7, [%rd8+3];
 ; PTX-NEXT:    shl.b32 %r8, %r7, 24;
 ; PTX-NEXT:    or.b32 %r9, %r8, %r6;
 ; PTX-NEXT:    or.b32 %r10, %r9, %r4;
@@ -68,7 +67,7 @@ entry:
   ret i32 %0, !dbg !23
 }
 
-define void @grid_const_int(ptr byval(i32) align 4 %input1, i32 %input2, ptr %out, i32 %n) {
+define ptx_kernel void @grid_const_int(ptr byval(i32) align 4 %input1, i32 %input2, ptr %out, i32 %n) {
 ; PTX-LABEL: grid_const_int(
 ; PTX:       {
 ; PTX-NEXT:    .reg .b32 %r<4>;
@@ -82,7 +81,7 @@ define void @grid_const_int(ptr byval(i32) align 4 %input1, i32 %input2, ptr %ou
 ; PTX-NEXT:    add.s32 %r3, %r2, %r1;
 ; PTX-NEXT:    st.global.u32 [%rd2], %r3;
 ; PTX-NEXT:    ret;
-; OPT-LABEL: define void @grid_const_int(
+; OPT-LABEL: define ptx_kernel void @grid_const_int(
 ; OPT-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], i32 [[INPUT2:%.*]], ptr [[OUT:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
 ; OPT-NEXT:    [[OUT2:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
 ; OPT-NEXT:    [[OUT3:%.*]] = addrspacecast ptr addrspace(1) [[OUT2]] to ptr
@@ -91,6 +90,7 @@ define void @grid_const_int(ptr byval(i32) align 4 %input1, i32 %input2, ptr %ou
 ; OPT-NEXT:    [[ADD:%.*]] = add i32 [[TMP]], [[INPUT2]]
 ; OPT-NEXT:    store i32 [[ADD]], ptr [[OUT3]], align 4
 ; OPT-NEXT:    ret void
+;
   %tmp = load i32, ptr %input1, align 4
   %add = add i32 %tmp, %input2
   store i32 %add, ptr %out
@@ -99,7 +99,7 @@ define void @grid_const_int(ptr byval(i32) align 4 %input1, i32 %input2, ptr %ou
 
 %struct.s = type { i32, i32 }
 
-define void @grid_const_struct(ptr byval(%struct.s) align 4 %input, ptr %out){
+define ptx_kernel void @grid_const_struct(ptr byval(%struct.s) align 4 %input, ptr %out){
 ; PTX-LABEL: grid_const_struct(
 ; PTX:       {
 ; PTX-NEXT:    .reg .b32 %r<4>;
@@ -113,7 +113,7 @@ define void @grid_const_struct(ptr byval(%struct.s) align 4 %input, ptr %out){
 ; PTX-NEXT:    add.s32 %r3, %r1, %r2;
 ; PTX-NEXT:    st.global.u32 [%rd2], %r3;
 ; PTX-NEXT:    ret;
-; OPT-LABEL: define void @grid_const_struct(
+; OPT-LABEL: define ptx_kernel void @grid_const_struct(
 ; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr [[OUT:%.*]]) #[[ATTR0]] {
 ; OPT-NEXT:    [[OUT4:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
 ; OPT-NEXT:    [[OUT5:%.*]] = addrspacecast ptr addrspace(1) [[OUT4]] to ptr
@@ -125,6 +125,7 @@ define void @grid_const_struct(ptr byval(%struct.s) align 4 %input, ptr %out){
 ; OPT-NEXT:    [[ADD:%.*]] = add i32 [[TMP1]], [[TMP2]]
 ; OPT-NEXT:    store i32 [[ADD]], ptr [[OUT5]], align 4
 ; OPT-NEXT:    ret void
+;
   %gep1 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 0
   %gep2 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 1
   %int1 = load i32, ptr %gep1
@@ -134,7 +135,7 @@ define void @grid_const_struct(ptr byval(%struct.s) align 4 %input, ptr %out){
   ret void
 }
 
-define void @grid_const_escape(ptr byval(%struct.s) align 4 %input) {
+define ptx_kernel void @grid_const_escape(ptr byval(%struct.s) align 4 %input) {
 ; PTX-LABEL: grid_const_escape(
 ; PTX:       {
 ; PTX-NEXT:    .reg .b32 %r<3>;
@@ -159,17 +160,18 @@ define void @grid_const_escape(ptr byval(%struct.s) align 4 %input) {
 ; PTX-NEXT:    ld.param.b32 %r1, [retval0];
 ; PTX-NEXT:    } // callseq 0
 ; PTX-NEXT:    ret;
-; OPT-LABEL: define void @grid_const_escape(
+; OPT-LABEL: define ptx_kernel void @grid_const_escape(
 ; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]]) #[[ATTR0]] {
 ; OPT-NEXT:    [[INPUT_PARAM:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
 ; OPT-NEXT:    [[INPUT_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT_PARAM]])
 ; OPT-NEXT:    [[CALL:%.*]] = call i32 @escape(ptr [[INPUT_PARAM_GEN]])
 ; OPT-NEXT:    ret void
+;
   %call = call i32 @escape(ptr %input)
   ret void
 }
 
-define void @multiple_grid_const_escape(ptr byval(%struct.s) align 4 %input, i32 %a, ptr byval(i32) align 4 %b) {
+define ptx_kernel void @multiple_grid_const_escape(ptr byval(%struct.s) align 4 %input, i32 %a, ptr byval(i32) align 4 %b) {
 ; PTX-LABEL: multiple_grid_const_escape(
 ; PTX:       {
 ; PTX-NEXT:    .local .align 4 .b8 __local_depot4[4];
@@ -212,7 +214,7 @@ define void @multiple_grid_const_escape(ptr byval(%struct.s) align 4 %input, i32
 ; PTX-NEXT:    ld.param.b32 %r2, [retval0];
 ; PTX-NEXT:    } // callseq 1
 ; PTX-NEXT:    ret;
-; OPT-LABEL: define void @multiple_grid_const_escape(
+; OPT-LABEL: define ptx_kernel void @multiple_grid_const_escape(
 ; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], i32 [[A:%.*]], ptr byval(i32) align 4 [[B:%.*]]) #[[ATTR0]] {
 ; OPT-NEXT:    [[B_PARAM:%.*]] = addrspacecast ptr [[B]] to ptr addrspace(101)
 ; OPT-NEXT:    [[B_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[B_PARAM]])
@@ -222,13 +224,14 @@ define void @multiple_grid_const_escape(ptr byval(%struct.s) align 4 %input, i32
 ; OPT-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
 ; OPT-NEXT:    [[CALL:%.*]] = call i32 @escape3(ptr [[INPUT_PARAM_GEN]], ptr [[A_ADDR]], ptr [[B_PARAM_GEN]])
 ; OPT-NEXT:    ret void
+;
   %a.addr = alloca i32, align 4
   store i32 %a, ptr %a.addr, align 4
   %call = call i32 @escape3(ptr %input, ptr %a.addr, ptr %b)
   ret void
 }
 
-define void @grid_const_memory_escape(ptr byval(%struct.s) align 4 %input, ptr %addr) {
+define ptx_kernel void @grid_const_memory_escape(ptr byval(%struct.s) align 4 %input, ptr %addr) {
 ; PTX-LABEL: grid_const_memory_escape(
 ; PTX:       {
 ; PTX-NEXT:    .reg .b64 %rd<6>;
@@ -241,7 +244,7 @@ define void @grid_const_memory_escape(ptr byval(%struct.s) align 4 %input, ptr %
 ; PTX-NEXT:    cvta.param.u64 %rd5, %rd4;
 ; PTX-NEXT:    st.global.u64 [%rd3], %rd5;
 ; PTX-NEXT:    ret;
-; OPT-LABEL: define void @grid_const_memory_escape(
+; OPT-LABEL: define ptx_kernel void @grid_const_memory_escape(
 ; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr [[ADDR:%.*]]) #[[ATTR0]] {
 ; OPT-NEXT:    [[ADDR4:%.*]] = addrspacecast ptr [[ADDR]] to ptr addrspace(1)
 ; OPT-NEXT:    [[ADDR5:%.*]] = addrspacecast ptr addrspace(1) [[ADDR4]] to ptr
@@ -249,11 +252,12 @@ define void @grid_const_memory_escape(ptr byval(%struct.s) align 4 %input, ptr %
 ; OPT-NEXT:    [[INPUT1:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT_PARAM]])
 ; OPT-NEXT:    store ptr [[INPUT1]], ptr [[ADDR5]], align 8
 ; OPT-NEXT:    ret void
+;
   store ptr %input, ptr %addr, align 8
   ret void
 }
 
-define void @grid_const_inlineasm_escape(ptr byval(%struct.s) align 4 %input, ptr %result) {
+define ptx_kernel void @grid_const_inlineasm_escape(ptr byval(%struct.s) align 4 %input, ptr %result) {
 ; PTX-LABEL: grid_const_inlineasm_escape(
 ; PTX:       {
 ; PTX-NEXT:    .reg .b64 %rd<8>;
@@ -271,7 +275,7 @@ define void @grid_const_inlineasm_escape(ptr byval(%struct.s) align 4 %input, pt
 ; PTX-NEXT:    st.global.u64 [%rd6], %rd1;
 ; PTX-NEXT:    ret;
 ; PTX-NOT      .local
-; OPT-LABEL: define void @grid_const_inlineasm_escape(
+; OPT-LABEL: define ptx_kernel void @grid_const_inlineasm_escape(
 ; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr [[RESULT:%.*]]) #[[ATTR0]] {
 ; OPT-NEXT:    [[RESULT4:%.*]] = addrspacecast ptr [[RESULT]] to ptr addrspace(1)
 ; OPT-NEXT:    [[RESULT5:%.*]] = addrspacecast ptr addrspace(1) [[RESULT4]] to ptr
@@ -282,6 +286,7 @@ define void @grid_const_inlineasm_escape(ptr byval(%struct.s) align 4 %input, pt
 ; OPT-NEXT:    [[TMP2:%.*]] = call i64 asm "add.s64 $0, $1, $2
 ; OPT-NEXT:    store i64 [[TMP2]], ptr [[RESULT5]], align 8
 ; OPT-NEXT:    ret void
+;
   %tmpptr1 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 0
   %tmpptr2 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 1
   %1 = call i64 asm "add.s64 $0, $1, $2;", "=l,l,l"(ptr %tmpptr1, ptr %tmpptr2) #1
@@ -289,7 +294,7 @@ define void @grid_const_inlineasm_escape(ptr byval(%struct.s) align 4 %input, pt
   ret void
 }
 
-define void @grid_const_partial_escape(ptr byval(i32) %input, ptr %output) {
+define ptx_kernel void @grid_const_partial_escape(ptr byval(i32) %input, ptr %output) {
 ; PTX-LABEL: grid_const_partial_escape(
 ; PTX:       {
 ; PTX-NEXT:    .reg .b32 %r<5>;
@@ -319,7 +324,7 @@ define void @grid_const_partial_escape(ptr byval(i32) %input, ptr %output) {
 ; PTX-NEXT:    ld.param.b32 %r3, [retval0];
 ; PTX-NEXT:    } // callseq 2
 ; PTX-NEXT:    ret;
-; OPT-LABEL: define void @grid_const_partial_escape(
+; OPT-LABEL: define ptx_kernel void @grid_const_partial_escape(
 ; OPT-SAME: ptr byval(i32) [[INPUT:%.*]], ptr [[OUTPUT:%.*]]) #[[ATTR0]] {
 ; OPT-NEXT:    [[OUTPUT4:%.*]] = addrspacecast ptr [[OUTPUT]] to ptr addrspace(1)
 ; OPT-NEXT:    [[OUTPUT5:%.*]] = addrspacecast ptr addrspace(1) [[OUTPUT4]] to ptr
@@ -330,6 +335,7 @@ define void @grid_const_partial_escape(ptr byval(i32) %input, ptr %output) {
 ; OPT-NEXT:    store i32 [[TWICE]], ptr [[OUTPUT5]], align 4
 ; OPT-NEXT:    [[CALL:%.*]] = call i32 @escape(ptr [[INPUT1_GEN]])
 ; OPT-NEXT:    ret void
+;
   %val = load i32, ptr %input
   %twice = add i32 %val, %val
   store i32 %twice, ptr %output
@@ -337,7 +343,7 @@ define void @grid_const_partial_escape(ptr byval(i32) %input, ptr %output) {
   ret void
 }
 
-define i32 @grid_const_partial_escapemem(ptr byval(%struct.s) %input, ptr %output) {
+define ptx_kernel i32 @grid_const_partial_escapemem(ptr byval(%struct.s) %input, ptr %output) {
 ; PTX-LABEL: grid_const_partial_escapemem(
 ; PTX:       {
 ; PTX-NEXT:    .reg .b32 %r<6>;
@@ -369,7 +375,7 @@ define i32 @grid_const_partial_escapemem(ptr byval(%struct.s) %input, ptr %outpu
 ; PTX-NEXT:    } // callseq 3
 ; PTX-NEXT:    st.param.b32 [func_retval0], %r3;
 ; PTX-NEXT:    ret;
-; OPT-LABEL: define i32 @grid_const_partial_escapemem(
+; OPT-LABEL: define ptx_kernel i32 @grid_const_partial_escapemem(
 ; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) [[INPUT:%.*]], ptr [[OUTPUT:%.*]]) #[[ATTR0]] {
 ; OPT-NEXT:    [[OUTPUT4:%.*]] = addrspacecast ptr [[OUTPUT]] to ptr addrspace(1)
 ; OPT-NEXT:    [[OUTPUT5:%.*]] = addrspacecast ptr addrspace(1) [[OUTPUT4]] to ptr
@@ -383,6 +389,7 @@ define i32 @grid_const_partial_escapemem(ptr byval(%struct.s) %input, ptr %outpu
 ; OPT-NEXT:    [[ADD:%.*]] = add i32 [[VAL1]], [[VAL2]]
 ; OPT-NEXT:    [[CALL2:%.*]] = call i32 @escape(ptr [[PTR1]])
 ; OPT-NEXT:    ret i32 [[ADD]]
+;
   %ptr1 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 0
   %val1 = load i32, ptr %ptr1
   %ptr2 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 1
@@ -393,7 +400,7 @@ define i32 @grid_const_partial_escapemem(ptr byval(%struct.s) %input, ptr %outpu
   ret i32 %add
 }
 
-define void @grid_const_phi(ptr byval(%struct.s) align 4 %input1, ptr %inout) {
+define ptx_kernel void @grid_const_phi(ptr byval(%struct.s) align 4 %input1, ptr %inout) {
 ; PTX-LABEL: grid_const_phi(
 ; PTX:       {
 ; PTX-NEXT:    .reg .pred %p<2>;
@@ -415,7 +422,7 @@ define void @grid_const_phi(ptr byval(%struct.s) align 4 %input1, ptr %inout) {
 ; PTX-NEXT:    ld.u32 %r2, [%rd8];
 ; PTX-NEXT:    st.global.u32 [%rd1], %r2;
 ; PTX-NEXT:    ret;
-; OPT-LABEL: define void @grid_const_phi(
+; OPT-LABEL: define ptx_kernel void @grid_const_phi(
 ; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr [[INOUT:%.*]]) #[[ATTR0]] {
 ; OPT-NEXT:    [[INOUT1:%.*]] = addrspacecast ptr [[INOUT]] to ptr addrspace(1)
 ; OPT-NEXT:    [[INOUT2:%.*]] = addrspacecast ptr addrspace(1) [[INOUT1]] to ptr
@@ -435,6 +442,7 @@ define void @grid_const_phi(ptr byval(%struct.s) align 4 %input1, ptr %inout) {
 ; OPT-NEXT:    [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4
 ; OPT-NEXT:    store i32 [[VALLOADED]], ptr [[INOUT2]], align 4
 ; OPT-NEXT:    ret void
+;
 
   %val = load i32, ptr %inout
   %less = icmp slt i32 %val, 0
@@ -453,7 +461,7 @@ merge:
 }
 
 ; NOTE: %input2 is *not* grid_constant
-define void @grid_const_phi_ngc(ptr byval(%struct.s) align 4 %input1, ptr byval(%struct.s) %input2, ptr %inout) {
+define ptx_kernel void @grid_const_phi_ngc(ptr byval(%struct.s) align 4 %input1, ptr byval(%struct.s) %input2, ptr %inout) {
 ; PTX-LABEL: grid_const_phi_ngc(
 ; PTX:       {
 ; PTX-NEXT:    .reg .pred %p<2>;
@@ -478,7 +486,7 @@ define void @grid_const_phi_ngc(ptr byval(%struct.s) align 4 %input1, ptr byval(
 ; PTX-NEXT:    ld.u32 %r2, [%rd11];
 ; PTX-NEXT:    st.global.u32 [%rd1], %r2;
 ; PTX-NEXT:    ret;
-; OPT-LABEL: define void @grid_const_phi_ngc(
+; OPT-LABEL: define ptx_kernel void @grid_const_phi_ngc(
 ; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr byval([[STRUCT_S]]) [[INPUT2:%.*]], ptr [[INOUT:%.*]]) #[[ATTR0]] {
 ; OPT-NEXT:    [[INOUT1:%.*]] = addrspacecast ptr [[INOUT]] to ptr addrspace(1)
 ; OPT-NEXT:    [[INOUT2:%.*]] = addrspacecast ptr addrspace(1) [[INOUT1]] to ptr
@@ -500,6 +508,7 @@ define void @grid_const_phi_ngc(ptr byval(%struct.s) align 4 %input1, ptr byval(
 ; OPT-NEXT:    [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4
 ; OPT-NEXT:    store i32 [[VALLOADED]], ptr [[INOUT2]], align 4
 ; OPT-NEXT:    ret void
+;
   %val = load i32, ptr %inout
   %less = icmp slt i32 %val, 0
   br i1 %less, label %first, label %second
@@ -517,7 +526,7 @@ merge:
 }
 
 ; NOTE: %input2 is *not* grid_constant
-define void @grid_const_select(ptr byval(i32) align 4 %input1, ptr byval(i32) %input2, ptr %inout) {
+define ptx_kernel void @grid_const_select(ptr byval(i32) align 4 %input1, ptr byval(i32) %input2, ptr %inout) {
 ; PTX-LABEL: grid_const_select(
 ; PTX:       {
 ; PTX-NEXT:    .reg .pred %p<2>;
@@ -539,7 +548,7 @@ define void @grid_const_select(ptr byval(i32) align 4 %input1, ptr byval(i32) %i
 ; PTX-NEXT:    ld.u32 %r2, [%rd9];
 ; PTX-NEXT:    st.global.u32 [%rd3], %r2;
 ; PTX-NEXT:    ret;
-; OPT-LABEL: define void @grid_const_select(
+; OPT-LABEL: define ptx_kernel void @grid_const_select(
 ; OPT-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], ptr byval(i32) [[INPUT2:%.*]], ptr [[INOUT:%.*]]) #[[ATTR0]] {
 ; OPT-NEXT:    [[INOUT1:%.*]] = addrspacecast ptr [[INOUT]] to ptr addrspace(1)
 ; OPT-NEXT:    [[INOUT2:%.*]] = addrspacecast ptr addrspace(1) [[INOUT1]] to ptr
@@ -553,6 +562,7 @@ define void @grid_const_select(ptr byval(i32) align 4 %input1, ptr byval(i32) %i
 ; OPT-NEXT:    [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4
 ; OPT-NEXT:    store i32 [[VALLOADED]], ptr [[INOUT2]], align 4
 ; OPT-NEXT:    ret void
+;
   %val = load i32, ptr %inout
   %less = icmp slt i32 %val, 0
   %ptrnew = select i1 %less, ptr %input1, ptr %input2
@@ -561,7 +571,7 @@ define void @grid_const_select(ptr byval(i32) align 4 %input1, ptr byval(i32) %i
   ret void
 }
 
-define i32 @grid_const_ptrtoint(ptr byval(i32) %input) {
+define ptx_kernel i32 @grid_const_ptrtoint(ptr byval(i32) %input) {
 ; PTX-LABEL: grid_const_ptrtoint(
 ; PTX:       {
 ; PTX-NEXT:    .reg .b32 %r<4>;
@@ -576,7 +586,7 @@ define i32 @grid_const_ptrtoint(ptr byval(i32) %input) {
 ; PTX-NEXT:    add.s32 %r3, %r1, %r2;
 ; PTX-NEXT:    st.param.b32 [func_retval0], %r3;
 ; PTX-NEXT:    ret;
-; OPT-LABEL: define i32 @grid_const_ptrtoint(
+; OPT-LABEL: define ptx_kernel i32 @grid_const_ptrtoint(
 ; OPT-SAME: ptr byval(i32) align 4 [[INPUT:%.*]]) #[[ATTR0]] {
 ; OPT-NEXT:    [[INPUT2:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
 ; OPT-NEXT:    [[INPUT3:%.*]] = load i32, ptr addrspace(101) [[INPUT2]], align 4
@@ -584,6 +594,7 @@ define i32 @grid_const_ptrtoint(ptr byval(i32) %input) {
 ; OPT-NEXT:    [[PTRVAL:%.*]] = ptrtoint ptr [[INPUT1]] to i32
 ; OPT-NEXT:    [[KEEPALIVE:%.*]] = add i32 [[INPUT3]], [[PTRVAL]]
 ; OPT-NEXT:    ret i32 [[KEEPALIVE]]
+;
   %val = load i32, ptr %input
   %ptrval = ptrtoint ptr %input to i32
   %keepalive = add i32 %val, %ptrval
@@ -598,40 +609,40 @@ declare dso_local ptr @escape3(ptr, ptr, ptr) local_unnamed_addr
 
 !nvvm.annotations = !{!0, !1, !2, !3, !4, !5, !6, !7, !8, !9, !10, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23}
 
-!0 = !{ptr @grid_const_int, !"kernel", i32 1, !"grid_constant", !1}
+!0 = !{ptr @grid_const_int, !"grid_constant", !1}
 !1 = !{i32 1}
 
-!2 = !{ptr @grid_const_struct, !"kernel", i32 1, !"grid_constant", !3}
+!2 = !{ptr @grid_const_struct, !"grid_constant", !3}
 !3 = !{i32 1}
 
-!4 = !{ptr @grid_const_escape, !"kernel", i32 1, !"grid_constant", !5}
+!4 = !{ptr @grid_const_escape, !"grid_constant", !5}
 !5 = !{i32 1}
 
-!6 = !{ptr @multiple_grid_const_escape, !"kernel", i32 1, !"grid_constant", !7}
+!6 = !{ptr @multiple_grid_const_escape, !"grid_constant", !7}
 !7 = !{i32 1, i32 3}
 
-!8 = !{ptr @grid_const_memory_escape, !"kernel", i32 1, !"grid_constant", !9}
+!8 = !{ptr @grid_const_memory_escape, !"grid_constant", !9}
 !9 = !{i32 1}
 
-!10 = !{ptr @grid_const_inlineasm_escape, !"kernel", i32 1, !"grid_constant", !11}
+!10 = !{ptr @grid_const_inlineasm_escape, !"grid_constant", !11}
 !11 = !{i32 1}
 
-!12 = !{ptr @grid_const_partial_escape, !"kernel", i32 1, !"grid_constant", !13}
+!12 = !{ptr @grid_const_partial_escape, !"grid_constant", !13}
 !13 = !{i32 1}
 
-!14 = !{ptr @grid_const_partial_escapemem, !"kernel", i32 1, !"grid_constant", !15}
+!14 = !{ptr @grid_const_partial_escapemem, !"grid_constant", !15}
 !15 = !{i32 1}
 
-!16 = !{ptr @grid_const_phi, !"kernel", i32 1, !"grid_constant", !17}
+!16 = !{ptr @grid_const_phi, !"grid_constant", !17}
 !17 = !{i32 1}
 
-!18 = !{ptr @grid_const_phi_ngc, !"kernel", i32 1, !"grid_constant", !19}
+!18 = !{ptr @grid_const_phi_ngc, !"grid_constant", !19}
 !19 = !{i32 1}
 
-!20 = !{ptr @grid_const_select, !"kernel", i32 1, !"grid_constant", !21}
+!20 = !{ptr @grid_const_select, !"grid_constant", !21}
 !21 = !{i32 1}
 
-!22 = !{ptr @grid_const_ptrtoint, !"kernel", i32 1, !"grid_constant", !23}
+!22 = !{ptr @grid_const_ptrtoint, !"grid_constant", !23}
 !23 = !{i32 1}
 
 
diff --git a/llvm/test/CodeGen/NVPTX/lower-args.ll b/llvm/test/CodeGen/NVPTX/lower-args.ll
index eba4f273fa70..269bba75dc5f 100644
--- a/llvm/test/CodeGen/NVPTX/lower-args.ll
+++ b/llvm/test/CodeGen/NVPTX/lower-args.ll
@@ -65,7 +65,7 @@ define void @load_padding(ptr nocapture readonly byval(%class.padded) %arg) {
 }
 
 ; COMMON-LABEL: ptr_generic
-define void @ptr_generic(ptr %out, ptr %in) {
+define ptx_kernel void @ptr_generic(ptr %out, ptr %in) {
 ; IRC:  %in3 = addrspacecast ptr %in to ptr addrspace(1)
 ; IRC:  %in4 = addrspacecast ptr addrspace(1) %in3 to ptr
 ; IRC:  %out1 = addrspacecast ptr %out to ptr addrspace(1)
@@ -87,7 +87,7 @@ define void @ptr_generic(ptr %out, ptr %in) {
 }
 
 ; COMMON-LABEL: ptr_nongeneric
-define void @ptr_nongeneric(ptr addrspace(1) %out, ptr addrspace(4) %in) {
+define ptx_kernel void @ptr_nongeneric(ptr addrspace(1) %out, ptr addrspace(4) %in) {
 ; IR-NOT: addrspacecast
 ; PTX-NOT: cvta.to.global
 ; PTX:  ld.const.u32
@@ -98,7 +98,7 @@ define void @ptr_nongeneric(ptr addrspace(1) %out, ptr addrspace(4) %in) {
 }
 
 ; COMMON-LABEL: ptr_as_int
- define void @ptr_as_int(i64 noundef %i, i32 noundef %v) {
+ define ptx_kernel void @ptr_as_int(i64 noundef %i, i32 noundef %v) {
 ; IR:   [[P:%.*]] = inttoptr i64 %i to ptr
 ; IRC:  [[P1:%.*]] = addrspacecast ptr [[P]] to ptr addrspace(1)
 ; IRC:  addrspacecast ptr addrspace(1) [[P1]] to ptr
@@ -121,7 +121,7 @@ define void @ptr_nongeneric(ptr addrspace(1) %out, ptr addrspace(4) %in) {
 %struct.S = type { i64 }
 
 ; COMMON-LABEL: ptr_as_int_aggr
-define void @ptr_as_int_aggr(ptr nocapture noundef readonly byval(%struct.S) align 8 %s, i32 noundef %v) {
+define ptx_kernel void @ptr_as_int_aggr(ptr nocapture noundef readonly byval(%struct.S) align 8 %s, i32 noundef %v) {
 ; IR:   [[S:%.*]] = addrspacecast ptr %s to ptr addrspace(101)
 ; IR:   [[I:%.*]] = load i64, ptr addrspace(101) [[S]], align 8
 ; IR:   [[P0:%.*]] = inttoptr i64 [[I]] to ptr
@@ -146,8 +146,3 @@ define void @ptr_as_int_aggr(ptr nocapture noundef readonly byval(%struct.S) ali
 
 ; Function Attrs: convergent nounwind
 declare dso_local ptr @escape(ptr) local_unnamed_addr
-!nvvm.annotations = !{!0, !1, !2, !3}
-!0 = !{ptr @ptr_generic, !"kernel", i32 1}
-!1 = !{ptr @ptr_nongeneric, !"kernel", i32 1}
-!2 = !{ptr @ptr_as_int, !"kernel", i32 1}
-!3 = !{ptr @ptr_as_int_aggr, !"kernel", i32 1}
diff --git a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll
index 5c52626a711f..26102722a483 100644
--- a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll
+++ b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll
@@ -24,8 +24,8 @@ declare void @llvm.memmove.p0.p0.i64(ptr nocapture writeonly, ptr nocapture read
 declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) #2
 
 ; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
-define dso_local void @read_only(ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
-; SM_60-LABEL: define dso_local void @read_only(
+define dso_local ptx_kernel void @read_only(ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
+; SM_60-LABEL: define dso_local ptx_kernel void @read_only(
 ; SM_60-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 ; SM_60-NEXT:  [[ENTRY:.*:]]
 ; SM_60-NEXT:    [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
@@ -35,7 +35,7 @@ define dso_local void @read_only(ptr nocapture noundef writeonly %out, ptr nocap
 ; SM_60-NEXT:    store i32 [[I]], ptr [[OUT2]], align 4
 ; SM_60-NEXT:    ret void
 ;
-; SM_70-LABEL: define dso_local void @read_only(
+; SM_70-LABEL: define dso_local ptx_kernel void @read_only(
 ; SM_70-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 ; SM_70-NEXT:  [[ENTRY:.*:]]
 ; SM_70-NEXT:    [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
@@ -45,7 +45,7 @@ define dso_local void @read_only(ptr nocapture noundef writeonly %out, ptr nocap
 ; SM_70-NEXT:    store i32 [[I]], ptr [[OUT2]], align 4
 ; SM_70-NEXT:    ret void
 ;
-; COPY-LABEL: define dso_local void @read_only(
+; COPY-LABEL: define dso_local ptx_kernel void @read_only(
 ; COPY-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 ; COPY-NEXT:  [[ENTRY:.*:]]
 ; COPY-NEXT:    [[S1:%.*]] = alloca [[STRUCT_S]], align 4
@@ -62,8 +62,8 @@ entry:
 }
 
 ; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
-define dso_local void @read_only_gep(ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
-; SM_60-LABEL: define dso_local void @read_only_gep(
+define dso_local ptx_kernel void @read_only_gep(ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
+; SM_60-LABEL: define dso_local ptx_kernel void @read_only_gep(
 ; SM_60-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; SM_60-NEXT:  [[ENTRY:.*:]]
 ; SM_60-NEXT:    [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
@@ -74,7 +74,7 @@ define dso_local void @read_only_gep(ptr nocapture noundef writeonly %out, ptr n
 ; SM_60-NEXT:    store i32 [[I]], ptr [[OUT2]], align 4
 ; SM_60-NEXT:    ret void
 ;
-; SM_70-LABEL: define dso_local void @read_only_gep(
+; SM_70-LABEL: define dso_local ptx_kernel void @read_only_gep(
 ; SM_70-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; SM_70-NEXT:  [[ENTRY:.*:]]
 ; SM_70-NEXT:    [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
@@ -85,7 +85,7 @@ define dso_local void @read_only_gep(ptr nocapture noundef writeonly %out, ptr n
 ; SM_70-NEXT:    store i32 [[I]], ptr [[OUT2]], align 4
 ; SM_70-NEXT:    ret void
 ;
-; COPY-LABEL: define dso_local void @read_only_gep(
+; COPY-LABEL: define dso_local ptx_kernel void @read_only_gep(
 ; COPY-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; COPY-NEXT:  [[ENTRY:.*:]]
 ; COPY-NEXT:    [[S1:%.*]] = alloca [[STRUCT_S]], align 4
@@ -104,8 +104,8 @@ entry:
 }
 
 ; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
-define dso_local void @read_only_gep_asc(ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
-; SM_60-LABEL: define dso_local void @read_only_gep_asc(
+define dso_local ptx_kernel void @read_only_gep_asc(ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
+; SM_60-LABEL: define dso_local ptx_kernel void @read_only_gep_asc(
 ; SM_60-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; SM_60-NEXT:  [[ENTRY:.*:]]
 ; SM_60-NEXT:    [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
@@ -116,7 +116,7 @@ define dso_local void @read_only_gep_asc(ptr nocapture noundef writeonly %out, p
 ; SM_60-NEXT:    store i32 [[I]], ptr [[OUT2]], align 4
 ; SM_60-NEXT:    ret void
 ;
-; SM_70-LABEL: define dso_local void @read_only_gep_asc(
+; SM_70-LABEL: define dso_local ptx_kernel void @read_only_gep_asc(
 ; SM_70-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; SM_70-NEXT:  [[ENTRY:.*:]]
 ; SM_70-NEXT:    [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
@@ -127,7 +127,7 @@ define dso_local void @read_only_gep_asc(ptr nocapture noundef writeonly %out, p
 ; SM_70-NEXT:    store i32 [[I]], ptr [[OUT2]], align 4
 ; SM_70-NEXT:    ret void
 ;
-; COPY-LABEL: define dso_local void @read_only_gep_asc(
+; COPY-LABEL: define dso_local ptx_kernel void @read_only_gep_asc(
 ; COPY-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; COPY-NEXT:  [[ENTRY:.*:]]
 ; COPY-NEXT:    [[S1:%.*]] = alloca [[STRUCT_S]], align 4
@@ -148,8 +148,8 @@ entry:
 }
 
 ; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
-define dso_local void @read_only_gep_asc0(ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
-; SM_60-LABEL: define dso_local void @read_only_gep_asc0(
+define dso_local ptx_kernel void @read_only_gep_asc0(ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
+; SM_60-LABEL: define dso_local ptx_kernel void @read_only_gep_asc0(
 ; SM_60-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; SM_60-NEXT:  [[ENTRY:.*:]]
 ; SM_60-NEXT:    [[S3:%.*]] = alloca [[STRUCT_S]], align 4
@@ -164,7 +164,7 @@ define dso_local void @read_only_gep_asc0(ptr nocapture noundef writeonly %out,
 ; SM_60-NEXT:    store i32 [[I]], ptr [[OUT2]], align 4
 ; SM_60-NEXT:    ret void
 ;
-; SM_70-LABEL: define dso_local void @read_only_gep_asc0(
+; SM_70-LABEL: define dso_local ptx_kernel void @read_only_gep_asc0(
 ; SM_70-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; SM_70-NEXT:  [[ENTRY:.*:]]
 ; SM_70-NEXT:    [[S3:%.*]] = alloca [[STRUCT_S]], align 4
@@ -179,7 +179,7 @@ define dso_local void @read_only_gep_asc0(ptr nocapture noundef writeonly %out,
 ; SM_70-NEXT:    store i32 [[I]], ptr [[OUT2]], align 4
 ; SM_70-NEXT:    ret void
 ;
-; COPY-LABEL: define dso_local void @read_only_gep_asc0(
+; COPY-LABEL: define dso_local ptx_kernel void @read_only_gep_asc0(
 ; COPY-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; COPY-NEXT:  [[ENTRY:.*:]]
 ; COPY-NEXT:    [[S1:%.*]] = alloca [[STRUCT_S]], align 4
@@ -202,8 +202,8 @@ entry:
 }
 
 ; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
-define dso_local void @escape_ptr(ptr nocapture noundef readnone %out, ptr noundef byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
-; SM_60-LABEL: define dso_local void @escape_ptr(
+define dso_local ptx_kernel void @escape_ptr(ptr nocapture noundef readnone %out, ptr noundef byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
+; SM_60-LABEL: define dso_local ptx_kernel void @escape_ptr(
 ; SM_60-SAME: ptr nocapture noundef readnone [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; SM_60-NEXT:  [[ENTRY:.*:]]
 ; SM_60-NEXT:    [[S3:%.*]] = alloca [[STRUCT_S]], align 4
@@ -214,7 +214,7 @@ define dso_local void @escape_ptr(ptr nocapture noundef readnone %out, ptr nound
 ; SM_60-NEXT:    call void @_Z6escapePv(ptr noundef nonnull [[S3]])
 ; SM_60-NEXT:    ret void
 ;
-; SM_70-LABEL: define dso_local void @escape_ptr(
+; SM_70-LABEL: define dso_local ptx_kernel void @escape_ptr(
 ; SM_70-SAME: ptr nocapture noundef readnone [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; SM_70-NEXT:  [[ENTRY:.*:]]
 ; SM_70-NEXT:    [[S3:%.*]] = alloca [[STRUCT_S]], align 4
@@ -225,7 +225,7 @@ define dso_local void @escape_ptr(ptr nocapture noundef readnone %out, ptr nound
 ; SM_70-NEXT:    call void @_Z6escapePv(ptr noundef nonnull [[S3]])
 ; SM_70-NEXT:    ret void
 ;
-; COPY-LABEL: define dso_local void @escape_ptr(
+; COPY-LABEL: define dso_local ptx_kernel void @escape_ptr(
 ; COPY-SAME: ptr nocapture noundef readnone [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; COPY-NEXT:  [[ENTRY:.*:]]
 ; COPY-NEXT:    [[S1:%.*]] = alloca [[STRUCT_S]], align 4
@@ -240,8 +240,8 @@ entry:
 }
 
 ; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
-define dso_local void @escape_ptr_gep(ptr nocapture noundef readnone %out, ptr noundef byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
-; SM_60-LABEL: define dso_local void @escape_ptr_gep(
+define dso_local ptx_kernel void @escape_ptr_gep(ptr nocapture noundef readnone %out, ptr noundef byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
+; SM_60-LABEL: define dso_local ptx_kernel void @escape_ptr_gep(
 ; SM_60-SAME: ptr nocapture noundef readnone [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; SM_60-NEXT:  [[ENTRY:.*:]]
 ; SM_60-NEXT:    [[S3:%.*]] = alloca [[STRUCT_S]], align 4
@@ -253,7 +253,7 @@ define dso_local void @escape_ptr_gep(ptr nocapture noundef readnone %out, ptr n
 ; SM_60-NEXT:    call void @_Z6escapePv(ptr noundef nonnull [[B]])
 ; SM_60-NEXT:    ret void
 ;
-; SM_70-LABEL: define dso_local void @escape_ptr_gep(
+; SM_70-LABEL: define dso_local ptx_kernel void @escape_ptr_gep(
 ; SM_70-SAME: ptr nocapture noundef readnone [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; SM_70-NEXT:  [[ENTRY:.*:]]
 ; SM_70-NEXT:    [[S3:%.*]] = alloca [[STRUCT_S]], align 4
@@ -265,7 +265,7 @@ define dso_local void @escape_ptr_gep(ptr nocapture noundef readnone %out, ptr n
 ; SM_70-NEXT:    call void @_Z6escapePv(ptr noundef nonnull [[B]])
 ; SM_70-NEXT:    ret void
 ;
-; COPY-LABEL: define dso_local void @escape_ptr_gep(
+; COPY-LABEL: define dso_local ptx_kernel void @escape_ptr_gep(
 ; COPY-SAME: ptr nocapture noundef readnone [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; COPY-NEXT:  [[ENTRY:.*:]]
 ; COPY-NEXT:    [[S1:%.*]] = alloca [[STRUCT_S]], align 4
@@ -282,8 +282,8 @@ entry:
 }
 
 ; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
-define dso_local void @escape_ptr_store(ptr nocapture noundef writeonly %out, ptr noundef byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
-; SM_60-LABEL: define dso_local void @escape_ptr_store(
+define dso_local ptx_kernel void @escape_ptr_store(ptr nocapture noundef writeonly %out, ptr noundef byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
+; SM_60-LABEL: define dso_local ptx_kernel void @escape_ptr_store(
 ; SM_60-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; SM_60-NEXT:  [[ENTRY:.*:]]
 ; SM_60-NEXT:    [[S3:%.*]] = alloca [[STRUCT_S]], align 4
@@ -294,7 +294,7 @@ define dso_local void @escape_ptr_store(ptr nocapture noundef writeonly %out, pt
 ; SM_60-NEXT:    store ptr [[S3]], ptr [[OUT2]], align 8
 ; SM_60-NEXT:    ret void
 ;
-; SM_70-LABEL: define dso_local void @escape_ptr_store(
+; SM_70-LABEL: define dso_local ptx_kernel void @escape_ptr_store(
 ; SM_70-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; SM_70-NEXT:  [[ENTRY:.*:]]
 ; SM_70-NEXT:    [[S3:%.*]] = alloca [[STRUCT_S]], align 4
@@ -305,7 +305,7 @@ define dso_local void @escape_ptr_store(ptr nocapture noundef writeonly %out, pt
 ; SM_70-NEXT:    store ptr [[S3]], ptr [[OUT2]], align 8
 ; SM_70-NEXT:    ret void
 ;
-; COPY-LABEL: define dso_local void @escape_ptr_store(
+; COPY-LABEL: define dso_local ptx_kernel void @escape_ptr_store(
 ; COPY-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; COPY-NEXT:  [[ENTRY:.*:]]
 ; COPY-NEXT:    [[S1:%.*]] = alloca [[STRUCT_S]], align 4
@@ -320,8 +320,8 @@ entry:
 }
 
 ; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
-define dso_local void @escape_ptr_gep_store(ptr nocapture noundef writeonly %out, ptr noundef byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
-; SM_60-LABEL: define dso_local void @escape_ptr_gep_store(
+define dso_local ptx_kernel void @escape_ptr_gep_store(ptr nocapture noundef writeonly %out, ptr noundef byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
+; SM_60-LABEL: define dso_local ptx_kernel void @escape_ptr_gep_store(
 ; SM_60-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; SM_60-NEXT:  [[ENTRY:.*:]]
 ; SM_60-NEXT:    [[S3:%.*]] = alloca [[STRUCT_S]], align 4
@@ -333,7 +333,7 @@ define dso_local void @escape_ptr_gep_store(ptr nocapture noundef writeonly %out
 ; SM_60-NEXT:    store ptr [[B]], ptr [[OUT2]], align 8
 ; SM_60-NEXT:    ret void
 ;
-; SM_70-LABEL: define dso_local void @escape_ptr_gep_store(
+; SM_70-LABEL: define dso_local ptx_kernel void @escape_ptr_gep_store(
 ; SM_70-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; SM_70-NEXT:  [[ENTRY:.*:]]
 ; SM_70-NEXT:    [[S3:%.*]] = alloca [[STRUCT_S]], align 4
@@ -345,7 +345,7 @@ define dso_local void @escape_ptr_gep_store(ptr nocapture noundef writeonly %out
 ; SM_70-NEXT:    store ptr [[B]], ptr [[OUT2]], align 8
 ; SM_70-NEXT:    ret void
 ;
-; COPY-LABEL: define dso_local void @escape_ptr_gep_store(
+; COPY-LABEL: define dso_local ptx_kernel void @escape_ptr_gep_store(
 ; COPY-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; COPY-NEXT:  [[ENTRY:.*:]]
 ; COPY-NEXT:    [[S1:%.*]] = alloca [[STRUCT_S]], align 4
@@ -362,8 +362,8 @@ entry:
 }
 
 ; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
-define dso_local void @escape_ptrtoint(ptr nocapture noundef writeonly %out, ptr noundef byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
-; SM_60-LABEL: define dso_local void @escape_ptrtoint(
+define dso_local ptx_kernel void @escape_ptrtoint(ptr nocapture noundef writeonly %out, ptr noundef byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
+; SM_60-LABEL: define dso_local ptx_kernel void @escape_ptrtoint(
 ; SM_60-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; SM_60-NEXT:  [[ENTRY:.*:]]
 ; SM_60-NEXT:    [[S3:%.*]] = alloca [[STRUCT_S]], align 4
@@ -375,7 +375,7 @@ define dso_local void @escape_ptrtoint(ptr nocapture noundef writeonly %out, ptr
 ; SM_60-NEXT:    store i64 [[I]], ptr [[OUT2]], align 8
 ; SM_60-NEXT:    ret void
 ;
-; SM_70-LABEL: define dso_local void @escape_ptrtoint(
+; SM_70-LABEL: define dso_local ptx_kernel void @escape_ptrtoint(
 ; SM_70-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; SM_70-NEXT:  [[ENTRY:.*:]]
 ; SM_70-NEXT:    [[S3:%.*]] = alloca [[STRUCT_S]], align 4
@@ -387,7 +387,7 @@ define dso_local void @escape_ptrtoint(ptr nocapture noundef writeonly %out, ptr
 ; SM_70-NEXT:    store i64 [[I]], ptr [[OUT2]], align 8
 ; SM_70-NEXT:    ret void
 ;
-; COPY-LABEL: define dso_local void @escape_ptrtoint(
+; COPY-LABEL: define dso_local ptx_kernel void @escape_ptrtoint(
 ; COPY-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; COPY-NEXT:  [[ENTRY:.*:]]
 ; COPY-NEXT:    [[S1:%.*]] = alloca [[STRUCT_S]], align 4
@@ -404,8 +404,8 @@ entry:
 }
 
 ; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
-define dso_local void @memcpy_from_param(ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
-; SM_60-LABEL: define dso_local void @memcpy_from_param(
+define dso_local ptx_kernel void @memcpy_from_param(ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
+; SM_60-LABEL: define dso_local ptx_kernel void @memcpy_from_param(
 ; SM_60-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; SM_60-NEXT:  [[ENTRY:.*:]]
 ; SM_60-NEXT:    [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
@@ -414,7 +414,7 @@ define dso_local void @memcpy_from_param(ptr nocapture noundef writeonly %out, p
 ; SM_60-NEXT:    call void @llvm.memcpy.p0.p101.i64(ptr [[OUT2]], ptr addrspace(101) [[S3]], i64 16, i1 true)
 ; SM_60-NEXT:    ret void
 ;
-; SM_70-LABEL: define dso_local void @memcpy_from_param(
+; SM_70-LABEL: define dso_local ptx_kernel void @memcpy_from_param(
 ; SM_70-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; SM_70-NEXT:  [[ENTRY:.*:]]
 ; SM_70-NEXT:    [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
@@ -423,7 +423,7 @@ define dso_local void @memcpy_from_param(ptr nocapture noundef writeonly %out, p
 ; SM_70-NEXT:    call void @llvm.memcpy.p0.p101.i64(ptr [[OUT2]], ptr addrspace(101) [[S3]], i64 16, i1 true)
 ; SM_70-NEXT:    ret void
 ;
-; COPY-LABEL: define dso_local void @memcpy_from_param(
+; COPY-LABEL: define dso_local ptx_kernel void @memcpy_from_param(
 ; COPY-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; COPY-NEXT:  [[ENTRY:.*:]]
 ; COPY-NEXT:    [[S1:%.*]] = alloca [[STRUCT_S]], align 4
@@ -438,8 +438,8 @@ entry:
 }
 
 ; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
-define dso_local void @memcpy_from_param_noalign (ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) %s) local_unnamed_addr #0 {
-; SM_60-LABEL: define dso_local void @memcpy_from_param_noalign(
+define dso_local ptx_kernel void @memcpy_from_param_noalign (ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) %s) local_unnamed_addr #0 {
+; SM_60-LABEL: define dso_local ptx_kernel void @memcpy_from_param_noalign(
 ; SM_60-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; SM_60-NEXT:  [[ENTRY:.*:]]
 ; SM_60-NEXT:    [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
@@ -448,7 +448,7 @@ define dso_local void @memcpy_from_param_noalign (ptr nocapture noundef writeonl
 ; SM_60-NEXT:    call void @llvm.memcpy.p0.p101.i64(ptr [[OUT2]], ptr addrspace(101) [[S3]], i64 16, i1 true)
 ; SM_60-NEXT:    ret void
 ;
-; SM_70-LABEL: define dso_local void @memcpy_from_param_noalign(
+; SM_70-LABEL: define dso_local ptx_kernel void @memcpy_from_param_noalign(
 ; SM_70-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; SM_70-NEXT:  [[ENTRY:.*:]]
 ; SM_70-NEXT:    [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
@@ -457,7 +457,7 @@ define dso_local void @memcpy_from_param_noalign (ptr nocapture noundef writeonl
 ; SM_70-NEXT:    call void @llvm.memcpy.p0.p101.i64(ptr [[OUT2]], ptr addrspace(101) [[S3]], i64 16, i1 true)
 ; SM_70-NEXT:    ret void
 ;
-; COPY-LABEL: define dso_local void @memcpy_from_param_noalign(
+; COPY-LABEL: define dso_local ptx_kernel void @memcpy_from_param_noalign(
 ; COPY-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; COPY-NEXT:  [[ENTRY:.*:]]
 ; COPY-NEXT:    [[S1:%.*]] = alloca [[STRUCT_S]], align 8
@@ -472,8 +472,8 @@ entry:
 }
 
 ; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
-define dso_local void @memcpy_to_param(ptr nocapture noundef readonly %in, ptr nocapture noundef readnone byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
-; SM_60-LABEL: define dso_local void @memcpy_to_param(
+define dso_local ptx_kernel void @memcpy_to_param(ptr nocapture noundef readonly %in, ptr nocapture noundef readnone byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
+; SM_60-LABEL: define dso_local ptx_kernel void @memcpy_to_param(
 ; SM_60-SAME: ptr nocapture noundef readonly [[IN:%.*]], ptr nocapture noundef readnone byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; SM_60-NEXT:  [[ENTRY:.*:]]
 ; SM_60-NEXT:    [[S3:%.*]] = alloca [[STRUCT_S]], align 4
@@ -484,7 +484,7 @@ define dso_local void @memcpy_to_param(ptr nocapture noundef readonly %in, ptr n
 ; SM_60-NEXT:    tail call void @llvm.memcpy.p0.p0.i64(ptr [[S3]], ptr [[IN2]], i64 16, i1 true)
 ; SM_60-NEXT:    ret void
 ;
-; SM_70-LABEL: define dso_local void @memcpy_to_param(
+; SM_70-LABEL: define dso_local ptx_kernel void @memcpy_to_param(
 ; SM_70-SAME: ptr nocapture noundef readonly [[IN:%.*]], ptr nocapture noundef readnone byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; SM_70-NEXT:  [[ENTRY:.*:]]
 ; SM_70-NEXT:    [[S3:%.*]] = alloca [[STRUCT_S]], align 4
@@ -495,7 +495,7 @@ define dso_local void @memcpy_to_param(ptr nocapture noundef readonly %in, ptr n
 ; SM_70-NEXT:    tail call void @llvm.memcpy.p0.p0.i64(ptr [[S3]], ptr [[IN2]], i64 16, i1 true)
 ; SM_70-NEXT:    ret void
 ;
-; COPY-LABEL: define dso_local void @memcpy_to_param(
+; COPY-LABEL: define dso_local ptx_kernel void @memcpy_to_param(
 ; COPY-SAME: ptr nocapture noundef readonly [[IN:%.*]], ptr nocapture noundef readnone byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; COPY-NEXT:  [[ENTRY:.*:]]
 ; COPY-NEXT:    [[S1:%.*]] = alloca [[STRUCT_S]], align 4
@@ -510,8 +510,8 @@ entry:
 }
 
 ; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
-define dso_local void @copy_on_store(ptr nocapture noundef readonly %in, ptr nocapture noundef byval(%struct.S) align 4 %s, i1 noundef zeroext %b) local_unnamed_addr #0 {
-; SM_60-LABEL: define dso_local void @copy_on_store(
+define dso_local ptx_kernel void @copy_on_store(ptr nocapture noundef readonly %in, ptr nocapture noundef byval(%struct.S) align 4 %s, i1 noundef zeroext %b) local_unnamed_addr #0 {
+; SM_60-LABEL: define dso_local ptx_kernel void @copy_on_store(
 ; SM_60-SAME: ptr nocapture noundef readonly [[IN:%.*]], ptr nocapture noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]], i1 noundef zeroext [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; SM_60-NEXT:  [[BB:.*:]]
 ; SM_60-NEXT:    [[S3:%.*]] = alloca [[STRUCT_S]], align 4
@@ -523,7 +523,7 @@ define dso_local void @copy_on_store(ptr nocapture noundef readonly %in, ptr noc
 ; SM_60-NEXT:    store i32 [[I]], ptr [[S3]], align 4
 ; SM_60-NEXT:    ret void
 ;
-; SM_70-LABEL: define dso_local void @copy_on_store(
+; SM_70-LABEL: define dso_local ptx_kernel void @copy_on_store(
 ; SM_70-SAME: ptr nocapture noundef readonly [[IN:%.*]], ptr nocapture noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]], i1 noundef zeroext [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; SM_70-NEXT:  [[BB:.*:]]
 ; SM_70-NEXT:    [[S3:%.*]] = alloca [[STRUCT_S]], align 4
@@ -535,7 +535,7 @@ define dso_local void @copy_on_store(ptr nocapture noundef readonly %in, ptr noc
 ; SM_70-NEXT:    store i32 [[I]], ptr [[S3]], align 4
 ; SM_70-NEXT:    ret void
 ;
-; COPY-LABEL: define dso_local void @copy_on_store(
+; COPY-LABEL: define dso_local ptx_kernel void @copy_on_store(
 ; COPY-SAME: ptr nocapture noundef readonly [[IN:%.*]], ptr nocapture noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]], i1 noundef zeroext [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; COPY-NEXT:  [[BB:.*:]]
 ; COPY-NEXT:    [[S1:%.*]] = alloca [[STRUCT_S]], align 4
@@ -551,8 +551,8 @@ bb:
   ret void
 }
 
-define void @test_select(ptr byval(i32) align 4 %input1, ptr byval(i32) %input2, ptr %out, i1 %cond) {
-; SM_60-LABEL: define void @test_select(
+define ptx_kernel void @test_select(ptr byval(i32) align 4 %input1, ptr byval(i32) %input2, ptr %out, i1 %cond) {
+; SM_60-LABEL: define ptx_kernel void @test_select(
 ; SM_60-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], ptr byval(i32) [[INPUT2:%.*]], ptr [[OUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3:[0-9]+]] {
 ; SM_60-NEXT:  [[BB:.*:]]
 ; SM_60-NEXT:    [[OUT7:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
@@ -568,7 +568,7 @@ define void @test_select(ptr byval(i32) align 4 %input1, ptr byval(i32) %input2,
 ; SM_60-NEXT:    store i32 [[VALLOADED]], ptr [[OUT8]], align 4
 ; SM_60-NEXT:    ret void
 ;
-; SM_70-LABEL: define void @test_select(
+; SM_70-LABEL: define ptx_kernel void @test_select(
 ; SM_70-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], ptr byval(i32) [[INPUT2:%.*]], ptr [[OUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3:[0-9]+]] {
 ; SM_70-NEXT:  [[BB:.*:]]
 ; SM_70-NEXT:    [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
@@ -582,7 +582,7 @@ define void @test_select(ptr byval(i32) align 4 %input1, ptr byval(i32) %input2,
 ; SM_70-NEXT:    store i32 [[VALLOADED]], ptr [[OUT2]], align 4
 ; SM_70-NEXT:    ret void
 ;
-; COPY-LABEL: define void @test_select(
+; COPY-LABEL: define ptx_kernel void @test_select(
 ; COPY-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], ptr byval(i32) [[INPUT2:%.*]], ptr [[OUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3:[0-9]+]] {
 ; COPY-NEXT:  [[BB:.*:]]
 ; COPY-NEXT:    [[INPUT23:%.*]] = alloca i32, align 4
@@ -603,8 +603,8 @@ bb:
   ret void
 }
 
-define void @test_select_write(ptr byval(i32) align 4 %input1, ptr byval(i32) %input2, ptr %out, i1 %cond) {
-; SM_60-LABEL: define void @test_select_write(
+define ptx_kernel void @test_select_write(ptr byval(i32) align 4 %input1, ptr byval(i32) %input2, ptr %out, i1 %cond) {
+; SM_60-LABEL: define ptx_kernel void @test_select_write(
 ; SM_60-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], ptr byval(i32) [[INPUT2:%.*]], ptr [[OUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3]] {
 ; SM_60-NEXT:  [[BB:.*:]]
 ; SM_60-NEXT:    [[OUT5:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
@@ -619,7 +619,7 @@ define void @test_select_write(ptr byval(i32) align 4 %input1, ptr byval(i32) %i
 ; SM_60-NEXT:    store i32 1, ptr [[PTRNEW]], align 4
 ; SM_60-NEXT:    ret void
 ;
-; SM_70-LABEL: define void @test_select_write(
+; SM_70-LABEL: define ptx_kernel void @test_select_write(
 ; SM_70-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], ptr byval(i32) [[INPUT2:%.*]], ptr [[OUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3]] {
 ; SM_70-NEXT:  [[BB:.*:]]
 ; SM_70-NEXT:    [[OUT5:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
@@ -634,7 +634,7 @@ define void @test_select_write(ptr byval(i32) align 4 %input1, ptr byval(i32) %i
 ; SM_70-NEXT:    store i32 1, ptr [[PTRNEW]], align 4
 ; SM_70-NEXT:    ret void
 ;
-; COPY-LABEL: define void @test_select_write(
+; COPY-LABEL: define ptx_kernel void @test_select_write(
 ; COPY-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], ptr byval(i32) [[INPUT2:%.*]], ptr [[OUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3]] {
 ; COPY-NEXT:  [[BB:.*:]]
 ; COPY-NEXT:    [[INPUT23:%.*]] = alloca i32, align 4
@@ -653,8 +653,8 @@ bb:
   ret void
 }
 
-define void @test_phi(ptr byval(%struct.S) align 4 %input1, ptr byval(%struct.S) %input2, ptr %inout, i1 %cond) {
-; SM_60-LABEL: define void @test_phi(
+define ptx_kernel void @test_phi(ptr byval(%struct.S) align 4 %input1, ptr byval(%struct.S) %input2, ptr %inout, i1 %cond) {
+; SM_60-LABEL: define ptx_kernel void @test_phi(
 ; SM_60-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr byval([[STRUCT_S]]) [[INPUT2:%.*]], ptr [[INOUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3]] {
 ; SM_60-NEXT:  [[BB:.*:]]
 ; SM_60-NEXT:    [[INOUT7:%.*]] = addrspacecast ptr [[INOUT]] to ptr addrspace(1)
@@ -678,7 +678,7 @@ define void @test_phi(ptr byval(%struct.S) align 4 %input1, ptr byval(%struct.S)
 ; SM_60-NEXT:    store i32 [[VALLOADED]], ptr [[INOUT8]], align 4
 ; SM_60-NEXT:    ret void
 ;
-; SM_70-LABEL: define void @test_phi(
+; SM_70-LABEL: define ptx_kernel void @test_phi(
 ; SM_70-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr byval([[STRUCT_S]]) [[INPUT2:%.*]], ptr [[INOUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3]] {
 ; SM_70-NEXT:  [[BB:.*:]]
 ; SM_70-NEXT:    [[INOUT1:%.*]] = addrspacecast ptr [[INOUT]] to ptr addrspace(1)
@@ -700,7 +700,7 @@ define void @test_phi(ptr byval(%struct.S) align 4 %input1, ptr byval(%struct.S)
 ; SM_70-NEXT:    store i32 [[VALLOADED]], ptr [[INOUT2]], align 4
 ; SM_70-NEXT:    ret void
 ;
-; COPY-LABEL: define void @test_phi(
+; COPY-LABEL: define ptx_kernel void @test_phi(
 ; COPY-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr byval([[STRUCT_S]]) [[INPUT2:%.*]], ptr [[INOUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3]] {
 ; COPY-NEXT:  [[BB:.*:]]
 ; COPY-NEXT:    [[INPUT23:%.*]] = alloca [[STRUCT_S]], align 8
@@ -740,8 +740,8 @@ merge:                                            ; preds = %second, %first
   ret void
 }
 
-define void @test_phi_write(ptr byval(%struct.S) align 4 %input1, ptr byval(%struct.S) %input2, i1 %cond) {
-; COMMON-LABEL: define void @test_phi_write(
+define ptx_kernel void @test_phi_write(ptr byval(%struct.S) align 4 %input1, ptr byval(%struct.S) %input2, i1 %cond) {
+; COMMON-LABEL: define ptx_kernel void @test_phi_write(
 ; COMMON-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr byval([[STRUCT_S]]) [[INPUT2:%.*]], i1 [[COND:%.*]]) #[[ATTR3:[0-9]+]] {
 ; COMMON-NEXT:  [[BB:.*:]]
 ; COMMON-NEXT:    [[INPUT24:%.*]] = alloca [[STRUCT_S]], align 8
@@ -784,29 +784,11 @@ attributes #1 = { nocallback nofree nounwind willreturn memory(argmem: readwrite
 attributes #2 = { nocallback nofree nounwind willreturn memory(argmem: write) }
 
 !llvm.module.flags = !{!0, !1, !2, !3}
-!nvvm.annotations = !{!4, !5, !6, !7, !8, !9, !10, !11, !12, !13, !14, !15, !16, !17, !18, !19, !23}
 !llvm.ident = !{!20, !21}
 
 !0 = !{i32 2, !"SDK Version", [2 x i32] [i32 11, i32 8]}
 !1 = !{i32 1, !"wchar_size", i32 4}
 !2 = !{i32 4, !"nvvm-reflect-ftz", i32 0}
 !3 = !{i32 7, !"frame-pointer", i32 2}
-!4 = !{ptr @read_only, !"kernel", i32 1}
-!5 = !{ptr @escape_ptr, !"kernel", i32 1}
-!6 = !{ptr @escape_ptr_gep, !"kernel", i32 1}
-!7 = !{ptr @escape_ptr_store, !"kernel", i32 1}
-!8 = !{ptr @escape_ptr_gep_store, !"kernel", i32 1}
-!9 = !{ptr @escape_ptrtoint, !"kernel", i32 1}
-!10 = !{ptr @memcpy_from_param, !"kernel", i32 1}
-!11 = !{ptr @memcpy_to_param, !"kernel", i32 1}
-!12 = !{ptr @copy_on_store, !"kernel", i32 1}
-!13 = !{ptr @read_only_gep, !"kernel", i32 1}
-!14 = !{ptr @read_only_gep_asc, !"kernel", i32 1}
-!15 = !{ptr @read_only_gep_asc0, !"kernel", i32 1}
-!16 = !{ptr @test_select, !"kernel", i32 1}
-!17 = !{ptr @test_phi, !"kernel", i32 1}
-!18 = !{ptr @test_phi_write, !"kernel", i32 1}
-!19 = !{ptr @test_select_write, !"kernel", i32 1}
 !20 = !{!"clang version 20.0.0git"}
 !21 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
-!23 = !{ptr @memcpy_from_param_noalign, !"kernel", i32 1}
diff --git a/llvm/test/CodeGen/NVPTX/lower-ctor-dtor.ll b/llvm/test/CodeGen/NVPTX/lower-ctor-dtor.ll
index f8b3b4b9b8c4..4ee1ca3ad4b1 100644
--- a/llvm/test/CodeGen/NVPTX/lower-ctor-dtor.ll
+++ b/llvm/test/CodeGen/NVPTX/lower-ctor-dtor.ll
@@ -43,7 +43,7 @@ define internal void @bar() {
   ret void
 }
 
-; CHECK-LABEL: define weak_odr void @"nvptx$device$init"() {
+; CHECK-LABEL: define weak_odr ptx_kernel void @"nvptx$device$init"() {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[BEGIN:%.*]] = load ptr addrspace(1), ptr addrspace(1) @__init_array_start, align 8
 ; CHECK-NEXT:    [[STOP:%.*]] = load ptr addrspace(1), ptr addrspace(1) @__init_array_end, align 8
@@ -60,7 +60,7 @@ define internal void @bar() {
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK-LABEL: define weak_odr void @"nvptx$device$fini"() {
+; CHECK-LABEL: define weak_odr ptx_kernel void @"nvptx$device$fini"() {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[BEGIN:%.*]] = load ptr addrspace(1), ptr addrspace(1) @__fini_array_start, align 8
 ; CHECK-NEXT:    [[STOP:%.*]] = load ptr addrspace(1), ptr addrspace(1) @__fini_array_end, align 8
@@ -82,12 +82,10 @@ define internal void @bar() {
 ; CHECK:       while.end:
 ; CHECK-NEXT:    ret void
 
-; CHECK: [[META0:![0-9]+]] = !{ptr @"nvptx$device$init", !"kernel", i32 1}
 ; CHECK: [[META1:![0-9]+]] = !{ptr @"nvptx$device$init", !"maxntidx", i32 1}
 ; CHECK: [[META2:![0-9]+]] = !{ptr @"nvptx$device$init", !"maxntidy", i32 1}
 ; CHECK: [[META3:![0-9]+]] = !{ptr @"nvptx$device$init", !"maxntidz", i32 1}
 ; CHECK: [[META4:![0-9]+]] = !{ptr @"nvptx$device$init", !"maxclusterrank", i32 1}
-; CHECK: [[META5:![0-9]+]] = !{ptr @"nvptx$device$fini", !"kernel", i32 1}
 ; CHECK: [[META6:![0-9]+]] = !{ptr @"nvptx$device$fini", !"maxntidx", i32 1}
 ; CHECK: [[META7:![0-9]+]] = !{ptr @"nvptx$device$fini", !"maxntidy", i32 1}
 ; CHECK: [[META8:![0-9]+]] = !{ptr @"nvptx$device$fini", !"maxntidz", i32 1}
diff --git a/llvm/test/CodeGen/NVPTX/lower-kernel-ptr-arg.ll b/llvm/test/CodeGen/NVPTX/lower-kernel-ptr-arg.ll
index 9ec690a68e7e..2e64c2559481 100644
--- a/llvm/test/CodeGen/NVPTX/lower-kernel-ptr-arg.ll
+++ b/llvm/test/CodeGen/NVPTX/lower-kernel-ptr-arg.ll
@@ -6,7 +6,7 @@ target triple = "nvptx64-nvidia-cuda"
 
 ; Verify that both %input and %output are converted to global pointers and then
 ; addrspacecast'ed back to the original type.
-define void @kernel(ptr %input, ptr %output) {
+define ptx_kernel void @kernel(ptr %input, ptr %output) {
 ; CHECK-LABEL: .visible .entry kernel(
 ; CHECK: cvta.to.global.u64
 ; CHECK: cvta.to.global.u64
@@ -17,7 +17,7 @@ define void @kernel(ptr %input, ptr %output) {
   ret void
 }
 
-define void @kernel2(ptr addrspace(1) %input, ptr addrspace(1) %output) {
+define ptx_kernel void @kernel2(ptr addrspace(1) %input, ptr addrspace(1) %output) {
 ; CHECK-LABEL: .visible .entry kernel2(
 ; CHECK-NOT: cvta.to.global.u64
   %1 = load float, ptr addrspace(1) %input, align 4
@@ -29,7 +29,7 @@ define void @kernel2(ptr addrspace(1) %input, ptr addrspace(1) %output) {
 
 %struct.S = type { ptr, ptr }
 
-define void @ptr_in_byval_kernel(ptr byval(%struct.S) %input, ptr %output) {
+define ptx_kernel void @ptr_in_byval_kernel(ptr byval(%struct.S) %input, ptr %output) {
 ; CHECK-LABEL: .visible .entry ptr_in_byval_kernel(
 ; CHECK: ld.param.u64 	%[[optr:rd.*]], [ptr_in_byval_kernel_param_1]
 ; CHECK: cvta.to.global.u64 %[[optr_g:.*]], %[[optr]];
@@ -60,7 +60,3 @@ define void @ptr_in_byval_func(ptr byval(%struct.S) %input, ptr %output) {
   ret void
 }
 
-!nvvm.annotations = !{!0, !1, !2}
-!0 = !{ptr @kernel, !"kernel", i32 1}
-!1 = !{ptr @kernel2, !"kernel", i32 1}
-!2 = !{ptr @ptr_in_byval_kernel, !"kernel", i32 1}
diff --git a/llvm/test/CodeGen/NVPTX/maxclusterrank.ll b/llvm/test/CodeGen/NVPTX/maxclusterrank.ll
index 3389e090aac5..c445c34c1842 100644
--- a/llvm/test/CodeGen/NVPTX/maxclusterrank.ll
+++ b/llvm/test/CodeGen/NVPTX/maxclusterrank.ll
@@ -11,16 +11,15 @@ target triple = "nvptx64-unknown-unknown"
 
 ; Make sure that for SM version prior to 90 `.maxclusterrank` directive is
 ; sielently ignored.
-define dso_local void @_Z18TestMaxClusterRankv() {
+define dso_local ptx_kernel void @_Z18TestMaxClusterRankv() {
 entry:
   %a = alloca i32, align 4
   store volatile i32 1, ptr %a, align 4
   ret void
 }
 
-!nvvm.annotations = !{!0, !1, !2, !3}
+!nvvm.annotations = !{!1, !2, !3}
 
-!0 = !{ptr @_Z18TestMaxClusterRankv, !"kernel", i32 1}
 !1 = !{ptr @_Z18TestMaxClusterRankv, !"maxntidx", i32 128}
 !2 = !{ptr @_Z18TestMaxClusterRankv, !"minctasm", i32 2}
 !3 = !{ptr @_Z18TestMaxClusterRankv, !"maxclusterrank", i32 8}
diff --git a/llvm/test/CodeGen/NVPTX/noduplicate-syncthreads.ll b/llvm/test/CodeGen/NVPTX/noduplicate-syncthreads.ll
index 2bc6d4cfa7f6..2a0c5ab7299b 100644
--- a/llvm/test/CodeGen/NVPTX/noduplicate-syncthreads.ll
+++ b/llvm/test/CodeGen/NVPTX/noduplicate-syncthreads.ll
@@ -66,7 +66,4 @@ if.end17:                                         ; preds = %if.else13, %if.then
 }
 
 ; Function Attrs: noduplicate nounwind
-declare void @llvm.nvvm.barrier0() #2
-
-!0 = !{ptr @foo, !"kernel", i32 1}
-!1 = !{null, !"align", i32 8}
+declare void @llvm.nvvm.barrier0() #2
+\ No newline at end of file
diff --git a/llvm/test/CodeGen/NVPTX/noreturn.ll b/llvm/test/CodeGen/NVPTX/noreturn.ll
index 2161d70a8852..6c11d0a9376a 100644
--- a/llvm/test/CodeGen/NVPTX/noreturn.ll
+++ b/llvm/test/CodeGen/NVPTX/noreturn.ll
@@ -27,7 +27,7 @@ define void @true_noreturn0() #0 {
 ; CHECK: .entry ignore_kernel_noreturn()
 ; CHECK-NOT: .noreturn
 
-define void @ignore_kernel_noreturn() #0 {
+define ptx_kernel void @ignore_kernel_noreturn() #0 {
   unreachable
 }
 
@@ -35,7 +35,7 @@ define void @ignore_kernel_noreturn() #0 {
 ; CHECK: prototype_{{[0-9]+}} : .callprototype ()_ (.param .b32 _) .noreturn;
 ; CHECK: prototype_{{[0-9]+}} : .callprototype (.param .b32 _) _ (.param .b32 _);
 
-define void @callprototype_noreturn(i32) {
+define ptx_kernel void @callprototype_noreturn(i32) {
   %fn = load ptr, ptr addrspace(1) @function_pointer
   call void %fn(i32 %0) #0
   %non_void = bitcast ptr %fn to ptr
@@ -44,8 +44,3 @@ define void @callprototype_noreturn(i32) {
 }
 
 attributes #0 = { noreturn }
-
-!nvvm.annotations = !{!0, !1}
-
-!0 = !{ptr @ignore_kernel_noreturn, !"kernel", i32 1}
-!1 = !{ptr @callprototype_noreturn, !"kernel", i32 1}
diff --git a/llvm/test/CodeGen/NVPTX/nvcl-param-align.ll b/llvm/test/CodeGen/NVPTX/nvcl-param-align.ll
index 48162eaba257..9a78d31302e1 100644
--- a/llvm/test/CodeGen/NVPTX/nvcl-param-align.ll
+++ b/llvm/test/CodeGen/NVPTX/nvcl-param-align.ll
@@ -3,7 +3,7 @@
 
 target triple = "nvptx-unknown-nvcl"
 
-define void @foo(i64 %img, i64 %sampler, ptr align 32 %v1, ptr %v2) {
+define ptx_kernel void @foo(i64 %img, i64 %sampler, ptr align 32 %v1, ptr %v2) {
 ; The parameter alignment is determined by the align attribute (default 1).
 ; CHECK-LABEL: .entry foo(
 ; CHECK: .param .u64 .ptr .align 32 foo_param_2
@@ -11,7 +11,6 @@ define void @foo(i64 %img, i64 %sampler, ptr align 32 %v1, ptr %v2) {
   ret void
 }
 
-!nvvm.annotations = !{!1, !2, !3}
-!1 = !{ptr @foo, !"kernel", i32 1}
+!nvvm.annotations = !{!2, !3}
 !2 = !{ptr @foo, !"rdoimage", i32 0}
 !3 = !{ptr @foo, !"sampler", i32 1}
diff --git a/llvm/test/CodeGen/NVPTX/nvvm-reflect-arch.ll b/llvm/test/CodeGen/NVPTX/nvvm-reflect-arch.ll
index ac5875c6ab10..83cb3cde48de 100644
--- a/llvm/test/CodeGen/NVPTX/nvvm-reflect-arch.ll
+++ b/llvm/test/CodeGen/NVPTX/nvvm-reflect-arch.ll
@@ -1,9 +1,9 @@
 ; Libdevice in recent CUDA versions relies on __CUDA_ARCH reflecting GPU type.
 ; Verify that __nvvm_reflect() is replaced with an appropriate value.
 ;
-; RUN: opt %s -S -passes='default<O2>' -mtriple=nvptx64 -mcpu=sm_20 \
+; RUN: opt %s -S -passes='nvvm-reflect' -mtriple=nvptx64 -mcpu=sm_20 \
 ; RUN:   | FileCheck %s --check-prefixes=COMMON,SM20
-; RUN: opt %s -S -passes='default<O2>' -mtriple=nvptx64 -mcpu=sm_35 \
+; RUN: opt %s -S -passes='nvvm-reflect' -mtriple=nvptx64 -mcpu=sm_35 \
 ; RUN:   | FileCheck %s --check-prefixes=COMMON,SM35
 
 @"$str" = private addrspace(1) constant [12 x i8] c"__CUDA_ARCH\00"
diff --git a/llvm/test/CodeGen/NVPTX/nvvm-reflect-ocl.ll b/llvm/test/CodeGen/NVPTX/nvvm-reflect-ocl.ll
index 9d383218dce8..bf8d6e2cca30 100644
--- a/llvm/test/CodeGen/NVPTX/nvvm-reflect-ocl.ll
+++ b/llvm/test/CodeGen/NVPTX/nvvm-reflect-ocl.ll
@@ -1,8 +1,8 @@
 ; Verify that __nvvm_reflect_ocl() is replaced with an appropriate value
 ;
-; RUN: opt %s -S -passes='default<O2>' -mtriple=nvptx64 -mcpu=sm_20 \
+; RUN: opt %s -S -passes='nvvm-reflect' -mtriple=nvptx64 -mcpu=sm_20 \
 ; RUN:   | FileCheck %s --check-prefixes=COMMON,SM20
-; RUN: opt %s -S -passes='default<O2>' -mtriple=nvptx64 -mcpu=sm_35 \
+; RUN: opt %s -S -passes='nvvm-reflect' -mtriple=nvptx64 -mcpu=sm_35 \
 ; RUN:   | FileCheck %s --check-prefixes=COMMON,SM35
 
 @"$str" = private addrspace(4) constant [12 x i8] c"__CUDA_ARCH\00"
diff --git a/llvm/test/CodeGen/NVPTX/nvvm-reflect-opaque.ll b/llvm/test/CodeGen/NVPTX/nvvm-reflect-opaque.ll
index 46ab79d9858c..19c74df30370 100644
--- a/llvm/test/CodeGen/NVPTX/nvvm-reflect-opaque.ll
+++ b/llvm/test/CodeGen/NVPTX/nvvm-reflect-opaque.ll
@@ -3,12 +3,12 @@
 
 ; RUN: cat %s > %t.noftz
 ; RUN: echo '!0 = !{i32 4, !"nvvm-reflect-ftz", i32 0}' >> %t.noftz
-; RUN: opt %t.noftz -S -mtriple=nvptx-nvidia-cuda -passes='default<O2>' \
+; RUN: opt %t.noftz -S -mtriple=nvptx-nvidia-cuda -passes='nvvm-reflect,simplifycfg' \
 ; RUN:   | FileCheck %s --check-prefix=USE_FTZ_0 --check-prefix=CHECK
 
 ; RUN: cat %s > %t.ftz
 ; RUN: echo '!0 = !{i32 4, !"nvvm-reflect-ftz", i32 1}' >> %t.ftz
-; RUN: opt %t.ftz -S -mtriple=nvptx-nvidia-cuda -passes='default<O2>' \
+; RUN: opt %t.ftz -S -mtriple=nvptx-nvidia-cuda -passes='nvvm-reflect,simplifycfg' \
 ; RUN:   | FileCheck %s --check-prefix=USE_FTZ_1 --check-prefix=CHECK
 
 @str = private unnamed_addr addrspace(4) constant [11 x i8] c"__CUDA_FTZ\00"
@@ -43,7 +43,7 @@ exit:
 
 declare i32 @llvm.nvvm.reflect(ptr)
 
-; CHECK-LABEL: define noundef i32 @intrinsic
+; CHECK-LABEL: define i32 @intrinsic
 define i32 @intrinsic() {
 ; CHECK-NOT: call i32 @llvm.nvvm.reflect
 ; USE_FTZ_0: ret i32 0
diff --git a/llvm/test/CodeGen/NVPTX/nvvm-reflect.ll b/llvm/test/CodeGen/NVPTX/nvvm-reflect.ll
index 2ed9f7c11bcf..244b44fea9b8 100644
--- a/llvm/test/CodeGen/NVPTX/nvvm-reflect.ll
+++ b/llvm/test/CodeGen/NVPTX/nvvm-reflect.ll
@@ -3,12 +3,12 @@
 
 ; RUN: cat %s > %t.noftz
 ; RUN: echo '!0 = !{i32 4, !"nvvm-reflect-ftz", i32 0}' >> %t.noftz
-; RUN: opt %t.noftz -S -mtriple=nvptx-nvidia-cuda -passes='default<O2>' \
+; RUN: opt %t.noftz -S -mtriple=nvptx-nvidia-cuda -passes='nvvm-reflect,simplifycfg' \
 ; RUN:   | FileCheck %s --check-prefix=USE_FTZ_0 --check-prefix=CHECK
 
 ; RUN: cat %s > %t.ftz
 ; RUN: echo '!0 = !{i32 4, !"nvvm-reflect-ftz", i32 1}' >> %t.ftz
-; RUN: opt %t.ftz -S -mtriple=nvptx-nvidia-cuda -passes='default<O2>' \
+; RUN: opt %t.ftz -S -mtriple=nvptx-nvidia-cuda -passes='nvvm-reflect,simplifycfg' \
 ; RUN:   | FileCheck %s --check-prefix=USE_FTZ_1 --check-prefix=CHECK
 
 @str = private unnamed_addr addrspace(4) constant [11 x i8] c"__CUDA_FTZ\00"
@@ -43,7 +43,8 @@ exit:
 
 declare i32 @llvm.nvvm.reflect(ptr)
 
-; CHECK-LABEL: define noundef i32 @intrinsic
+; CHECK-LABEL: define i32 @intrinsic
+
 define i32 @intrinsic() {
 ; CHECK-NOT: call i32 @llvm.nvvm.reflect
 ; USE_FTZ_0: ret i32 0
diff --git a/llvm/test/CodeGen/NVPTX/refl1.ll b/llvm/test/CodeGen/NVPTX/refl1.ll
index 34db3bb1a1a9..99b83f49ff9b 100644
--- a/llvm/test/CodeGen/NVPTX/refl1.ll
+++ b/llvm/test/CodeGen/NVPTX/refl1.ll
@@ -5,7 +5,7 @@ target triple = "nvptx-nvidia-cuda"
 
 ; Function Attrs: nounwind
 ; CHECK: .entry foo
-define void @foo(ptr nocapture %a) #0 {
+define ptx_kernel void @foo(ptr nocapture %a) #0 {
   %val = load float, ptr %a
   %tan = tail call fastcc float @__nv_fast_tanf(float %val)
   store float %tan, ptr %a
@@ -34,7 +34,3 @@ entry:
 attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone }
 attributes #2 = { alwaysinline inlinehint nounwind readnone }
-
-!nvvm.annotations = !{!0}
-
-!0 = !{ptr @foo, !"kernel", i32 1}
diff --git a/llvm/test/CodeGen/NVPTX/reg-copy.ll b/llvm/test/CodeGen/NVPTX/reg-copy.ll
index f66ef195c625..20396c4cc69f 100644
--- a/llvm/test/CodeGen/NVPTX/reg-copy.ll
+++ b/llvm/test/CodeGen/NVPTX/reg-copy.ll
@@ -4,7 +4,7 @@
 target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
 target triple = "nvptx64-unknown-unknown"
 
-define void @PR24303(ptr %f) {
+define ptx_kernel void @PR24303(ptr %f) {
 ; CHECK-LABEL: .visible .entry PR24303(
 ; Do not use mov.f or mov.u to convert between float and int.
 ; CHECK-NOT: mov.{{f|u}}{{32|64}} %f{{[0-9]+}}, %r{{[0-9]+}}
@@ -217,7 +217,3 @@ _ZN12cuda_builtinmlIfEENS_7complexIT_EERKS3_S5_.exit: ; preds = %if.then.93.i, %
 }
 
 declare float @llvm.nvvm.fabs.f(float)
-
-!nvvm.annotations = !{!0}
-
-!0 = !{ptr @PR24303, !"kernel", i32 1}
diff --git a/llvm/test/CodeGen/NVPTX/simple-call.ll b/llvm/test/CodeGen/NVPTX/simple-call.ll
index 3580604d429d..991ae04b91b6 100644
--- a/llvm/test/CodeGen/NVPTX/simple-call.ll
+++ b/llvm/test/CodeGen/NVPTX/simple-call.ll
@@ -10,7 +10,7 @@ define float @device_func(float %a) noinline {
 }
 
 ; CHECK: .entry kernel_func
-define void @kernel_func(ptr %a) {
+define ptx_kernel void @kernel_func(ptr %a) {
   %val = load float, ptr %a
 ; CHECK: call.uni (retval0),
 ; CHECK: device_func,
@@ -18,9 +18,3 @@ define void @kernel_func(ptr %a) {
   store float %mul, ptr %a
   ret void
 }
-
-
-
-!nvvm.annotations = !{!1}
-
-!1 = !{ptr @kernel_func, !"kernel", i32 1}
diff --git a/llvm/test/CodeGen/NVPTX/surf-read-cuda.ll b/llvm/test/CodeGen/NVPTX/surf-read-cuda.ll
index 504dcdeb3370..7a7904a2f042 100644
--- a/llvm/test/CodeGen/NVPTX/surf-read-cuda.ll
+++ b/llvm/test/CodeGen/NVPTX/surf-read-cuda.ll
@@ -10,7 +10,7 @@ declare i32 @llvm.nvvm.suld.1d.i32.trap(i64, i32)
 declare i64 @llvm.nvvm.texsurf.handle.internal.p1(ptr addrspace(1))
 
 
-define void @foo(i64 %img, ptr %red, i32 %idx) {
+define ptx_kernel void @foo(i64 %img, ptr %red, i32 %idx) {
 ; CHECK-LABEL: foo(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b32 %r<3>;
@@ -34,7 +34,7 @@ define void @foo(i64 %img, ptr %red, i32 %idx) {
 
 @surf0 = internal addrspace(1) global i64 0, align 8
 
-define void @bar(ptr %red, i32 %idx) {
+define ptx_kernel void @bar(ptr %red, i32 %idx) {
 ; CHECK-LABEL: bar(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b32 %r<3>;
@@ -56,11 +56,5 @@ define void @bar(ptr %red, i32 %idx) {
   ret void
 }
 
-
-
-
-!nvvm.annotations = !{!1, !2, !3}
-!1 = !{ptr @foo, !"kernel", i32 1}
-!2 = !{ptr @bar, !"kernel", i32 1}
-!3 = !{ptr addrspace(1) @surf0, !"surface", i32 1}
-
+!nvvm.annotations = !{!1}
+!1 = !{ptr addrspace(1) @surf0, !"surface", i32 1}
diff --git a/llvm/test/CodeGen/NVPTX/surf-read.ll b/llvm/test/CodeGen/NVPTX/surf-read.ll
index e0cebd60d7dd..cd11b5617076 100644
--- a/llvm/test/CodeGen/NVPTX/surf-read.ll
+++ b/llvm/test/CodeGen/NVPTX/surf-read.ll
@@ -6,7 +6,7 @@ target triple = "nvptx64-unknown-nvcl"
 declare i32 @llvm.nvvm.suld.1d.i32.trap(i64, i32)
 
 ; CHECK: .entry foo
-define void @foo(i64 %img, ptr %red, i32 %idx) {
+define ptx_kernel void @foo(i64 %img, ptr %red, i32 %idx) {
 ; CHECK: suld.b.1d.b32.trap {%r[[RED:[0-9]+]]}, [foo_param_0, {%r{{[0-9]+}}}]
   %val = tail call i32 @llvm.nvvm.suld.1d.i32.trap(i64 %img, i32 %idx)
 ; CHECK: cvt.rn.f32.s32 %f[[REDF:[0-9]+]], %r[[RED]]
@@ -16,6 +16,5 @@ define void @foo(i64 %img, ptr %red, i32 %idx) {
   ret void
 }
 
-!nvvm.annotations = !{!1, !2}
-!1 = !{ptr @foo, !"kernel", i32 1}
-!2 = !{ptr @foo, !"rdwrimage", i32 0}
+!nvvm.annotations = !{!1}
+!1 = !{ptr @foo, !"rdwrimage", i32 0}
diff --git a/llvm/test/CodeGen/NVPTX/surf-tex.py b/llvm/test/CodeGen/NVPTX/surf-tex.py
index 9607a58856ba..90d67666f1ed 100644
--- a/llvm/test/CodeGen/NVPTX/surf-tex.py
+++ b/llvm/test/CodeGen/NVPTX/surf-tex.py
@@ -224,11 +224,6 @@ def get_ptx_surface(target):
 def get_surface_metadata(target, fun_ty, fun_name, has_surface_param):
     metadata = []
 
-    md_kernel = '!{{{fun_ty} @{fun_name}, !"kernel", i32 1}}'.format(
-        fun_ty=fun_ty, fun_name=fun_name
-    )
-    metadata.append(md_kernel)
-
     if target == "cuda":
         # When a parameter is lowered as a .surfref, it still has the
         # corresponding ld.param.u64, which is illegal. Do not emit the
@@ -263,14 +258,14 @@ def gen_suld_tests(target, global_surf):
   ; CHECK-LABEL: .entry ${test_name}_param
   ; CHECK: ${instruction} ${reg_ret}, [${reg_surf}, ${reg_access}]
   ;
-  define void @${test_name}_param(i64 %s, ${retty}* %ret, ${access}) {
+  define ptx_kernel void @${test_name}_param(i64 %s, ${retty}* %ret, ${access}) {
     %val = tail call ${retty} @${intrinsic}(i64 %s, ${access})
     store ${retty} %val, ${retty}* %ret
     ret void
   }
   ; CHECK-LABEL: .entry ${test_name}_global
   ; CHECK: ${instruction} ${reg_ret}, [${global_surf}, ${reg_access}]
-  define void @${test_name}_global(${retty}* %ret, ${access}) {
+  define ptx_kernel void @${test_name}_global(${retty}* %ret, ${access}) {
     %gs = tail call i64 @llvm.nvvm.texsurf.handle.internal.p1i64(i64 addrspace(1)* @${global_surf})
     %val = tail call ${retty} @${intrinsic}(i64 %gs, ${access})
     store ${retty} %val, ${retty}* %ret
@@ -356,13 +351,13 @@ def gen_sust_tests(target, global_surf):
   ; CHECK-LABEL: .entry ${test_name}_param
   ; CHECK: ${instruction} [${reg_surf}, ${reg_access}], ${reg_value}
   ;
-  define void @${test_name}_param(i64 %s, ${value}, ${access}) {
+  define ptx_kernel void @${test_name}_param(i64 %s, ${value}, ${access}) {
     tail call void @${intrinsic}(i64 %s, ${access}, ${value})
     ret void
   }
   ; CHECK-LABEL: .entry ${test_name}_global
   ; CHECK: ${instruction} [${global_surf}, ${reg_access}], ${reg_value}
-  define void @${test_name}_global(${value}, ${access}) {
+  define ptx_kernel void @${test_name}_global(${value}, ${access}) {
     %gs = tail call i64 @llvm.nvvm.texsurf.handle.internal.p1i64(i64 addrspace(1)* @${global_surf})
     tail call void @${intrinsic}(i64 %gs, ${access}, ${value})
     ret void
@@ -420,19 +415,13 @@ def gen_sust_tests(target, global_surf):
         generated_items.append((params["intrinsic"], params["instruction"]))
 
         fun_name = test_name + "_param"
-        fun_ty = "void (i64, {value_ty}, {access_ty})*".format(
-            value_ty=get_llvm_value_type(vec, ctype),
-            access_ty=get_llvm_surface_access_type(geom),
-        )
+        fun_ty = "ptr"
         generated_metadata += get_surface_metadata(
             target, fun_ty, fun_name, has_surface_param=True
         )
 
         fun_name = test_name + "_global"
-        fun_ty = "void ({value_ty}, {access_ty})*".format(
-            value_ty=get_llvm_value_type(vec, ctype),
-            access_ty=get_llvm_surface_access_type(geom),
-        )
+        fun_ty = "ptr"
         generated_metadata += get_surface_metadata(
             target, fun_ty, fun_name, has_surface_param=False
         )
@@ -559,11 +548,6 @@ def get_ptx_global_sampler(target, global_sampler):
 def get_texture_metadata(target, fun_ty, fun_name, has_texture_params):
     metadata = []
 
-    md_kernel = '!{{{fun_ty} @{fun_name}, !"kernel", i32 1}}'.format(
-        fun_ty=fun_ty, fun_name=fun_name
-    )
-    metadata.append(md_kernel)
-
     if target == "cuda":
         # When a parameter is lowered as a .texref, it still has the
         # corresponding ld.param.u64, which is illegal. Do not emit the
@@ -615,14 +599,14 @@ def gen_tex_tests(target, global_tex, global_sampler):
 
   ; CHECK-LABEL: .entry ${test_name}_param
   ; CHECK: ${instruction} ${ptx_ret}, [${ptx_tex}, ${ptx_access}]
-  define void @${test_name}_param(i64 %tex, ${sampler} ${retty}* %ret, ${access}) {
+  define ptx_kernel void @${test_name}_param(i64 %tex, ${sampler} ${retty}* %ret, ${access}) {
     %val = tail call ${retty} @${intrinsic}(i64 %tex, ${sampler} ${access})
     store ${retty} %val, ${retty}* %ret
     ret void
   }
   ; CHECK-LABEL: .entry ${test_name}_global
   ; CHECK: ${instruction} ${ptx_ret}, [${global_tex}, ${ptx_global_sampler} ${ptx_access}]
-  define void @${test_name}_global(${retty}* %ret, ${access}) {
+  define ptx_kernel void @${test_name}_global(${retty}* %ret, ${access}) {
     %gt = tail call i64 @llvm.nvvm.texsurf.handle.internal.p1i64(i64 addrspace(1)* @${global_tex})
     ${get_sampler_handle}
     %val = tail call ${retty} @${intrinsic}(i64 %gt, ${sampler} ${access})
@@ -799,14 +783,14 @@ def gen_tld4_tests(target, global_tex, global_sampler):
 
   ; CHECK-LABEL: .entry ${test_name}_param
   ; CHECK: ${instruction} ${ptx_ret}, [${ptx_tex}, ${ptx_access}]
-  define void @${test_name}_param(i64 %tex, ${sampler} ${retty}* %ret, ${access}) {
+  define ptx_kernel void @${test_name}_param(i64 %tex, ${sampler} ${retty}* %ret, ${access}) {
     %val = tail call ${retty} @${intrinsic}(i64 %tex, ${sampler} ${access})
     store ${retty} %val, ${retty}* %ret
     ret void
   }
   ; CHECK-LABEL: .entry ${test_name}_global
   ; CHECK: ${instruction} ${ptx_ret}, [${global_tex}, ${ptx_global_sampler} ${ptx_access}]
-  define void @${test_name}_global(${retty}* %ret, ${access}) {
+  define ptx_kernel void @${test_name}_global(${retty}* %ret, ${access}) {
     %gt = tail call i64 @llvm.nvvm.texsurf.handle.internal.p1i64(i64 addrspace(1)* @${global_tex})
     ${get_sampler_handle}
     %val = tail call ${retty} @${intrinsic}(i64 %gt, ${sampler} ${access})
diff --git a/llvm/test/CodeGen/NVPTX/surf-write-cuda.ll b/llvm/test/CodeGen/NVPTX/surf-write-cuda.ll
index 881ea459feb4..5dc44cb1925b 100644
--- a/llvm/test/CodeGen/NVPTX/surf-write-cuda.ll
+++ b/llvm/test/CodeGen/NVPTX/surf-write-cuda.ll
@@ -10,7 +10,7 @@ declare void @llvm.nvvm.sust.b.1d.i32.trap(i64, i32, i32)
 declare i64 @llvm.nvvm.texsurf.handle.internal.p1(ptr addrspace(1))
 
 
-define void @foo(i64 %img, i32 %val, i32 %idx) {
+define ptx_kernel void @foo(i64 %img, i32 %val, i32 %idx) {
 ; CHECK-LABEL: foo(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b32 %r<3>;
@@ -30,7 +30,7 @@ define void @foo(i64 %img, i32 %val, i32 %idx) {
 @surf0 = internal addrspace(1) global i64 0, align 8
 
 
-define void @bar(i32 %val, i32 %idx) {
+define ptx_kernel void @bar(i32 %val, i32 %idx) {
 ; CHECK-LABEL: bar(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b32 %r<3>;
@@ -47,8 +47,6 @@ define void @bar(i32 %val, i32 %idx) {
 }
 
 
-!nvvm.annotations = !{!1, !2, !3}
-!1 = !{ptr @foo, !"kernel", i32 1}
-!2 = !{ptr @bar, !"kernel", i32 1}
-!3 = !{ptr addrspace(1) @surf0, !"surface", i32 1}
+!nvvm.annotations = !{!1}
+!1 = !{ptr addrspace(1) @surf0, !"surface", i32 1}
 
diff --git a/llvm/test/CodeGen/NVPTX/surf-write.ll b/llvm/test/CodeGen/NVPTX/surf-write.ll
index 258bb6d8b5b7..0e1f0cc70099 100644
--- a/llvm/test/CodeGen/NVPTX/surf-write.ll
+++ b/llvm/test/CodeGen/NVPTX/surf-write.ll
@@ -6,12 +6,11 @@ target triple = "nvptx-unknown-nvcl"
 declare void @llvm.nvvm.sust.b.1d.i32.trap(i64, i32, i32)
 
 ; CHECK: .entry foo
-define void @foo(i64 %img, i32 %val, i32 %idx) {
+define ptx_kernel void @foo(i64 %img, i32 %val, i32 %idx) {
 ; CHECK: sust.b.1d.b32.trap [foo_param_0, {%r{{[0-9]+}}}], {%r{{[0-9]+}}}
   tail call void @llvm.nvvm.sust.b.1d.i32.trap(i64 %img, i32 %idx, i32 %val)
   ret void
 }
 
-!nvvm.annotations = !{!1, !2}
-!1 = !{ptr @foo, !"kernel", i32 1}
-!2 = !{ptr @foo, !"wroimage", i32 0}
+!nvvm.annotations = !{!1}
+!1 = !{ptr @foo, !"wroimage", i32 0}
diff --git a/llvm/test/CodeGen/NVPTX/tex-read-cuda.ll b/llvm/test/CodeGen/NVPTX/tex-read-cuda.ll
index ba556d2d9bd6..61837bde82ec 100644
--- a/llvm/test/CodeGen/NVPTX/tex-read-cuda.ll
+++ b/llvm/test/CodeGen/NVPTX/tex-read-cuda.ll
@@ -10,7 +10,7 @@ target triple = "nvptx-unknown-cuda"
 declare { float, float, float, float } @llvm.nvvm.tex.unified.1d.v4f32.s32(i64, i32)
 declare i64 @llvm.nvvm.texsurf.handle.internal.p1(ptr addrspace(1))
 
-define void @foo(i64 %img, ptr %red, i32 %idx) {
+define ptx_kernel void @foo(i64 %img, ptr %red, i32 %idx) {
 ; CHECK-LABEL: foo(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b32 %r<2>;
@@ -34,7 +34,7 @@ define void @foo(i64 %img, ptr %red, i32 %idx) {
 
 @tex0 = internal addrspace(1) global i64 0, align 8
 
-define void @bar(ptr %red, i32 %idx) {
+define ptx_kernel void @bar(ptr %red, i32 %idx) {
 ; CHECK-LABEL: bar(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b32 %r<2>;
@@ -57,7 +57,7 @@ define void @bar(ptr %red, i32 %idx) {
 
 declare float @texfunc(i64)
 
-define void @baz(ptr %red, i32 %idx) {
+define ptx_kernel void @baz(ptr %red, i32 %idx) {
 ; CHECK-LABEL: baz(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b32 %r<2>;
@@ -93,8 +93,5 @@ define void @baz(ptr %red, i32 %idx) {
   ret void
 }
 
-!nvvm.annotations = !{!1, !2, !3, !4}
-!1 = !{ptr @foo, !"kernel", i32 1}
-!2 = !{ptr @bar, !"kernel", i32 1}
-!3 = !{ptr addrspace(1) @tex0, !"texture", i32 1}
-!4 = !{ptr @baz, !"kernel", i32 1}
+!nvvm.annotations = !{!1}
+!1 = !{ptr addrspace(1) @tex0, !"texture", i32 1}
diff --git a/llvm/test/CodeGen/NVPTX/tex-read.ll b/llvm/test/CodeGen/NVPTX/tex-read.ll
index d11aea45a65f..d74c89f5abc8 100644
--- a/llvm/test/CodeGen/NVPTX/tex-read.ll
+++ b/llvm/test/CodeGen/NVPTX/tex-read.ll
@@ -6,7 +6,7 @@ target triple = "nvptx64-unknown-nvcl"
 declare { float, float, float, float } @llvm.nvvm.tex.1d.v4f32.s32(i64, i64, i32)
 
 ; CHECK: .entry foo
-define void @foo(i64 %img, i64 %sampler, ptr %red, i32 %idx) {
+define ptx_kernel void @foo(i64 %img, i64 %sampler, ptr %red, i32 %idx) {
 ; CHECK: tex.1d.v4.f32.s32 {%f[[RED:[0-9]+]], %f[[GREEN:[0-9]+]], %f[[BLUE:[0-9]+]], %f[[ALPHA:[0-9]+]]}, [foo_param_0, foo_param_1, {%r{{[0-9]+}}}]
   %val = tail call { float, float, float, float } @llvm.nvvm.tex.1d.v4f32.s32(i64 %img, i64 %sampler, i32 %idx)
   %ret = extractvalue { float, float, float, float } %val, 0
@@ -15,7 +15,6 @@ define void @foo(i64 %img, i64 %sampler, ptr %red, i32 %idx) {
   ret void
 }
 
-!nvvm.annotations = !{!1, !2, !3}
-!1 = !{ptr @foo, !"kernel", i32 1}
+!nvvm.annotations = !{!2, !3}
 !2 = !{ptr @foo, !"rdoimage", i32 0}
 !3 = !{ptr @foo, !"sampler", i32 1}
diff --git a/llvm/test/CodeGen/NVPTX/unreachable.ll b/llvm/test/CodeGen/NVPTX/unreachable.ll
index 286f3588a754..80cf938d48b5 100644
--- a/llvm/test/CodeGen/NVPTX/unreachable.ll
+++ b/llvm/test/CodeGen/NVPTX/unreachable.ll
@@ -21,7 +21,7 @@ target triple = "nvptx-unknown-cuda"
 declare void @throw() #0
 declare void @llvm.trap() #0
 
-define void @kernel_func() {
+define ptx_kernel void @kernel_func() {
 ; NO-TRAP-UNREACHABLE-LABEL: kernel_func(
 ; NO-TRAP-UNREACHABLE:       {
 ; NO-TRAP-UNREACHABLE-EMPTY:
@@ -102,6 +102,3 @@ define void @kernel_func_2() {
 }
 
 attributes #0 = { noreturn }
-
-!nvvm.annotations = !{!1}
-!1 = !{ptr @kernel_func, !"kernel", i32 1}
diff --git a/llvm/test/CodeGen/NVPTX/variadics-backend.ll b/llvm/test/CodeGen/NVPTX/variadics-backend.ll
index cb54812dea6d..f7ed690efabc 100644
--- a/llvm/test/CodeGen/NVPTX/variadics-backend.ll
+++ b/llvm/test/CodeGen/NVPTX/variadics-backend.ll
@@ -153,7 +153,7 @@ define dso_local i32 @variadics2(i32 noundef %first, ...) {
 ; CHECK-PTX-NEXT:    .reg .b64 %SPL;
 ; CHECK-PTX-NEXT:    .reg .b16 %rs<6>;
 ; CHECK-PTX-NEXT:    .reg .b32 %r<7>;
-; CHECK-PTX-NEXT:    .reg .b64 %rd<11>;
+; CHECK-PTX-NEXT:    .reg .b64 %rd<7>;
 ; CHECK-PTX-EMPTY:
 ; CHECK-PTX-NEXT:  // %bb.0: // %entry
 ; CHECK-PTX-NEXT:    mov.u64 %SPL, __local_depot2;
@@ -163,24 +163,20 @@ define dso_local i32 @variadics2(i32 noundef %first, ...) {
 ; CHECK-PTX-NEXT:    add.s64 %rd2, %rd1, 7;
 ; CHECK-PTX-NEXT:    and.b64 %rd3, %rd2, -8;
 ; CHECK-PTX-NEXT:    ld.u32 %r2, [%rd3];
-; CHECK-PTX-NEXT:    or.b64 %rd4, %rd3, 4;
-; CHECK-PTX-NEXT:    ld.s8 %r3, [%rd4];
-; CHECK-PTX-NEXT:    or.b64 %rd5, %rd3, 5;
-; CHECK-PTX-NEXT:    or.b64 %rd6, %rd3, 7;
-; CHECK-PTX-NEXT:    ld.u8 %rs1, [%rd6];
+; CHECK-PTX-NEXT:    ld.s8 %r3, [%rd3+4];
+; CHECK-PTX-NEXT:    ld.u8 %rs1, [%rd3+7];
 ; CHECK-PTX-NEXT:    st.u8 [%SP+2], %rs1;
-; CHECK-PTX-NEXT:    ld.u8 %rs2, [%rd5];
-; CHECK-PTX-NEXT:    or.b64 %rd7, %rd3, 6;
-; CHECK-PTX-NEXT:    ld.u8 %rs3, [%rd7];
+; CHECK-PTX-NEXT:    ld.u8 %rs2, [%rd3+5];
+; CHECK-PTX-NEXT:    ld.u8 %rs3, [%rd3+6];
 ; CHECK-PTX-NEXT:    shl.b16 %rs4, %rs3, 8;
 ; CHECK-PTX-NEXT:    or.b16 %rs5, %rs4, %rs2;
 ; CHECK-PTX-NEXT:    st.u16 [%SP], %rs5;
-; CHECK-PTX-NEXT:    ld.u64 %rd8, [%rd3+8];
+; CHECK-PTX-NEXT:    ld.u64 %rd4, [%rd3+8];
 ; CHECK-PTX-NEXT:    add.s32 %r4, %r1, %r2;
 ; CHECK-PTX-NEXT:    add.s32 %r5, %r4, %r3;
-; CHECK-PTX-NEXT:    cvt.u64.u32 %rd9, %r5;
-; CHECK-PTX-NEXT:    add.s64 %rd10, %rd9, %rd8;
-; CHECK-PTX-NEXT:    cvt.u32.u64 %r6, %rd10;
+; CHECK-PTX-NEXT:    cvt.u64.u32 %rd5, %r5;
+; CHECK-PTX-NEXT:    add.s64 %rd6, %rd5, %rd4;
+; CHECK-PTX-NEXT:    cvt.u32.u64 %r6, %rd6;
 ; CHECK-PTX-NEXT:    st.param.b32 [func_retval0], %r6;
 ; CHECK-PTX-NEXT:    ret;
 entry:
@@ -219,7 +215,7 @@ define dso_local i32 @bar() {
 ; CHECK-PTX-NEXT:    .reg .b64 %SPL;
 ; CHECK-PTX-NEXT:    .reg .b16 %rs<10>;
 ; CHECK-PTX-NEXT:    .reg .b32 %r<4>;
-; CHECK-PTX-NEXT:    .reg .b64 %rd<8>;
+; CHECK-PTX-NEXT:    .reg .b64 %rd<7>;
 ; CHECK-PTX-EMPTY:
 ; CHECK-PTX-NEXT:  // %bb.0: // %entry
 ; CHECK-PTX-NEXT:    mov.u64 %SPL, __local_depot3;
@@ -240,17 +236,16 @@ define dso_local i32 @bar() {
 ; CHECK-PTX-NEXT:    st.u16 [%SP], %rs8;
 ; CHECK-PTX-NEXT:    mov.b32 %r1, 1;
 ; CHECK-PTX-NEXT:    st.u32 [%SP+8], %r1;
-; CHECK-PTX-NEXT:    add.u64 %rd5, %SP, 8;
-; CHECK-PTX-NEXT:    or.b64 %rd6, %rd5, 4;
 ; CHECK-PTX-NEXT:    mov.b16 %rs9, 1;
-; CHECK-PTX-NEXT:    st.u8 [%rd6], %rs9;
-; CHECK-PTX-NEXT:    mov.b64 %rd7, 1;
-; CHECK-PTX-NEXT:    st.u64 [%SP+16], %rd7;
+; CHECK-PTX-NEXT:    st.u8 [%SP+12], %rs9;
+; CHECK-PTX-NEXT:    mov.b64 %rd5, 1;
+; CHECK-PTX-NEXT:    st.u64 [%SP+16], %rd5;
+; CHECK-PTX-NEXT:    add.u64 %rd6, %SP, 8;
 ; CHECK-PTX-NEXT:    { // callseq 1, 0
 ; CHECK-PTX-NEXT:    .param .b32 param0;
 ; CHECK-PTX-NEXT:    st.param.b32 [param0], 1;
 ; CHECK-PTX-NEXT:    .param .b64 param1;
-; CHECK-PTX-NEXT:    st.param.b64 [param1], %rd5;
+; CHECK-PTX-NEXT:    st.param.b64 [param1], %rd6;
 ; CHECK-PTX-NEXT:    .param .b32 retval0;
 ; CHECK-PTX-NEXT:    call.uni (retval0),
 ; CHECK-PTX-NEXT:    variadics2,
diff --git a/llvm/test/CodeGen/PowerPC/global-merge-aix-zero-size-struct.ll b/llvm/test/CodeGen/PowerPC/global-merge-aix-zero-size-struct.ll
new file mode 100644
index 000000000000..ec6fd7ee4cf4
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/global-merge-aix-zero-size-struct.ll
@@ -0,0 +1,20 @@
+; RUN: llc -verify-machineinstrs -mtriple powerpc64-ibm-aix-xcoff -mcpu=pwr7 < %s | FileCheck %s
+
+; RUN: llc -verify-machineinstrs -mtriple powerpc64-ibm-aix-xcoff -mcpu=pwr7 --filetype=obj -o %t.o < %s
+; RUN: llvm-objdump --syms %t.o | FileCheck %s --check-prefix=OBJ
+
+%struct.anon = type {}
+
+@a = internal constant %struct.anon zeroinitializer, align 1
+@b = internal constant [6 x i8] c"hello\00", align 1
+
+; CHECK:      	.csect L.._MergedGlobals[RO],2
+; CHECK-NEXT: 	.lglobl	a                               # @_MergedGlobals
+; CHECK-NEXT: 	.lglobl	b
+; CHECK-NEXT: a:
+; CHECK-NEXT: b:
+; CHECK-NEXT: 	.string	"hello"
+
+; OBJ:      0000000000000000 l       .text	0000000000000006 L.._MergedGlobals
+; OBJ-NEXT: 0000000000000000 l       .text (csect: L.._MergedGlobals) 	0000000000000000 a
+; OBJ-NEXT: 0000000000000000 l       .text (csect: L.._MergedGlobals) 	0000000000000000 b
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/add-imm.ll b/llvm/test/CodeGen/RISCV/GlobalISel/add-imm.ll
index ff56ab193c48..0fd23a7d346d 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/add-imm.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/add-imm.ll
@@ -14,7 +14,7 @@ define i32 @add_positive_low_bound_reject(i32 %a) nounwind {
 ;
 ; RV64I-LABEL: add_positive_low_bound_reject:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    addi a0, a0, 2047
+; RV64I-NEXT:    addiw a0, a0, 2047
 ; RV64I-NEXT:    ret
   %1 = add i32 %a, 2047
   ret i32 %1
@@ -30,7 +30,7 @@ define i32 @add_positive_low_bound_accept(i32 %a) nounwind {
 ; RV64I-LABEL: add_positive_low_bound_accept:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    addi a0, a0, 2047
-; RV64I-NEXT:    addi a0, a0, 1
+; RV64I-NEXT:    addiw a0, a0, 1
 ; RV64I-NEXT:    ret
   %1 = add i32 %a, 2048
   ret i32 %1
@@ -46,7 +46,7 @@ define i32 @add_positive_high_bound_accept(i32 %a) nounwind {
 ; RV64I-LABEL: add_positive_high_bound_accept:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    addi a0, a0, 2047
-; RV64I-NEXT:    addi a0, a0, 2047
+; RV64I-NEXT:    addiw a0, a0, 2047
 ; RV64I-NEXT:    ret
   %1 = add i32 %a, 4094
   ret i32 %1
@@ -63,8 +63,8 @@ define i32 @add_positive_high_bound_reject(i32 %a) nounwind {
 ; RV64I-LABEL: add_positive_high_bound_reject:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    lui a1, 1
-; RV64I-NEXT:    addiw a1, a1, -1
-; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    addi a1, a1, -1
+; RV64I-NEXT:    addw a0, a0, a1
 ; RV64I-NEXT:    ret
   %1 = add i32 %a, 4095
   ret i32 %1
@@ -78,7 +78,7 @@ define i32 @add_negative_high_bound_reject(i32 %a) nounwind {
 ;
 ; RV64I-LABEL: add_negative_high_bound_reject:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    addi a0, a0, -2048
+; RV64I-NEXT:    addiw a0, a0, -2048
 ; RV64I-NEXT:    ret
   %1 = add i32 %a, -2048
   ret i32 %1
@@ -94,7 +94,7 @@ define i32 @add_negative_high_bound_accept(i32 %a) nounwind {
 ; RV64I-LABEL: add_negative_high_bound_accept:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    addi a0, a0, -2048
-; RV64I-NEXT:    addi a0, a0, -1
+; RV64I-NEXT:    addiw a0, a0, -1
 ; RV64I-NEXT:    ret
   %1 = add i32 %a, -2049
   ret i32 %1
@@ -110,7 +110,7 @@ define i32 @add_negative_low_bound_accept(i32 %a) nounwind {
 ; RV64I-LABEL: add_negative_low_bound_accept:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    addi a0, a0, -2048
-; RV64I-NEXT:    addi a0, a0, -2048
+; RV64I-NEXT:    addiw a0, a0, -2048
 ; RV64I-NEXT:    ret
   %1 = add i32 %a, -4096
   ret i32 %1
@@ -127,8 +127,8 @@ define i32 @add_negative_low_bound_reject(i32 %a) nounwind {
 ; RV64I-LABEL: add_negative_low_bound_reject:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    lui a1, 1048575
-; RV64I-NEXT:    addiw a1, a1, -1
-; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    addi a1, a1, -1
+; RV64I-NEXT:    addw a0, a0, a1
 ; RV64I-NEXT:    ret
   %1 = add i32 %a, -4097
   ret i32 %1
@@ -144,7 +144,7 @@ define i32 @add32_accept(i32 %a) nounwind {
 ; RV64I-LABEL: add32_accept:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    addi a0, a0, 2047
-; RV64I-NEXT:    addi a0, a0, 952
+; RV64I-NEXT:    addiw a0, a0, 952
 ; RV64I-NEXT:    ret
   %1 = add i32 %a, 2999
   ret i32 %1
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/alu-roundtrip.ll b/llvm/test/CodeGen/RISCV/GlobalISel/alu-roundtrip.ll
index ee414992a524..f1c0fccb78a3 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/alu-roundtrip.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/alu-roundtrip.ll
@@ -37,7 +37,7 @@ define i32 @add_i8_signext_i32(i8 %a, i8 %b) {
 ; RV64IM-NEXT:    slli a1, a1, 56
 ; RV64IM-NEXT:    srai a0, a0, 56
 ; RV64IM-NEXT:    srai a1, a1, 56
-; RV64IM-NEXT:    add a0, a0, a1
+; RV64IM-NEXT:    addw a0, a0, a1
 ; RV64IM-NEXT:    ret
 entry:
   %0 = sext i8 %a to i32
@@ -58,7 +58,7 @@ define i32 @add_i8_zeroext_i32(i8 %a, i8 %b) {
 ; RV64IM:       # %bb.0: # %entry
 ; RV64IM-NEXT:    andi a0, a0, 255
 ; RV64IM-NEXT:    andi a1, a1, 255
-; RV64IM-NEXT:    add a0, a0, a1
+; RV64IM-NEXT:    addw a0, a0, a1
 ; RV64IM-NEXT:    ret
 entry:
   %0 = zext i8 %a to i32
@@ -78,7 +78,7 @@ define i32 @add_i32(i32 %a, i32 %b) {
 ;
 ; RV64IM-LABEL: add_i32:
 ; RV64IM:       # %bb.0: # %entry
-; RV64IM-NEXT:    add a0, a0, a1
+; RV64IM-NEXT:    addw a0, a0, a1
 ; RV64IM-NEXT:    ret
 entry:
   %0 = add i32 %a, %b
@@ -93,7 +93,7 @@ define i32 @addi_i32(i32 %a) {
 ;
 ; RV64IM-LABEL: addi_i32:
 ; RV64IM:       # %bb.0: # %entry
-; RV64IM-NEXT:    addi a0, a0, 1234
+; RV64IM-NEXT:    addiw a0, a0, 1234
 ; RV64IM-NEXT:    ret
 entry:
   %0 = add i32 %a, 1234
@@ -108,7 +108,7 @@ define i32 @sub_i32(i32 %a, i32 %b) {
 ;
 ; RV64IM-LABEL: sub_i32:
 ; RV64IM:       # %bb.0: # %entry
-; RV64IM-NEXT:    sub a0, a0, a1
+; RV64IM-NEXT:    subw a0, a0, a1
 ; RV64IM-NEXT:    ret
 entry:
   %0 = sub i32 %a, %b
@@ -123,7 +123,7 @@ define i32 @subi_i32(i32 %a) {
 ;
 ; RV64IM-LABEL: subi_i32:
 ; RV64IM:       # %bb.0: # %entry
-; RV64IM-NEXT:    addi a0, a0, -1234
+; RV64IM-NEXT:    addiw a0, a0, -1234
 ; RV64IM-NEXT:    ret
 entry:
   %0 = sub i32 %a, 1234
@@ -138,7 +138,7 @@ define i32 @neg_i32(i32 %a) {
 ;
 ; RV64IM-LABEL: neg_i32:
 ; RV64IM:       # %bb.0: # %entry
-; RV64IM-NEXT:    neg a0, a0
+; RV64IM-NEXT:    negw a0, a0
 ; RV64IM-NEXT:    ret
 entry:
   %0 = sub i32 0, %a
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/combine-neg-abs.ll b/llvm/test/CodeGen/RISCV/GlobalISel/combine-neg-abs.ll
index 6c848ecf0fff..3a55189076de 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/combine-neg-abs.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/combine-neg-abs.ll
@@ -27,14 +27,13 @@ define i32 @expanded_neg_abs32(i32 %x) {
 ;
 ; RV64I-LABEL: expanded_neg_abs32:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    neg a1, a0
-; RV64I-NEXT:    sext.w a2, a1
-; RV64I-NEXT:    sext.w a3, a0
-; RV64I-NEXT:    blt a3, a2, .LBB0_2
+; RV64I-NEXT:    negw a1, a0
+; RV64I-NEXT:    sext.w a2, a0
+; RV64I-NEXT:    blt a2, a1, .LBB0_2
 ; RV64I-NEXT:  # %bb.1:
 ; RV64I-NEXT:    mv a1, a0
 ; RV64I-NEXT:  .LBB0_2:
-; RV64I-NEXT:    neg a0, a1
+; RV64I-NEXT:    negw a0, a1
 ; RV64I-NEXT:    ret
 ;
 ; RV64ZBB-LABEL: expanded_neg_abs32:
@@ -42,7 +41,7 @@ define i32 @expanded_neg_abs32(i32 %x) {
 ; RV64ZBB-NEXT:    negw a1, a0
 ; RV64ZBB-NEXT:    sext.w a0, a0
 ; RV64ZBB-NEXT:    max a0, a1, a0
-; RV64ZBB-NEXT:    neg a0, a0
+; RV64ZBB-NEXT:    negw a0, a0
 ; RV64ZBB-NEXT:    ret
   %n = sub i32 0, %x
   %t = call i32 @llvm.smax.i32(i32 %n, i32 %x)
@@ -69,14 +68,13 @@ define i32 @expanded_neg_abs32_unsigned(i32 %x) {
 ;
 ; RV64I-LABEL: expanded_neg_abs32_unsigned:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    neg a1, a0
-; RV64I-NEXT:    sext.w a2, a1
-; RV64I-NEXT:    sext.w a3, a0
-; RV64I-NEXT:    bltu a3, a2, .LBB1_2
+; RV64I-NEXT:    negw a1, a0
+; RV64I-NEXT:    sext.w a2, a0
+; RV64I-NEXT:    bltu a2, a1, .LBB1_2
 ; RV64I-NEXT:  # %bb.1:
 ; RV64I-NEXT:    mv a1, a0
 ; RV64I-NEXT:  .LBB1_2:
-; RV64I-NEXT:    neg a0, a1
+; RV64I-NEXT:    negw a0, a1
 ; RV64I-NEXT:    ret
 ;
 ; RV64ZBB-LABEL: expanded_neg_abs32_unsigned:
@@ -84,7 +82,7 @@ define i32 @expanded_neg_abs32_unsigned(i32 %x) {
 ; RV64ZBB-NEXT:    negw a1, a0
 ; RV64ZBB-NEXT:    sext.w a0, a0
 ; RV64ZBB-NEXT:    maxu a0, a1, a0
-; RV64ZBB-NEXT:    neg a0, a0
+; RV64ZBB-NEXT:    negw a0, a0
 ; RV64ZBB-NEXT:    ret
   %n = sub i32 0, %x
   %t = call i32 @llvm.umax.i32(i32 %n, i32 %x)
@@ -251,14 +249,13 @@ define i32 @expanded_neg_inv_abs32(i32 %x) {
 ;
 ; RV64I-LABEL: expanded_neg_inv_abs32:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    neg a1, a0
-; RV64I-NEXT:    sext.w a2, a1
-; RV64I-NEXT:    sext.w a3, a0
-; RV64I-NEXT:    blt a2, a3, .LBB4_2
+; RV64I-NEXT:    negw a1, a0
+; RV64I-NEXT:    sext.w a2, a0
+; RV64I-NEXT:    blt a1, a2, .LBB4_2
 ; RV64I-NEXT:  # %bb.1:
 ; RV64I-NEXT:    mv a1, a0
 ; RV64I-NEXT:  .LBB4_2:
-; RV64I-NEXT:    neg a0, a1
+; RV64I-NEXT:    negw a0, a1
 ; RV64I-NEXT:    ret
 ;
 ; RV64ZBB-LABEL: expanded_neg_inv_abs32:
@@ -266,7 +263,7 @@ define i32 @expanded_neg_inv_abs32(i32 %x) {
 ; RV64ZBB-NEXT:    negw a1, a0
 ; RV64ZBB-NEXT:    sext.w a0, a0
 ; RV64ZBB-NEXT:    min a0, a1, a0
-; RV64ZBB-NEXT:    neg a0, a0
+; RV64ZBB-NEXT:    negw a0, a0
 ; RV64ZBB-NEXT:    ret
   %n = sub i32 0, %x
   %t = call i32 @llvm.smin.i32(i32 %n, i32 %x)
@@ -293,14 +290,13 @@ define i32 @expanded_neg_inv_abs32_unsigned(i32 %x) {
 ;
 ; RV64I-LABEL: expanded_neg_inv_abs32_unsigned:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    neg a1, a0
-; RV64I-NEXT:    sext.w a2, a1
-; RV64I-NEXT:    sext.w a3, a0
-; RV64I-NEXT:    bltu a2, a3, .LBB5_2
+; RV64I-NEXT:    negw a1, a0
+; RV64I-NEXT:    sext.w a2, a0
+; RV64I-NEXT:    bltu a1, a2, .LBB5_2
 ; RV64I-NEXT:  # %bb.1:
 ; RV64I-NEXT:    mv a1, a0
 ; RV64I-NEXT:  .LBB5_2:
-; RV64I-NEXT:    neg a0, a1
+; RV64I-NEXT:    negw a0, a1
 ; RV64I-NEXT:    ret
 ;
 ; RV64ZBB-LABEL: expanded_neg_inv_abs32_unsigned:
@@ -308,7 +304,7 @@ define i32 @expanded_neg_inv_abs32_unsigned(i32 %x) {
 ; RV64ZBB-NEXT:    negw a1, a0
 ; RV64ZBB-NEXT:    sext.w a0, a0
 ; RV64ZBB-NEXT:    minu a0, a1, a0
-; RV64ZBB-NEXT:    neg a0, a0
+; RV64ZBB-NEXT:    negw a0, a0
 ; RV64ZBB-NEXT:    ret
   %n = sub i32 0, %x
   %t = call i32 @llvm.umin.i32(i32 %n, i32 %x)
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/combine.ll b/llvm/test/CodeGen/RISCV/GlobalISel/combine.ll
index 9c7fd6895d37..360e84d37ec8 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/combine.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/combine.ll
@@ -21,6 +21,7 @@ define i32 @constant_to_rhs(i32 %x) {
 ; RV64-O0-NEXT:    mv a1, a0
 ; RV64-O0-NEXT:    li a0, 1
 ; RV64-O0-NEXT:    add a0, a0, a1
+; RV64-O0-NEXT:    sext.w a0, a0
 ; RV64-O0-NEXT:    ret
 ;
 ; RV32-OPT-LABEL: constant_to_rhs:
@@ -30,7 +31,7 @@ define i32 @constant_to_rhs(i32 %x) {
 ;
 ; RV64-OPT-LABEL: constant_to_rhs:
 ; RV64-OPT:       # %bb.0:
-; RV64-OPT-NEXT:    addi a0, a0, 1
+; RV64-OPT-NEXT:    addiw a0, a0, 1
 ; RV64-OPT-NEXT:    ret
   %a = add i32 1, %x
   ret i32 %a
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/double-zfa.ll b/llvm/test/CodeGen/RISCV/GlobalISel/double-zfa.ll
index 385156b3b99d..487869922658 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/double-zfa.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/double-zfa.ll
@@ -1,9 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-
 ; RUN: llc -mtriple=riscv32 -mattr=+zfa,d -global-isel < %s \
-; RUN: | FileCheck %s
+; RUN: | FileCheck %s --check-prefixes=CHECK,RV32IDZFA
 ; RUN: llc -mtriple=riscv64 -mattr=+zfa,d -global-isel < %s \
-; RUN: | FileCheck %s
+; RUN: | FileCheck %s --check-prefixes=CHECK,RV64DZFA
 
 
 define double @fceil(double %a) {
@@ -86,3 +85,32 @@ define double @fminimum(double %a, double %b) {
   %c = call double @llvm.minimum.f64(double %a, double %b)
   ret double %c
 }
+
+define i64 @fmvh_x_d(double %fa) {
+; RV32IDZFA-LABEL: fmvh_x_d:
+; RV32IDZFA:       # %bb.0:
+; RV32IDZFA-NEXT:    fmv.x.w a0, fa0
+; RV32IDZFA-NEXT:    fmvh.x.d a1, fa0
+; RV32IDZFA-NEXT:    ret
+;
+; RV64DZFA-LABEL: fmvh_x_d:
+; RV64DZFA:       # %bb.0:
+; RV64DZFA-NEXT:    fmv.x.d a0, fa0
+; RV64DZFA-NEXT:    ret
+  %i = bitcast double %fa to i64
+  ret i64 %i
+}
+
+define double @fmvp_d_x(i64 %a) {
+; RV32IDZFA-LABEL: fmvp_d_x:
+; RV32IDZFA:       # %bb.0:
+; RV32IDZFA-NEXT:    fmvp.d.x fa0, a0, a1
+; RV32IDZFA-NEXT:    ret
+;
+; RV64DZFA-LABEL: fmvp_d_x:
+; RV64DZFA:       # %bb.0:
+; RV64DZFA-NEXT:    fmv.d.x fa0, a0
+; RV64DZFA-NEXT:    ret
+  %or = bitcast i64 %a to double
+  ret double %or
+}
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/freeze.ll b/llvm/test/CodeGen/RISCV/GlobalISel/freeze.ll
index 72f0ab159f0a..234f33841206 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/freeze.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/freeze.ll
@@ -96,12 +96,19 @@ define ptr @freeze_ptr(ptr %x) {
 %struct.T = type { i32, i32 }
 
 define i32 @freeze_struct(ptr %p) {
-; CHECK-LABEL: freeze_struct:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    lw a1, 0(a0)
-; CHECK-NEXT:    lw a0, 4(a0)
-; CHECK-NEXT:    add a0, a1, a0
-; CHECK-NEXT:    ret
+; RV32-LABEL: freeze_struct:
+; RV32:       # %bb.0:
+; RV32-NEXT:    lw a1, 0(a0)
+; RV32-NEXT:    lw a0, 4(a0)
+; RV32-NEXT:    add a0, a1, a0
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: freeze_struct:
+; RV64:       # %bb.0:
+; RV64-NEXT:    lw a1, 0(a0)
+; RV64-NEXT:    lw a0, 4(a0)
+; RV64-NEXT:    addw a0, a1, a0
+; RV64-NEXT:    ret
   %s = load %struct.T, ptr %p
   %y1 = freeze %struct.T %s
   %v1 = extractvalue %struct.T %y1, 0
@@ -111,12 +118,19 @@ define i32 @freeze_struct(ptr %p) {
 }
 
 define i32 @freeze_anonstruct(ptr %p) {
-; CHECK-LABEL: freeze_anonstruct:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    lw a1, 0(a0)
-; CHECK-NEXT:    lw a0, 4(a0)
-; CHECK-NEXT:    add a0, a1, a0
-; CHECK-NEXT:    ret
+; RV32-LABEL: freeze_anonstruct:
+; RV32:       # %bb.0:
+; RV32-NEXT:    lw a1, 0(a0)
+; RV32-NEXT:    lw a0, 4(a0)
+; RV32-NEXT:    add a0, a1, a0
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: freeze_anonstruct:
+; RV64:       # %bb.0:
+; RV64-NEXT:    lw a1, 0(a0)
+; RV64-NEXT:    lw a0, 4(a0)
+; RV64-NEXT:    addw a0, a1, a0
+; RV64-NEXT:    ret
   %s = load {i32, i32}, ptr %p
   %y1 = freeze {i32, i32} %s
   %v1 = extractvalue {i32, i32} %y1, 0
@@ -141,7 +155,7 @@ define i32 @freeze_anonstruct2(ptr %p) {
 ; RV64-NEXT:    lw a0, 0(a0)
 ; RV64-NEXT:    slli a1, a1, 48
 ; RV64-NEXT:    srli a1, a1, 48
-; RV64-NEXT:    add a0, a0, a1
+; RV64-NEXT:    addw a0, a0, a1
 ; RV64-NEXT:    ret
   %s = load {i32, i16}, ptr %p
   %y1 = freeze {i32, i16} %s
@@ -168,7 +182,7 @@ define i32 @freeze_anonstruct2_sext(ptr %p) {
 ; RV64-NEXT:    lw a0, 0(a0)
 ; RV64-NEXT:    slli a1, a1, 48
 ; RV64-NEXT:    srai a1, a1, 48
-; RV64-NEXT:    add a0, a0, a1
+; RV64-NEXT:    addw a0, a0, a1
 ; RV64-NEXT:    ret
   %s = load {i32, i16}, ptr %p
   %y1 = freeze {i32, i16} %s
@@ -180,12 +194,19 @@ define i32 @freeze_anonstruct2_sext(ptr %p) {
 }
 
 define i32 @freeze_array(ptr %p) nounwind {
-; CHECK-LABEL: freeze_array:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    lw a1, 0(a0)
-; CHECK-NEXT:    lw a0, 4(a0)
-; CHECK-NEXT:    add a0, a1, a0
-; CHECK-NEXT:    ret
+; RV32-LABEL: freeze_array:
+; RV32:       # %bb.0:
+; RV32-NEXT:    lw a1, 0(a0)
+; RV32-NEXT:    lw a0, 4(a0)
+; RV32-NEXT:    add a0, a1, a0
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: freeze_array:
+; RV64:       # %bb.0:
+; RV64-NEXT:    lw a1, 0(a0)
+; RV64-NEXT:    lw a0, 4(a0)
+; RV64-NEXT:    addw a0, a1, a0
+; RV64-NEXT:    ret
   %s = load [2 x i32], ptr %p
   %y1 = freeze [2 x i32] %s
   %v1 = extractvalue [2 x i32] %y1, 0
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/iabs.ll b/llvm/test/CodeGen/RISCV/GlobalISel/iabs.ll
index 1156edffe919..31a78d4f72ce 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/iabs.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/iabs.ll
@@ -98,7 +98,7 @@ define i32 @abs32(i32 %x) {
 ; RV64I-LABEL: abs32:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    sraiw a1, a0, 31
-; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    addw a0, a0, a1
 ; RV64I-NEXT:    xor a0, a0, a1
 ; RV64I-NEXT:    ret
 ;
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer-info-validation.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer-info-validation.mir
index a27e2b80cd98..dbc13840a026 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer-info-validation.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer-info-validation.mir
@@ -23,7 +23,7 @@
 # DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
 #
 # DEBUG-NEXT: G_SUB (opcode [[SUB_OPC:[0-9]+]]): 1 type index, 0 imm indices
-# DEBUG-NEXT: .. opcode [[SUB_OPC]] is aliased to [[ADD_OPC]]
+# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
 # DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
 # DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
 #
@@ -59,7 +59,6 @@
 # DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
 #
 # DEBUG-NEXT: G_AND (opcode {{[0-9]+}}): 1 type index, 0 imm indices
-# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
 # DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
 # DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
 #
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-abs-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-abs-rv64.mir
index 22ce8a0fd0df..78a2227b84a3 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-abs-rv64.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-abs-rv64.mir
@@ -86,9 +86,10 @@ body:             |
     ; RV64I-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 31
     ; RV64I-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[ASSERT_SEXT]], [[C]](s64)
     ; RV64I-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[ASSERT_SEXT]], [[ASHR]]
-    ; RV64I-NEXT: [[XOR:%[0-9]+]]:_(s64) = G_XOR [[ADD]], [[ASHR]]
-    ; RV64I-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s64) = G_SEXT_INREG [[XOR]], 32
-    ; RV64I-NEXT: $x10 = COPY [[SEXT_INREG]](s64)
+    ; RV64I-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s64) = G_SEXT_INREG [[ADD]], 32
+    ; RV64I-NEXT: [[XOR:%[0-9]+]]:_(s64) = G_XOR [[SEXT_INREG]], [[ASHR]]
+    ; RV64I-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s64) = G_SEXT_INREG [[XOR]], 32
+    ; RV64I-NEXT: $x10 = COPY [[SEXT_INREG1]](s64)
     ; RV64I-NEXT: PseudoRET implicit $x10
     ;
     ; RV64ZBB-LABEL: name: abs_i32
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-add-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-add-rv64.mir
index 48b65a1dd6ba..8f2b9f36eb9f 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-add-rv64.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-add-rv64.mir
@@ -69,7 +69,8 @@ body:             |
     ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
     ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[COPY]], [[COPY1]]
-    ; CHECK-NEXT: $x10 = COPY [[ADD]](s64)
+    ; CHECK-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s64) = G_SEXT_INREG [[ADD]], 32
+    ; CHECK-NEXT: $x10 = COPY [[SEXT_INREG]](s64)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %0:_(s64) = COPY $x10
     %1:_(s64) = COPY $x11
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-addo-subo-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-addo-subo-rv64.mir
index f2ec70933261..eed1aac8f6c1 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-addo-subo-rv64.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-addo-subo-rv64.mir
@@ -339,7 +339,7 @@ body:             |
     ; CHECK-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s64) = G_SEXT_INREG [[ADD]], 32
     ; CHECK-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s64) = G_SEXT_INREG [[COPY1]], 32
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(ult), [[SEXT_INREG]](s64), [[SEXT_INREG1]]
-    ; CHECK-NEXT: $x10 = COPY [[ADD]](s64)
+    ; CHECK-NEXT: $x10 = COPY [[SEXT_INREG]](s64)
     ; CHECK-NEXT: $x11 = COPY [[ICMP]](s64)
     ; CHECK-NEXT: PseudoRET implicit $x10, implicit $x11
     %2:_(s64) = COPY $x10
@@ -454,10 +454,11 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
     ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s64) = G_SUB [[COPY]], [[COPY1]]
-    ; CHECK-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s64) = G_SEXT_INREG [[COPY]], 32
-    ; CHECK-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s64) = G_SEXT_INREG [[COPY1]], 32
-    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(ult), [[SEXT_INREG]](s64), [[SEXT_INREG1]]
-    ; CHECK-NEXT: $x10 = COPY [[SUB]](s64)
+    ; CHECK-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s64) = G_SEXT_INREG [[SUB]], 32
+    ; CHECK-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s64) = G_SEXT_INREG [[COPY]], 32
+    ; CHECK-NEXT: [[SEXT_INREG2:%[0-9]+]]:_(s64) = G_SEXT_INREG [[COPY1]], 32
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(ult), [[SEXT_INREG1]](s64), [[SEXT_INREG2]]
+    ; CHECK-NEXT: $x10 = COPY [[SEXT_INREG]](s64)
     ; CHECK-NEXT: $x11 = COPY [[ICMP]](s64)
     ; CHECK-NEXT: PseudoRET implicit $x10, implicit $x11
     %2:_(s64) = COPY $x10
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-const-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-const-rv64.mir
index 57fc513dc9e3..e28572d05207 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-const-rv64.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-const-rv64.mir
@@ -145,7 +145,8 @@ body:             |
     ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -64769
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
     ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[COPY]], [[C]]
-    ; CHECK-NEXT: $x10 = COPY [[ADD]](s64)
+    ; CHECK-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s64) = G_SEXT_INREG [[ADD]], 32
+    ; CHECK-NEXT: $x10 = COPY [[SEXT_INREG]](s64)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %0:_(s32) = G_CONSTANT i32 -64769
     %1:_(s64) = COPY $x10
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-ctlz-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-ctlz-rv64.mir
index 6cc5477b85a4..62d731351ffd 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-ctlz-rv64.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-ctlz-rv64.mir
@@ -59,7 +59,8 @@ body:             |
     ; RV64ZBB-NEXT: [[CLZW:%[0-9]+]]:_(s64) = G_CLZW [[AND]]
     ; RV64ZBB-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 24
     ; RV64ZBB-NEXT: [[SUB:%[0-9]+]]:_(s64) = G_SUB [[CLZW]], [[C1]]
-    ; RV64ZBB-NEXT: $x10 = COPY [[SUB]](s64)
+    ; RV64ZBB-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s64) = G_SEXT_INREG [[SUB]], 32
+    ; RV64ZBB-NEXT: $x10 = COPY [[SEXT_INREG]](s64)
     ; RV64ZBB-NEXT: PseudoRET implicit $x10
     %1:_(s64) = COPY $x10
     %0:_(s8) = G_TRUNC %1(s64)
@@ -129,7 +130,8 @@ body:             |
     ; RV64ZBB-NEXT: [[CLZW:%[0-9]+]]:_(s64) = G_CLZW [[AND]]
     ; RV64ZBB-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
     ; RV64ZBB-NEXT: [[SUB:%[0-9]+]]:_(s64) = G_SUB [[CLZW]], [[C1]]
-    ; RV64ZBB-NEXT: $x10 = COPY [[SUB]](s64)
+    ; RV64ZBB-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s64) = G_SEXT_INREG [[SUB]], 32
+    ; RV64ZBB-NEXT: $x10 = COPY [[SEXT_INREG]](s64)
     ; RV64ZBB-NEXT: PseudoRET implicit $x10
     %1:_(s64) = COPY $x10
     %0:_(s16) = G_TRUNC %1(s64)
@@ -175,16 +177,19 @@ body:             |
     ; RV64I-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 1431655765
     ; RV64I-NEXT: [[AND6:%[0-9]+]]:_(s64) = G_AND [[LSHR5]], [[C6]]
     ; RV64I-NEXT: [[SUB:%[0-9]+]]:_(s64) = G_SUB [[OR4]], [[AND6]]
-    ; RV64I-NEXT: [[AND7:%[0-9]+]]:_(s64) = G_AND [[SUB]], [[C1]]
+    ; RV64I-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s64) = G_SEXT_INREG [[SUB]], 32
+    ; RV64I-NEXT: [[AND7:%[0-9]+]]:_(s64) = G_AND [[SEXT_INREG]], [[C1]]
     ; RV64I-NEXT: [[LSHR6:%[0-9]+]]:_(s64) = G_LSHR [[AND7]], [[C2]](s64)
     ; RV64I-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 858993459
     ; RV64I-NEXT: [[AND8:%[0-9]+]]:_(s64) = G_AND [[LSHR6]], [[C7]]
-    ; RV64I-NEXT: [[AND9:%[0-9]+]]:_(s64) = G_AND [[SUB]], [[C7]]
+    ; RV64I-NEXT: [[AND9:%[0-9]+]]:_(s64) = G_AND [[SEXT_INREG]], [[C7]]
     ; RV64I-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[AND8]], [[AND9]]
-    ; RV64I-NEXT: [[LSHR7:%[0-9]+]]:_(s64) = G_LSHR [[ADD]], [[C3]](s64)
-    ; RV64I-NEXT: [[ADD1:%[0-9]+]]:_(s64) = G_ADD [[LSHR7]], [[ADD]]
+    ; RV64I-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s64) = G_SEXT_INREG [[ADD]], 32
+    ; RV64I-NEXT: [[LSHR7:%[0-9]+]]:_(s64) = G_LSHR [[SEXT_INREG1]], [[C3]](s64)
+    ; RV64I-NEXT: [[ADD1:%[0-9]+]]:_(s64) = G_ADD [[LSHR7]], [[SEXT_INREG1]]
+    ; RV64I-NEXT: [[SEXT_INREG2:%[0-9]+]]:_(s64) = G_SEXT_INREG [[ADD1]], 32
     ; RV64I-NEXT: [[C8:%[0-9]+]]:_(s64) = G_CONSTANT i64 252645135
-    ; RV64I-NEXT: [[AND10:%[0-9]+]]:_(s64) = G_AND [[ADD1]], [[C8]]
+    ; RV64I-NEXT: [[AND10:%[0-9]+]]:_(s64) = G_AND [[SEXT_INREG2]], [[C8]]
     ; RV64I-NEXT: [[C9:%[0-9]+]]:_(s64) = G_CONSTANT i64 16843009
     ; RV64I-NEXT: [[C10:%[0-9]+]]:_(s64) = G_CONSTANT i64 24
     ; RV64I-NEXT: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[AND10]], [[C9]]
@@ -192,7 +197,8 @@ body:             |
     ; RV64I-NEXT: [[LSHR8:%[0-9]+]]:_(s64) = G_LSHR [[AND11]], [[C10]](s64)
     ; RV64I-NEXT: [[C11:%[0-9]+]]:_(s64) = G_CONSTANT i64 32
     ; RV64I-NEXT: [[SUB1:%[0-9]+]]:_(s64) = G_SUB [[C11]], [[LSHR8]]
-    ; RV64I-NEXT: $x10 = COPY [[SUB1]](s64)
+    ; RV64I-NEXT: [[SEXT_INREG3:%[0-9]+]]:_(s64) = G_SEXT_INREG [[SUB1]], 32
+    ; RV64I-NEXT: $x10 = COPY [[SEXT_INREG3]](s64)
     ; RV64I-NEXT: PseudoRET implicit $x10
     ;
     ; RV64ZBB-LABEL: name: ctlz_i32
@@ -328,7 +334,8 @@ body:             |
     ; RV64ZBB-NEXT: [[CLZW:%[0-9]+]]:_(s64) = G_CLZW [[AND]]
     ; RV64ZBB-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 24
     ; RV64ZBB-NEXT: [[SUB:%[0-9]+]]:_(s64) = G_SUB [[CLZW]], [[C1]]
-    ; RV64ZBB-NEXT: $x10 = COPY [[SUB]](s64)
+    ; RV64ZBB-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s64) = G_SEXT_INREG [[SUB]], 32
+    ; RV64ZBB-NEXT: $x10 = COPY [[SEXT_INREG]](s64)
     ; RV64ZBB-NEXT: PseudoRET implicit $x10
     %1:_(s64) = COPY $x10
     %0:_(s8) = G_TRUNC %1(s64)
@@ -398,7 +405,8 @@ body:             |
     ; RV64ZBB-NEXT: [[CLZW:%[0-9]+]]:_(s64) = G_CLZW [[AND]]
     ; RV64ZBB-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
     ; RV64ZBB-NEXT: [[SUB:%[0-9]+]]:_(s64) = G_SUB [[CLZW]], [[C1]]
-    ; RV64ZBB-NEXT: $x10 = COPY [[SUB]](s64)
+    ; RV64ZBB-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s64) = G_SEXT_INREG [[SUB]], 32
+    ; RV64ZBB-NEXT: $x10 = COPY [[SEXT_INREG]](s64)
     ; RV64ZBB-NEXT: PseudoRET implicit $x10
     %1:_(s64) = COPY $x10
     %0:_(s16) = G_TRUNC %1(s64)
@@ -444,16 +452,19 @@ body:             |
     ; RV64I-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 1431655765
     ; RV64I-NEXT: [[AND6:%[0-9]+]]:_(s64) = G_AND [[LSHR5]], [[C6]]
     ; RV64I-NEXT: [[SUB:%[0-9]+]]:_(s64) = G_SUB [[OR4]], [[AND6]]
-    ; RV64I-NEXT: [[AND7:%[0-9]+]]:_(s64) = G_AND [[SUB]], [[C1]]
+    ; RV64I-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s64) = G_SEXT_INREG [[SUB]], 32
+    ; RV64I-NEXT: [[AND7:%[0-9]+]]:_(s64) = G_AND [[SEXT_INREG]], [[C1]]
     ; RV64I-NEXT: [[LSHR6:%[0-9]+]]:_(s64) = G_LSHR [[AND7]], [[C2]](s64)
     ; RV64I-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 858993459
     ; RV64I-NEXT: [[AND8:%[0-9]+]]:_(s64) = G_AND [[LSHR6]], [[C7]]
-    ; RV64I-NEXT: [[AND9:%[0-9]+]]:_(s64) = G_AND [[SUB]], [[C7]]
+    ; RV64I-NEXT: [[AND9:%[0-9]+]]:_(s64) = G_AND [[SEXT_INREG]], [[C7]]
     ; RV64I-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[AND8]], [[AND9]]
-    ; RV64I-NEXT: [[LSHR7:%[0-9]+]]:_(s64) = G_LSHR [[ADD]], [[C3]](s64)
-    ; RV64I-NEXT: [[ADD1:%[0-9]+]]:_(s64) = G_ADD [[LSHR7]], [[ADD]]
+    ; RV64I-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s64) = G_SEXT_INREG [[ADD]], 32
+    ; RV64I-NEXT: [[LSHR7:%[0-9]+]]:_(s64) = G_LSHR [[SEXT_INREG1]], [[C3]](s64)
+    ; RV64I-NEXT: [[ADD1:%[0-9]+]]:_(s64) = G_ADD [[LSHR7]], [[SEXT_INREG1]]
+    ; RV64I-NEXT: [[SEXT_INREG2:%[0-9]+]]:_(s64) = G_SEXT_INREG [[ADD1]], 32
     ; RV64I-NEXT: [[C8:%[0-9]+]]:_(s64) = G_CONSTANT i64 252645135
-    ; RV64I-NEXT: [[AND10:%[0-9]+]]:_(s64) = G_AND [[ADD1]], [[C8]]
+    ; RV64I-NEXT: [[AND10:%[0-9]+]]:_(s64) = G_AND [[SEXT_INREG2]], [[C8]]
     ; RV64I-NEXT: [[C9:%[0-9]+]]:_(s64) = G_CONSTANT i64 16843009
     ; RV64I-NEXT: [[C10:%[0-9]+]]:_(s64) = G_CONSTANT i64 24
     ; RV64I-NEXT: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[AND10]], [[C9]]
@@ -461,7 +472,8 @@ body:             |
     ; RV64I-NEXT: [[LSHR8:%[0-9]+]]:_(s64) = G_LSHR [[AND11]], [[C10]](s64)
     ; RV64I-NEXT: [[C11:%[0-9]+]]:_(s64) = G_CONSTANT i64 32
     ; RV64I-NEXT: [[SUB1:%[0-9]+]]:_(s64) = G_SUB [[C11]], [[LSHR8]]
-    ; RV64I-NEXT: $x10 = COPY [[SUB1]](s64)
+    ; RV64I-NEXT: [[SEXT_INREG3:%[0-9]+]]:_(s64) = G_SEXT_INREG [[SUB1]], 32
+    ; RV64I-NEXT: $x10 = COPY [[SEXT_INREG3]](s64)
     ; RV64I-NEXT: PseudoRET implicit $x10
     ;
     ; RV64ZBB-LABEL: name: ctlz_zero_undef_i32
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-ctpop-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-ctpop-rv64.mir
index 1493514394bd..c61c46df0a43 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-ctpop-rv64.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-ctpop-rv64.mir
@@ -129,18 +129,21 @@ body:             |
     ; RV64I-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 1431655765
     ; RV64I-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[LSHR]], [[C2]]
     ; RV64I-NEXT: [[SUB:%[0-9]+]]:_(s64) = G_SUB [[COPY]], [[AND1]]
+    ; RV64I-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s64) = G_SEXT_INREG [[SUB]], 32
     ; RV64I-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; RV64I-NEXT: [[AND2:%[0-9]+]]:_(s64) = G_AND [[SUB]], [[C1]]
+    ; RV64I-NEXT: [[AND2:%[0-9]+]]:_(s64) = G_AND [[SEXT_INREG]], [[C1]]
     ; RV64I-NEXT: [[LSHR1:%[0-9]+]]:_(s64) = G_LSHR [[AND2]], [[C3]](s64)
     ; RV64I-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 858993459
     ; RV64I-NEXT: [[AND3:%[0-9]+]]:_(s64) = G_AND [[LSHR1]], [[C4]]
-    ; RV64I-NEXT: [[AND4:%[0-9]+]]:_(s64) = G_AND [[SUB]], [[C4]]
+    ; RV64I-NEXT: [[AND4:%[0-9]+]]:_(s64) = G_AND [[SEXT_INREG]], [[C4]]
     ; RV64I-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[AND3]], [[AND4]]
+    ; RV64I-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s64) = G_SEXT_INREG [[ADD]], 32
     ; RV64I-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; RV64I-NEXT: [[LSHR2:%[0-9]+]]:_(s64) = G_LSHR [[ADD]], [[C5]](s64)
-    ; RV64I-NEXT: [[ADD1:%[0-9]+]]:_(s64) = G_ADD [[LSHR2]], [[ADD]]
+    ; RV64I-NEXT: [[LSHR2:%[0-9]+]]:_(s64) = G_LSHR [[SEXT_INREG1]], [[C5]](s64)
+    ; RV64I-NEXT: [[ADD1:%[0-9]+]]:_(s64) = G_ADD [[LSHR2]], [[SEXT_INREG1]]
+    ; RV64I-NEXT: [[SEXT_INREG2:%[0-9]+]]:_(s64) = G_SEXT_INREG [[ADD1]], 32
     ; RV64I-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 252645135
-    ; RV64I-NEXT: [[AND5:%[0-9]+]]:_(s64) = G_AND [[ADD1]], [[C6]]
+    ; RV64I-NEXT: [[AND5:%[0-9]+]]:_(s64) = G_AND [[SEXT_INREG2]], [[C6]]
     ; RV64I-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 16843009
     ; RV64I-NEXT: [[C8:%[0-9]+]]:_(s64) = G_CONSTANT i64 24
     ; RV64I-NEXT: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[AND5]], [[C7]]
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-cttz-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-cttz-rv64.mir
index 252e79280af6..87155bb8b743 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-cttz-rv64.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-cttz-rv64.mir
@@ -131,7 +131,8 @@ body:             |
     ; RV64I-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
     ; RV64I-NEXT: [[XOR:%[0-9]+]]:_(s64) = G_XOR [[COPY]], [[C]]
     ; RV64I-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[COPY]], [[C]]
-    ; RV64I-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[XOR]], [[ADD]]
+    ; RV64I-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s64) = G_SEXT_INREG [[ADD]], 32
+    ; RV64I-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[XOR]], [[SEXT_INREG]]
     ; RV64I-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
     ; RV64I-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4294967295
     ; RV64I-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[AND]], [[C2]]
@@ -139,18 +140,21 @@ body:             |
     ; RV64I-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 1431655765
     ; RV64I-NEXT: [[AND2:%[0-9]+]]:_(s64) = G_AND [[LSHR]], [[C3]]
     ; RV64I-NEXT: [[SUB:%[0-9]+]]:_(s64) = G_SUB [[AND]], [[AND2]]
+    ; RV64I-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s64) = G_SEXT_INREG [[SUB]], 32
     ; RV64I-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; RV64I-NEXT: [[AND3:%[0-9]+]]:_(s64) = G_AND [[SUB]], [[C2]]
+    ; RV64I-NEXT: [[AND3:%[0-9]+]]:_(s64) = G_AND [[SEXT_INREG1]], [[C2]]
     ; RV64I-NEXT: [[LSHR1:%[0-9]+]]:_(s64) = G_LSHR [[AND3]], [[C4]](s64)
     ; RV64I-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 858993459
     ; RV64I-NEXT: [[AND4:%[0-9]+]]:_(s64) = G_AND [[LSHR1]], [[C5]]
-    ; RV64I-NEXT: [[AND5:%[0-9]+]]:_(s64) = G_AND [[SUB]], [[C5]]
+    ; RV64I-NEXT: [[AND5:%[0-9]+]]:_(s64) = G_AND [[SEXT_INREG1]], [[C5]]
     ; RV64I-NEXT: [[ADD1:%[0-9]+]]:_(s64) = G_ADD [[AND4]], [[AND5]]
+    ; RV64I-NEXT: [[SEXT_INREG2:%[0-9]+]]:_(s64) = G_SEXT_INREG [[ADD1]], 32
     ; RV64I-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; RV64I-NEXT: [[LSHR2:%[0-9]+]]:_(s64) = G_LSHR [[ADD1]], [[C6]](s64)
-    ; RV64I-NEXT: [[ADD2:%[0-9]+]]:_(s64) = G_ADD [[LSHR2]], [[ADD1]]
+    ; RV64I-NEXT: [[LSHR2:%[0-9]+]]:_(s64) = G_LSHR [[SEXT_INREG2]], [[C6]](s64)
+    ; RV64I-NEXT: [[ADD2:%[0-9]+]]:_(s64) = G_ADD [[LSHR2]], [[SEXT_INREG2]]
+    ; RV64I-NEXT: [[SEXT_INREG3:%[0-9]+]]:_(s64) = G_SEXT_INREG [[ADD2]], 32
     ; RV64I-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 252645135
-    ; RV64I-NEXT: [[AND6:%[0-9]+]]:_(s64) = G_AND [[ADD2]], [[C7]]
+    ; RV64I-NEXT: [[AND6:%[0-9]+]]:_(s64) = G_AND [[SEXT_INREG3]], [[C7]]
     ; RV64I-NEXT: [[C8:%[0-9]+]]:_(s64) = G_CONSTANT i64 16843009
     ; RV64I-NEXT: [[C9:%[0-9]+]]:_(s64) = G_CONSTANT i64 24
     ; RV64I-NEXT: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[AND6]], [[C8]]
@@ -351,7 +355,8 @@ body:             |
     ; RV64I-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
     ; RV64I-NEXT: [[XOR:%[0-9]+]]:_(s64) = G_XOR [[COPY]], [[C]]
     ; RV64I-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[COPY]], [[C]]
-    ; RV64I-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[XOR]], [[ADD]]
+    ; RV64I-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s64) = G_SEXT_INREG [[ADD]], 32
+    ; RV64I-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[XOR]], [[SEXT_INREG]]
     ; RV64I-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
     ; RV64I-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4294967295
     ; RV64I-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[AND]], [[C2]]
@@ -359,18 +364,21 @@ body:             |
     ; RV64I-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 1431655765
     ; RV64I-NEXT: [[AND2:%[0-9]+]]:_(s64) = G_AND [[LSHR]], [[C3]]
     ; RV64I-NEXT: [[SUB:%[0-9]+]]:_(s64) = G_SUB [[AND]], [[AND2]]
+    ; RV64I-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s64) = G_SEXT_INREG [[SUB]], 32
     ; RV64I-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; RV64I-NEXT: [[AND3:%[0-9]+]]:_(s64) = G_AND [[SUB]], [[C2]]
+    ; RV64I-NEXT: [[AND3:%[0-9]+]]:_(s64) = G_AND [[SEXT_INREG1]], [[C2]]
     ; RV64I-NEXT: [[LSHR1:%[0-9]+]]:_(s64) = G_LSHR [[AND3]], [[C4]](s64)
     ; RV64I-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 858993459
     ; RV64I-NEXT: [[AND4:%[0-9]+]]:_(s64) = G_AND [[LSHR1]], [[C5]]
-    ; RV64I-NEXT: [[AND5:%[0-9]+]]:_(s64) = G_AND [[SUB]], [[C5]]
+    ; RV64I-NEXT: [[AND5:%[0-9]+]]:_(s64) = G_AND [[SEXT_INREG1]], [[C5]]
     ; RV64I-NEXT: [[ADD1:%[0-9]+]]:_(s64) = G_ADD [[AND4]], [[AND5]]
+    ; RV64I-NEXT: [[SEXT_INREG2:%[0-9]+]]:_(s64) = G_SEXT_INREG [[ADD1]], 32
     ; RV64I-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; RV64I-NEXT: [[LSHR2:%[0-9]+]]:_(s64) = G_LSHR [[ADD1]], [[C6]](s64)
-    ; RV64I-NEXT: [[ADD2:%[0-9]+]]:_(s64) = G_ADD [[LSHR2]], [[ADD1]]
+    ; RV64I-NEXT: [[LSHR2:%[0-9]+]]:_(s64) = G_LSHR [[SEXT_INREG2]], [[C6]](s64)
+    ; RV64I-NEXT: [[ADD2:%[0-9]+]]:_(s64) = G_ADD [[LSHR2]], [[SEXT_INREG2]]
+    ; RV64I-NEXT: [[SEXT_INREG3:%[0-9]+]]:_(s64) = G_SEXT_INREG [[ADD2]], 32
     ; RV64I-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 252645135
-    ; RV64I-NEXT: [[AND6:%[0-9]+]]:_(s64) = G_AND [[ADD2]], [[C7]]
+    ; RV64I-NEXT: [[AND6:%[0-9]+]]:_(s64) = G_AND [[SEXT_INREG3]], [[C7]]
     ; RV64I-NEXT: [[C8:%[0-9]+]]:_(s64) = G_CONSTANT i64 16843009
     ; RV64I-NEXT: [[C9:%[0-9]+]]:_(s64) = G_CONSTANT i64 24
     ; RV64I-NEXT: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[AND6]], [[C8]]
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-ext-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-ext-rv64.mir
index f3bc1ce28cfa..aff7d4d3ec1e 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-ext-rv64.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-ext-rv64.mir
@@ -30,8 +30,9 @@ body:             |
     ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
     ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s64) = G_SEXT_INREG [[ADD]], 32
     ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4294967295
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[ADD]], [[C]]
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[SEXT_INREG]], [[C]]
     ; CHECK-NEXT: $x10 = COPY [[AND]](s64)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %0:_(s64) = COPY $x10
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-rotate-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-rotate-rv64.mir
index 4689a7dd219a..776f5f53fafb 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-rotate-rv64.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-rotate-rv64.mir
@@ -88,9 +88,10 @@ body:             |
     ; RV64I-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
     ; RV64I-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 31
     ; RV64I-NEXT: [[SUB:%[0-9]+]]:_(s64) = G_SUB [[C]], [[COPY1]]
+    ; RV64I-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s64) = G_SEXT_INREG [[SUB]], 32
     ; RV64I-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY1]], [[C1]]
     ; RV64I-NEXT: [[SLLW:%[0-9]+]]:_(s64) = G_SLLW [[COPY]], [[AND]]
-    ; RV64I-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[SUB]], [[C1]]
+    ; RV64I-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[SEXT_INREG]], [[C1]]
     ; RV64I-NEXT: [[SRLW:%[0-9]+]]:_(s64) = G_SRLW [[COPY]], [[AND1]]
     ; RV64I-NEXT: [[OR:%[0-9]+]]:_(s64) = G_OR [[SLLW]], [[SRLW]]
     ; RV64I-NEXT: $x10 = COPY [[OR]](s64)
@@ -233,9 +234,10 @@ body:             |
     ; RV64I-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
     ; RV64I-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 31
     ; RV64I-NEXT: [[SUB:%[0-9]+]]:_(s64) = G_SUB [[C]], [[COPY1]]
+    ; RV64I-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s64) = G_SEXT_INREG [[SUB]], 32
     ; RV64I-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY1]], [[C1]]
     ; RV64I-NEXT: [[SRLW:%[0-9]+]]:_(s64) = G_SRLW [[COPY]], [[AND]]
-    ; RV64I-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[SUB]], [[C1]]
+    ; RV64I-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[SEXT_INREG]], [[C1]]
     ; RV64I-NEXT: [[SLLW:%[0-9]+]]:_(s64) = G_SLLW [[COPY]], [[AND1]]
     ; RV64I-NEXT: [[OR:%[0-9]+]]:_(s64) = G_OR [[SRLW]], [[SLLW]]
     ; RV64I-NEXT: $x10 = COPY [[OR]](s64)
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-sat-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-sat-rv64.mir
index bf8c8d690f07..d162bfcca1bc 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-sat-rv64.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-sat-rv64.mir
@@ -16,8 +16,8 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
     ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[COPY]], [[COPY1]]
-    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[ADD]](s64)
     ; CHECK-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s64) = G_SEXT_INREG [[ADD]], 32
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[SEXT_INREG]](s64)
     ; CHECK-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s64) = G_SEXT_INREG [[COPY1]], 32
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(ult), [[SEXT_INREG]](s64), [[SEXT_INREG1]]
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[TRUNC]](s32)
@@ -97,7 +97,8 @@ body:             |
     ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[COPY3]], [[C]](s64)
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 -2147483648
     ; CHECK-NEXT: [[ADD1:%[0-9]+]]:_(s64) = G_ADD [[ASHR]], [[C1]]
-    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[ADD1]](s64)
+    ; CHECK-NEXT: [[SEXT_INREG3:%[0-9]+]]:_(s64) = G_SEXT_INREG [[ADD1]], 32
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[SEXT_INREG3]](s64)
     ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s64), [[TRUNC1]], [[COPY2]]
     ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[SELECT]](s32)
     ; CHECK-NEXT: $x10 = COPY [[ANYEXT]](s64)
@@ -173,10 +174,11 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
     ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s64) = G_SUB [[COPY]], [[COPY1]]
-    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[SUB]](s64)
-    ; CHECK-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s64) = G_SEXT_INREG [[COPY]], 32
-    ; CHECK-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s64) = G_SEXT_INREG [[COPY1]], 32
-    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(ult), [[SEXT_INREG]](s64), [[SEXT_INREG1]]
+    ; CHECK-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s64) = G_SEXT_INREG [[SUB]], 32
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[SEXT_INREG]](s64)
+    ; CHECK-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s64) = G_SEXT_INREG [[COPY]], 32
+    ; CHECK-NEXT: [[SEXT_INREG2:%[0-9]+]]:_(s64) = G_SEXT_INREG [[COPY1]], 32
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(ult), [[SEXT_INREG1]](s64), [[SEXT_INREG2]]
     ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
     ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[C]](s64)
     ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s64), [[TRUNC1]], [[TRUNC]]
@@ -250,7 +252,8 @@ body:             |
     ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[COPY3]], [[C]](s64)
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 -2147483648
     ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[ASHR]], [[C1]]
-    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[ADD]](s64)
+    ; CHECK-NEXT: [[SEXT_INREG3:%[0-9]+]]:_(s64) = G_SEXT_INREG [[ADD]], 32
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[SEXT_INREG3]](s64)
     ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s64), [[TRUNC1]], [[COPY2]]
     ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[SELECT]](s32)
     ; CHECK-NEXT: $x10 = COPY [[ANYEXT]](s64)
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-sub-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-sub-rv64.mir
index da3ab9e1a527..7ab07ee0d70d 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-sub-rv64.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-sub-rv64.mir
@@ -69,7 +69,8 @@ body:             |
     ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
     ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s64) = G_SUB [[COPY]], [[COPY1]]
-    ; CHECK-NEXT: $x10 = COPY [[SUB]](s64)
+    ; CHECK-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s64) = G_SEXT_INREG [[SUB]], 32
+    ; CHECK-NEXT: $x10 = COPY [[SEXT_INREG]](s64)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %0:_(s64) = COPY $x10
     %1:_(s64) = COPY $x11
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb.ll
index 0b876fed59c1..9df319e73a11 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb.ll
@@ -18,7 +18,7 @@ define signext i32 @ctlz_i32(i32 signext %a) nounwind {
 ; RV64I-NEXT:    srliw a1, a0, 1
 ; RV64I-NEXT:    lui a2, 349525
 ; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    addiw a1, a2, 1365
+; RV64I-NEXT:    addi a1, a2, 1365
 ; RV64I-NEXT:    srliw a2, a0, 2
 ; RV64I-NEXT:    or a0, a0, a2
 ; RV64I-NEXT:    srliw a2, a0, 4
@@ -30,15 +30,15 @@ define signext i32 @ctlz_i32(i32 signext %a) nounwind {
 ; RV64I-NEXT:    srliw a2, a0, 1
 ; RV64I-NEXT:    and a1, a2, a1
 ; RV64I-NEXT:    lui a2, 209715
-; RV64I-NEXT:    addiw a2, a2, 819
+; RV64I-NEXT:    addi a2, a2, 819
 ; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    srliw a1, a0, 2
 ; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    lui a2, 61681
-; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    addw a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 4
-; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    addw a0, a1, a0
 ; RV64I-NEXT:    lui a1, 4112
 ; RV64I-NEXT:    addiw a2, a2, -241
 ; RV64I-NEXT:    and a0, a0, a2
@@ -75,7 +75,7 @@ define signext i32 @log2_i32(i32 signext %a) nounwind {
 ; RV64I-NEXT:    srliw a1, a0, 1
 ; RV64I-NEXT:    lui a2, 349525
 ; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    addiw a1, a2, 1365
+; RV64I-NEXT:    addi a1, a2, 1365
 ; RV64I-NEXT:    srliw a2, a0, 2
 ; RV64I-NEXT:    or a0, a0, a2
 ; RV64I-NEXT:    srliw a2, a0, 4
@@ -87,15 +87,15 @@ define signext i32 @log2_i32(i32 signext %a) nounwind {
 ; RV64I-NEXT:    srliw a2, a0, 1
 ; RV64I-NEXT:    and a1, a2, a1
 ; RV64I-NEXT:    lui a2, 209715
-; RV64I-NEXT:    addiw a2, a2, 819
+; RV64I-NEXT:    addi a2, a2, 819
 ; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    srliw a1, a0, 2
 ; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    lui a2, 61681
-; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    addw a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 4
-; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    addw a0, a1, a0
 ; RV64I-NEXT:    lui a1, 4112
 ; RV64I-NEXT:    addiw a2, a2, -241
 ; RV64I-NEXT:    and a0, a0, a2
@@ -133,15 +133,14 @@ define signext i32 @log2_ceil_i32(i32 signext %a) nounwind {
 ; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s0, 0(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    li s0, 32
-; RV64I-NEXT:    addi a0, a0, -1
-; RV64I-NEXT:    sext.w a2, a0
+; RV64I-NEXT:    addiw a0, a0, -1
 ; RV64I-NEXT:    li a1, 32
-; RV64I-NEXT:    beqz a2, .LBB2_2
+; RV64I-NEXT:    beqz a0, .LBB2_2
 ; RV64I-NEXT:  # %bb.1: # %cond.false
 ; RV64I-NEXT:    srliw a1, a0, 1
 ; RV64I-NEXT:    lui a2, 349525
 ; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    addiw a1, a2, 1365
+; RV64I-NEXT:    addi a1, a2, 1365
 ; RV64I-NEXT:    srliw a2, a0, 2
 ; RV64I-NEXT:    or a0, a0, a2
 ; RV64I-NEXT:    srliw a2, a0, 4
@@ -153,15 +152,15 @@ define signext i32 @log2_ceil_i32(i32 signext %a) nounwind {
 ; RV64I-NEXT:    srliw a2, a0, 1
 ; RV64I-NEXT:    and a1, a2, a1
 ; RV64I-NEXT:    lui a2, 209715
-; RV64I-NEXT:    addiw a2, a2, 819
+; RV64I-NEXT:    addi a2, a2, 819
 ; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    srliw a1, a0, 2
 ; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    lui a2, 61681
-; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    addw a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 4
-; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    addw a0, a1, a0
 ; RV64I-NEXT:    lui a1, 4112
 ; RV64I-NEXT:    addiw a2, a2, -241
 ; RV64I-NEXT:    and a0, a0, a2
@@ -200,7 +199,7 @@ define signext i32 @findLastSet_i32(i32 signext %a) nounwind {
 ; RV64I-NEXT:    srliw a0, a0, 1
 ; RV64I-NEXT:    lui a1, 349525
 ; RV64I-NEXT:    or a0, s0, a0
-; RV64I-NEXT:    addiw a1, a1, 1365
+; RV64I-NEXT:    addi a1, a1, 1365
 ; RV64I-NEXT:    srliw a2, a0, 2
 ; RV64I-NEXT:    or a0, a0, a2
 ; RV64I-NEXT:    srliw a2, a0, 4
@@ -212,15 +211,15 @@ define signext i32 @findLastSet_i32(i32 signext %a) nounwind {
 ; RV64I-NEXT:    srliw a2, a0, 1
 ; RV64I-NEXT:    and a1, a2, a1
 ; RV64I-NEXT:    lui a2, 209715
-; RV64I-NEXT:    addiw a2, a2, 819
+; RV64I-NEXT:    addi a2, a2, 819
 ; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    srliw a1, a0, 2
 ; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    lui a2, 61681
-; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    addw a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 4
-; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    addw a0, a1, a0
 ; RV64I-NEXT:    lui a1, 4112
 ; RV64I-NEXT:    addiw a2, a2, -241
 ; RV64I-NEXT:    and a0, a0, a2
@@ -271,7 +270,7 @@ define i32 @ctlz_lshr_i32(i32 signext %a) {
 ; RV64I-NEXT:    srliw a0, a0, 2
 ; RV64I-NEXT:    lui a2, 349525
 ; RV64I-NEXT:    or a0, a1, a0
-; RV64I-NEXT:    addiw a1, a2, 1365
+; RV64I-NEXT:    addi a1, a2, 1365
 ; RV64I-NEXT:    srli a2, a0, 2
 ; RV64I-NEXT:    or a0, a0, a2
 ; RV64I-NEXT:    srli a2, a0, 4
@@ -283,15 +282,15 @@ define i32 @ctlz_lshr_i32(i32 signext %a) {
 ; RV64I-NEXT:    srliw a2, a0, 1
 ; RV64I-NEXT:    and a1, a2, a1
 ; RV64I-NEXT:    lui a2, 209715
-; RV64I-NEXT:    addiw a2, a2, 819
+; RV64I-NEXT:    addi a2, a2, 819
 ; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    srliw a1, a0, 2
 ; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    lui a2, 61681
-; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    addw a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 4
-; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    addw a0, a1, a0
 ; RV64I-NEXT:    lui a1, 4112
 ; RV64I-NEXT:    addiw a2, a2, -241
 ; RV64I-NEXT:    and a0, a0, a2
@@ -299,7 +298,7 @@ define i32 @ctlz_lshr_i32(i32 signext %a) {
 ; RV64I-NEXT:    call __muldi3
 ; RV64I-NEXT:    srliw a0, a0, 24
 ; RV64I-NEXT:    li a1, 32
-; RV64I-NEXT:    sub a0, a1, a0
+; RV64I-NEXT:    subw a0, a1, a0
 ; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    .cfi_restore ra
 ; RV64I-NEXT:    addi sp, sp, 16
@@ -408,19 +407,19 @@ define signext i32 @cttz_i32(i32 signext %a) nounwind {
 ; RV64I-NEXT:    addi a0, a0, -1
 ; RV64I-NEXT:    lui a2, 349525
 ; RV64I-NEXT:    and a0, a1, a0
-; RV64I-NEXT:    addiw a1, a2, 1365
+; RV64I-NEXT:    addi a1, a2, 1365
 ; RV64I-NEXT:    srliw a2, a0, 1
 ; RV64I-NEXT:    and a1, a2, a1
 ; RV64I-NEXT:    lui a2, 209715
-; RV64I-NEXT:    addiw a2, a2, 819
+; RV64I-NEXT:    addi a2, a2, 819
 ; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    srliw a1, a0, 2
 ; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    lui a2, 61681
-; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    addw a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 4
-; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    addw a0, a1, a0
 ; RV64I-NEXT:    lui a1, 4112
 ; RV64I-NEXT:    addiw a2, a2, -241
 ; RV64I-NEXT:    and a0, a0, a2
@@ -451,19 +450,19 @@ define signext i32 @cttz_zero_undef_i32(i32 signext %a) nounwind {
 ; RV64I-NEXT:    addi a0, a0, -1
 ; RV64I-NEXT:    lui a2, 349525
 ; RV64I-NEXT:    and a0, a1, a0
-; RV64I-NEXT:    addiw a1, a2, 1365
+; RV64I-NEXT:    addi a1, a2, 1365
 ; RV64I-NEXT:    srliw a2, a0, 1
 ; RV64I-NEXT:    and a1, a2, a1
 ; RV64I-NEXT:    lui a2, 209715
-; RV64I-NEXT:    addiw a2, a2, 819
+; RV64I-NEXT:    addi a2, a2, 819
 ; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    srliw a1, a0, 2
 ; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    lui a2, 61681
-; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    addw a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 4
-; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    addw a0, a1, a0
 ; RV64I-NEXT:    lui a1, 4112
 ; RV64I-NEXT:    addiw a2, a2, -241
 ; RV64I-NEXT:    and a0, a0, a2
@@ -493,19 +492,19 @@ define signext i32 @findFirstSet_i32(i32 signext %a) nounwind {
 ; RV64I-NEXT:    addi a1, s0, -1
 ; RV64I-NEXT:    lui a2, 349525
 ; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    addiw a1, a2, 1365
+; RV64I-NEXT:    addi a1, a2, 1365
 ; RV64I-NEXT:    srliw a2, a0, 1
 ; RV64I-NEXT:    and a1, a2, a1
 ; RV64I-NEXT:    lui a2, 209715
-; RV64I-NEXT:    addiw a2, a2, 819
+; RV64I-NEXT:    addi a2, a2, 819
 ; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    srliw a1, a0, 2
 ; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    lui a2, 61681
-; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    addw a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 4
-; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    addw a0, a1, a0
 ; RV64I-NEXT:    lui a1, 4112
 ; RV64I-NEXT:    addiw a2, a2, -241
 ; RV64I-NEXT:    and a0, a0, a2
@@ -549,19 +548,19 @@ define signext i32 @ffs_i32(i32 signext %a) nounwind {
 ; RV64I-NEXT:    addi a1, s0, -1
 ; RV64I-NEXT:    lui a2, 349525
 ; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    addiw a1, a2, 1365
+; RV64I-NEXT:    addi a1, a2, 1365
 ; RV64I-NEXT:    srliw a2, a0, 1
 ; RV64I-NEXT:    and a1, a2, a1
 ; RV64I-NEXT:    lui a2, 209715
-; RV64I-NEXT:    addiw a2, a2, 819
+; RV64I-NEXT:    addi a2, a2, 819
 ; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    srliw a1, a0, 2
 ; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    lui a2, 61681
-; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    addw a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 4
-; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    addw a0, a1, a0
 ; RV64I-NEXT:    lui a1, 4112
 ; RV64I-NEXT:    addiw a2, a2, -241
 ; RV64I-NEXT:    and a0, a0, a2
@@ -669,18 +668,18 @@ define signext i32 @ctpop_i32(i32 signext %a) nounwind {
 ; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    srliw a1, a0, 1
 ; RV64I-NEXT:    lui a2, 349525
-; RV64I-NEXT:    addiw a2, a2, 1365
+; RV64I-NEXT:    addi a2, a2, 1365
 ; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    lui a2, 209715
-; RV64I-NEXT:    addiw a2, a2, 819
+; RV64I-NEXT:    addi a2, a2, 819
 ; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    srliw a1, a0, 2
 ; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    lui a2, 61681
-; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    addw a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 4
-; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    addw a0, a1, a0
 ; RV64I-NEXT:    lui a1, 4112
 ; RV64I-NEXT:    addiw a2, a2, -241
 ; RV64I-NEXT:    and a0, a0, a2
@@ -706,18 +705,18 @@ define i1 @ctpop_i32_ult_two(i32 signext %a) nounwind {
 ; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    srliw a1, a0, 1
 ; RV64I-NEXT:    lui a2, 349525
-; RV64I-NEXT:    addiw a2, a2, 1365
+; RV64I-NEXT:    addi a2, a2, 1365
 ; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    lui a2, 209715
-; RV64I-NEXT:    addiw a2, a2, 819
+; RV64I-NEXT:    addi a2, a2, 819
 ; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    srliw a1, a0, 2
 ; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    lui a2, 61681
-; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    addw a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 4
-; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    addw a0, a1, a0
 ; RV64I-NEXT:    lui a1, 4112
 ; RV64I-NEXT:    addiw a2, a2, -241
 ; RV64I-NEXT:    and a0, a0, a2
@@ -746,19 +745,19 @@ define signext i32 @ctpop_i32_load(ptr %p) nounwind {
 ; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    lwu a0, 0(a0)
 ; RV64I-NEXT:    lui a1, 349525
-; RV64I-NEXT:    addiw a1, a1, 1365
+; RV64I-NEXT:    addi a1, a1, 1365
 ; RV64I-NEXT:    srli a2, a0, 1
 ; RV64I-NEXT:    and a1, a2, a1
 ; RV64I-NEXT:    lui a2, 209715
-; RV64I-NEXT:    addiw a2, a2, 819
+; RV64I-NEXT:    addi a2, a2, 819
 ; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    srliw a1, a0, 2
 ; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    lui a2, 61681
-; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    addw a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 4
-; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    addw a0, a1, a0
 ; RV64I-NEXT:    lui a1, 4112
 ; RV64I-NEXT:    addiw a2, a2, -241
 ; RV64I-NEXT:    and a0, a0, a2
@@ -1057,7 +1056,7 @@ define i32 @abs_i32(i32 %x) {
 ; RV64I-LABEL: abs_i32:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    sraiw a1, a0, 31
-; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    addw a0, a0, a1
 ; RV64I-NEXT:    xor a0, a0, a1
 ; RV64I-NEXT:    ret
 ;
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/scmp.ll b/llvm/test/CodeGen/RISCV/GlobalISel/scmp.ll
index 4346e04ecda6..daeb2e69c83b 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/scmp.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/scmp.ll
@@ -97,7 +97,7 @@ define i32 @scmp.32.32(i32 %x, i32 %y) nounwind {
 ; RV64I-NEXT:    sext.w a1, a1
 ; RV64I-NEXT:    slt a2, a1, a0
 ; RV64I-NEXT:    slt a0, a0, a1
-; RV64I-NEXT:    sub a0, a2, a0
+; RV64I-NEXT:    subw a0, a2, a0
 ; RV64I-NEXT:    ret
   %1 = call i32 @llvm.scmp(i32 %x, i32 %y)
   ret i32 %1
@@ -122,7 +122,7 @@ define i32 @scmp.32.64(i64 %x, i64 %y) nounwind {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slt a2, a1, a0
 ; RV64I-NEXT:    slt a0, a0, a1
-; RV64I-NEXT:    sub a0, a2, a0
+; RV64I-NEXT:    subw a0, a2, a0
 ; RV64I-NEXT:    ret
   %1 = call i32 @llvm.scmp(i64 %x, i64 %y)
   ret i32 %1
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/ucmp.ll b/llvm/test/CodeGen/RISCV/GlobalISel/ucmp.ll
index 9784c58dca4f..463883b371ca 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/ucmp.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/ucmp.ll
@@ -97,7 +97,7 @@ define i32 @ucmp.32.32(i32 %x, i32 %y) nounwind {
 ; RV64I-NEXT:    sext.w a1, a1
 ; RV64I-NEXT:    sltu a2, a1, a0
 ; RV64I-NEXT:    sltu a0, a0, a1
-; RV64I-NEXT:    sub a0, a2, a0
+; RV64I-NEXT:    subw a0, a2, a0
 ; RV64I-NEXT:    ret
   %1 = call i32 @llvm.ucmp(i32 %x, i32 %y)
   ret i32 %1
@@ -115,7 +115,7 @@ define i32 @ucmp.32.32_sext(i32 signext %x, i32 signext %y) nounwind {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    sltu a2, a1, a0
 ; RV64I-NEXT:    sltu a0, a0, a1
-; RV64I-NEXT:    sub a0, a2, a0
+; RV64I-NEXT:    subw a0, a2, a0
 ; RV64I-NEXT:    ret
   %1 = call i32 @llvm.ucmp(i32 %x, i32 %y)
   ret i32 %1
@@ -135,7 +135,7 @@ define i32 @ucmp.32.32_zext(i32 zeroext %x, i32 zeroext %y) nounwind {
 ; RV64I-NEXT:    sext.w a1, a1
 ; RV64I-NEXT:    sltu a2, a1, a0
 ; RV64I-NEXT:    sltu a0, a0, a1
-; RV64I-NEXT:    sub a0, a2, a0
+; RV64I-NEXT:    subw a0, a2, a0
 ; RV64I-NEXT:    ret
   %1 = call i32 @llvm.ucmp(i32 %x, i32 %y)
   ret i32 %1
@@ -160,7 +160,7 @@ define i32 @ucmp.32.64(i64 %x, i64 %y) nounwind {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    sltu a2, a1, a0
 ; RV64I-NEXT:    sltu a0, a0, a1
-; RV64I-NEXT:    sub a0, a2, a0
+; RV64I-NEXT:    subw a0, a2, a0
 ; RV64I-NEXT:    ret
   %1 = call i32 @llvm.ucmp(i64 %x, i64 %y)
   ret i32 %1
diff --git a/llvm/test/CodeGen/RISCV/add_sext_shl_constant.ll b/llvm/test/CodeGen/RISCV/add_sext_shl_constant.ll
index fe89b4aa2417..d7f62ae83434 100644
--- a/llvm/test/CodeGen/RISCV/add_sext_shl_constant.ll
+++ b/llvm/test/CodeGen/RISCV/add_sext_shl_constant.ll
@@ -320,6 +320,19 @@ define i64 @add_shl_moreOneUse_sh3add(i64 %x) {
   ret i64 %add
 }
 
+;; Covers a case which previously crashed (pr119527)
+define i64 @add_shl_sext(i32 %1) {
+; RV64-LABEL: add_shl_sext:
+; RV64:       # %bb.0:
+; RV64-NEXT:    addi a1, a0, 3
+; RV64-NEXT:    sllw a0, a1, a0
+; RV64-NEXT:    ret
+  %3 = add i32 %1, 3
+  %4 = shl i32 %3, %1
+  %5 = sext i32 %4 to i64
+  ret i64 %5
+}
+
 define i64 @add_shl_moreOneUse_sh4add(i64 %x) {
 ; RV64-LABEL: add_shl_moreOneUse_sh4add:
 ; RV64:       # %bb.0:
diff --git a/llvm/test/CodeGen/RISCV/attributes.ll b/llvm/test/CodeGen/RISCV/attributes.ll
index 7e55e0590ec5..c0fcc6f61111 100644
--- a/llvm/test/CodeGen/RISCV/attributes.ll
+++ b/llvm/test/CodeGen/RISCV/attributes.ll
@@ -84,6 +84,7 @@
 ; RUN: llc -mtriple=riscv32 -mattr=+experimental-xqcia %s -o - | FileCheck --check-prefix=RV32XQCIA %s
 ; RUN: llc -mtriple=riscv32 -mattr=+experimental-xqciac %s -o - | FileCheck --check-prefix=RV32XQCIAC %s
 ; RUN: llc -mtriple=riscv32 -mattr=+experimental-xqcicli %s -o - | FileCheck --check-prefix=RV32XQCICLI %s
+; RUN: llc -mtriple=riscv32 -mattr=+experimental-xqcicm %s -o - | FileCheck --check-prefix=RV32XQCICM %s
 ; RUN: llc -mtriple=riscv32 -mattr=+experimental-xqcics %s -o - | FileCheck --check-prefix=RV32XQCICS %s
 ; RUN: llc -mtriple=riscv32 -mattr=+experimental-xqcicsr %s -o - | FileCheck --check-prefix=RV32XQCICSR %s
 ; RUN: llc -mtriple=riscv32 -mattr=+experimental-xqcilsm %s -o - | FileCheck --check-prefix=RV32XQCILSM %s
@@ -397,6 +398,7 @@
 ; RV32XQCIA: .attribute 5, "rv32i2p1_xqcia0p2"
 ; RV32XQCIAC: .attribute 5, "rv32i2p1_zca1p0_xqciac0p2"
 ; RV32XQCICLI: .attribute 5, "rv32i2p1_xqcicli0p2"
+; RV32XQCICM: .attribute 5, "rv32i2p1_zca1p0_xqcicm0p2"
 ; RV32XQCICS: .attribute 5, "rv32i2p1_xqcics0p2"
 ; RV32XQCICSR: .attribute 5, "rv32i2p1_xqcicsr0p2"
 ; RV32XQCILSM: .attribute 5, "rv32i2p1_xqcilsm0p2"
diff --git a/llvm/test/CodeGen/RISCV/kcfi-isel-mir.ll b/llvm/test/CodeGen/RISCV/kcfi-isel-mir.ll
index 4c47b5f741fa..2c428cf4ac87 100644
--- a/llvm/test/CodeGen/RISCV/kcfi-isel-mir.ll
+++ b/llvm/test/CodeGen/RISCV/kcfi-isel-mir.ll
@@ -20,7 +20,7 @@ define void @f2(ptr noundef %x) #0 {
   ; CHECK-NEXT:   liveins: $x10
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gprtc = COPY $x10
-  ; CHECK-NEXT:   PseudoTAILIndirect [[COPY]], implicit $x2, cfi-type 12345678
+  ; CHECK-NEXT:   PseudoTAILIndirect [[COPY]], csr_ilp32_lp64, implicit $x2, cfi-type 12345678
   tail call void %x() [ "kcfi"(i32 12345678) ]
   ret void
 }
diff --git a/llvm/test/CodeGen/RISCV/kcfi-mir.ll b/llvm/test/CodeGen/RISCV/kcfi-mir.ll
index f9f383a35358..0c0d39a8bf87 100644
--- a/llvm/test/CodeGen/RISCV/kcfi-mir.ll
+++ b/llvm/test/CodeGen/RISCV/kcfi-mir.ll
@@ -30,7 +30,7 @@ define void @f2(ptr noundef %x) #0 {
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   BUNDLE implicit-def $x6, implicit-def $x6_w, implicit-def $x6_h, implicit-def $x7, implicit-def $x7_w, implicit-def $x7_h, implicit-def $x28, implicit-def $x28_w, implicit-def $x28_h, implicit-def $x29, implicit-def $x29_w, implicit-def $x29_h, implicit-def $x30, implicit-def $x30_w, implicit-def $x30_h, implicit-def $x31, implicit-def $x31_w, implicit-def $x31_h, implicit killed $x10, implicit $x2 {
   ; CHECK-NEXT:     KCFI_CHECK $x10, 12345678, implicit-def $x6, implicit-def $x7, implicit-def $x28, implicit-def $x29, implicit-def $x30, implicit-def $x31
-  ; CHECK-NEXT:     PseudoTAILIndirect killed $x10, implicit $x2
+  ; CHECK-NEXT:     PseudoTAILIndirect killed $x10, csr_ilp32_lp64, implicit $x2
   ; CHECK-NEXT:   }
   tail call void %x() [ "kcfi"(i32 12345678) ]
   ret void
diff --git a/llvm/test/CodeGen/RISCV/rvv/bitreverse-vp.ll b/llvm/test/CodeGen/RISCV/rvv/bitreverse-vp.ll
index 10d24927d9b7..4d34621cd5f2 100644
--- a/llvm/test/CodeGen/RISCV/rvv/bitreverse-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/bitreverse-vp.ll
@@ -1445,10 +1445,9 @@ define <vscale x 1 x i64> @vp_bitreverse_nxv1i64(<vscale x 1 x i64> %va, <vscale
 ; RV32-NEXT:    addi a6, sp, 8
 ; RV32-NEXT:    sw a4, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    vsetvli a4, zero, e64, m1, ta, ma
+; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV32-NEXT:    vlse64.v v9, (a6), zero
 ; RV32-NEXT:    lui a4, 61681
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV32-NEXT:    vsll.vx v10, v8, a3, v0.t
 ; RV32-NEXT:    addi a5, a5, -256
 ; RV32-NEXT:    vand.vx v11, v8, a5, v0.t
@@ -1595,9 +1594,7 @@ define <vscale x 1 x i64> @vp_bitreverse_nxv1i64_unmasked(<vscale x 1 x i64> %va
 ; RV32-NEXT:    vand.vx v13, v8, a1
 ; RV32-NEXT:    vand.vx v12, v12, a1
 ; RV32-NEXT:    vor.vv v11, v12, v11
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
 ; RV32-NEXT:    vlse64.v v12, (a6), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV32-NEXT:    vsll.vx v13, v13, a4
 ; RV32-NEXT:    vor.vv v10, v10, v13
 ; RV32-NEXT:    vsrl.vi v13, v8, 8
@@ -1730,10 +1727,9 @@ define <vscale x 2 x i64> @vp_bitreverse_nxv2i64(<vscale x 2 x i64> %va, <vscale
 ; RV32-NEXT:    addi a6, sp, 8
 ; RV32-NEXT:    sw a4, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    vsetvli a4, zero, e64, m2, ta, ma
+; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV32-NEXT:    vlse64.v v10, (a6), zero
 ; RV32-NEXT:    lui a4, 61681
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV32-NEXT:    vsll.vx v12, v8, a3, v0.t
 ; RV32-NEXT:    addi a5, a5, -256
 ; RV32-NEXT:    vand.vx v14, v8, a5, v0.t
@@ -1880,9 +1876,7 @@ define <vscale x 2 x i64> @vp_bitreverse_nxv2i64_unmasked(<vscale x 2 x i64> %va
 ; RV32-NEXT:    vand.vx v18, v8, a1
 ; RV32-NEXT:    vand.vx v16, v16, a1
 ; RV32-NEXT:    vor.vv v10, v16, v10
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
 ; RV32-NEXT:    vlse64.v v16, (a6), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV32-NEXT:    vsll.vx v18, v18, a4
 ; RV32-NEXT:    vor.vv v12, v12, v18
 ; RV32-NEXT:    vsrl.vi v18, v8, 8
@@ -2015,10 +2009,9 @@ define <vscale x 4 x i64> @vp_bitreverse_nxv4i64(<vscale x 4 x i64> %va, <vscale
 ; RV32-NEXT:    addi a6, sp, 8
 ; RV32-NEXT:    sw a4, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    vsetvli a4, zero, e64, m4, ta, ma
+; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV32-NEXT:    vlse64.v v12, (a6), zero
 ; RV32-NEXT:    lui a4, 61681
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV32-NEXT:    vsll.vx v16, v8, a3, v0.t
 ; RV32-NEXT:    addi a5, a5, -256
 ; RV32-NEXT:    vand.vx v20, v8, a5, v0.t
@@ -2165,9 +2158,7 @@ define <vscale x 4 x i64> @vp_bitreverse_nxv4i64_unmasked(<vscale x 4 x i64> %va
 ; RV32-NEXT:    vand.vx v28, v8, a1
 ; RV32-NEXT:    vand.vx v24, v24, a1
 ; RV32-NEXT:    vor.vv v12, v24, v12
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
 ; RV32-NEXT:    vlse64.v v24, (a6), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV32-NEXT:    vsll.vx v28, v28, a4
 ; RV32-NEXT:    vor.vv v16, v16, v28
 ; RV32-NEXT:    vsrl.vi v28, v8, 8
@@ -2315,7 +2306,6 @@ define <vscale x 7 x i64> @vp_bitreverse_nxv7i64(<vscale x 7 x i64> %va, <vscale
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
 ; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetvli a3, zero, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v16, (a5), zero
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    slli a3, a3, 3
@@ -2323,7 +2313,6 @@ define <vscale x 7 x i64> @vp_bitreverse_nxv7i64(<vscale x 7 x i64> %va, <vscale
 ; RV32-NEXT:    addi a3, a3, 16
 ; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    lui a3, 4080
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vx v24, v8, a3, v0.t
 ; RV32-NEXT:    vsll.vi v24, v24, 24, v0.t
 ; RV32-NEXT:    addi a5, sp, 16
@@ -2528,9 +2517,7 @@ define <vscale x 7 x i64> @vp_bitreverse_nxv7i64_unmasked(<vscale x 7 x i64> %va
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v24, (a6), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsrl.vi v16, v8, 24
 ; RV32-NEXT:    vand.vx v16, v16, a5
 ; RV32-NEXT:    vsrl.vi v0, v8, 8
@@ -2704,7 +2691,6 @@ define <vscale x 8 x i64> @vp_bitreverse_nxv8i64(<vscale x 8 x i64> %va, <vscale
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
 ; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetvli a3, zero, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v16, (a5), zero
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    slli a3, a3, 3
@@ -2712,7 +2698,6 @@ define <vscale x 8 x i64> @vp_bitreverse_nxv8i64(<vscale x 8 x i64> %va, <vscale
 ; RV32-NEXT:    addi a3, a3, 16
 ; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    lui a3, 4080
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vx v24, v8, a3, v0.t
 ; RV32-NEXT:    vsll.vi v24, v24, 24, v0.t
 ; RV32-NEXT:    addi a5, sp, 16
@@ -2917,9 +2902,7 @@ define <vscale x 8 x i64> @vp_bitreverse_nxv8i64_unmasked(<vscale x 8 x i64> %va
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v24, (a6), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsrl.vi v16, v8, 24
 ; RV32-NEXT:    vand.vx v16, v16, a5
 ; RV32-NEXT:    vsrl.vi v0, v8, 8
diff --git a/llvm/test/CodeGen/RISCV/rvv/bswap-vp.ll b/llvm/test/CodeGen/RISCV/rvv/bswap-vp.ll
index 0dc1d0c32ac4..0c58cca0f947 100644
--- a/llvm/test/CodeGen/RISCV/rvv/bswap-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/bswap-vp.ll
@@ -523,11 +523,9 @@ define <vscale x 1 x i64> @vp_bswap_nxv1i64(<vscale x 1 x i64> %va, <vscale x 1
 ; RV32-NEXT:    sw zero, 12(sp)
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV32-NEXT:    vsll.vx v9, v8, a2, v0.t
-; RV32-NEXT:    addi a1, a3, -256
-; RV32-NEXT:    vand.vx v10, v8, a1, v0.t
-; RV32-NEXT:    vsetvli a3, zero, e64, m1, ta, ma
+; RV32-NEXT:    addi a0, a3, -256
+; RV32-NEXT:    vand.vx v10, v8, a0, v0.t
 ; RV32-NEXT:    vlse64.v v11, (a6), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV32-NEXT:    vsll.vx v10, v10, a4, v0.t
 ; RV32-NEXT:    vor.vv v9, v9, v10, v0.t
 ; RV32-NEXT:    vand.vx v10, v8, a5, v0.t
@@ -538,7 +536,7 @@ define <vscale x 1 x i64> @vp_bswap_nxv1i64(<vscale x 1 x i64> %va, <vscale x 1
 ; RV32-NEXT:    vor.vv v9, v9, v10, v0.t
 ; RV32-NEXT:    vsrl.vx v10, v8, a2, v0.t
 ; RV32-NEXT:    vsrl.vx v12, v8, a4, v0.t
-; RV32-NEXT:    vand.vx v12, v12, a1, v0.t
+; RV32-NEXT:    vand.vx v12, v12, a0, v0.t
 ; RV32-NEXT:    vor.vv v10, v12, v10, v0.t
 ; RV32-NEXT:    vsrl.vi v12, v8, 24, v0.t
 ; RV32-NEXT:    vand.vx v12, v12, a5, v0.t
@@ -609,15 +607,13 @@ define <vscale x 1 x i64> @vp_bswap_nxv1i64_unmasked(<vscale x 1 x i64> %va, i32
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
 ; RV32-NEXT:    vsll.vx v10, v8, a2
-; RV32-NEXT:    addi a1, a3, -256
+; RV32-NEXT:    addi a0, a3, -256
 ; RV32-NEXT:    vsrl.vx v11, v8, a2
 ; RV32-NEXT:    vsrl.vx v12, v8, a4
-; RV32-NEXT:    vand.vx v13, v8, a1
-; RV32-NEXT:    vand.vx v12, v12, a1
+; RV32-NEXT:    vand.vx v13, v8, a0
+; RV32-NEXT:    vand.vx v12, v12, a0
 ; RV32-NEXT:    vor.vv v11, v12, v11
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
 ; RV32-NEXT:    vlse64.v v12, (a6), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV32-NEXT:    vsll.vx v13, v13, a4
 ; RV32-NEXT:    vor.vv v10, v10, v13
 ; RV32-NEXT:    vsrl.vi v13, v8, 8
@@ -695,11 +691,9 @@ define <vscale x 2 x i64> @vp_bswap_nxv2i64(<vscale x 2 x i64> %va, <vscale x 2
 ; RV32-NEXT:    sw zero, 12(sp)
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV32-NEXT:    vsll.vx v10, v8, a2, v0.t
-; RV32-NEXT:    addi a1, a3, -256
-; RV32-NEXT:    vand.vx v12, v8, a1, v0.t
-; RV32-NEXT:    vsetvli a3, zero, e64, m2, ta, ma
+; RV32-NEXT:    addi a0, a3, -256
+; RV32-NEXT:    vand.vx v12, v8, a0, v0.t
 ; RV32-NEXT:    vlse64.v v14, (a6), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV32-NEXT:    vsll.vx v12, v12, a4, v0.t
 ; RV32-NEXT:    vor.vv v10, v10, v12, v0.t
 ; RV32-NEXT:    vand.vx v12, v8, a5, v0.t
@@ -710,7 +704,7 @@ define <vscale x 2 x i64> @vp_bswap_nxv2i64(<vscale x 2 x i64> %va, <vscale x 2
 ; RV32-NEXT:    vor.vv v10, v10, v12, v0.t
 ; RV32-NEXT:    vsrl.vx v12, v8, a2, v0.t
 ; RV32-NEXT:    vsrl.vx v16, v8, a4, v0.t
-; RV32-NEXT:    vand.vx v16, v16, a1, v0.t
+; RV32-NEXT:    vand.vx v16, v16, a0, v0.t
 ; RV32-NEXT:    vor.vv v12, v16, v12, v0.t
 ; RV32-NEXT:    vsrl.vi v16, v8, 24, v0.t
 ; RV32-NEXT:    vand.vx v16, v16, a5, v0.t
@@ -781,15 +775,13 @@ define <vscale x 2 x i64> @vp_bswap_nxv2i64_unmasked(<vscale x 2 x i64> %va, i32
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
 ; RV32-NEXT:    vsll.vx v12, v8, a2
-; RV32-NEXT:    addi a1, a3, -256
+; RV32-NEXT:    addi a0, a3, -256
 ; RV32-NEXT:    vsrl.vx v14, v8, a2
 ; RV32-NEXT:    vsrl.vx v16, v8, a4
-; RV32-NEXT:    vand.vx v18, v8, a1
-; RV32-NEXT:    vand.vx v16, v16, a1
+; RV32-NEXT:    vand.vx v18, v8, a0
+; RV32-NEXT:    vand.vx v16, v16, a0
 ; RV32-NEXT:    vor.vv v14, v16, v14
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
 ; RV32-NEXT:    vlse64.v v16, (a6), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV32-NEXT:    vsll.vx v18, v18, a4
 ; RV32-NEXT:    vor.vv v12, v12, v18
 ; RV32-NEXT:    vsrl.vi v18, v8, 8
@@ -867,11 +859,9 @@ define <vscale x 4 x i64> @vp_bswap_nxv4i64(<vscale x 4 x i64> %va, <vscale x 4
 ; RV32-NEXT:    sw zero, 12(sp)
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV32-NEXT:    vsll.vx v16, v8, a2, v0.t
-; RV32-NEXT:    addi a1, a3, -256
-; RV32-NEXT:    vand.vx v20, v8, a1, v0.t
-; RV32-NEXT:    vsetvli a3, zero, e64, m4, ta, ma
+; RV32-NEXT:    addi a0, a3, -256
+; RV32-NEXT:    vand.vx v20, v8, a0, v0.t
 ; RV32-NEXT:    vlse64.v v12, (a6), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV32-NEXT:    vsll.vx v20, v20, a4, v0.t
 ; RV32-NEXT:    vor.vv v16, v16, v20, v0.t
 ; RV32-NEXT:    vand.vx v20, v8, a5, v0.t
@@ -882,7 +872,7 @@ define <vscale x 4 x i64> @vp_bswap_nxv4i64(<vscale x 4 x i64> %va, <vscale x 4
 ; RV32-NEXT:    vor.vv v16, v16, v20, v0.t
 ; RV32-NEXT:    vsrl.vx v20, v8, a2, v0.t
 ; RV32-NEXT:    vsrl.vx v24, v8, a4, v0.t
-; RV32-NEXT:    vand.vx v24, v24, a1, v0.t
+; RV32-NEXT:    vand.vx v24, v24, a0, v0.t
 ; RV32-NEXT:    vor.vv v20, v24, v20, v0.t
 ; RV32-NEXT:    vsrl.vi v24, v8, 24, v0.t
 ; RV32-NEXT:    vand.vx v24, v24, a5, v0.t
@@ -953,15 +943,13 @@ define <vscale x 4 x i64> @vp_bswap_nxv4i64_unmasked(<vscale x 4 x i64> %va, i32
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
 ; RV32-NEXT:    vsll.vx v16, v8, a2
-; RV32-NEXT:    addi a1, a3, -256
+; RV32-NEXT:    addi a0, a3, -256
 ; RV32-NEXT:    vsrl.vx v20, v8, a2
 ; RV32-NEXT:    vsrl.vx v24, v8, a4
-; RV32-NEXT:    vand.vx v28, v8, a1
-; RV32-NEXT:    vand.vx v24, v24, a1
+; RV32-NEXT:    vand.vx v28, v8, a0
+; RV32-NEXT:    vand.vx v24, v24, a0
 ; RV32-NEXT:    vor.vv v20, v24, v20
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
 ; RV32-NEXT:    vlse64.v v24, (a6), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV32-NEXT:    vsll.vx v28, v28, a4
 ; RV32-NEXT:    vor.vv v16, v16, v28
 ; RV32-NEXT:    vsrl.vi v28, v8, 8
@@ -1043,51 +1031,49 @@ define <vscale x 7 x i64> @vp_bswap_nxv7i64(<vscale x 7 x i64> %va, <vscale x 7
 ; RV32-NEXT:    sw zero, 12(sp)
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsll.vx v16, v8, a2, v0.t
-; RV32-NEXT:    addi a1, a3, -256
-; RV32-NEXT:    vand.vx v24, v8, a1, v0.t
+; RV32-NEXT:    addi a0, a3, -256
+; RV32-NEXT:    vand.vx v24, v8, a0, v0.t
 ; RV32-NEXT:    vsll.vx v24, v24, a4, v0.t
 ; RV32-NEXT:    vor.vv v16, v16, v24, v0.t
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 4
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vlse64.v v16, (a5), zero
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    lui a1, 4080
+; RV32-NEXT:    vand.vx v24, v8, a1, v0.t
+; RV32-NEXT:    vsll.vi v24, v24, 24, v0.t
+; RV32-NEXT:    addi a3, sp, 16
+; RV32-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
+; RV32-NEXT:    vsll.vi v16, v24, 8, v0.t
+; RV32-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
+; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    slli a3, a3, 4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetvli a3, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a5), zero
+; RV32-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
+; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 3
+; RV32-NEXT:    slli a3, a3, 4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
 ; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    lui a3, 4080
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vx v24, v8, a3, v0.t
-; RV32-NEXT:    vsll.vi v24, v24, 24, v0.t
-; RV32-NEXT:    addi a0, sp, 16
-; RV32-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
-; RV32-NEXT:    vsll.vi v16, v24, 8, v0.t
-; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsrl.vx v16, v8, a2, v0.t
 ; RV32-NEXT:    vsrl.vx v24, v8, a4, v0.t
-; RV32-NEXT:    vand.vx v24, v24, a1, v0.t
+; RV32-NEXT:    vand.vx v24, v24, a0, v0.t
 ; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
 ; RV32-NEXT:    addi a0, sp, 16
 ; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsrl.vi v24, v8, 24, v0.t
-; RV32-NEXT:    vand.vx v24, v24, a3, v0.t
+; RV32-NEXT:    vand.vx v24, v24, a1, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 8, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 3
@@ -1193,24 +1179,22 @@ define <vscale x 7 x i64> @vp_bswap_nxv7i64_unmasked(<vscale x 7 x i64> %va, i32
 ; RV32-NEXT:    sw zero, 12(sp)
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsll.vx v24, v8, a2
-; RV32-NEXT:    addi a1, a3, -256
+; RV32-NEXT:    addi a0, a3, -256
 ; RV32-NEXT:    vsrl.vx v16, v8, a2
 ; RV32-NEXT:    vsrl.vx v0, v8, a4
-; RV32-NEXT:    vand.vx v0, v0, a1
+; RV32-NEXT:    vand.vx v0, v0, a0
 ; RV32-NEXT:    vor.vv v16, v0, v16
-; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 3
-; RV32-NEXT:    add a2, sp, a2
-; RV32-NEXT:    addi a2, a2, 16
-; RV32-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; RV32-NEXT:    vand.vx v0, v8, a1
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vand.vx v0, v8, a0
 ; RV32-NEXT:    vsll.vx v0, v0, a4
 ; RV32-NEXT:    vor.vv v16, v24, v0
-; RV32-NEXT:    addi a1, sp, 16
-; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; RV32-NEXT:    vlse64.v v0, (a6), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsrl.vi v16, v8, 24
 ; RV32-NEXT:    vand.vx v16, v16, a5
 ; RV32-NEXT:    vsrl.vi v24, v8, 8
@@ -1221,7 +1205,6 @@ define <vscale x 7 x i64> @vp_bswap_nxv7i64_unmasked(<vscale x 7 x i64> %va, i32
 ; RV32-NEXT:    vsll.vi v8, v8, 24
 ; RV32-NEXT:    vsll.vi v24, v24, 8
 ; RV32-NEXT:    vor.vv v8, v8, v24
-; RV32-NEXT:    addi a0, sp, 16
 ; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vor.vv v8, v24, v8
 ; RV32-NEXT:    csrr a0, vlenb
@@ -1318,51 +1301,49 @@ define <vscale x 8 x i64> @vp_bswap_nxv8i64(<vscale x 8 x i64> %va, <vscale x 8
 ; RV32-NEXT:    sw zero, 12(sp)
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsll.vx v16, v8, a2, v0.t
-; RV32-NEXT:    addi a1, a3, -256
-; RV32-NEXT:    vand.vx v24, v8, a1, v0.t
+; RV32-NEXT:    addi a0, a3, -256
+; RV32-NEXT:    vand.vx v24, v8, a0, v0.t
 ; RV32-NEXT:    vsll.vx v24, v24, a4, v0.t
 ; RV32-NEXT:    vor.vv v16, v16, v24, v0.t
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 4
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vlse64.v v16, (a5), zero
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    lui a1, 4080
+; RV32-NEXT:    vand.vx v24, v8, a1, v0.t
+; RV32-NEXT:    vsll.vi v24, v24, 24, v0.t
+; RV32-NEXT:    addi a3, sp, 16
+; RV32-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
+; RV32-NEXT:    vsll.vi v16, v24, 8, v0.t
+; RV32-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
+; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    slli a3, a3, 4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetvli a3, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a5), zero
+; RV32-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
+; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 3
+; RV32-NEXT:    slli a3, a3, 4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
 ; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    lui a3, 4080
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vx v24, v8, a3, v0.t
-; RV32-NEXT:    vsll.vi v24, v24, 24, v0.t
-; RV32-NEXT:    addi a0, sp, 16
-; RV32-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
-; RV32-NEXT:    vsll.vi v16, v24, 8, v0.t
-; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsrl.vx v16, v8, a2, v0.t
 ; RV32-NEXT:    vsrl.vx v24, v8, a4, v0.t
-; RV32-NEXT:    vand.vx v24, v24, a1, v0.t
+; RV32-NEXT:    vand.vx v24, v24, a0, v0.t
 ; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
 ; RV32-NEXT:    addi a0, sp, 16
 ; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsrl.vi v24, v8, 24, v0.t
-; RV32-NEXT:    vand.vx v24, v24, a3, v0.t
+; RV32-NEXT:    vand.vx v24, v24, a1, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 8, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 3
@@ -1468,24 +1449,22 @@ define <vscale x 8 x i64> @vp_bswap_nxv8i64_unmasked(<vscale x 8 x i64> %va, i32
 ; RV32-NEXT:    sw zero, 12(sp)
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsll.vx v24, v8, a2
-; RV32-NEXT:    addi a1, a3, -256
+; RV32-NEXT:    addi a0, a3, -256
 ; RV32-NEXT:    vsrl.vx v16, v8, a2
 ; RV32-NEXT:    vsrl.vx v0, v8, a4
-; RV32-NEXT:    vand.vx v0, v0, a1
+; RV32-NEXT:    vand.vx v0, v0, a0
 ; RV32-NEXT:    vor.vv v16, v0, v16
-; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 3
-; RV32-NEXT:    add a2, sp, a2
-; RV32-NEXT:    addi a2, a2, 16
-; RV32-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; RV32-NEXT:    vand.vx v0, v8, a1
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vand.vx v0, v8, a0
 ; RV32-NEXT:    vsll.vx v0, v0, a4
 ; RV32-NEXT:    vor.vv v16, v24, v0
-; RV32-NEXT:    addi a1, sp, 16
-; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; RV32-NEXT:    vlse64.v v0, (a6), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsrl.vi v16, v8, 24
 ; RV32-NEXT:    vand.vx v16, v16, a5
 ; RV32-NEXT:    vsrl.vi v24, v8, 8
@@ -1496,7 +1475,6 @@ define <vscale x 8 x i64> @vp_bswap_nxv8i64_unmasked(<vscale x 8 x i64> %va, i32
 ; RV32-NEXT:    vsll.vi v8, v8, 24
 ; RV32-NEXT:    vsll.vi v24, v24, 8
 ; RV32-NEXT:    vor.vv v8, v8, v24
-; RV32-NEXT:    addi a0, sp, 16
 ; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vor.vv v8, v24, v8
 ; RV32-NEXT:    csrr a0, vlenb
@@ -1716,11 +1694,9 @@ define <vscale x 1 x i48> @vp_bswap_nxv1i48(<vscale x 1 x i48> %va, <vscale x 1
 ; RV32-NEXT:    sw zero, 12(sp)
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV32-NEXT:    vsll.vx v9, v8, a2, v0.t
-; RV32-NEXT:    addi a1, a3, -256
-; RV32-NEXT:    vand.vx v10, v8, a1, v0.t
-; RV32-NEXT:    vsetvli a3, zero, e64, m1, ta, ma
+; RV32-NEXT:    addi a0, a3, -256
+; RV32-NEXT:    vand.vx v10, v8, a0, v0.t
 ; RV32-NEXT:    vlse64.v v11, (a6), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV32-NEXT:    vsll.vx v10, v10, a4, v0.t
 ; RV32-NEXT:    vor.vv v9, v9, v10, v0.t
 ; RV32-NEXT:    vand.vx v10, v8, a5, v0.t
@@ -1731,7 +1707,7 @@ define <vscale x 1 x i48> @vp_bswap_nxv1i48(<vscale x 1 x i48> %va, <vscale x 1
 ; RV32-NEXT:    vor.vv v9, v9, v10, v0.t
 ; RV32-NEXT:    vsrl.vx v10, v8, a2, v0.t
 ; RV32-NEXT:    vsrl.vx v12, v8, a4, v0.t
-; RV32-NEXT:    vand.vx v12, v12, a1, v0.t
+; RV32-NEXT:    vand.vx v12, v12, a0, v0.t
 ; RV32-NEXT:    vor.vv v10, v12, v10, v0.t
 ; RV32-NEXT:    vsrl.vi v12, v8, 24, v0.t
 ; RV32-NEXT:    vand.vx v12, v12, a5, v0.t
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll
index 15793eaada07..66952cac8e00 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll
@@ -1254,12 +1254,10 @@ define void @copysign_neg_trunc_v4f16_v4f32(ptr %x, ptr %y) {
 define void @copysign_neg_trunc_v3f16_v3f32(ptr %x, ptr %y) {
 ; ZVFH-LABEL: copysign_neg_trunc_v3f16_v3f32:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    vsetivli zero, 3, e32, m1, ta, ma
+; ZVFH-NEXT:    vsetivli zero, 3, e16, mf2, ta, ma
 ; ZVFH-NEXT:    vle32.v v8, (a1)
 ; ZVFH-NEXT:    vle16.v v9, (a0)
-; ZVFH-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; ZVFH-NEXT:    vfncvt.f.f.w v10, v8
-; ZVFH-NEXT:    vsetivli zero, 3, e16, mf2, ta, ma
 ; ZVFH-NEXT:    vfsgnjn.vv v8, v9, v10
 ; ZVFH-NEXT:    vse16.v v8, (a0)
 ; ZVFH-NEXT:    ret
@@ -1272,9 +1270,7 @@ define void @copysign_neg_trunc_v3f16_v3f32(ptr %x, ptr %y) {
 ; ZVFHMIN-NEXT:    lui a1, 8
 ; ZVFHMIN-NEXT:    addi a2, a1, -1
 ; ZVFHMIN-NEXT:    vand.vx v8, v8, a2
-; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v9
-; ZVFHMIN-NEXT:    vsetivli zero, 3, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v9, v10, a1
 ; ZVFHMIN-NEXT:    vand.vx v9, v9, a1
 ; ZVFHMIN-NEXT:    vor.vv v8, v8, v9
@@ -4013,9 +4009,10 @@ define void @trunc_v6f16(ptr %x) {
 ; ZVFH-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFH-NEXT:    vfabs.v v9, v8
 ; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
+; ZVFH-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; ZVFH-NEXT:    vfcvt.rtz.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; ZVFH-NEXT:    vsetivli zero, 6, e16, m1, ta, mu
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m1, ta, mu
 ; ZVFH-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
 ; ZVFH-NEXT:    vse16.v v8, (a0)
 ; ZVFH-NEXT:    ret
@@ -4197,10 +4194,11 @@ define void @ceil_v6f16(ptr %x) {
 ; ZVFH-NEXT:    vfabs.v v9, v8
 ; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
 ; ZVFH-NEXT:    fsrmi a1, 3
+; ZVFH-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a1
 ; ZVFH-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; ZVFH-NEXT:    vsetivli zero, 6, e16, m1, ta, mu
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m1, ta, mu
 ; ZVFH-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
 ; ZVFH-NEXT:    vse16.v v8, (a0)
 ; ZVFH-NEXT:    ret
@@ -4388,10 +4386,11 @@ define void @floor_v6f16(ptr %x) {
 ; ZVFH-NEXT:    vfabs.v v9, v8
 ; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
 ; ZVFH-NEXT:    fsrmi a1, 2
+; ZVFH-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a1
 ; ZVFH-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; ZVFH-NEXT:    vsetivli zero, 6, e16, m1, ta, mu
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m1, ta, mu
 ; ZVFH-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
 ; ZVFH-NEXT:    vse16.v v8, (a0)
 ; ZVFH-NEXT:    ret
@@ -4579,10 +4578,11 @@ define void @round_v6f16(ptr %x) {
 ; ZVFH-NEXT:    vfabs.v v9, v8
 ; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
 ; ZVFH-NEXT:    fsrmi a1, 4
+; ZVFH-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    fsrm a1
 ; ZVFH-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; ZVFH-NEXT:    vsetivli zero, 6, e16, m1, ta, mu
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m1, ta, mu
 ; ZVFH-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
 ; ZVFH-NEXT:    vse16.v v8, (a0)
 ; ZVFH-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll
index 59c7feb53ce9..80e462c93769 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll
@@ -1142,9 +1142,7 @@ define void @mulhu_v6i16(ptr %x) {
 ; CHECK-NEXT:    vle16.v v8, (a0)
 ; CHECK-NEXT:    lui a1, %hi(.LCPI67_0)
 ; CHECK-NEXT:    addi a1, a1, %lo(.LCPI67_0)
-; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-NEXT:    vle16.v v9, (a1)
-; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; CHECK-NEXT:    vdivu.vv v8, v8, v9
 ; CHECK-NEXT:    vse16.v v8, (a0)
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-formation.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-formation.ll
index 4f0f5dd78c94..bf8baafc4a25 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-formation.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-formation.ll
@@ -530,7 +530,7 @@ define i32 @reduce_and_16xi32_prefix5(ptr %p) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 5, e32, m2, ta, ma
 ; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    vsetivli zero, 5, e32, m1, ta, ma
+; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.i v10, -1
 ; CHECK-NEXT:    vsetivli zero, 5, e32, m2, ta, ma
 ; CHECK-NEXT:    vredand.vs v8, v8, v10
@@ -725,7 +725,7 @@ define i32 @reduce_umin_16xi32_prefix5(ptr %p) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 5, e32, m2, ta, ma
 ; RV32-NEXT:    vle32.v v8, (a0)
-; RV32-NEXT:    vsetivli zero, 5, e32, m1, ta, ma
+; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32-NEXT:    vmv.v.i v10, -1
 ; RV32-NEXT:    vsetivli zero, 5, e32, m2, ta, ma
 ; RV32-NEXT:    vredminu.vs v8, v8, v10
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect-vp.ll
index 5d407caf7151..05254e60b65b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect-vp.ll
@@ -473,6 +473,7 @@ define <32 x i64> @select_evl_v32i64(<32 x i1> %a, <32 x i64> %b, <32 x i64> %c)
 ; CHECK-NEXT:    addi a1, a0, 128
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; CHECK-NEXT:    vle64.v v8, (a0)
+; CHECK-NEXT:    vsetivli zero, 1, e64, m8, ta, ma
 ; CHECK-NEXT:    vle64.v v16, (a1)
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
diff --git a/llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll
index 7649d60def11..33fe73a097e3 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll
@@ -582,14 +582,14 @@ define <vscale x 1 x half> @vfmax_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfmax_vv_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v10, v0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v11, v11, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmerge.vvm v9, v11, v8, v0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v10
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v8, v8, v0.t
@@ -616,13 +616,13 @@ define <vscale x 1 x half> @vfmax_vv_nxv1f16_unmasked(<vscale x 1 x half> %va, <
 ;
 ; ZVFHMIN-LABEL: vfmax_vv_nxv1f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmerge.vvm v9, v10, v8, v0
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v8, v8
 ; ZVFHMIN-NEXT:    vmerge.vvm v8, v8, v10, v0
@@ -652,14 +652,14 @@ define <vscale x 2 x half> @vfmax_vv_nxv2f16(<vscale x 2 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfmax_vv_nxv2f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v10, v0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v11, v11, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmerge.vvm v9, v11, v8, v0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v10
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v8, v8, v0.t
@@ -686,13 +686,13 @@ define <vscale x 2 x half> @vfmax_vv_nxv2f16_unmasked(<vscale x 2 x half> %va, <
 ;
 ; ZVFHMIN-LABEL: vfmax_vv_nxv2f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmerge.vvm v9, v10, v8, v0
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v8, v8
 ; ZVFHMIN-NEXT:    vmerge.vvm v8, v8, v10, v0
@@ -722,15 +722,15 @@ define <vscale x 4 x half> @vfmax_vv_nxv4f16(<vscale x 4 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfmax_vv_nxv4f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v10, v0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v8, v12, v12, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v9
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmerge.vvm v16, v12, v14, v0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v10
 ; ZVFHMIN-NEXT:    vmfeq.vv v8, v14, v14, v0.t
@@ -758,13 +758,13 @@ define <vscale x 4 x half> @vfmax_vv_nxv4f16_unmasked(<vscale x 4 x half> %va, <
 ;
 ; ZVFHMIN-LABEL: vfmax_vv_nxv4f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmerge.vvm v8, v10, v12, v0
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v12, v12
 ; ZVFHMIN-NEXT:    vmerge.vvm v10, v12, v10, v0
@@ -796,15 +796,15 @@ define <vscale x 8 x half> @vfmax_vv_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfmax_vv_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v12, v0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v8, v16, v16, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v10
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmerge.vvm v24, v16, v20, v0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v12
 ; ZVFHMIN-NEXT:    vmfeq.vv v8, v20, v20, v0.t
@@ -832,13 +832,13 @@ define <vscale x 8 x half> @vfmax_vv_nxv8f16_unmasked(<vscale x 8 x half> %va, <
 ;
 ; ZVFHMIN-LABEL: vfmax_vv_nxv8f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v12, v12
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmerge.vvm v8, v12, v16, v0
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v16, v16
 ; ZVFHMIN-NEXT:    vmerge.vvm v12, v16, v12, v0
@@ -876,15 +876,15 @@ define <vscale x 16 x half> @vfmax_vv_nxv16f16(<vscale x 16 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v7, v0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v8, v24, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmerge.vvm v8, v24, v16, v0
 ; ZVFHMIN-NEXT:    addi a0, sp, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
@@ -927,10 +927,10 @@ define <vscale x 16 x half> @vfmax_vv_nxv16f16_unmasked(<vscale x 16 x half> %va
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v16, v16
 ; ZVFHMIN-NEXT:    vmfeq.vv v7, v24, v24
 ; ZVFHMIN-NEXT:    vmerge.vvm v8, v16, v24, v0
@@ -995,64 +995,62 @@ define <vscale x 32 x half> @vfmax_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    add a1, a2, a1
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x21, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 33 * vlenb
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    li a3, 24
-; ZVFHMIN-NEXT:    mul a1, a1, a3
+; ZVFHMIN-NEXT:    li a2, 25
+; ZVFHMIN-NEXT:    mul a1, a1, a2
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmv8r.v v16, v8
+; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
 ; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 5
+; ZVFHMIN-NEXT:    li a5, 24
+; ZVFHMIN-NEXT:    mul a4, a4, a5
 ; ZVFHMIN-NEXT:    add a4, sp, a4
 ; ZVFHMIN-NEXT:    addi a4, a4, 16
 ; ZVFHMIN-NEXT:    vs1r.v v0, (a4) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v12, v0, a2
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vs1r.v v0, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    vmv1r.v v0, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v13, v24, v24, v0.t
-; ZVFHMIN-NEXT:    vmv8r.v v0, v16
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
 ; ZVFHMIN-NEXT:    slli a3, a3, 4
 ; ZVFHMIN-NEXT:    add a3, sp, a3
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4
-; ZVFHMIN-NEXT:    vmv1r.v v0, v13
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    li a4, 24
-; ZVFHMIN-NEXT:    mul a3, a3, a4
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmerge.vvm v24, v24, v16, v0
+; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v12, v24, v24, v0.t
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    li a3, 25
+; ZVFHMIN-NEXT:    mul a2, a2, a3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vl8r.v v0, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v12
-; ZVFHMIN-NEXT:    vmfeq.vv v13, v16, v16, v0.t
-; ZVFHMIN-NEXT:    vmv1r.v v0, v13
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v24, v16, v0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    li a3, 24
-; ZVFHMIN-NEXT:    mul a2, a2, a3
+; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vmerge.vvm v16, v16, v24, v0
+; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vl1r.v v8, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vmv1r.v v0, v8
+; ZVFHMIN-NEXT:    vmfeq.vv v12, v16, v16, v0.t
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v12
+; ZVFHMIN-NEXT:    vmerge.vvm v16, v16, v24, v0
+; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
@@ -1070,35 +1068,43 @@ define <vscale x 32 x half> @vfmax_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB22_2:
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 5
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v24, v16, v16, v0.t
-; ZVFHMIN-NEXT:    vmv8r.v v8, v16
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a1, 4
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v0
-; ZVFHMIN-NEXT:    vmv1r.v v0, v24
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmerge.vvm v24, v8, v16, v0
+; ZVFHMIN-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    li a1, 24
 ; ZVFHMIN-NEXT:    mul a0, a0, a1
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl1r.v v0, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v8, v24, v24, v0.t
+; ZVFHMIN-NEXT:    addi a0, sp, 16
 ; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 5
+; ZVFHMIN-NEXT:    li a1, 25
+; ZVFHMIN-NEXT:    mul a0, a0, a1
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v0
+; ZVFHMIN-NEXT:    vmv1r.v v0, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vmerge.vvm v24, v24, v16, v0
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    li a1, 25
+; ZVFHMIN-NEXT:    mul a0, a0, a1
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    li a1, 24
+; ZVFHMIN-NEXT:    mul a0, a0, a1
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl1r.v v9, (a0) # Unknown-size Folded Reload
@@ -1110,7 +1116,7 @@ define <vscale x 32 x half> @vfmax_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    vmerge.vvm v16, v16, v24, v0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v9
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    li a1, 24
+; ZVFHMIN-NEXT:    li a1, 25
 ; ZVFHMIN-NEXT:    mul a0, a0, a1
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
@@ -1152,68 +1158,61 @@ define <vscale x 32 x half> @vfmax_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 5
+; ZVFHMIN-NEXT:    li a2, 25
+; ZVFHMIN-NEXT:    mul a1, a1, a2
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x19, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 25 * vlenb
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v7
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    li a3, 24
-; ZVFHMIN-NEXT:    mul a1, a1, a3
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vmv8r.v v0, v8
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    vmset.m v24
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v12, v7, a2
+; ZVFHMIN-NEXT:    vslidedown.vx v8, v24, a2
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vs1r.v v8, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    vmv1r.v v0, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v13, v24, v24, v0.t
-; ZVFHMIN-NEXT:    vmv8r.v v0, v16
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 4
 ; ZVFHMIN-NEXT:    add a3, sp, a3
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4
-; ZVFHMIN-NEXT:    vmv1r.v v0, v13
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    li a4, 24
-; ZVFHMIN-NEXT:    mul a3, a3, a4
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmerge.vvm v24, v24, v16, v0
+; ZVFHMIN-NEXT:    vs8r.v v0, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v4
+; ZVFHMIN-NEXT:    vmv1r.v v0, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v12, v24, v24, v0.t
+; ZVFHMIN-NEXT:    vmv8r.v v0, v16
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    slli a3, a2, 4
+; ZVFHMIN-NEXT:    add a2, a3, a2
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v12
-; ZVFHMIN-NEXT:    vmfeq.vv v13, v16, v16, v0.t
-; ZVFHMIN-NEXT:    vmv1r.v v0, v13
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v24, v16, v0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    li a3, 24
-; ZVFHMIN-NEXT:    mul a2, a2, a3
+; ZVFHMIN-NEXT:    slli a3, a2, 3
+; ZVFHMIN-NEXT:    add a2, a3, a2
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vmerge.vvm v16, v16, v24, v0
+; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vl1r.v v13, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vmv1r.v v0, v13
+; ZVFHMIN-NEXT:    vmfeq.vv v12, v16, v16, v0.t
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v12
+; ZVFHMIN-NEXT:    vmerge.vvm v16, v16, v24, v0
+; ZVFHMIN-NEXT:    vmv1r.v v0, v13
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    slli a3, a2, 3
+; ZVFHMIN-NEXT:    add a2, a3, a2
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
@@ -1221,7 +1220,8 @@ define <vscale x 32 x half> @vfmax_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    slli a3, a2, 3
+; ZVFHMIN-NEXT:    add a2, a3, a2
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
@@ -1229,43 +1229,49 @@ define <vscale x 32 x half> @vfmax_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB23_2:
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 4
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a1, a0, 4
+; ZVFHMIN-NEXT:    add a0, a1, a0
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v0
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v8, v8
 ; ZVFHMIN-NEXT:    vmfeq.vv v7, v16, v16
 ; ZVFHMIN-NEXT:    vmerge.vvm v24, v8, v16, v0
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    li a1, 24
-; ZVFHMIN-NEXT:    mul a0, a0, a1
+; ZVFHMIN-NEXT:    slli a1, a0, 4
+; ZVFHMIN-NEXT:    add a0, a1, a0
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v7
 ; ZVFHMIN-NEXT:    vmerge.vvm v16, v16, v8, v0
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    li a1, 24
-; ZVFHMIN-NEXT:    mul a0, a0, a1
+; ZVFHMIN-NEXT:    slli a1, a0, 4
+; ZVFHMIN-NEXT:    add a0, a1, a0
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfmax.vv v16, v16, v24
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    slli a1, a0, 3
+; ZVFHMIN-NEXT:    add a0, a1, a0
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 5
+; ZVFHMIN-NEXT:    li a1, 25
+; ZVFHMIN-NEXT:    mul a0, a0, a1
 ; ZVFHMIN-NEXT:    add sp, sp, a0
 ; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
 ; ZVFHMIN-NEXT:    addi sp, sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll
index 8e448fcda9c5..c65712e9965a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll
@@ -582,14 +582,14 @@ define <vscale x 1 x half> @vfmin_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfmin_vv_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v10, v0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v11, v11, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmerge.vvm v9, v11, v8, v0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v10
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v8, v8, v0.t
@@ -616,13 +616,13 @@ define <vscale x 1 x half> @vfmin_vv_nxv1f16_unmasked(<vscale x 1 x half> %va, <
 ;
 ; ZVFHMIN-LABEL: vfmin_vv_nxv1f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmerge.vvm v9, v10, v8, v0
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v8, v8
 ; ZVFHMIN-NEXT:    vmerge.vvm v8, v8, v10, v0
@@ -652,14 +652,14 @@ define <vscale x 2 x half> @vfmin_vv_nxv2f16(<vscale x 2 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfmin_vv_nxv2f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v10, v0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v11, v11, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmerge.vvm v9, v11, v8, v0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v10
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v8, v8, v0.t
@@ -686,13 +686,13 @@ define <vscale x 2 x half> @vfmin_vv_nxv2f16_unmasked(<vscale x 2 x half> %va, <
 ;
 ; ZVFHMIN-LABEL: vfmin_vv_nxv2f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmerge.vvm v9, v10, v8, v0
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v8, v8
 ; ZVFHMIN-NEXT:    vmerge.vvm v8, v8, v10, v0
@@ -722,15 +722,15 @@ define <vscale x 4 x half> @vfmin_vv_nxv4f16(<vscale x 4 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfmin_vv_nxv4f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v10, v0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v8, v12, v12, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v9
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmerge.vvm v16, v12, v14, v0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v10
 ; ZVFHMIN-NEXT:    vmfeq.vv v8, v14, v14, v0.t
@@ -758,13 +758,13 @@ define <vscale x 4 x half> @vfmin_vv_nxv4f16_unmasked(<vscale x 4 x half> %va, <
 ;
 ; ZVFHMIN-LABEL: vfmin_vv_nxv4f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmerge.vvm v8, v10, v12, v0
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v12, v12
 ; ZVFHMIN-NEXT:    vmerge.vvm v10, v12, v10, v0
@@ -796,15 +796,15 @@ define <vscale x 8 x half> @vfmin_vv_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfmin_vv_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v12, v0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v8, v16, v16, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v10
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmerge.vvm v24, v16, v20, v0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v12
 ; ZVFHMIN-NEXT:    vmfeq.vv v8, v20, v20, v0.t
@@ -832,13 +832,13 @@ define <vscale x 8 x half> @vfmin_vv_nxv8f16_unmasked(<vscale x 8 x half> %va, <
 ;
 ; ZVFHMIN-LABEL: vfmin_vv_nxv8f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v12, v12
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmerge.vvm v8, v12, v16, v0
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v16, v16
 ; ZVFHMIN-NEXT:    vmerge.vvm v12, v16, v12, v0
@@ -876,15 +876,15 @@ define <vscale x 16 x half> @vfmin_vv_nxv16f16(<vscale x 16 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v7, v0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v8, v24, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmerge.vvm v8, v24, v16, v0
 ; ZVFHMIN-NEXT:    addi a0, sp, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
@@ -927,10 +927,10 @@ define <vscale x 16 x half> @vfmin_vv_nxv16f16_unmasked(<vscale x 16 x half> %va
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v16, v16
 ; ZVFHMIN-NEXT:    vmfeq.vv v7, v24, v24
 ; ZVFHMIN-NEXT:    vmerge.vvm v8, v16, v24, v0
@@ -995,64 +995,62 @@ define <vscale x 32 x half> @vfmin_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    add a1, a2, a1
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x21, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 33 * vlenb
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    li a3, 24
-; ZVFHMIN-NEXT:    mul a1, a1, a3
+; ZVFHMIN-NEXT:    li a2, 25
+; ZVFHMIN-NEXT:    mul a1, a1, a2
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmv8r.v v16, v8
+; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
 ; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 5
+; ZVFHMIN-NEXT:    li a5, 24
+; ZVFHMIN-NEXT:    mul a4, a4, a5
 ; ZVFHMIN-NEXT:    add a4, sp, a4
 ; ZVFHMIN-NEXT:    addi a4, a4, 16
 ; ZVFHMIN-NEXT:    vs1r.v v0, (a4) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v12, v0, a2
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vs1r.v v0, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    vmv1r.v v0, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v13, v24, v24, v0.t
-; ZVFHMIN-NEXT:    vmv8r.v v0, v16
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
 ; ZVFHMIN-NEXT:    slli a3, a3, 4
 ; ZVFHMIN-NEXT:    add a3, sp, a3
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4
-; ZVFHMIN-NEXT:    vmv1r.v v0, v13
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    li a4, 24
-; ZVFHMIN-NEXT:    mul a3, a3, a4
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmerge.vvm v24, v24, v16, v0
+; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v12, v24, v24, v0.t
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    li a3, 25
+; ZVFHMIN-NEXT:    mul a2, a2, a3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vl8r.v v0, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v12
-; ZVFHMIN-NEXT:    vmfeq.vv v13, v16, v16, v0.t
-; ZVFHMIN-NEXT:    vmv1r.v v0, v13
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v24, v16, v0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    li a3, 24
-; ZVFHMIN-NEXT:    mul a2, a2, a3
+; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vmerge.vvm v16, v16, v24, v0
+; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vl1r.v v8, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vmv1r.v v0, v8
+; ZVFHMIN-NEXT:    vmfeq.vv v12, v16, v16, v0.t
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v12
+; ZVFHMIN-NEXT:    vmerge.vvm v16, v16, v24, v0
+; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
@@ -1070,35 +1068,43 @@ define <vscale x 32 x half> @vfmin_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB22_2:
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 5
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v24, v16, v16, v0.t
-; ZVFHMIN-NEXT:    vmv8r.v v8, v16
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a1, 4
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v0
-; ZVFHMIN-NEXT:    vmv1r.v v0, v24
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmerge.vvm v24, v8, v16, v0
+; ZVFHMIN-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    li a1, 24
 ; ZVFHMIN-NEXT:    mul a0, a0, a1
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl1r.v v0, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v8, v24, v24, v0.t
+; ZVFHMIN-NEXT:    addi a0, sp, 16
 ; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 5
+; ZVFHMIN-NEXT:    li a1, 25
+; ZVFHMIN-NEXT:    mul a0, a0, a1
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v0
+; ZVFHMIN-NEXT:    vmv1r.v v0, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vmerge.vvm v24, v24, v16, v0
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    li a1, 25
+; ZVFHMIN-NEXT:    mul a0, a0, a1
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    li a1, 24
+; ZVFHMIN-NEXT:    mul a0, a0, a1
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl1r.v v9, (a0) # Unknown-size Folded Reload
@@ -1110,7 +1116,7 @@ define <vscale x 32 x half> @vfmin_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    vmerge.vvm v16, v16, v24, v0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v9
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    li a1, 24
+; ZVFHMIN-NEXT:    li a1, 25
 ; ZVFHMIN-NEXT:    mul a0, a0, a1
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
@@ -1152,68 +1158,61 @@ define <vscale x 32 x half> @vfmin_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 5
+; ZVFHMIN-NEXT:    li a2, 25
+; ZVFHMIN-NEXT:    mul a1, a1, a2
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x19, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 25 * vlenb
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v7
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    li a3, 24
-; ZVFHMIN-NEXT:    mul a1, a1, a3
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vmv8r.v v0, v8
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    vmset.m v24
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v12, v7, a2
+; ZVFHMIN-NEXT:    vslidedown.vx v8, v24, a2
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vs1r.v v8, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    vmv1r.v v0, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v13, v24, v24, v0.t
-; ZVFHMIN-NEXT:    vmv8r.v v0, v16
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 4
 ; ZVFHMIN-NEXT:    add a3, sp, a3
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4
-; ZVFHMIN-NEXT:    vmv1r.v v0, v13
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    li a4, 24
-; ZVFHMIN-NEXT:    mul a3, a3, a4
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmerge.vvm v24, v24, v16, v0
+; ZVFHMIN-NEXT:    vs8r.v v0, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v4
+; ZVFHMIN-NEXT:    vmv1r.v v0, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v12, v24, v24, v0.t
+; ZVFHMIN-NEXT:    vmv8r.v v0, v16
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    slli a3, a2, 4
+; ZVFHMIN-NEXT:    add a2, a3, a2
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v12
-; ZVFHMIN-NEXT:    vmfeq.vv v13, v16, v16, v0.t
-; ZVFHMIN-NEXT:    vmv1r.v v0, v13
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v24, v16, v0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    li a3, 24
-; ZVFHMIN-NEXT:    mul a2, a2, a3
+; ZVFHMIN-NEXT:    slli a3, a2, 3
+; ZVFHMIN-NEXT:    add a2, a3, a2
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vmerge.vvm v16, v16, v24, v0
+; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vl1r.v v13, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vmv1r.v v0, v13
+; ZVFHMIN-NEXT:    vmfeq.vv v12, v16, v16, v0.t
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v12
+; ZVFHMIN-NEXT:    vmerge.vvm v16, v16, v24, v0
+; ZVFHMIN-NEXT:    vmv1r.v v0, v13
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    slli a3, a2, 3
+; ZVFHMIN-NEXT:    add a2, a3, a2
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
@@ -1221,7 +1220,8 @@ define <vscale x 32 x half> @vfmin_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    slli a3, a2, 3
+; ZVFHMIN-NEXT:    add a2, a3, a2
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
@@ -1229,43 +1229,49 @@ define <vscale x 32 x half> @vfmin_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB23_2:
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 4
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a1, a0, 4
+; ZVFHMIN-NEXT:    add a0, a1, a0
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v0
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v8, v8
 ; ZVFHMIN-NEXT:    vmfeq.vv v7, v16, v16
 ; ZVFHMIN-NEXT:    vmerge.vvm v24, v8, v16, v0
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    li a1, 24
-; ZVFHMIN-NEXT:    mul a0, a0, a1
+; ZVFHMIN-NEXT:    slli a1, a0, 4
+; ZVFHMIN-NEXT:    add a0, a1, a0
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v7
 ; ZVFHMIN-NEXT:    vmerge.vvm v16, v16, v8, v0
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    li a1, 24
-; ZVFHMIN-NEXT:    mul a0, a0, a1
+; ZVFHMIN-NEXT:    slli a1, a0, 4
+; ZVFHMIN-NEXT:    add a0, a1, a0
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfmin.vv v16, v16, v24
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    slli a1, a0, 3
+; ZVFHMIN-NEXT:    add a0, a1, a0
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 5
+; ZVFHMIN-NEXT:    li a1, 25
+; ZVFHMIN-NEXT:    mul a0, a0, a1
 ; ZVFHMIN-NEXT:    add sp, sp, a0
 ; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
 ; ZVFHMIN-NEXT:    addi sp, sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/fold-binary-reduce.ll b/llvm/test/CodeGen/RISCV/rvv/fold-binary-reduce.ll
index 2fda344690bf..6787c8c24c87 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fold-binary-reduce.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fold-binary-reduce.ll
@@ -18,7 +18,7 @@ entry:
 define i64 @reduce_add2(<4 x i64> %v) {
 ; CHECK-LABEL: reduce_add2:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetivli zero, 4, e64, m1, ta, ma
+; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.i v10, 8
 ; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; CHECK-NEXT:    vredsum.vs v8, v8, v10
diff --git a/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll
index 70b53841bff4..06f48762e24c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll
@@ -1658,10 +1658,10 @@ define <vscale x 1 x i1> @fcmp_oeq_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: fcmp_oeq_vv_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v9, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 1 x i1> @llvm.vp.fcmp.nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x half> %vb, metadata !"oeq", <vscale x 1 x i1> %m, i32 %evl)
@@ -1678,11 +1678,11 @@ define <vscale x 1 x i1> @fcmp_oeq_vf_nxv1f16(<vscale x 1 x half> %va, half %b,
 ; ZVFHMIN-LABEL: fcmp_oeq_vf_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v8, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x half> poison, half %b, i32 0
@@ -1701,11 +1701,11 @@ define <vscale x 1 x i1> @fcmp_oeq_vf_swap_nxv1f16(<vscale x 1 x half> %va, half
 ; ZVFHMIN-LABEL: fcmp_oeq_vf_swap_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v8, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x half> poison, half %b, i32 0
@@ -1723,10 +1723,10 @@ define <vscale x 1 x i1> @fcmp_ogt_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: fcmp_ogt_vv_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v0, v8, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 1 x i1> @llvm.vp.fcmp.nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x half> %vb, metadata !"ogt", <vscale x 1 x i1> %m, i32 %evl)
@@ -1743,11 +1743,11 @@ define <vscale x 1 x i1> @fcmp_ogt_vf_nxv1f16(<vscale x 1 x half> %va, half %b,
 ; ZVFHMIN-LABEL: fcmp_ogt_vf_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v0, v8, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x half> poison, half %b, i32 0
@@ -1766,11 +1766,11 @@ define <vscale x 1 x i1> @fcmp_ogt_vf_swap_nxv1f16(<vscale x 1 x half> %va, half
 ; ZVFHMIN-LABEL: fcmp_ogt_vf_swap_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v0, v10, v8, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x half> poison, half %b, i32 0
@@ -1788,10 +1788,10 @@ define <vscale x 1 x i1> @fcmp_oge_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: fcmp_oge_vv_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfle.vv v0, v8, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 1 x i1> @llvm.vp.fcmp.nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x half> %vb, metadata !"oge", <vscale x 1 x i1> %m, i32 %evl)
@@ -1808,11 +1808,11 @@ define <vscale x 1 x i1> @fcmp_oge_vf_nxv1f16(<vscale x 1 x half> %va, half %b,
 ; ZVFHMIN-LABEL: fcmp_oge_vf_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfle.vv v0, v8, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x half> poison, half %b, i32 0
@@ -1831,11 +1831,11 @@ define <vscale x 1 x i1> @fcmp_oge_vf_swap_nxv1f16(<vscale x 1 x half> %va, half
 ; ZVFHMIN-LABEL: fcmp_oge_vf_swap_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfle.vv v0, v10, v8, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x half> poison, half %b, i32 0
@@ -1853,10 +1853,10 @@ define <vscale x 1 x i1> @fcmp_olt_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: fcmp_olt_vv_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v0, v9, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 1 x i1> @llvm.vp.fcmp.nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x half> %vb, metadata !"olt", <vscale x 1 x i1> %m, i32 %evl)
@@ -1873,11 +1873,11 @@ define <vscale x 1 x i1> @fcmp_olt_vf_nxv1f16(<vscale x 1 x half> %va, half %b,
 ; ZVFHMIN-LABEL: fcmp_olt_vf_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v0, v10, v8, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x half> poison, half %b, i32 0
@@ -1896,11 +1896,11 @@ define <vscale x 1 x i1> @fcmp_olt_vf_swap_nxv1f16(<vscale x 1 x half> %va, half
 ; ZVFHMIN-LABEL: fcmp_olt_vf_swap_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v0, v8, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x half> poison, half %b, i32 0
@@ -1918,10 +1918,10 @@ define <vscale x 1 x i1> @fcmp_ole_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: fcmp_ole_vv_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfle.vv v0, v9, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 1 x i1> @llvm.vp.fcmp.nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x half> %vb, metadata !"ole", <vscale x 1 x i1> %m, i32 %evl)
@@ -1938,11 +1938,11 @@ define <vscale x 1 x i1> @fcmp_ole_vf_nxv1f16(<vscale x 1 x half> %va, half %b,
 ; ZVFHMIN-LABEL: fcmp_ole_vf_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfle.vv v0, v10, v8, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x half> poison, half %b, i32 0
@@ -1961,11 +1961,11 @@ define <vscale x 1 x i1> @fcmp_ole_vf_swap_nxv1f16(<vscale x 1 x half> %va, half
 ; ZVFHMIN-LABEL: fcmp_ole_vf_swap_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfle.vv v0, v8, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x half> poison, half %b, i32 0
@@ -1985,10 +1985,10 @@ define <vscale x 1 x i1> @fcmp_one_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: fcmp_one_vv_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v8, v9, v10, v0.t
 ; ZVFHMIN-NEXT:    vmflt.vv v9, v10, v9, v0.t
 ; ZVFHMIN-NEXT:    vmor.mm v0, v9, v8
@@ -2009,11 +2009,11 @@ define <vscale x 1 x i1> @fcmp_one_vf_nxv1f16(<vscale x 1 x half> %va, half %b,
 ; ZVFHMIN-LABEL: fcmp_one_vf_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vmv.v.x v8, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v8, v9, v10, v0.t
 ; ZVFHMIN-NEXT:    vmflt.vv v9, v10, v9, v0.t
 ; ZVFHMIN-NEXT:    vmor.mm v0, v9, v8
@@ -2036,11 +2036,11 @@ define <vscale x 1 x i1> @fcmp_one_vf_swap_nxv1f16(<vscale x 1 x half> %va, half
 ; ZVFHMIN-LABEL: fcmp_one_vf_swap_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vmv.v.x v8, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v8, v10, v9, v0.t
 ; ZVFHMIN-NEXT:    vmflt.vv v9, v9, v10, v0.t
 ; ZVFHMIN-NEXT:    vmor.mm v0, v9, v8
@@ -2062,10 +2062,10 @@ define <vscale x 1 x i1> @fcmp_ord_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: fcmp_ord_vv_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v8, v10, v10, v0.t
 ; ZVFHMIN-NEXT:    vmfeq.vv v9, v9, v9, v0.t
 ; ZVFHMIN-NEXT:    vmand.mm v0, v9, v8
@@ -2088,14 +2088,14 @@ define <vscale x 1 x i1> @fcmp_ord_vf_nxv1f16(<vscale x 1 x half> %va, half %b,
 ; ZVFHMIN-LABEL: fcmp_ord_vf_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vmv.v.x v8, a1
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v9, v9, v9, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v8, v10, v10, v0.t
 ; ZVFHMIN-NEXT:    vmand.mm v0, v9, v8
 ; ZVFHMIN-NEXT:    ret
@@ -2119,14 +2119,14 @@ define <vscale x 1 x i1> @fcmp_ord_vf_swap_nxv1f16(<vscale x 1 x half> %va, half
 ; ZVFHMIN-LABEL: fcmp_ord_vf_swap_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vmv.v.x v8, a1
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v9, v9, v9, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v8, v10, v10, v0.t
 ; ZVFHMIN-NEXT:    vmand.mm v0, v8, v9
 ; ZVFHMIN-NEXT:    ret
@@ -2147,10 +2147,10 @@ define <vscale x 1 x i1> @fcmp_ueq_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: fcmp_ueq_vv_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v8, v9, v10, v0.t
 ; ZVFHMIN-NEXT:    vmflt.vv v9, v10, v9, v0.t
 ; ZVFHMIN-NEXT:    vmnor.mm v0, v9, v8
@@ -2171,11 +2171,11 @@ define <vscale x 1 x i1> @fcmp_ueq_vf_nxv1f16(<vscale x 1 x half> %va, half %b,
 ; ZVFHMIN-LABEL: fcmp_ueq_vf_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vmv.v.x v8, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v8, v9, v10, v0.t
 ; ZVFHMIN-NEXT:    vmflt.vv v9, v10, v9, v0.t
 ; ZVFHMIN-NEXT:    vmnor.mm v0, v9, v8
@@ -2198,11 +2198,11 @@ define <vscale x 1 x i1> @fcmp_ueq_vf_swap_nxv1f16(<vscale x 1 x half> %va, half
 ; ZVFHMIN-LABEL: fcmp_ueq_vf_swap_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vmv.v.x v8, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v8, v10, v9, v0.t
 ; ZVFHMIN-NEXT:    vmflt.vv v9, v9, v10, v0.t
 ; ZVFHMIN-NEXT:    vmnor.mm v0, v9, v8
@@ -2223,10 +2223,10 @@ define <vscale x 1 x i1> @fcmp_ugt_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: fcmp_ugt_vv_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfle.vv v8, v9, v10, v0.t
 ; ZVFHMIN-NEXT:    vmnot.m v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -2245,11 +2245,11 @@ define <vscale x 1 x i1> @fcmp_ugt_vf_nxv1f16(<vscale x 1 x half> %va, half %b,
 ; ZVFHMIN-LABEL: fcmp_ugt_vf_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfle.vv v8, v10, v8, v0.t
 ; ZVFHMIN-NEXT:    vmnot.m v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -2270,11 +2270,11 @@ define <vscale x 1 x i1> @fcmp_ugt_vf_swap_nxv1f16(<vscale x 1 x half> %va, half
 ; ZVFHMIN-LABEL: fcmp_ugt_vf_swap_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfle.vv v8, v8, v10, v0.t
 ; ZVFHMIN-NEXT:    vmnot.m v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -2294,10 +2294,10 @@ define <vscale x 1 x i1> @fcmp_uge_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: fcmp_uge_vv_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v8, v9, v10, v0.t
 ; ZVFHMIN-NEXT:    vmnot.m v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -2316,11 +2316,11 @@ define <vscale x 1 x i1> @fcmp_uge_vf_nxv1f16(<vscale x 1 x half> %va, half %b,
 ; ZVFHMIN-LABEL: fcmp_uge_vf_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v8, v10, v8, v0.t
 ; ZVFHMIN-NEXT:    vmnot.m v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -2341,11 +2341,11 @@ define <vscale x 1 x i1> @fcmp_uge_vf_swap_nxv1f16(<vscale x 1 x half> %va, half
 ; ZVFHMIN-LABEL: fcmp_uge_vf_swap_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v8, v8, v10, v0.t
 ; ZVFHMIN-NEXT:    vmnot.m v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -2365,10 +2365,10 @@ define <vscale x 1 x i1> @fcmp_ult_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: fcmp_ult_vv_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfle.vv v8, v8, v10, v0.t
 ; ZVFHMIN-NEXT:    vmnot.m v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -2387,11 +2387,11 @@ define <vscale x 1 x i1> @fcmp_ult_vf_nxv1f16(<vscale x 1 x half> %va, half %b,
 ; ZVFHMIN-LABEL: fcmp_ult_vf_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfle.vv v8, v8, v10, v0.t
 ; ZVFHMIN-NEXT:    vmnot.m v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -2412,11 +2412,11 @@ define <vscale x 1 x i1> @fcmp_ult_vf_swap_nxv1f16(<vscale x 1 x half> %va, half
 ; ZVFHMIN-LABEL: fcmp_ult_vf_swap_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfle.vv v8, v10, v8, v0.t
 ; ZVFHMIN-NEXT:    vmnot.m v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -2436,10 +2436,10 @@ define <vscale x 1 x i1> @fcmp_ule_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: fcmp_ule_vv_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v8, v8, v10, v0.t
 ; ZVFHMIN-NEXT:    vmnot.m v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -2458,11 +2458,11 @@ define <vscale x 1 x i1> @fcmp_ule_vf_nxv1f16(<vscale x 1 x half> %va, half %b,
 ; ZVFHMIN-LABEL: fcmp_ule_vf_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v8, v8, v10, v0.t
 ; ZVFHMIN-NEXT:    vmnot.m v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -2483,11 +2483,11 @@ define <vscale x 1 x i1> @fcmp_ule_vf_swap_nxv1f16(<vscale x 1 x half> %va, half
 ; ZVFHMIN-LABEL: fcmp_ule_vf_swap_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v8, v10, v8, v0.t
 ; ZVFHMIN-NEXT:    vmnot.m v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -2506,10 +2506,10 @@ define <vscale x 1 x i1> @fcmp_une_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: fcmp_une_vv_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfne.vv v0, v9, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 1 x i1> @llvm.vp.fcmp.nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x half> %vb, metadata !"une", <vscale x 1 x i1> %m, i32 %evl)
@@ -2526,11 +2526,11 @@ define <vscale x 1 x i1> @fcmp_une_vf_nxv1f16(<vscale x 1 x half> %va, half %b,
 ; ZVFHMIN-LABEL: fcmp_une_vf_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfne.vv v0, v10, v8, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x half> poison, half %b, i32 0
@@ -2549,11 +2549,11 @@ define <vscale x 1 x i1> @fcmp_une_vf_swap_nxv1f16(<vscale x 1 x half> %va, half
 ; ZVFHMIN-LABEL: fcmp_une_vf_swap_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfne.vv v0, v8, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x half> poison, half %b, i32 0
@@ -2573,10 +2573,10 @@ define <vscale x 1 x i1> @fcmp_uno_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: fcmp_uno_vv_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfne.vv v8, v10, v10, v0.t
 ; ZVFHMIN-NEXT:    vmfne.vv v9, v9, v9, v0.t
 ; ZVFHMIN-NEXT:    vmor.mm v0, v9, v8
@@ -2599,14 +2599,14 @@ define <vscale x 1 x i1> @fcmp_uno_vf_nxv1f16(<vscale x 1 x half> %va, half %b,
 ; ZVFHMIN-LABEL: fcmp_uno_vf_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vmv.v.x v8, a1
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfne.vv v9, v9, v9, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfne.vv v8, v10, v10, v0.t
 ; ZVFHMIN-NEXT:    vmor.mm v0, v9, v8
 ; ZVFHMIN-NEXT:    ret
@@ -2630,14 +2630,14 @@ define <vscale x 1 x i1> @fcmp_uno_vf_swap_nxv1f16(<vscale x 1 x half> %va, half
 ; ZVFHMIN-LABEL: fcmp_uno_vf_swap_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vmv.v.x v8, a1
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfne.vv v9, v9, v9, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfne.vv v8, v10, v10, v0.t
 ; ZVFHMIN-NEXT:    vmor.mm v0, v8, v9
 ; ZVFHMIN-NEXT:    ret
@@ -2658,10 +2658,10 @@ define <vscale x 3 x i1> @fcmp_oeq_vv_nxv3f16(<vscale x 3 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: fcmp_oeq_vv_nxv3f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v8, v12, v10, v0.t
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -2681,10 +2681,10 @@ define <vscale x 8 x i1> @fcmp_oeq_vv_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: fcmp_oeq_vv_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v8, v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -2703,11 +2703,11 @@ define <vscale x 8 x i1> @fcmp_oeq_vf_nxv8f16(<vscale x 8 x half> %va, half %b,
 ; ZVFHMIN-LABEL: fcmp_oeq_vf_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v8, v12, v16, v0.t
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -2728,11 +2728,11 @@ define <vscale x 8 x i1> @fcmp_oeq_vf_swap_nxv8f16(<vscale x 8 x half> %va, half
 ; ZVFHMIN-LABEL: fcmp_oeq_vf_swap_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v8, v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -2752,10 +2752,10 @@ define <vscale x 8 x i1> @fcmp_ogt_vv_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: fcmp_ogt_vv_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v8, v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -2774,11 +2774,11 @@ define <vscale x 8 x i1> @fcmp_ogt_vf_nxv8f16(<vscale x 8 x half> %va, half %b,
 ; ZVFHMIN-LABEL: fcmp_ogt_vf_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v8, v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -2799,11 +2799,11 @@ define <vscale x 8 x i1> @fcmp_ogt_vf_swap_nxv8f16(<vscale x 8 x half> %va, half
 ; ZVFHMIN-LABEL: fcmp_ogt_vf_swap_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v8, v12, v16, v0.t
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -2823,10 +2823,10 @@ define <vscale x 8 x i1> @fcmp_oge_vv_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: fcmp_oge_vv_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmfle.vv v8, v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -2845,11 +2845,11 @@ define <vscale x 8 x i1> @fcmp_oge_vf_nxv8f16(<vscale x 8 x half> %va, half %b,
 ; ZVFHMIN-LABEL: fcmp_oge_vf_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmfle.vv v8, v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -2870,11 +2870,11 @@ define <vscale x 8 x i1> @fcmp_oge_vf_swap_nxv8f16(<vscale x 8 x half> %va, half
 ; ZVFHMIN-LABEL: fcmp_oge_vf_swap_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmfle.vv v8, v12, v16, v0.t
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -2894,10 +2894,10 @@ define <vscale x 8 x i1> @fcmp_olt_vv_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: fcmp_olt_vv_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v8, v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -2916,11 +2916,11 @@ define <vscale x 8 x i1> @fcmp_olt_vf_nxv8f16(<vscale x 8 x half> %va, half %b,
 ; ZVFHMIN-LABEL: fcmp_olt_vf_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v8, v12, v16, v0.t
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -2941,11 +2941,11 @@ define <vscale x 8 x i1> @fcmp_olt_vf_swap_nxv8f16(<vscale x 8 x half> %va, half
 ; ZVFHMIN-LABEL: fcmp_olt_vf_swap_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v8, v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -2965,10 +2965,10 @@ define <vscale x 8 x i1> @fcmp_ole_vv_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: fcmp_ole_vv_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmfle.vv v8, v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -2987,11 +2987,11 @@ define <vscale x 8 x i1> @fcmp_ole_vf_nxv8f16(<vscale x 8 x half> %va, half %b,
 ; ZVFHMIN-LABEL: fcmp_ole_vf_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmfle.vv v8, v12, v16, v0.t
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -3012,11 +3012,11 @@ define <vscale x 8 x i1> @fcmp_ole_vf_swap_nxv8f16(<vscale x 8 x half> %va, half
 ; ZVFHMIN-LABEL: fcmp_ole_vf_swap_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmfle.vv v8, v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -3037,10 +3037,10 @@ define <vscale x 8 x i1> @fcmp_one_vv_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: fcmp_one_vv_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v8, v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vmflt.vv v9, v12, v16, v0.t
 ; ZVFHMIN-NEXT:    vmor.mm v0, v9, v8
@@ -3061,11 +3061,11 @@ define <vscale x 8 x i1> @fcmp_one_vf_nxv8f16(<vscale x 8 x half> %va, half %b,
 ; ZVFHMIN-LABEL: fcmp_one_vf_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vmv.v.x v8, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v8, v12, v16, v0.t
 ; ZVFHMIN-NEXT:    vmflt.vv v9, v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vmor.mm v0, v9, v8
@@ -3088,11 +3088,11 @@ define <vscale x 8 x i1> @fcmp_one_vf_swap_nxv8f16(<vscale x 8 x half> %va, half
 ; ZVFHMIN-LABEL: fcmp_one_vf_swap_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vmv.v.x v8, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v8, v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vmflt.vv v9, v12, v16, v0.t
 ; ZVFHMIN-NEXT:    vmor.mm v0, v9, v8
@@ -3114,13 +3114,13 @@ define <vscale x 8 x i1> @fcmp_ord_vv_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: fcmp_ord_vv_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v10, v12, v12, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v8, v12, v12, v0.t
 ; ZVFHMIN-NEXT:    vmand.mm v0, v8, v10
 ; ZVFHMIN-NEXT:    ret
@@ -3142,14 +3142,14 @@ define <vscale x 8 x i1> @fcmp_ord_vf_nxv8f16(<vscale x 8 x half> %va, half %b,
 ; ZVFHMIN-LABEL: fcmp_ord_vf_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vmv.v.x v8, a1
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v10, v12, v12, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v8, v12, v12, v0.t
 ; ZVFHMIN-NEXT:    vmand.mm v0, v10, v8
 ; ZVFHMIN-NEXT:    ret
@@ -3173,14 +3173,14 @@ define <vscale x 8 x i1> @fcmp_ord_vf_swap_nxv8f16(<vscale x 8 x half> %va, half
 ; ZVFHMIN-LABEL: fcmp_ord_vf_swap_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vmv.v.x v8, a1
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v10, v12, v12, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v8, v12, v12, v0.t
 ; ZVFHMIN-NEXT:    vmand.mm v0, v8, v10
 ; ZVFHMIN-NEXT:    ret
@@ -3201,10 +3201,10 @@ define <vscale x 8 x i1> @fcmp_ueq_vv_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: fcmp_ueq_vv_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v8, v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vmflt.vv v9, v12, v16, v0.t
 ; ZVFHMIN-NEXT:    vmnor.mm v0, v9, v8
@@ -3225,11 +3225,11 @@ define <vscale x 8 x i1> @fcmp_ueq_vf_nxv8f16(<vscale x 8 x half> %va, half %b,
 ; ZVFHMIN-LABEL: fcmp_ueq_vf_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vmv.v.x v8, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v8, v12, v16, v0.t
 ; ZVFHMIN-NEXT:    vmflt.vv v9, v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vmnor.mm v0, v9, v8
@@ -3252,11 +3252,11 @@ define <vscale x 8 x i1> @fcmp_ueq_vf_swap_nxv8f16(<vscale x 8 x half> %va, half
 ; ZVFHMIN-LABEL: fcmp_ueq_vf_swap_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vmv.v.x v8, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v8, v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vmflt.vv v9, v12, v16, v0.t
 ; ZVFHMIN-NEXT:    vmnor.mm v0, v9, v8
@@ -3277,10 +3277,10 @@ define <vscale x 8 x i1> @fcmp_ugt_vv_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: fcmp_ugt_vv_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmfle.vv v8, v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vmnot.m v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -3299,11 +3299,11 @@ define <vscale x 8 x i1> @fcmp_ugt_vf_nxv8f16(<vscale x 8 x half> %va, half %b,
 ; ZVFHMIN-LABEL: fcmp_ugt_vf_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmfle.vv v8, v12, v16, v0.t
 ; ZVFHMIN-NEXT:    vmnot.m v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -3324,11 +3324,11 @@ define <vscale x 8 x i1> @fcmp_ugt_vf_swap_nxv8f16(<vscale x 8 x half> %va, half
 ; ZVFHMIN-LABEL: fcmp_ugt_vf_swap_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmfle.vv v8, v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vmnot.m v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -3348,10 +3348,10 @@ define <vscale x 8 x i1> @fcmp_uge_vv_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: fcmp_uge_vv_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v8, v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vmnot.m v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -3370,11 +3370,11 @@ define <vscale x 8 x i1> @fcmp_uge_vf_nxv8f16(<vscale x 8 x half> %va, half %b,
 ; ZVFHMIN-LABEL: fcmp_uge_vf_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v8, v12, v16, v0.t
 ; ZVFHMIN-NEXT:    vmnot.m v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -3395,11 +3395,11 @@ define <vscale x 8 x i1> @fcmp_uge_vf_swap_nxv8f16(<vscale x 8 x half> %va, half
 ; ZVFHMIN-LABEL: fcmp_uge_vf_swap_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v8, v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vmnot.m v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -3419,10 +3419,10 @@ define <vscale x 8 x i1> @fcmp_ult_vv_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: fcmp_ult_vv_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmfle.vv v8, v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vmnot.m v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -3441,11 +3441,11 @@ define <vscale x 8 x i1> @fcmp_ult_vf_nxv8f16(<vscale x 8 x half> %va, half %b,
 ; ZVFHMIN-LABEL: fcmp_ult_vf_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmfle.vv v8, v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vmnot.m v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -3466,11 +3466,11 @@ define <vscale x 8 x i1> @fcmp_ult_vf_swap_nxv8f16(<vscale x 8 x half> %va, half
 ; ZVFHMIN-LABEL: fcmp_ult_vf_swap_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmfle.vv v8, v12, v16, v0.t
 ; ZVFHMIN-NEXT:    vmnot.m v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -3490,10 +3490,10 @@ define <vscale x 8 x i1> @fcmp_ule_vv_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: fcmp_ule_vv_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v8, v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vmnot.m v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -3512,11 +3512,11 @@ define <vscale x 8 x i1> @fcmp_ule_vf_nxv8f16(<vscale x 8 x half> %va, half %b,
 ; ZVFHMIN-LABEL: fcmp_ule_vf_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v8, v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vmnot.m v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -3537,11 +3537,11 @@ define <vscale x 8 x i1> @fcmp_ule_vf_swap_nxv8f16(<vscale x 8 x half> %va, half
 ; ZVFHMIN-LABEL: fcmp_ule_vf_swap_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v8, v12, v16, v0.t
 ; ZVFHMIN-NEXT:    vmnot.m v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -3561,10 +3561,10 @@ define <vscale x 8 x i1> @fcmp_une_vv_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: fcmp_une_vv_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmfne.vv v8, v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -3583,11 +3583,11 @@ define <vscale x 8 x i1> @fcmp_une_vf_nxv8f16(<vscale x 8 x half> %va, half %b,
 ; ZVFHMIN-LABEL: fcmp_une_vf_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmfne.vv v8, v12, v16, v0.t
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -3608,11 +3608,11 @@ define <vscale x 8 x i1> @fcmp_une_vf_swap_nxv8f16(<vscale x 8 x half> %va, half
 ; ZVFHMIN-LABEL: fcmp_une_vf_swap_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmfne.vv v8, v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
 ; ZVFHMIN-NEXT:    ret
@@ -3633,13 +3633,13 @@ define <vscale x 8 x i1> @fcmp_uno_vv_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: fcmp_uno_vv_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmfne.vv v10, v12, v12, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmfne.vv v8, v12, v12, v0.t
 ; ZVFHMIN-NEXT:    vmor.mm v0, v8, v10
 ; ZVFHMIN-NEXT:    ret
@@ -3661,14 +3661,14 @@ define <vscale x 8 x i1> @fcmp_uno_vf_nxv8f16(<vscale x 8 x half> %va, half %b,
 ; ZVFHMIN-LABEL: fcmp_uno_vf_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vmv.v.x v8, a1
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmfne.vv v10, v12, v12, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmfne.vv v8, v12, v12, v0.t
 ; ZVFHMIN-NEXT:    vmor.mm v0, v10, v8
 ; ZVFHMIN-NEXT:    ret
@@ -3692,14 +3692,14 @@ define <vscale x 8 x i1> @fcmp_uno_vf_swap_nxv8f16(<vscale x 8 x half> %va, half
 ; ZVFHMIN-LABEL: fcmp_uno_vf_swap_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vmv.v.x v8, a1
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmfne.vv v10, v12, v12, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmfne.vv v8, v12, v12, v0.t
 ; ZVFHMIN-NEXT:    vmor.mm v0, v8, v10
 ; ZVFHMIN-NEXT:    ret
@@ -3829,14 +3829,14 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64f16(<vscale x 64 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vs8r.v v0, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a7, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v4
 ; ZVFHMIN-NEXT:    addi a0, sp, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vmv8r.v v8, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli t0, a0, 4
-; ZVFHMIN-NEXT:    add a0, t0, a0
+; ZVFHMIN-NEXT:    slli a7, a0, 4
+; ZVFHMIN-NEXT:    add a0, a7, a0
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
@@ -3844,7 +3844,7 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64f16(<vscale x 64 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v24
 ; ZVFHMIN-NEXT:    addi a0, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a7, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v6, v16, v8, v0.t
 ; ZVFHMIN-NEXT:    bltu a6, a4, .LBB171_2
 ; ZVFHMIN-NEXT:  # %bb.1:
@@ -3857,16 +3857,16 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64f16(<vscale x 64 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a6, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a7, a0, 4
-; ZVFHMIN-NEXT:    add a0, a7, a0
+; ZVFHMIN-NEXT:    slli a6, a0, 4
+; ZVFHMIN-NEXT:    add a0, a6, a0
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
-; ZVFHMIN-NEXT:    vsetvli zero, a6, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v5, v24, v8, v0.t
 ; ZVFHMIN-NEXT:    add a0, a3, a3
 ; ZVFHMIN-NEXT:    bltu a2, a5, .LBB171_4
@@ -3881,6 +3881,9 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64f16(<vscale x 64 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    vl1r.v v7, (a6) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli a6, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a3
+; ZVFHMIN-NEXT:    sltu a6, a2, a5
+; ZVFHMIN-NEXT:    addi a6, a6, -1
+; ZVFHMIN-NEXT:    and a5, a6, a5
 ; ZVFHMIN-NEXT:    csrr a6, vlenb
 ; ZVFHMIN-NEXT:    mv a7, a6
 ; ZVFHMIN-NEXT:    slli a6, a6, 3
@@ -3890,31 +3893,28 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64f16(<vscale x 64 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    add a6, sp, a6
 ; ZVFHMIN-NEXT:    addi a6, a6, 16
 ; ZVFHMIN-NEXT:    vl8r.v v16, (a6) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli a6, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a5, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
-; ZVFHMIN-NEXT:    csrr a6, vlenb
-; ZVFHMIN-NEXT:    slli a7, a6, 4
-; ZVFHMIN-NEXT:    add a6, a7, a6
-; ZVFHMIN-NEXT:    add a6, sp, a6
-; ZVFHMIN-NEXT:    addi a6, a6, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a6) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    csrr a6, vlenb
-; ZVFHMIN-NEXT:    slli a7, a6, 5
-; ZVFHMIN-NEXT:    add a6, a7, a6
-; ZVFHMIN-NEXT:    add a6, sp, a6
-; ZVFHMIN-NEXT:    addi a6, a6, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a6) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    csrr a5, vlenb
+; ZVFHMIN-NEXT:    slli a6, a5, 4
+; ZVFHMIN-NEXT:    add a5, a6, a5
+; ZVFHMIN-NEXT:    add a5, sp, a5
+; ZVFHMIN-NEXT:    addi a5, a5, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a5) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    csrr a5, vlenb
+; ZVFHMIN-NEXT:    slli a6, a5, 5
+; ZVFHMIN-NEXT:    add a5, a6, a5
+; ZVFHMIN-NEXT:    add a5, sp, a5
+; ZVFHMIN-NEXT:    addi a5, a5, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a5) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v28
-; ZVFHMIN-NEXT:    sltu a6, a2, a5
-; ZVFHMIN-NEXT:    addi a6, a6, -1
-; ZVFHMIN-NEXT:    and a5, a6, a5
-; ZVFHMIN-NEXT:    csrr a6, vlenb
-; ZVFHMIN-NEXT:    slli a7, a6, 4
-; ZVFHMIN-NEXT:    add a6, a7, a6
-; ZVFHMIN-NEXT:    add a6, sp, a6
-; ZVFHMIN-NEXT:    addi a6, a6, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a6) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a5, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    csrr a5, vlenb
+; ZVFHMIN-NEXT:    slli a6, a5, 4
+; ZVFHMIN-NEXT:    add a5, a6, a5
+; ZVFHMIN-NEXT:    add a5, sp, a5
+; ZVFHMIN-NEXT:    addi a5, a5, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a5) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v4, v24, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslideup.vx v5, v6, a3
@@ -3923,16 +3923,16 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64f16(<vscale x 64 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    mv a2, a4
 ; ZVFHMIN-NEXT:  .LBB171_6:
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v7
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
-; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a5, a4, 5
-; ZVFHMIN-NEXT:    add a4, a5, a4
-; ZVFHMIN-NEXT:    add a4, sp, a4
-; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a4, a2, 5
+; ZVFHMIN-NEXT:    add a2, a4, a2
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v8, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslideup.vx v8, v4, a3
diff --git a/llvm/test/CodeGen/RISCV/rvv/setcc-int-vp.ll b/llvm/test/CodeGen/RISCV/rvv/setcc-int-vp.ll
index 61cc754e21df..9c733b17dc6e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/setcc-int-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/setcc-int-vp.ll
@@ -2367,9 +2367,8 @@ define <vscale x 1 x i1> @icmp_eq_vx_nxv1i64(<vscale x 1 x i64> %va, i64 %b, <vs
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vmseq.vv v0, v8, v9, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -2394,9 +2393,8 @@ define <vscale x 1 x i1> @icmp_eq_vx_swap_nxv1i64(<vscale x 1 x i64> %va, i64 %b
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vmseq.vv v0, v9, v8, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -2451,9 +2449,8 @@ define <vscale x 1 x i1> @icmp_ne_vx_nxv1i64(<vscale x 1 x i64> %va, i64 %b, <vs
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vmsne.vv v0, v8, v9, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -2478,9 +2475,8 @@ define <vscale x 1 x i1> @icmp_ne_vx_swap_nxv1i64(<vscale x 1 x i64> %va, i64 %b
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vmsne.vv v0, v9, v8, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -2535,9 +2531,8 @@ define <vscale x 1 x i1> @icmp_ugt_vx_nxv1i64(<vscale x 1 x i64> %va, i64 %b, <v
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vmsltu.vv v0, v9, v8, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -2562,9 +2557,8 @@ define <vscale x 1 x i1> @icmp_ugt_vx_swap_nxv1i64(<vscale x 1 x i64> %va, i64 %
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vmsltu.vv v0, v8, v9, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -2619,9 +2613,8 @@ define <vscale x 1 x i1> @icmp_uge_vx_nxv1i64(<vscale x 1 x i64> %va, i64 %b, <v
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vmsleu.vv v0, v9, v8, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -2647,9 +2640,8 @@ define <vscale x 1 x i1> @icmp_uge_vx_swap_nxv1i64(<vscale x 1 x i64> %va, i64 %
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vmsleu.vv v0, v8, v9, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -2704,9 +2696,8 @@ define <vscale x 1 x i1> @icmp_ult_vx_nxv1i64(<vscale x 1 x i64> %va, i64 %b, <v
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vmsltu.vv v0, v8, v9, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -2731,9 +2722,8 @@ define <vscale x 1 x i1> @icmp_ult_vx_swap_nxv1i64(<vscale x 1 x i64> %va, i64 %
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vmsltu.vv v0, v9, v8, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -2788,9 +2778,8 @@ define <vscale x 1 x i1> @icmp_sgt_vx_nxv1i64(<vscale x 1 x i64> %va, i64 %b, <v
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vmslt.vv v0, v9, v8, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -2815,9 +2804,8 @@ define <vscale x 1 x i1> @icmp_sgt_vx_swap_nxv1i64(<vscale x 1 x i64> %va, i64 %
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vmslt.vv v0, v8, v9, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -2872,9 +2860,8 @@ define <vscale x 1 x i1> @icmp_sge_vx_nxv1i64(<vscale x 1 x i64> %va, i64 %b, <v
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vmsle.vv v0, v9, v8, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -2900,9 +2887,8 @@ define <vscale x 1 x i1> @icmp_sge_vx_swap_nxv1i64(<vscale x 1 x i64> %va, i64 %
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vmsle.vv v0, v8, v9, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -2957,9 +2943,8 @@ define <vscale x 1 x i1> @icmp_slt_vx_nxv1i64(<vscale x 1 x i64> %va, i64 %b, <v
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vmslt.vv v0, v8, v9, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -2984,9 +2969,8 @@ define <vscale x 1 x i1> @icmp_slt_vx_swap_nxv1i64(<vscale x 1 x i64> %va, i64 %
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vmslt.vv v0, v9, v8, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -3041,9 +3025,8 @@ define <vscale x 1 x i1> @icmp_sle_vx_nxv1i64(<vscale x 1 x i64> %va, i64 %b, <v
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vmsle.vv v0, v8, v9, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -3068,9 +3051,8 @@ define <vscale x 1 x i1> @icmp_sle_vx_swap_nxv1i64(<vscale x 1 x i64> %va, i64 %
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vmsle.vv v0, v9, v8, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -3129,9 +3111,8 @@ define <vscale x 8 x i1> @icmp_eq_vx_nxv8i64(<vscale x 8 x i64> %va, i64 %b, <vs
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vmseq.vv v16, v8, v24, v0.t
 ; RV32-NEXT:    vmv1r.v v0, v16
 ; RV32-NEXT:    addi sp, sp, 16
@@ -3158,9 +3139,8 @@ define <vscale x 8 x i1> @icmp_eq_vx_swap_nxv8i64(<vscale x 8 x i64> %va, i64 %b
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vmseq.vv v16, v24, v8, v0.t
 ; RV32-NEXT:    vmv1r.v v0, v16
 ; RV32-NEXT:    addi sp, sp, 16
@@ -3220,9 +3200,8 @@ define <vscale x 8 x i1> @icmp_ne_vx_nxv8i64(<vscale x 8 x i64> %va, i64 %b, <vs
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vmsne.vv v16, v8, v24, v0.t
 ; RV32-NEXT:    vmv1r.v v0, v16
 ; RV32-NEXT:    addi sp, sp, 16
@@ -3249,9 +3228,8 @@ define <vscale x 8 x i1> @icmp_ne_vx_swap_nxv8i64(<vscale x 8 x i64> %va, i64 %b
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vmsne.vv v16, v24, v8, v0.t
 ; RV32-NEXT:    vmv1r.v v0, v16
 ; RV32-NEXT:    addi sp, sp, 16
@@ -3311,9 +3289,8 @@ define <vscale x 8 x i1> @icmp_ugt_vx_nxv8i64(<vscale x 8 x i64> %va, i64 %b, <v
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vmsltu.vv v16, v24, v8, v0.t
 ; RV32-NEXT:    vmv1r.v v0, v16
 ; RV32-NEXT:    addi sp, sp, 16
@@ -3340,9 +3317,8 @@ define <vscale x 8 x i1> @icmp_ugt_vx_swap_nxv8i64(<vscale x 8 x i64> %va, i64 %
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vmsltu.vv v16, v8, v24, v0.t
 ; RV32-NEXT:    vmv1r.v v0, v16
 ; RV32-NEXT:    addi sp, sp, 16
@@ -3402,9 +3378,8 @@ define <vscale x 8 x i1> @icmp_uge_vx_nxv8i64(<vscale x 8 x i64> %va, i64 %b, <v
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vmsleu.vv v16, v24, v8, v0.t
 ; RV32-NEXT:    vmv1r.v v0, v16
 ; RV32-NEXT:    addi sp, sp, 16
@@ -3432,9 +3407,8 @@ define <vscale x 8 x i1> @icmp_uge_vx_swap_nxv8i64(<vscale x 8 x i64> %va, i64 %
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vmsleu.vv v16, v8, v24, v0.t
 ; RV32-NEXT:    vmv1r.v v0, v16
 ; RV32-NEXT:    addi sp, sp, 16
@@ -3494,9 +3468,8 @@ define <vscale x 8 x i1> @icmp_ult_vx_nxv8i64(<vscale x 8 x i64> %va, i64 %b, <v
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vmsltu.vv v16, v8, v24, v0.t
 ; RV32-NEXT:    vmv1r.v v0, v16
 ; RV32-NEXT:    addi sp, sp, 16
@@ -3523,9 +3496,8 @@ define <vscale x 8 x i1> @icmp_ult_vx_swap_nxv8i64(<vscale x 8 x i64> %va, i64 %
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vmsltu.vv v16, v24, v8, v0.t
 ; RV32-NEXT:    vmv1r.v v0, v16
 ; RV32-NEXT:    addi sp, sp, 16
@@ -3585,9 +3557,8 @@ define <vscale x 8 x i1> @icmp_sgt_vx_nxv8i64(<vscale x 8 x i64> %va, i64 %b, <v
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vmslt.vv v16, v24, v8, v0.t
 ; RV32-NEXT:    vmv1r.v v0, v16
 ; RV32-NEXT:    addi sp, sp, 16
@@ -3614,9 +3585,8 @@ define <vscale x 8 x i1> @icmp_sgt_vx_swap_nxv8i64(<vscale x 8 x i64> %va, i64 %
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vmslt.vv v16, v8, v24, v0.t
 ; RV32-NEXT:    vmv1r.v v0, v16
 ; RV32-NEXT:    addi sp, sp, 16
@@ -3676,9 +3646,8 @@ define <vscale x 8 x i1> @icmp_sge_vx_nxv8i64(<vscale x 8 x i64> %va, i64 %b, <v
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vmsle.vv v16, v24, v8, v0.t
 ; RV32-NEXT:    vmv1r.v v0, v16
 ; RV32-NEXT:    addi sp, sp, 16
@@ -3706,9 +3675,8 @@ define <vscale x 8 x i1> @icmp_sge_vx_swap_nxv8i64(<vscale x 8 x i64> %va, i64 %
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vmsle.vv v16, v8, v24, v0.t
 ; RV32-NEXT:    vmv1r.v v0, v16
 ; RV32-NEXT:    addi sp, sp, 16
@@ -3768,9 +3736,8 @@ define <vscale x 8 x i1> @icmp_slt_vx_nxv8i64(<vscale x 8 x i64> %va, i64 %b, <v
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vmslt.vv v16, v8, v24, v0.t
 ; RV32-NEXT:    vmv1r.v v0, v16
 ; RV32-NEXT:    addi sp, sp, 16
@@ -3797,9 +3764,8 @@ define <vscale x 8 x i1> @icmp_slt_vx_swap_nxv8i64(<vscale x 8 x i64> %va, i64 %
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vmslt.vv v16, v24, v8, v0.t
 ; RV32-NEXT:    vmv1r.v v0, v16
 ; RV32-NEXT:    addi sp, sp, 16
@@ -3859,9 +3825,8 @@ define <vscale x 8 x i1> @icmp_sle_vx_nxv8i64(<vscale x 8 x i64> %va, i64 %b, <v
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vmsle.vv v16, v8, v24, v0.t
 ; RV32-NEXT:    vmv1r.v v0, v16
 ; RV32-NEXT:    addi sp, sp, 16
@@ -3888,9 +3853,8 @@ define <vscale x 8 x i1> @icmp_sle_vx_swap_nxv8i64(<vscale x 8 x i64> %va, i64 %
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vmsle.vv v16, v24, v8, v0.t
 ; RV32-NEXT:    vmv1r.v v0, v16
 ; RV32-NEXT:    addi sp, sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/vadd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vadd-vp.ll
index fee6799e992f..77f3cf3ca498 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vadd-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vadd-vp.ll
@@ -1487,9 +1487,8 @@ define <vscale x 1 x i64> @vadd_vx_nxv1i64(<vscale x 1 x i64> %va, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vadd.vv v8, v8, v9, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1514,9 +1513,8 @@ define <vscale x 1 x i64> @vadd_vx_nxv1i64_unmasked(<vscale x 1 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vadd.vv v8, v8, v9
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1583,9 +1581,8 @@ define <vscale x 2 x i64> @vadd_vx_nxv2i64(<vscale x 2 x i64> %va, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vadd.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1610,9 +1607,8 @@ define <vscale x 2 x i64> @vadd_vx_nxv2i64_unmasked(<vscale x 2 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vadd.vv v8, v8, v10
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1679,9 +1675,8 @@ define <vscale x 4 x i64> @vadd_vx_nxv4i64(<vscale x 4 x i64> %va, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vadd.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1706,9 +1701,8 @@ define <vscale x 4 x i64> @vadd_vx_nxv4i64_unmasked(<vscale x 4 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vadd.vv v8, v8, v12
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1775,9 +1769,8 @@ define <vscale x 8 x i64> @vadd_vx_nxv8i64(<vscale x 8 x i64> %va, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1802,9 +1795,8 @@ define <vscale x 8 x i64> @vadd_vx_nxv8i64_unmasked(<vscale x 8 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vadd.vv v8, v8, v16
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/vand-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vand-vp.ll
index b0c5a72f6f9e..4866bb06f19e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vand-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vand-vp.ll
@@ -1314,9 +1314,8 @@ define <vscale x 1 x i64> @vand_vx_nxv1i64(<vscale x 1 x i64> %va, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vand.vv v8, v8, v9, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1341,9 +1340,8 @@ define <vscale x 1 x i64> @vand_vx_nxv1i64_unmasked(<vscale x 1 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vand.vv v8, v8, v9
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1410,9 +1408,8 @@ define <vscale x 2 x i64> @vand_vx_nxv2i64(<vscale x 2 x i64> %va, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1437,9 +1434,8 @@ define <vscale x 2 x i64> @vand_vx_nxv2i64_unmasked(<vscale x 2 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vand.vv v8, v8, v10
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1506,9 +1502,8 @@ define <vscale x 4 x i64> @vand_vx_nxv4i64(<vscale x 4 x i64> %va, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1533,9 +1528,8 @@ define <vscale x 4 x i64> @vand_vx_nxv4i64_unmasked(<vscale x 4 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vand.vv v8, v8, v12
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1602,9 +1596,8 @@ define <vscale x 8 x i64> @vand_vx_nxv8i64(<vscale x 8 x i64> %va, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1629,9 +1622,8 @@ define <vscale x 8 x i64> @vand_vx_nxv8i64_unmasked(<vscale x 8 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vand.vv v8, v8, v16
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/vandn-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vandn-vp.ll
index 32992301bd39..763b2908b102 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vandn-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vandn-vp.ll
@@ -1115,9 +1115,8 @@ define <vscale x 1 x i64> @vandn_vx_vp_nxv1i64(i64 %a, <vscale x 1 x i64> %b, <v
 ; CHECK-RV32-NEXT:    sw a0, 8(sp)
 ; CHECK-RV32-NEXT:    sw a1, 12(sp)
 ; CHECK-RV32-NEXT:    addi a0, sp, 8
-; CHECK-RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; CHECK-RV32-NEXT:    vlse64.v v9, (a0), zero
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; CHECK-RV32-NEXT:    vlse64.v v9, (a0), zero
 ; CHECK-RV32-NEXT:    vand.vv v8, v8, v9, v0.t
 ; CHECK-RV32-NEXT:    addi sp, sp, 16
 ; CHECK-RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1139,9 +1138,8 @@ define <vscale x 1 x i64> @vandn_vx_vp_nxv1i64(i64 %a, <vscale x 1 x i64> %b, <v
 ; CHECK-ZVKB32-NEXT:    sw a0, 8(sp)
 ; CHECK-ZVKB32-NEXT:    sw a1, 12(sp)
 ; CHECK-ZVKB32-NEXT:    addi a0, sp, 8
-; CHECK-ZVKB32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; CHECK-ZVKB32-NEXT:    vlse64.v v9, (a0), zero
 ; CHECK-ZVKB32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; CHECK-ZVKB32-NEXT:    vlse64.v v9, (a0), zero
 ; CHECK-ZVKB32-NEXT:    vand.vv v8, v8, v9, v0.t
 ; CHECK-ZVKB32-NEXT:    addi sp, sp, 16
 ; CHECK-ZVKB32-NEXT:    .cfi_def_cfa_offset 0
@@ -1208,9 +1206,8 @@ define <vscale x 2 x i64> @vandn_vx_vp_nxv2i64(i64 %a, <vscale x 2 x i64> %b, <v
 ; CHECK-RV32-NEXT:    sw a0, 8(sp)
 ; CHECK-RV32-NEXT:    sw a1, 12(sp)
 ; CHECK-RV32-NEXT:    addi a0, sp, 8
-; CHECK-RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; CHECK-RV32-NEXT:    vlse64.v v10, (a0), zero
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; CHECK-RV32-NEXT:    vlse64.v v10, (a0), zero
 ; CHECK-RV32-NEXT:    vand.vv v8, v8, v10, v0.t
 ; CHECK-RV32-NEXT:    addi sp, sp, 16
 ; CHECK-RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1232,9 +1229,8 @@ define <vscale x 2 x i64> @vandn_vx_vp_nxv2i64(i64 %a, <vscale x 2 x i64> %b, <v
 ; CHECK-ZVKB32-NEXT:    sw a0, 8(sp)
 ; CHECK-ZVKB32-NEXT:    sw a1, 12(sp)
 ; CHECK-ZVKB32-NEXT:    addi a0, sp, 8
-; CHECK-ZVKB32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; CHECK-ZVKB32-NEXT:    vlse64.v v10, (a0), zero
 ; CHECK-ZVKB32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; CHECK-ZVKB32-NEXT:    vlse64.v v10, (a0), zero
 ; CHECK-ZVKB32-NEXT:    vand.vv v8, v8, v10, v0.t
 ; CHECK-ZVKB32-NEXT:    addi sp, sp, 16
 ; CHECK-ZVKB32-NEXT:    .cfi_def_cfa_offset 0
@@ -1301,9 +1297,8 @@ define <vscale x 4 x i64> @vandn_vx_vp_nxv4i64(i64 %a, <vscale x 4 x i64> %b, <v
 ; CHECK-RV32-NEXT:    sw a0, 8(sp)
 ; CHECK-RV32-NEXT:    sw a1, 12(sp)
 ; CHECK-RV32-NEXT:    addi a0, sp, 8
-; CHECK-RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; CHECK-RV32-NEXT:    vlse64.v v12, (a0), zero
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; CHECK-RV32-NEXT:    vlse64.v v12, (a0), zero
 ; CHECK-RV32-NEXT:    vand.vv v8, v8, v12, v0.t
 ; CHECK-RV32-NEXT:    addi sp, sp, 16
 ; CHECK-RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1325,9 +1320,8 @@ define <vscale x 4 x i64> @vandn_vx_vp_nxv4i64(i64 %a, <vscale x 4 x i64> %b, <v
 ; CHECK-ZVKB32-NEXT:    sw a0, 8(sp)
 ; CHECK-ZVKB32-NEXT:    sw a1, 12(sp)
 ; CHECK-ZVKB32-NEXT:    addi a0, sp, 8
-; CHECK-ZVKB32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; CHECK-ZVKB32-NEXT:    vlse64.v v12, (a0), zero
 ; CHECK-ZVKB32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; CHECK-ZVKB32-NEXT:    vlse64.v v12, (a0), zero
 ; CHECK-ZVKB32-NEXT:    vand.vv v8, v8, v12, v0.t
 ; CHECK-ZVKB32-NEXT:    addi sp, sp, 16
 ; CHECK-ZVKB32-NEXT:    .cfi_def_cfa_offset 0
@@ -1394,9 +1388,8 @@ define <vscale x 8 x i64> @vandn_vx_vp_nxv8i64(i64 %a, <vscale x 8 x i64> %b, <v
 ; CHECK-RV32-NEXT:    sw a0, 8(sp)
 ; CHECK-RV32-NEXT:    sw a1, 12(sp)
 ; CHECK-RV32-NEXT:    addi a0, sp, 8
-; CHECK-RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; CHECK-RV32-NEXT:    vlse64.v v16, (a0), zero
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; CHECK-RV32-NEXT:    vlse64.v v16, (a0), zero
 ; CHECK-RV32-NEXT:    vand.vv v8, v8, v16, v0.t
 ; CHECK-RV32-NEXT:    addi sp, sp, 16
 ; CHECK-RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1418,9 +1411,8 @@ define <vscale x 8 x i64> @vandn_vx_vp_nxv8i64(i64 %a, <vscale x 8 x i64> %b, <v
 ; CHECK-ZVKB32-NEXT:    sw a0, 8(sp)
 ; CHECK-ZVKB32-NEXT:    sw a1, 12(sp)
 ; CHECK-ZVKB32-NEXT:    addi a0, sp, 8
-; CHECK-ZVKB32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; CHECK-ZVKB32-NEXT:    vlse64.v v16, (a0), zero
 ; CHECK-ZVKB32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; CHECK-ZVKB32-NEXT:    vlse64.v v16, (a0), zero
 ; CHECK-ZVKB32-NEXT:    vand.vv v8, v8, v16, v0.t
 ; CHECK-ZVKB32-NEXT:    addi sp, sp, 16
 ; CHECK-ZVKB32-NEXT:    .cfi_def_cfa_offset 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/vdiv-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vdiv-vp.ll
index 2814be2792de..03e4e1f445be 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vdiv-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vdiv-vp.ll
@@ -893,9 +893,8 @@ define <vscale x 1 x i64> @vdiv_vx_nxv1i64(<vscale x 1 x i64> %va, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vdiv.vv v8, v8, v9, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -920,9 +919,8 @@ define <vscale x 1 x i64> @vdiv_vx_nxv1i64_unmasked(<vscale x 1 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vdiv.vv v8, v8, v9
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -969,9 +967,8 @@ define <vscale x 2 x i64> @vdiv_vx_nxv2i64(<vscale x 2 x i64> %va, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vdiv.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -996,9 +993,8 @@ define <vscale x 2 x i64> @vdiv_vx_nxv2i64_unmasked(<vscale x 2 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vdiv.vv v8, v8, v10
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1045,9 +1041,8 @@ define <vscale x 4 x i64> @vdiv_vx_nxv4i64(<vscale x 4 x i64> %va, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vdiv.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1072,9 +1067,8 @@ define <vscale x 4 x i64> @vdiv_vx_nxv4i64_unmasked(<vscale x 4 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vdiv.vv v8, v8, v12
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1121,9 +1115,8 @@ define <vscale x 8 x i64> @vdiv_vx_nxv8i64(<vscale x 8 x i64> %va, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vdiv.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1148,9 +1141,8 @@ define <vscale x 8 x i64> @vdiv_vx_nxv8i64_unmasked(<vscale x 8 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vdiv.vv v8, v8, v16
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/vdivu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vdivu-vp.ll
index 3e913d4f682e..2f35f91d77a4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vdivu-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vdivu-vp.ll
@@ -892,9 +892,8 @@ define <vscale x 1 x i64> @vdivu_vx_nxv1i64(<vscale x 1 x i64> %va, i64 %b, <vsc
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vdivu.vv v8, v8, v9, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -919,9 +918,8 @@ define <vscale x 1 x i64> @vdivu_vx_nxv1i64_unmasked(<vscale x 1 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vdivu.vv v8, v8, v9
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -968,9 +966,8 @@ define <vscale x 2 x i64> @vdivu_vx_nxv2i64(<vscale x 2 x i64> %va, i64 %b, <vsc
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vdivu.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -995,9 +992,8 @@ define <vscale x 2 x i64> @vdivu_vx_nxv2i64_unmasked(<vscale x 2 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vdivu.vv v8, v8, v10
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1044,9 +1040,8 @@ define <vscale x 4 x i64> @vdivu_vx_nxv4i64(<vscale x 4 x i64> %va, i64 %b, <vsc
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vdivu.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1071,9 +1066,8 @@ define <vscale x 4 x i64> @vdivu_vx_nxv4i64_unmasked(<vscale x 4 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vdivu.vv v8, v8, v12
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1120,9 +1114,8 @@ define <vscale x 8 x i64> @vdivu_vx_nxv8i64(<vscale x 8 x i64> %va, i64 %b, <vsc
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vdivu.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1147,9 +1140,8 @@ define <vscale x 8 x i64> @vdivu_vx_nxv8i64_unmasked(<vscale x 8 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vdivu.vv v8, v8, v16
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfadd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfadd-vp.ll
index 87bc9f27d6dc..31359c3f68ec 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfadd-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfadd-vp.ll
@@ -679,10 +679,10 @@ define <vscale x 1 x half> @vfadd_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfadd_vv_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v9, v9, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -700,10 +700,10 @@ define <vscale x 1 x half> @vfadd_vv_nxv1f16_unmasked(<vscale x 1 x half> %va, <
 ;
 ; ZVFHMIN-LABEL: vfadd_vv_nxv1f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v9, v9, v10
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -722,11 +722,11 @@ define <vscale x 1 x half> @vfadd_vf_nxv1f16(<vscale x 1 x half> %va, half %b, <
 ; ZVFHMIN-LABEL: vfadd_vf_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v9, v10, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -747,11 +747,11 @@ define <vscale x 1 x half> @vfadd_vf_nxv1f16_commute(<vscale x 1 x half> %va, ha
 ; ZVFHMIN-LABEL: vfadd_vf_nxv1f16_commute:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v9, v8, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -772,11 +772,11 @@ define <vscale x 1 x half> @vfadd_vf_nxv1f16_unmasked(<vscale x 1 x half> %va, h
 ; ZVFHMIN-LABEL: vfadd_vf_nxv1f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v9, v10, v8
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -797,11 +797,11 @@ define <vscale x 1 x half> @vfadd_vf_nxv1f16_unmasked_commute(<vscale x 1 x half
 ; ZVFHMIN-LABEL: vfadd_vf_nxv1f16_unmasked_commute:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v9, v8, v10
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -823,10 +823,10 @@ define <vscale x 2 x half> @vfadd_vv_nxv2f16(<vscale x 2 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfadd_vv_nxv2f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v9, v9, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -844,10 +844,10 @@ define <vscale x 2 x half> @vfadd_vv_nxv2f16_unmasked(<vscale x 2 x half> %va, <
 ;
 ; ZVFHMIN-LABEL: vfadd_vv_nxv2f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v9, v9, v10
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -866,11 +866,11 @@ define <vscale x 2 x half> @vfadd_vf_nxv2f16(<vscale x 2 x half> %va, half %b, <
 ; ZVFHMIN-LABEL: vfadd_vf_nxv2f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v9, v10, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -891,11 +891,11 @@ define <vscale x 2 x half> @vfadd_vf_nxv2f16_unmasked(<vscale x 2 x half> %va, h
 ; ZVFHMIN-LABEL: vfadd_vf_nxv2f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v9, v10, v8
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -917,10 +917,10 @@ define <vscale x 4 x half> @vfadd_vv_nxv4f16(<vscale x 4 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfadd_vv_nxv4f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v10, v12, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
@@ -938,10 +938,10 @@ define <vscale x 4 x half> @vfadd_vv_nxv4f16_unmasked(<vscale x 4 x half> %va, <
 ;
 ; ZVFHMIN-LABEL: vfadd_vv_nxv4f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v10, v12, v10
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
@@ -960,11 +960,11 @@ define <vscale x 4 x half> @vfadd_vf_nxv4f16(<vscale x 4 x half> %va, half %b, <
 ; ZVFHMIN-LABEL: vfadd_vf_nxv4f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v10, v10, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
@@ -985,11 +985,11 @@ define <vscale x 4 x half> @vfadd_vf_nxv4f16_unmasked(<vscale x 4 x half> %va, h
 ; ZVFHMIN-LABEL: vfadd_vf_nxv4f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v10, v10, v12
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
@@ -1011,10 +1011,10 @@ define <vscale x 8 x half> @vfadd_vv_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfadd_vv_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v12, v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
@@ -1032,10 +1032,10 @@ define <vscale x 8 x half> @vfadd_vv_nxv8f16_unmasked(<vscale x 8 x half> %va, <
 ;
 ; ZVFHMIN-LABEL: vfadd_vv_nxv8f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v12, v16, v12
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
@@ -1054,11 +1054,11 @@ define <vscale x 8 x half> @vfadd_vf_nxv8f16(<vscale x 8 x half> %va, half %b, <
 ; ZVFHMIN-LABEL: vfadd_vf_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v12, v12, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
@@ -1079,11 +1079,11 @@ define <vscale x 8 x half> @vfadd_vf_nxv8f16_unmasked(<vscale x 8 x half> %va, h
 ; ZVFHMIN-LABEL: vfadd_vf_nxv8f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v12, v12, v16
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
@@ -1105,10 +1105,10 @@ define <vscale x 16 x half> @vfadd_vv_nxv16f16(<vscale x 16 x half> %va, <vscale
 ;
 ; ZVFHMIN-LABEL: vfadd_vv_nxv16f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v16, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
@@ -1126,10 +1126,10 @@ define <vscale x 16 x half> @vfadd_vv_nxv16f16_unmasked(<vscale x 16 x half> %va
 ;
 ; ZVFHMIN-LABEL: vfadd_vv_nxv16f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v16, v24, v16
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
@@ -1148,11 +1148,11 @@ define <vscale x 16 x half> @vfadd_vf_nxv16f16(<vscale x 16 x half> %va, half %b
 ; ZVFHMIN-LABEL: vfadd_vf_nxv16f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v12, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v16, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
@@ -1173,11 +1173,11 @@ define <vscale x 16 x half> @vfadd_vf_nxv16f16_unmasked(<vscale x 16 x half> %va
 ; ZVFHMIN-LABEL: vfadd_vf_nxv16f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v12, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v16, v16, v24
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
@@ -1205,23 +1205,22 @@ define <vscale x 32 x half> @vfadd_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v7, v0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v16, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
@@ -1231,10 +1230,11 @@ define <vscale x 32 x half> @vfadd_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:  .LBB48_2:
 ; ZVFHMIN-NEXT:    addi a1, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v7
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v16, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
@@ -1266,22 +1266,21 @@ define <vscale x 32 x half> @vfadd_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v7
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
+; ZVFHMIN-NEXT:    vmset.m v24
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a2
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v24, a2
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v16, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
@@ -1291,9 +1290,10 @@ define <vscale x 32 x half> @vfadd_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:  .LBB49_2:
 ; ZVFHMIN-NEXT:    addi a1, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v16, v24, v16
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
@@ -1324,14 +1324,10 @@ define <vscale x 32 x half> @vfadd_vf_nxv32f16(<vscale x 32 x half> %va, half %b
 ; ZVFHMIN-NEXT:    add a1, a2, a1
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x11, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 17 * vlenb
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vmv8r.v v16, v8
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vmv8r.v v24, v8
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v16, a1
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a3, a1, 3
@@ -1352,15 +1348,18 @@ define <vscale x 32 x half> @vfadd_vf_nxv32f16(<vscale x 32 x half> %va, half %b
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a4, a3, 3
-; ZVFHMIN-NEXT:    add a3, a4, a3
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v28
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a3, a2, 3
+; ZVFHMIN-NEXT:    add a2, a3, a2
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v16, v8, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
@@ -1370,20 +1369,21 @@ define <vscale x 32 x half> @vfadd_vf_nxv32f16(<vscale x 32 x half> %va, half %b
 ; ZVFHMIN-NEXT:  .LBB50_2:
 ; ZVFHMIN-NEXT:    addi a1, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a2, a1, 3
-; ZVFHMIN-NEXT:    add a1, a2, a1
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a1, a0, 3
+; ZVFHMIN-NEXT:    add a0, a1, a0
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v0
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl1r.v v0, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v16, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
@@ -1416,16 +1416,10 @@ define <vscale x 32 x half> @vfadd_vf_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    slli a1, a1, 4
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmv8r.v v16, v8
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    vmset.m v7
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
 ; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vmset.m v24
 ; ZVFHMIN-NEXT:    vmv.v.x v16, a1
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
@@ -1436,18 +1430,22 @@ define <vscale x 32 x half> @vfadd_vf_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a2
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v24, a2
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 3
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vmv8r.v v16, v8
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v16, v8, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
@@ -1457,14 +1455,15 @@ define <vscale x 32 x half> @vfadd_vf_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:  .LBB51_2:
 ; ZVFHMIN-NEXT:    addi a1, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v0
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v16, v16, v24
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfdiv-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfdiv-vp.ll
index 061af454aa8b..2205769d3494 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfdiv-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfdiv-vp.ll
@@ -641,10 +641,10 @@ define <vscale x 1 x half> @vfdiv_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfdiv_vv_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v9, v9, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -662,10 +662,10 @@ define <vscale x 1 x half> @vfdiv_vv_nxv1f16_unmasked(<vscale x 1 x half> %va, <
 ;
 ; ZVFHMIN-LABEL: vfdiv_vv_nxv1f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v9, v9, v10
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -684,11 +684,11 @@ define <vscale x 1 x half> @vfdiv_vf_nxv1f16(<vscale x 1 x half> %va, half %b, <
 ; ZVFHMIN-LABEL: vfdiv_vf_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v9, v10, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -709,11 +709,11 @@ define <vscale x 1 x half> @vfdiv_vf_nxv1f16_unmasked(<vscale x 1 x half> %va, h
 ; ZVFHMIN-LABEL: vfdiv_vf_nxv1f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v9, v10, v8
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -735,10 +735,10 @@ define <vscale x 2 x half> @vfdiv_vv_nxv2f16(<vscale x 2 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfdiv_vv_nxv2f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v9, v9, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -756,10 +756,10 @@ define <vscale x 2 x half> @vfdiv_vv_nxv2f16_unmasked(<vscale x 2 x half> %va, <
 ;
 ; ZVFHMIN-LABEL: vfdiv_vv_nxv2f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v9, v9, v10
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -778,11 +778,11 @@ define <vscale x 2 x half> @vfdiv_vf_nxv2f16(<vscale x 2 x half> %va, half %b, <
 ; ZVFHMIN-LABEL: vfdiv_vf_nxv2f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v9, v10, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -803,11 +803,11 @@ define <vscale x 2 x half> @vfdiv_vf_nxv2f16_unmasked(<vscale x 2 x half> %va, h
 ; ZVFHMIN-LABEL: vfdiv_vf_nxv2f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v9, v10, v8
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -829,10 +829,10 @@ define <vscale x 4 x half> @vfdiv_vv_nxv4f16(<vscale x 4 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfdiv_vv_nxv4f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v10, v12, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
@@ -850,10 +850,10 @@ define <vscale x 4 x half> @vfdiv_vv_nxv4f16_unmasked(<vscale x 4 x half> %va, <
 ;
 ; ZVFHMIN-LABEL: vfdiv_vv_nxv4f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v10, v12, v10
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
@@ -872,11 +872,11 @@ define <vscale x 4 x half> @vfdiv_vf_nxv4f16(<vscale x 4 x half> %va, half %b, <
 ; ZVFHMIN-LABEL: vfdiv_vf_nxv4f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v10, v10, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
@@ -897,11 +897,11 @@ define <vscale x 4 x half> @vfdiv_vf_nxv4f16_unmasked(<vscale x 4 x half> %va, h
 ; ZVFHMIN-LABEL: vfdiv_vf_nxv4f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v10, v10, v12
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
@@ -923,10 +923,10 @@ define <vscale x 8 x half> @vfdiv_vv_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfdiv_vv_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v12, v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
@@ -944,10 +944,10 @@ define <vscale x 8 x half> @vfdiv_vv_nxv8f16_unmasked(<vscale x 8 x half> %va, <
 ;
 ; ZVFHMIN-LABEL: vfdiv_vv_nxv8f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v12, v16, v12
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
@@ -966,11 +966,11 @@ define <vscale x 8 x half> @vfdiv_vf_nxv8f16(<vscale x 8 x half> %va, half %b, <
 ; ZVFHMIN-LABEL: vfdiv_vf_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v12, v12, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
@@ -991,11 +991,11 @@ define <vscale x 8 x half> @vfdiv_vf_nxv8f16_unmasked(<vscale x 8 x half> %va, h
 ; ZVFHMIN-LABEL: vfdiv_vf_nxv8f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v12, v12, v16
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
@@ -1017,10 +1017,10 @@ define <vscale x 16 x half> @vfdiv_vv_nxv16f16(<vscale x 16 x half> %va, <vscale
 ;
 ; ZVFHMIN-LABEL: vfdiv_vv_nxv16f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v16, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
@@ -1038,10 +1038,10 @@ define <vscale x 16 x half> @vfdiv_vv_nxv16f16_unmasked(<vscale x 16 x half> %va
 ;
 ; ZVFHMIN-LABEL: vfdiv_vv_nxv16f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v16, v24, v16
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
@@ -1060,11 +1060,11 @@ define <vscale x 16 x half> @vfdiv_vf_nxv16f16(<vscale x 16 x half> %va, half %b
 ; ZVFHMIN-LABEL: vfdiv_vf_nxv16f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v12, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v16, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
@@ -1085,11 +1085,11 @@ define <vscale x 16 x half> @vfdiv_vf_nxv16f16_unmasked(<vscale x 16 x half> %va
 ; ZVFHMIN-LABEL: vfdiv_vf_nxv16f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v12, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v16, v16, v24
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
@@ -1117,23 +1117,22 @@ define <vscale x 32 x half> @vfdiv_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v7, v0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v16, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
@@ -1143,10 +1142,11 @@ define <vscale x 32 x half> @vfdiv_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:  .LBB44_2:
 ; ZVFHMIN-NEXT:    addi a1, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v7
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v16, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
@@ -1178,22 +1178,21 @@ define <vscale x 32 x half> @vfdiv_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v7
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
+; ZVFHMIN-NEXT:    vmset.m v24
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a2
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v24, a2
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v16, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
@@ -1203,9 +1202,10 @@ define <vscale x 32 x half> @vfdiv_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:  .LBB45_2:
 ; ZVFHMIN-NEXT:    addi a1, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v16, v24, v16
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
@@ -1236,14 +1236,10 @@ define <vscale x 32 x half> @vfdiv_vf_nxv32f16(<vscale x 32 x half> %va, half %b
 ; ZVFHMIN-NEXT:    add a1, a2, a1
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x11, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 17 * vlenb
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vmv8r.v v16, v8
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vmv8r.v v24, v8
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v16, a1
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a3, a1, 3
@@ -1264,15 +1260,18 @@ define <vscale x 32 x half> @vfdiv_vf_nxv32f16(<vscale x 32 x half> %va, half %b
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a4, a3, 3
-; ZVFHMIN-NEXT:    add a3, a4, a3
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v28
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a3, a2, 3
+; ZVFHMIN-NEXT:    add a2, a3, a2
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v16, v8, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
@@ -1282,20 +1281,21 @@ define <vscale x 32 x half> @vfdiv_vf_nxv32f16(<vscale x 32 x half> %va, half %b
 ; ZVFHMIN-NEXT:  .LBB46_2:
 ; ZVFHMIN-NEXT:    addi a1, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a2, a1, 3
-; ZVFHMIN-NEXT:    add a1, a2, a1
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a1, a0, 3
+; ZVFHMIN-NEXT:    add a0, a1, a0
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v0
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl1r.v v0, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v16, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
@@ -1328,16 +1328,10 @@ define <vscale x 32 x half> @vfdiv_vf_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    slli a1, a1, 4
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmv8r.v v16, v8
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    vmset.m v7
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
 ; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vmset.m v24
 ; ZVFHMIN-NEXT:    vmv.v.x v16, a1
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
@@ -1348,18 +1342,22 @@ define <vscale x 32 x half> @vfdiv_vf_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a2
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v24, a2
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 3
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vmv8r.v v16, v8
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v16, v8, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
@@ -1369,14 +1367,15 @@ define <vscale x 32 x half> @vfdiv_vf_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:  .LBB47_2:
 ; ZVFHMIN-NEXT:    addi a1, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v0
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v16, v16, v24
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmax-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfmax-vp.ll
index 02d6229e9924..5d998c4e739d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfmax-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfmax-vp.ll
@@ -290,10 +290,10 @@ define <vscale x 1 x half> @vfmax_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfmax_vv_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfmax.vv v9, v9, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -311,10 +311,10 @@ define <vscale x 1 x half> @vfmax_vv_nxv1f16_unmasked(<vscale x 1 x half> %va, <
 ;
 ; ZVFHMIN-LABEL: vfmax_vv_nxv1f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfmax.vv v9, v9, v10
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -334,10 +334,10 @@ define <vscale x 2 x half> @vfmax_vv_nxv2f16(<vscale x 2 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfmax_vv_nxv2f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfmax.vv v9, v9, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -355,10 +355,10 @@ define <vscale x 2 x half> @vfmax_vv_nxv2f16_unmasked(<vscale x 2 x half> %va, <
 ;
 ; ZVFHMIN-LABEL: vfmax_vv_nxv2f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfmax.vv v9, v9, v10
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -378,10 +378,10 @@ define <vscale x 4 x half> @vfmax_vv_nxv4f16(<vscale x 4 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfmax_vv_nxv4f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfmax.vv v10, v12, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
@@ -399,10 +399,10 @@ define <vscale x 4 x half> @vfmax_vv_nxv4f16_unmasked(<vscale x 4 x half> %va, <
 ;
 ; ZVFHMIN-LABEL: vfmax_vv_nxv4f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfmax.vv v10, v12, v10
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
@@ -422,10 +422,10 @@ define <vscale x 8 x half> @vfmax_vv_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfmax_vv_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfmax.vv v12, v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
@@ -443,10 +443,10 @@ define <vscale x 8 x half> @vfmax_vv_nxv8f16_unmasked(<vscale x 8 x half> %va, <
 ;
 ; ZVFHMIN-LABEL: vfmax_vv_nxv8f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfmax.vv v12, v16, v12
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
@@ -466,10 +466,10 @@ define <vscale x 16 x half> @vfmax_vv_nxv16f16(<vscale x 16 x half> %va, <vscale
 ;
 ; ZVFHMIN-LABEL: vfmax_vv_nxv16f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmax.vv v16, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
@@ -487,10 +487,10 @@ define <vscale x 16 x half> @vfmax_vv_nxv16f16_unmasked(<vscale x 16 x half> %va
 ;
 ; ZVFHMIN-LABEL: vfmax_vv_nxv16f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmax.vv v16, v24, v16
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
@@ -516,23 +516,22 @@ define <vscale x 32 x half> @vfmax_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v7, v0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmax.vv v16, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
@@ -542,10 +541,11 @@ define <vscale x 32 x half> @vfmax_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:  .LBB22_2:
 ; ZVFHMIN-NEXT:    addi a1, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v7
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmax.vv v16, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
@@ -577,22 +577,21 @@ define <vscale x 32 x half> @vfmax_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v7
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
+; ZVFHMIN-NEXT:    vmset.m v24
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a2
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v24, a2
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmax.vv v16, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
@@ -602,9 +601,10 @@ define <vscale x 32 x half> @vfmax_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:  .LBB23_2:
 ; ZVFHMIN-NEXT:    addi a1, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmax.vv v16, v24, v16
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmin-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfmin-vp.ll
index f7f80299785d..48a4c1386900 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfmin-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfmin-vp.ll
@@ -290,10 +290,10 @@ define <vscale x 1 x half> @vfmin_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfmin_vv_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfmin.vv v9, v9, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -311,10 +311,10 @@ define <vscale x 1 x half> @vfmin_vv_nxv1f16_unmasked(<vscale x 1 x half> %va, <
 ;
 ; ZVFHMIN-LABEL: vfmin_vv_nxv1f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfmin.vv v9, v9, v10
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -334,10 +334,10 @@ define <vscale x 2 x half> @vfmin_vv_nxv2f16(<vscale x 2 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfmin_vv_nxv2f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfmin.vv v9, v9, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -355,10 +355,10 @@ define <vscale x 2 x half> @vfmin_vv_nxv2f16_unmasked(<vscale x 2 x half> %va, <
 ;
 ; ZVFHMIN-LABEL: vfmin_vv_nxv2f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfmin.vv v9, v9, v10
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -378,10 +378,10 @@ define <vscale x 4 x half> @vfmin_vv_nxv4f16(<vscale x 4 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfmin_vv_nxv4f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfmin.vv v10, v12, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
@@ -399,10 +399,10 @@ define <vscale x 4 x half> @vfmin_vv_nxv4f16_unmasked(<vscale x 4 x half> %va, <
 ;
 ; ZVFHMIN-LABEL: vfmin_vv_nxv4f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfmin.vv v10, v12, v10
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
@@ -422,10 +422,10 @@ define <vscale x 8 x half> @vfmin_vv_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfmin_vv_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfmin.vv v12, v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
@@ -443,10 +443,10 @@ define <vscale x 8 x half> @vfmin_vv_nxv8f16_unmasked(<vscale x 8 x half> %va, <
 ;
 ; ZVFHMIN-LABEL: vfmin_vv_nxv8f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfmin.vv v12, v16, v12
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
@@ -466,10 +466,10 @@ define <vscale x 16 x half> @vfmin_vv_nxv16f16(<vscale x 16 x half> %va, <vscale
 ;
 ; ZVFHMIN-LABEL: vfmin_vv_nxv16f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmin.vv v16, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
@@ -487,10 +487,10 @@ define <vscale x 16 x half> @vfmin_vv_nxv16f16_unmasked(<vscale x 16 x half> %va
 ;
 ; ZVFHMIN-LABEL: vfmin_vv_nxv16f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmin.vv v16, v24, v16
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
@@ -516,23 +516,22 @@ define <vscale x 32 x half> @vfmin_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v7, v0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmin.vv v16, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
@@ -542,10 +541,11 @@ define <vscale x 32 x half> @vfmin_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:  .LBB22_2:
 ; ZVFHMIN-NEXT:    addi a1, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v7
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmin.vv v16, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
@@ -577,22 +577,21 @@ define <vscale x 32 x half> @vfmin_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v7
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
+; ZVFHMIN-NEXT:    vmset.m v24
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a2
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v24, a2
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmin.vv v16, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
@@ -602,9 +601,10 @@ define <vscale x 32 x half> @vfmin_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:  .LBB23_2:
 ; ZVFHMIN-NEXT:    addi a1, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmin.vv v16, v24, v16
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmul-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfmul-vp.ll
index 7e5523044a01..06f74dd99574 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfmul-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfmul-vp.ll
@@ -19,10 +19,10 @@ define <vscale x 1 x half> @vfmul_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfmul_vv_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v9, v9, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -40,10 +40,10 @@ define <vscale x 1 x half> @vfmul_vv_nxv1f16_unmasked(<vscale x 1 x half> %va, <
 ;
 ; ZVFHMIN-LABEL: vfmul_vv_nxv1f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v9, v9, v10
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -62,11 +62,11 @@ define <vscale x 1 x half> @vfmul_vf_nxv1f16(<vscale x 1 x half> %va, half %b, <
 ; ZVFHMIN-LABEL: vfmul_vf_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v9, v10, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -87,11 +87,11 @@ define <vscale x 1 x half> @vfmul_vf_nxv1f16_unmasked(<vscale x 1 x half> %va, h
 ; ZVFHMIN-LABEL: vfmul_vf_nxv1f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v9, v10, v8
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -113,10 +113,10 @@ define <vscale x 2 x half> @vfmul_vv_nxv2f16(<vscale x 2 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfmul_vv_nxv2f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v9, v9, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -134,10 +134,10 @@ define <vscale x 2 x half> @vfmul_vv_nxv2f16_unmasked(<vscale x 2 x half> %va, <
 ;
 ; ZVFHMIN-LABEL: vfmul_vv_nxv2f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v9, v9, v10
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -156,11 +156,11 @@ define <vscale x 2 x half> @vfmul_vf_nxv2f16(<vscale x 2 x half> %va, half %b, <
 ; ZVFHMIN-LABEL: vfmul_vf_nxv2f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v9, v10, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -181,11 +181,11 @@ define <vscale x 2 x half> @vfmul_vf_nxv2f16_unmasked(<vscale x 2 x half> %va, h
 ; ZVFHMIN-LABEL: vfmul_vf_nxv2f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v9, v10, v8
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -207,10 +207,10 @@ define <vscale x 4 x half> @vfmul_vv_nxv4f16(<vscale x 4 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfmul_vv_nxv4f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v10, v12, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
@@ -228,10 +228,10 @@ define <vscale x 4 x half> @vfmul_vv_nxv4f16_unmasked(<vscale x 4 x half> %va, <
 ;
 ; ZVFHMIN-LABEL: vfmul_vv_nxv4f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v10, v12, v10
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
@@ -250,11 +250,11 @@ define <vscale x 4 x half> @vfmul_vf_nxv4f16(<vscale x 4 x half> %va, half %b, <
 ; ZVFHMIN-LABEL: vfmul_vf_nxv4f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v10, v10, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
@@ -275,11 +275,11 @@ define <vscale x 4 x half> @vfmul_vf_nxv4f16_unmasked(<vscale x 4 x half> %va, h
 ; ZVFHMIN-LABEL: vfmul_vf_nxv4f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v10, v10, v12
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
@@ -301,10 +301,10 @@ define <vscale x 8 x half> @vfmul_vv_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfmul_vv_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v12, v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
@@ -322,10 +322,10 @@ define <vscale x 8 x half> @vfmul_vv_nxv8f16_unmasked(<vscale x 8 x half> %va, <
 ;
 ; ZVFHMIN-LABEL: vfmul_vv_nxv8f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v12, v16, v12
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
@@ -344,11 +344,11 @@ define <vscale x 8 x half> @vfmul_vf_nxv8f16(<vscale x 8 x half> %va, half %b, <
 ; ZVFHMIN-LABEL: vfmul_vf_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v12, v12, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
@@ -369,11 +369,11 @@ define <vscale x 8 x half> @vfmul_vf_nxv8f16_unmasked(<vscale x 8 x half> %va, h
 ; ZVFHMIN-LABEL: vfmul_vf_nxv8f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v12, v12, v16
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
@@ -395,10 +395,10 @@ define <vscale x 16 x half> @vfmul_vv_nxv16f16(<vscale x 16 x half> %va, <vscale
 ;
 ; ZVFHMIN-LABEL: vfmul_vv_nxv16f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v16, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
@@ -416,10 +416,10 @@ define <vscale x 16 x half> @vfmul_vv_nxv16f16_unmasked(<vscale x 16 x half> %va
 ;
 ; ZVFHMIN-LABEL: vfmul_vv_nxv16f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v16, v24, v16
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
@@ -438,11 +438,11 @@ define <vscale x 16 x half> @vfmul_vf_nxv16f16(<vscale x 16 x half> %va, half %b
 ; ZVFHMIN-LABEL: vfmul_vf_nxv16f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v12, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v16, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
@@ -463,11 +463,11 @@ define <vscale x 16 x half> @vfmul_vf_nxv16f16_unmasked(<vscale x 16 x half> %va
 ; ZVFHMIN-LABEL: vfmul_vf_nxv16f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v12, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v16, v16, v24
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
@@ -495,23 +495,22 @@ define <vscale x 32 x half> @vfmul_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v7, v0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v16, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
@@ -521,10 +520,11 @@ define <vscale x 32 x half> @vfmul_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:  .LBB20_2:
 ; ZVFHMIN-NEXT:    addi a1, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v7
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v16, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
@@ -556,22 +556,21 @@ define <vscale x 32 x half> @vfmul_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v7
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
+; ZVFHMIN-NEXT:    vmset.m v24
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a2
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v24, a2
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v16, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
@@ -581,9 +580,10 @@ define <vscale x 32 x half> @vfmul_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:  .LBB21_2:
 ; ZVFHMIN-NEXT:    addi a1, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v16, v24, v16
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
@@ -614,14 +614,10 @@ define <vscale x 32 x half> @vfmul_vf_nxv32f16(<vscale x 32 x half> %va, half %b
 ; ZVFHMIN-NEXT:    add a1, a2, a1
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x11, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 17 * vlenb
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vmv8r.v v16, v8
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vmv8r.v v24, v8
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v16, a1
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a3, a1, 3
@@ -642,15 +638,18 @@ define <vscale x 32 x half> @vfmul_vf_nxv32f16(<vscale x 32 x half> %va, half %b
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a4, a3, 3
-; ZVFHMIN-NEXT:    add a3, a4, a3
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v28
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a3, a2, 3
+; ZVFHMIN-NEXT:    add a2, a3, a2
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v16, v8, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
@@ -660,20 +659,21 @@ define <vscale x 32 x half> @vfmul_vf_nxv32f16(<vscale x 32 x half> %va, half %b
 ; ZVFHMIN-NEXT:  .LBB22_2:
 ; ZVFHMIN-NEXT:    addi a1, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a2, a1, 3
-; ZVFHMIN-NEXT:    add a1, a2, a1
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a1, a0, 3
+; ZVFHMIN-NEXT:    add a0, a1, a0
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v0
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl1r.v v0, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v16, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
@@ -706,16 +706,10 @@ define <vscale x 32 x half> @vfmul_vf_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    slli a1, a1, 4
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmv8r.v v16, v8
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    vmset.m v7
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
 ; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vmset.m v24
 ; ZVFHMIN-NEXT:    vmv.v.x v16, a1
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
@@ -726,18 +720,22 @@ define <vscale x 32 x half> @vfmul_vf_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a2
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v24, a2
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 3
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vmv8r.v v16, v8
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v16, v8, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
@@ -747,14 +745,15 @@ define <vscale x 32 x half> @vfmul_vf_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:  .LBB23_2:
 ; ZVFHMIN-NEXT:    addi a1, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v0
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v16, v16, v24
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfptosi-vp-mask.ll b/llvm/test/CodeGen/RISCV/rvv/vfptosi-vp-mask.ll
index b7f2133144e7..575d50d11f0a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfptosi-vp-mask.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfptosi-vp-mask.ll
@@ -42,9 +42,9 @@ define <vscale x 2 x i1> @vfptosi_nxv2i1_nxv2f16(<vscale x 2 x half> %va, <vscal
 ;
 ; ZVFHMIN-LABEL: vfptosi_nxv2i1_nxv2f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.rtz.x.f.v v8, v9, v0.t
 ; ZVFHMIN-NEXT:    vmsne.vi v0, v8, 0, v0.t
 ; ZVFHMIN-NEXT:    ret
@@ -62,9 +62,9 @@ define <vscale x 2 x i1> @vfptosi_nxv2i1_nxv2f16_unmasked(<vscale x 2 x half> %v
 ;
 ; ZVFHMIN-LABEL: vfptosi_nxv2i1_nxv2f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.rtz.x.f.v v8, v9
 ; ZVFHMIN-NEXT:    vmsne.vi v0, v8, 0
 ; ZVFHMIN-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfptosi-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfptosi-vp.ll
index d990c74c67d5..e33ab98c0f85 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfptosi-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfptosi-vp.ll
@@ -130,9 +130,8 @@ define <vscale x 2 x i7> @vfptosi_v4i7_v4f16(<vscale x 2 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfptosi_v4i7_v4f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vfncvt.rtz.x.f.w v8, v9, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vnsrl.wi v8, v8, 0, v0.t
@@ -153,9 +152,8 @@ define <vscale x 2 x i8> @vfptosi_nxv2i8_nxv2f16(<vscale x 2 x half> %va, <vscal
 ;
 ; ZVFHMIN-LABEL: vfptosi_nxv2i8_nxv2f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vfncvt.rtz.x.f.w v8, v9, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vnsrl.wi v8, v8, 0, v0.t
@@ -174,9 +172,8 @@ define <vscale x 2 x i8> @vfptosi_nxv2i8_nxv2f16_unmasked(<vscale x 2 x half> %v
 ;
 ; ZVFHMIN-LABEL: vfptosi_nxv2i8_nxv2f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vfncvt.rtz.x.f.w v8, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vnsrl.wi v8, v8, 0
@@ -196,9 +193,8 @@ define <vscale x 2 x i16> @vfptosi_nxv2i16_nxv2f16(<vscale x 2 x half> %va, <vsc
 ;
 ; ZVFHMIN-LABEL: vfptosi_nxv2i16_nxv2f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vfncvt.rtz.x.f.w v8, v9, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 2 x i16> @llvm.vp.fptosi.nxv2i16.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> %m, i32 %evl)
@@ -214,9 +210,8 @@ define <vscale x 2 x i16> @vfptosi_nxv2i16_nxv2f16_unmasked(<vscale x 2 x half>
 ;
 ; ZVFHMIN-LABEL: vfptosi_nxv2i16_nxv2f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vfncvt.rtz.x.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 2 x i16> @llvm.vp.fptosi.nxv2i16.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
@@ -235,9 +230,9 @@ define <vscale x 2 x i32> @vfptosi_nxv2i32_nxv2f16(<vscale x 2 x half> %va, <vsc
 ;
 ; ZVFHMIN-LABEL: vfptosi_nxv2i32_nxv2f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.rtz.x.f.v v8, v9, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 2 x i32> @llvm.vp.fptosi.nxv2i32.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> %m, i32 %evl)
@@ -254,9 +249,9 @@ define <vscale x 2 x i32> @vfptosi_nxv2i32_nxv2f16_unmasked(<vscale x 2 x half>
 ;
 ; ZVFHMIN-LABEL: vfptosi_nxv2i32_nxv2f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.rtz.x.f.v v8, v9
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 2 x i32> @llvm.vp.fptosi.nxv2i32.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
@@ -276,9 +271,9 @@ define <vscale x 2 x i64> @vfptosi_nxv2i64_nxv2f16(<vscale x 2 x half> %va, <vsc
 ;
 ; ZVFHMIN-LABEL: vfptosi_nxv2i64_nxv2f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.rtz.x.f.v v8, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 2 x i64> @llvm.vp.fptosi.nxv2i64.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> %m, i32 %evl)
@@ -286,21 +281,13 @@ define <vscale x 2 x i64> @vfptosi_nxv2i64_nxv2f16(<vscale x 2 x half> %va, <vsc
 }
 
 define <vscale x 2 x i64> @vfptosi_nxv2i64_nxv2f16_unmasked(<vscale x 2 x half> %va, i32 zeroext %evl) {
-; ZVFH-LABEL: vfptosi_nxv2i64_nxv2f16_unmasked:
-; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; ZVFH-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFH-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFH-NEXT:    vfwcvt.rtz.x.f.v v8, v10
-; ZVFH-NEXT:    ret
-;
-; ZVFHMIN-LABEL: vfptosi_nxv2i64_nxv2f16_unmasked:
-; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.rtz.x.f.v v8, v10
-; ZVFHMIN-NEXT:    ret
+; CHECK-LABEL: vfptosi_nxv2i64_nxv2f16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vfwcvt.f.f.v v10, v8
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vfwcvt.rtz.x.f.v v8, v10
+; CHECK-NEXT:    ret
   %v = call <vscale x 2 x i64> @llvm.vp.fptosi.nxv2i64.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x i64> %v
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfptoui-vp-mask.ll b/llvm/test/CodeGen/RISCV/rvv/vfptoui-vp-mask.ll
index 8ac5992bd5eb..e1d0ad475858 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfptoui-vp-mask.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfptoui-vp-mask.ll
@@ -42,9 +42,9 @@ define <vscale x 2 x i1> @vfptoui_nxv2i1_nxv2f16(<vscale x 2 x half> %va, <vscal
 ;
 ; ZVFHMIN-LABEL: vfptoui_nxv2i1_nxv2f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.rtz.xu.f.v v8, v9, v0.t
 ; ZVFHMIN-NEXT:    vmsne.vi v0, v8, 0, v0.t
 ; ZVFHMIN-NEXT:    ret
@@ -62,9 +62,9 @@ define <vscale x 2 x i1> @vfptoui_nxv2i1_nxv2f16_unmasked(<vscale x 2 x half> %v
 ;
 ; ZVFHMIN-LABEL: vfptoui_nxv2i1_nxv2f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.rtz.xu.f.v v8, v9
 ; ZVFHMIN-NEXT:    vmsne.vi v0, v8, 0
 ; ZVFHMIN-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfptoui-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfptoui-vp.ll
index 3b24a648d97f..86222ecfadfe 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfptoui-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfptoui-vp.ll
@@ -130,9 +130,8 @@ define <vscale x 2 x i7> @vfptoui_v4i7_v4f16(<vscale x 2 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfptoui_v4i7_v4f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vfncvt.rtz.x.f.w v8, v9, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vnsrl.wi v8, v8, 0, v0.t
@@ -153,9 +152,8 @@ define <vscale x 2 x i8> @vfptoui_nxv2i8_nxv2f16(<vscale x 2 x half> %va, <vscal
 ;
 ; ZVFHMIN-LABEL: vfptoui_nxv2i8_nxv2f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vfncvt.rtz.xu.f.w v8, v9, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vnsrl.wi v8, v8, 0, v0.t
@@ -174,9 +172,8 @@ define <vscale x 2 x i8> @vfptoui_nxv2i8_nxv2f16_unmasked(<vscale x 2 x half> %v
 ;
 ; ZVFHMIN-LABEL: vfptoui_nxv2i8_nxv2f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vfncvt.rtz.xu.f.w v8, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vnsrl.wi v8, v8, 0
@@ -196,9 +193,8 @@ define <vscale x 2 x i16> @vfptoui_nxv2i16_nxv2f16(<vscale x 2 x half> %va, <vsc
 ;
 ; ZVFHMIN-LABEL: vfptoui_nxv2i16_nxv2f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vfncvt.rtz.xu.f.w v8, v9, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 2 x i16> @llvm.vp.fptoui.nxv2i16.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> %m, i32 %evl)
@@ -214,9 +210,8 @@ define <vscale x 2 x i16> @vfptoui_nxv2i16_nxv2f16_unmasked(<vscale x 2 x half>
 ;
 ; ZVFHMIN-LABEL: vfptoui_nxv2i16_nxv2f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vfncvt.rtz.xu.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 2 x i16> @llvm.vp.fptoui.nxv2i16.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
@@ -235,9 +230,9 @@ define <vscale x 2 x i32> @vfptoui_nxv2i32_nxv2f16(<vscale x 2 x half> %va, <vsc
 ;
 ; ZVFHMIN-LABEL: vfptoui_nxv2i32_nxv2f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.rtz.xu.f.v v8, v9, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 2 x i32> @llvm.vp.fptoui.nxv2i32.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> %m, i32 %evl)
@@ -254,9 +249,9 @@ define <vscale x 2 x i32> @vfptoui_nxv2i32_nxv2f16_unmasked(<vscale x 2 x half>
 ;
 ; ZVFHMIN-LABEL: vfptoui_nxv2i32_nxv2f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.rtz.xu.f.v v8, v9
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 2 x i32> @llvm.vp.fptoui.nxv2i32.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
@@ -276,9 +271,9 @@ define <vscale x 2 x i64> @vfptoui_nxv2i64_nxv2f16(<vscale x 2 x half> %va, <vsc
 ;
 ; ZVFHMIN-LABEL: vfptoui_nxv2i64_nxv2f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.rtz.xu.f.v v8, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 2 x i64> @llvm.vp.fptoui.nxv2i64.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> %m, i32 %evl)
@@ -286,21 +281,13 @@ define <vscale x 2 x i64> @vfptoui_nxv2i64_nxv2f16(<vscale x 2 x half> %va, <vsc
 }
 
 define <vscale x 2 x i64> @vfptoui_nxv2i64_nxv2f16_unmasked(<vscale x 2 x half> %va, i32 zeroext %evl) {
-; ZVFH-LABEL: vfptoui_nxv2i64_nxv2f16_unmasked:
-; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; ZVFH-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFH-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFH-NEXT:    vfwcvt.rtz.xu.f.v v8, v10
-; ZVFH-NEXT:    ret
-;
-; ZVFHMIN-LABEL: vfptoui_nxv2i64_nxv2f16_unmasked:
-; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.rtz.xu.f.v v8, v10
-; ZVFHMIN-NEXT:    ret
+; CHECK-LABEL: vfptoui_nxv2i64_nxv2f16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vfwcvt.f.f.v v10, v8
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vfwcvt.rtz.xu.f.v v8, v10
+; CHECK-NEXT:    ret
   %v = call <vscale x 2 x i64> @llvm.vp.fptoui.nxv2i64.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x i64> %v
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfsqrt-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfsqrt-vp.ll
index 8e57be1e0697..e94d0a60bbfc 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfsqrt-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfsqrt-vp.ll
@@ -242,9 +242,9 @@ define <vscale x 1 x half> @vfsqrt_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfsqrt_vv_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfsqrt.v v9, v9, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -262,9 +262,9 @@ define <vscale x 1 x half> @vfsqrt_vv_nxv1f16_unmasked(<vscale x 1 x half> %va,
 ;
 ; ZVFHMIN-LABEL: vfsqrt_vv_nxv1f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfsqrt.v v9, v9
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -284,9 +284,9 @@ define <vscale x 2 x half> @vfsqrt_vv_nxv2f16(<vscale x 2 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfsqrt_vv_nxv2f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfsqrt.v v9, v9, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -304,9 +304,9 @@ define <vscale x 2 x half> @vfsqrt_vv_nxv2f16_unmasked(<vscale x 2 x half> %va,
 ;
 ; ZVFHMIN-LABEL: vfsqrt_vv_nxv2f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfsqrt.v v9, v9
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -326,9 +326,9 @@ define <vscale x 4 x half> @vfsqrt_vv_nxv4f16(<vscale x 4 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfsqrt_vv_nxv4f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfsqrt.v v10, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
@@ -346,9 +346,9 @@ define <vscale x 4 x half> @vfsqrt_vv_nxv4f16_unmasked(<vscale x 4 x half> %va,
 ;
 ; ZVFHMIN-LABEL: vfsqrt_vv_nxv4f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfsqrt.v v10, v10
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
@@ -368,9 +368,9 @@ define <vscale x 8 x half> @vfsqrt_vv_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfsqrt_vv_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfsqrt.v v12, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
@@ -388,9 +388,9 @@ define <vscale x 8 x half> @vfsqrt_vv_nxv8f16_unmasked(<vscale x 8 x half> %va,
 ;
 ; ZVFHMIN-LABEL: vfsqrt_vv_nxv8f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfsqrt.v v12, v12
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
@@ -410,9 +410,9 @@ define <vscale x 16 x half> @vfsqrt_vv_nxv16f16(<vscale x 16 x half> %va, <vscal
 ;
 ; ZVFHMIN-LABEL: vfsqrt_vv_nxv16f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfsqrt.v v16, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
@@ -430,9 +430,9 @@ define <vscale x 16 x half> @vfsqrt_vv_nxv16f16_unmasked(<vscale x 16 x half> %v
 ;
 ; ZVFHMIN-LABEL: vfsqrt_vv_nxv16f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfsqrt.v v16, v16
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
@@ -458,13 +458,13 @@ define <vscale x 32 x half> @vfsqrt_vv_nxv32f16(<vscale x 32 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
+; ZVFHMIN-NEXT:    sltu a4, a0, a3
+; ZVFHMIN-NEXT:    addi a4, a4, -1
+; ZVFHMIN-NEXT:    and a3, a4, a3
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
-; ZVFHMIN-NEXT:    sltu a2, a0, a3
-; ZVFHMIN-NEXT:    addi a2, a2, -1
-; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfsqrt.v v24, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24
@@ -472,9 +472,10 @@ define <vscale x 32 x half> @vfsqrt_vv_nxv32f16(<vscale x 32 x half> %va, <vscal
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB22_2:
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v16
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfsqrt.v v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
@@ -498,14 +499,14 @@ define <vscale x 32 x half> @vfsqrt_vv_nxv32f16_unmasked(<vscale x 32 x half> %v
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
+; ZVFHMIN-NEXT:    sltu a4, a0, a3
+; ZVFHMIN-NEXT:    addi a4, a4, -1
+; ZVFHMIN-NEXT:    and a3, a4, a3
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v16, a2
-; ZVFHMIN-NEXT:    sltu a2, a0, a3
-; ZVFHMIN-NEXT:    addi a2, a2, -1
-; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfsqrt.v v16, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
@@ -513,8 +514,9 @@ define <vscale x 32 x half> @vfsqrt_vv_nxv32f16_unmasked(<vscale x 32 x half> %v
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB23_2:
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfsqrt.v v16, v16
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfsub-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfsub-vp.ll
index d034f65479a1..56ed560f9ec9 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfsub-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfsub-vp.ll
@@ -641,10 +641,10 @@ define <vscale x 1 x half> @vfsub_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfsub_vv_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v9, v9, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -662,10 +662,10 @@ define <vscale x 1 x half> @vfsub_vv_nxv1f16_unmasked(<vscale x 1 x half> %va, <
 ;
 ; ZVFHMIN-LABEL: vfsub_vv_nxv1f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v9, v9, v10
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -684,11 +684,11 @@ define <vscale x 1 x half> @vfsub_vf_nxv1f16(<vscale x 1 x half> %va, half %b, <
 ; ZVFHMIN-LABEL: vfsub_vf_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v9, v10, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -709,11 +709,11 @@ define <vscale x 1 x half> @vfsub_vf_nxv1f16_unmasked(<vscale x 1 x half> %va, h
 ; ZVFHMIN-LABEL: vfsub_vf_nxv1f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v9, v10, v8
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -735,10 +735,10 @@ define <vscale x 2 x half> @vfsub_vv_nxv2f16(<vscale x 2 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfsub_vv_nxv2f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v9, v9, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -756,10 +756,10 @@ define <vscale x 2 x half> @vfsub_vv_nxv2f16_unmasked(<vscale x 2 x half> %va, <
 ;
 ; ZVFHMIN-LABEL: vfsub_vv_nxv2f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v9, v9, v10
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -778,11 +778,11 @@ define <vscale x 2 x half> @vfsub_vf_nxv2f16(<vscale x 2 x half> %va, half %b, <
 ; ZVFHMIN-LABEL: vfsub_vf_nxv2f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v9, v10, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -803,11 +803,11 @@ define <vscale x 2 x half> @vfsub_vf_nxv2f16_unmasked(<vscale x 2 x half> %va, h
 ; ZVFHMIN-LABEL: vfsub_vf_nxv2f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v9, v10, v8
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
@@ -829,10 +829,10 @@ define <vscale x 4 x half> @vfsub_vv_nxv4f16(<vscale x 4 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfsub_vv_nxv4f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v10, v12, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
@@ -850,10 +850,10 @@ define <vscale x 4 x half> @vfsub_vv_nxv4f16_unmasked(<vscale x 4 x half> %va, <
 ;
 ; ZVFHMIN-LABEL: vfsub_vv_nxv4f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v10, v12, v10
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
@@ -872,11 +872,11 @@ define <vscale x 4 x half> @vfsub_vf_nxv4f16(<vscale x 4 x half> %va, half %b, <
 ; ZVFHMIN-LABEL: vfsub_vf_nxv4f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v10, v10, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
@@ -897,11 +897,11 @@ define <vscale x 4 x half> @vfsub_vf_nxv4f16_unmasked(<vscale x 4 x half> %va, h
 ; ZVFHMIN-LABEL: vfsub_vf_nxv4f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v10, v10, v12
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
@@ -923,10 +923,10 @@ define <vscale x 8 x half> @vfsub_vv_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ;
 ; ZVFHMIN-LABEL: vfsub_vv_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v12, v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
@@ -944,10 +944,10 @@ define <vscale x 8 x half> @vfsub_vv_nxv8f16_unmasked(<vscale x 8 x half> %va, <
 ;
 ; ZVFHMIN-LABEL: vfsub_vv_nxv8f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v12, v16, v12
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
@@ -966,11 +966,11 @@ define <vscale x 8 x half> @vfsub_vf_nxv8f16(<vscale x 8 x half> %va, half %b, <
 ; ZVFHMIN-LABEL: vfsub_vf_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v12, v12, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
@@ -991,11 +991,11 @@ define <vscale x 8 x half> @vfsub_vf_nxv8f16_unmasked(<vscale x 8 x half> %va, h
 ; ZVFHMIN-LABEL: vfsub_vf_nxv8f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v12, v12, v16
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
@@ -1017,10 +1017,10 @@ define <vscale x 16 x half> @vfsub_vv_nxv16f16(<vscale x 16 x half> %va, <vscale
 ;
 ; ZVFHMIN-LABEL: vfsub_vv_nxv16f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v16, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
@@ -1038,10 +1038,10 @@ define <vscale x 16 x half> @vfsub_vv_nxv16f16_unmasked(<vscale x 16 x half> %va
 ;
 ; ZVFHMIN-LABEL: vfsub_vv_nxv16f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v16, v24, v16
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
@@ -1060,11 +1060,11 @@ define <vscale x 16 x half> @vfsub_vf_nxv16f16(<vscale x 16 x half> %va, half %b
 ; ZVFHMIN-LABEL: vfsub_vf_nxv16f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v12, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v16, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
@@ -1085,11 +1085,11 @@ define <vscale x 16 x half> @vfsub_vf_nxv16f16_unmasked(<vscale x 16 x half> %va
 ; ZVFHMIN-LABEL: vfsub_vf_nxv16f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v12, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v16, v16, v24
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
@@ -1117,23 +1117,22 @@ define <vscale x 32 x half> @vfsub_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v7, v0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v16, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
@@ -1143,10 +1142,11 @@ define <vscale x 32 x half> @vfsub_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:  .LBB44_2:
 ; ZVFHMIN-NEXT:    addi a1, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v7
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v16, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
@@ -1178,22 +1178,21 @@ define <vscale x 32 x half> @vfsub_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v7
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
+; ZVFHMIN-NEXT:    vmset.m v24
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a2
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v24, a2
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v16, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
@@ -1203,9 +1202,10 @@ define <vscale x 32 x half> @vfsub_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:  .LBB45_2:
 ; ZVFHMIN-NEXT:    addi a1, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v16, v24, v16
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
@@ -1236,14 +1236,10 @@ define <vscale x 32 x half> @vfsub_vf_nxv32f16(<vscale x 32 x half> %va, half %b
 ; ZVFHMIN-NEXT:    add a1, a2, a1
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x11, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 17 * vlenb
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vmv8r.v v16, v8
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vmv8r.v v24, v8
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v16, a1
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a3, a1, 3
@@ -1264,15 +1260,18 @@ define <vscale x 32 x half> @vfsub_vf_nxv32f16(<vscale x 32 x half> %va, half %b
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a4, a3, 3
-; ZVFHMIN-NEXT:    add a3, a4, a3
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v28
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a3, a2, 3
+; ZVFHMIN-NEXT:    add a2, a3, a2
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v16, v8, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
@@ -1282,20 +1281,21 @@ define <vscale x 32 x half> @vfsub_vf_nxv32f16(<vscale x 32 x half> %va, half %b
 ; ZVFHMIN-NEXT:  .LBB46_2:
 ; ZVFHMIN-NEXT:    addi a1, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a2, a1, 3
-; ZVFHMIN-NEXT:    add a1, a2, a1
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a1, a0, 3
+; ZVFHMIN-NEXT:    add a0, a1, a0
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v0
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl1r.v v0, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v16, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
@@ -1328,16 +1328,10 @@ define <vscale x 32 x half> @vfsub_vf_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    slli a1, a1, 4
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmv8r.v v16, v8
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    vmset.m v7
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
 ; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vmset.m v24
 ; ZVFHMIN-NEXT:    vmv.v.x v16, a1
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
@@ -1348,18 +1342,22 @@ define <vscale x 32 x half> @vfsub_vf_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a2
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v24, a2
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 3
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vmv8r.v v16, v8
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28
-; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v16, v8, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
@@ -1369,14 +1367,15 @@ define <vscale x 32 x half> @vfsub_vf_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:  .LBB47_2:
 ; ZVFHMIN-NEXT:    addi a1, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v0
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v16, v16, v24
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
diff --git a/llvm/test/CodeGen/RISCV/rvv/vl-opt-op-info.mir b/llvm/test/CodeGen/RISCV/rvv/vl-opt-op-info.mir
index f9b81863d68d..c4a383454610 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vl-opt-op-info.mir
+++ b/llvm/test/CodeGen/RISCV/rvv/vl-opt-op-info.mir
@@ -573,6 +573,86 @@ body: |
     PseudoVSE8_V_MF2 %x, $noreg, 1, 3 /* e8 */
 ...
 ---
+name: vsm_v
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vsm_v
+    ; CHECK: %x:vr = PseudoVMAND_MM_B8 $noreg, $noreg, 1, 0 /* e8 */
+    ; CHECK-NEXT: PseudoVSM_V_B8 %x, $noreg, 1, 0 /* e8 */
+    %x:vr = PseudoVMAND_MM_B8 $noreg, $noreg, -1, 0
+    PseudoVSM_V_B8 %x, $noreg, 1, 0
+...
+---
+name: vsm_v_incompatible_emul
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vsm_v_incompatible_emul
+    ; CHECK: %x:vr = PseudoVMAND_MM_B8 $noreg, $noreg, -1, 0 /* e8 */
+    ; CHECK-NEXT: PseudoVSM_V_B16 %x, $noreg, 1, 0 /* e8 */
+    %x:vr = PseudoVMAND_MM_B8 $noreg, $noreg, -1, 0
+    PseudoVSM_V_B16 %x, $noreg, 1, 0
+...
+---
+name: vleN_v
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vleN_v
+    ; CHECK: %x:vr = PseudoVLE8_V_M1 $noreg, $noreg, 1, 3 /* e8 */, 0 /* tu, mu */
+    ; CHECK-NEXT: %y:vr = PseudoVADD_VV_M1 $noreg, %x, $noreg, 1, 3 /* e8 */, 0 /* tu, mu */
+    %x:vr = PseudoVLE8_V_M1 $noreg, $noreg, -1, 3 /* e8 */, 0
+    %y:vr = PseudoVADD_VV_M1 $noreg, %x, $noreg, 1, 3 /* e8 */, 0
+...
+---
+name: vleN_v_incompatible_eew
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vleN_v_incompatible_eew
+    ; CHECK: %x:vr = PseudoVLE8_V_M1 $noreg, $noreg, -1, 3 /* e8 */, 0 /* tu, mu */
+    ; CHECK-NEXT: %y:vr = PseudoVADD_VV_M1 $noreg, %x, $noreg, 1, 4 /* e16 */, 0 /* tu, mu */
+    %x:vr = PseudoVLE8_V_M1 $noreg, $noreg, -1, 3 /* e8 */, 0
+    %y:vr = PseudoVADD_VV_M1 $noreg, %x, $noreg, 1, 4 /* e16 */, 0
+...
+---
+name: vleN_v_incompatible_emul
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vleN_v_incompatible_emul
+    ; CHECK: %x:vr = PseudoVLE8_V_M1 $noreg, $noreg, -1, 3 /* e8 */, 0 /* tu, mu */
+    ; CHECK-NEXT: %x:vr = PseudoVADD_VV_MF2 $noreg, %x, $noreg, 1, 3 /* e8 */, 0 /* tu, mu */
+    %x:vr = PseudoVLE8_V_M1 $noreg, $noreg, -1, 3 /* e8 */, 0
+    %x:vr = PseudoVADD_VV_MF2 $noreg, %x, $noreg, 1, 3 /* e8 */, 0
+...
+---
+name: vlm_v
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vlm_v
+    ; CHECK: %x:vr = PseudoVLM_V_B8 $noreg, $noreg, 1, 0 /* e8 */, 0 /* tu, mu */
+    ; CHECK-NEXT: %y:vr = PseudoVMAND_MM_B8 $noreg, %x, 1, 0 /* e8 */
+    %x:vr = PseudoVLM_V_B8 $noreg, $noreg, -1, 0, 0
+    %y:vr = PseudoVMAND_MM_B8 $noreg, %x, 1, 0
+...
+---
+name: vlm_v_incompatible_eew
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vlm_v_incompatible_eew
+    ; CHECK: %x:vr = PseudoVLM_V_B8 $noreg, $noreg, -1, 0 /* e8 */, 0 /* tu, mu */
+    ; CHECK-NEXT: %y:vr = PseudoVADD_VV_M1 $noreg, $noreg, %x, 1, 4 /* e16 */, 0 /* tu, mu */
+    %x:vr = PseudoVLM_V_B8 $noreg, $noreg, -1, 0, 0
+    %y:vr = PseudoVADD_VV_M1 $noreg, $noreg, %x, 1, 4 /* e16 */, 0
+...
+---
+name: vlm_v_incompatible_emul
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vlm_v_incompatible_emul
+    ; CHECK: %x:vr = PseudoVLM_V_B8 $noreg, $noreg, -1, 0 /* e8 */, 0 /* tu, mu */
+    ; CHECK-NEXT: %y:vr = PseudoVMAND_MM_B16 $noreg, %x, 1, 0 /* e8 */
+    %x:vr = PseudoVLM_V_B8 $noreg, $noreg, -1, 0, 0
+    %y:vr = PseudoVMAND_MM_B16 $noreg, %x, 1, 0
+...
+---
 name: vsseN_v
 body: |
   bb.0:
@@ -705,6 +785,56 @@ body: |
     %y:vr = PseudoVLUXEI8_V_MF2_M1 $noreg, $noreg, %x, 1, 4 /* e16 */, 0
 ...
 ---
+name: vluxeiN_v_idx_incompatible_eew
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vluxeiN_v_idx_incompatible_eew
+    ; CHECK: %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 4 /* e16 */, 0 /* tu, mu */
+    ; CHECK-NEXT: %y:vr = PseudoVLUXEI8_V_M1_M1 $noreg, $noreg, %x, 1, 3 /* e8 */, 0 /* tu, mu */
+    %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 4 /* e16 */, 0
+    %y:vr = PseudoVLUXEI8_V_M1_M1 $noreg, $noreg, %x, 1, 3 /* e8 */, 0
+...
+---
+name: vluxeiN_v_idx_incompatible_emul
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vluxeiN_v_idx_incompatible_emul
+    ; CHECK: %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0 /* tu, mu */
+    ; CHECK-NEXT: %y:vr = PseudoVLUXEI8_V_MF2_MF2 $noreg, $noreg, %x, 1, 3 /* e8 */, 0 /* tu, mu */
+    %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0
+    %y:vr = PseudoVLUXEI8_V_MF2_MF2 $noreg, $noreg, %x, 1, 3 /* e8 */, 0
+...
+---
+name: vluxeiN_v_vd
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vluxeiN_v_vd
+    ; CHECK: %x:vr = PseudoVLUXEI8_V_M1_M1 $noreg, $noreg, $noreg, 1, 3 /* e8 */, 0 /* tu, mu */
+    ; CHECK-NEXT: %y:vr = PseudoVADD_VV_M1 $noreg, %x, $noreg, 1, 3 /* e8 */, 0 /* tu, mu */
+    %x:vr = PseudoVLUXEI8_V_M1_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0
+    %y:vr = PseudoVADD_VV_M1 $noreg, %x, $noreg, 1, 3 /* e8 */, 0
+...
+---
+name: vluxeiN_v_vd_incompatible_eew
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vluxeiN_v_vd_incompatible_eew
+    ; CHECK: %x:vr = PseudoVLUXEI8_V_M1_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0 /* tu, mu */
+    ; CHECK-NEXT: %y:vr = PseudoVADD_VV_M1 $noreg, %x, $noreg, 1, 4 /* e16 */, 0 /* tu, mu */
+    %x:vr = PseudoVLUXEI8_V_M1_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0
+    %y:vr = PseudoVADD_VV_M1 $noreg, %x, $noreg, 1, 4 /* e16 */, 0
+...
+---
+name: vluxeiN_vd_incompatible_emul
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vluxeiN_vd_incompatible_emul
+    ; CHECK: %x:vr = PseudoVLUXEI8_V_M1_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0 /* tu, mu */
+    ; CHECK-NEXT: %y:vr = PseudoVADD_VV_MF2 $noreg, %x, $noreg, 1, 3 /* e8 */, 0 /* tu, mu */
+    %x:vr = PseudoVLUXEI8_V_M1_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0
+    %y:vr = PseudoVADD_VV_MF2 $noreg, %x, $noreg, 1, 3 /* e8 */, 0
+...
+---
 name: vmop_mm
 body: |
   bb.0:
@@ -1094,3 +1224,116 @@ body: |
     %x:vr = PseudoVMAND_MM_B1 $noreg, $noreg, -1, 0
     %y:vr = PseudoVIOTA_M_MF2 $noreg, %x, 1, 3 /* e8 */, 0
 ...
+name: vred_vs2
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vred_vs2
+    ; CHECK: %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, 1, 3 /* e8 */, 0 /* tu, mu */
+    ; CHECK-NEXT: %y:vr = PseudoVREDAND_VS_M1_E8 $noreg, %x, $noreg, 1, 3 /* e8 */, 0 /* tu, mu */
+    %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0
+    %y:vr = PseudoVREDAND_VS_M1_E8 $noreg, %x, $noreg, 1, 3 /* e8 */, 0
+...
+---
+name: vred_vs1
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vred_vs1
+    ; CHECK: %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, 1, 3 /* e8 */, 0 /* tu, mu */
+    ; CHECK-NEXT: %y:vr = PseudoVREDAND_VS_M1_E8 $noreg, $noreg, %x, 1, 3 /* e8 */, 0 /* tu, mu */
+    %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0
+    %y:vr = PseudoVREDAND_VS_M1_E8 $noreg, $noreg, %x, 1, 3 /* e8 */, 0
+...
+---
+name: vred_vs1_vs2
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vred_vs1_vs2
+    ; CHECK: %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, 1, 3 /* e8 */, 0 /* tu, mu */
+    ; CHECK-NEXT: %y:vr = PseudoVREDAND_VS_M1_E8 $noreg, %x, %x, 1, 3 /* e8 */, 0 /* tu, mu */
+    %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0
+    %y:vr = PseudoVREDAND_VS_M1_E8 $noreg, %x, %x, 1, 3 /* e8 */, 0
+...
+---
+name: vred_vs1_vs2_incompatible_eew
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vred_vs1_vs2_incompatible_eew
+    ; CHECK: %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0 /* tu, mu */
+    ; CHECK-NEXT: %y:vr = PseudoVREDAND_VS_M1_E8 $noreg, %x, %x, 1, 4 /* e16 */, 0 /* tu, mu */
+    %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0
+    %y:vr = PseudoVREDAND_VS_M1_E8 $noreg, %x, %x, 1, 4 /* e16 */, 0
+...
+---
+name: vred_vs1_vs2_incompatible_emul
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vred_vs1_vs2_incompatible_emul
+    ; CHECK: %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0 /* tu, mu */
+    ; CHECK-NEXT: %y:vr = PseudoVREDAND_VS_MF2_E8 $noreg, %x, %x, 1, 3 /* e8 */, 0 /* tu, mu */
+    %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0
+    %y:vr = PseudoVREDAND_VS_MF2_E8 $noreg, %x, %x, 1, 3 /* e8 */, 0
+...
+---
+name: vred_other_user_is_vl0
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vred_other_user_is_vl0
+    ; CHECK: %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, 1, 3 /* e8 */, 0 /* tu, mu */
+    ; CHECK-NEXT: %y:vr = PseudoVREDSUM_VS_M1_E8 $noreg, $noreg, %x, 1, 3 /* e8 */, 0 /* tu, mu */
+    ; CHECK-NEXT: %z:vr = PseudoVADD_VV_M1 $noreg, %x, $noreg, 0, 3 /* e8 */, 0 /* tu, mu */
+    %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0
+    %y:vr = PseudoVREDSUM_VS_M1_E8 $noreg, $noreg, %x, 1, 3 /* e8 */, 0
+    %z:vr = PseudoVADD_VV_M1 $noreg, %x, $noreg, 0, 3 /* e8 */, 0
+...
+---
+name: vred_both_vl0
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vred_both_vl0
+    ; CHECK: %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, 1, 3 /* e8 */, 0 /* tu, mu */
+    ; CHECK-NEXT: %y:vr = PseudoVREDSUM_VS_M1_E8 $noreg, $noreg, %x, 0, 3 /* e8 */, 0 /* tu, mu */
+    ; CHECK-NEXT: %z:vr = PseudoVADD_VV_M1 $noreg, %x, $noreg, 0, 3 /* e8 */, 0 /* tu, mu */
+    %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0
+    %y:vr = PseudoVREDSUM_VS_M1_E8 $noreg, $noreg, %x, 0, 3 /* e8 */, 0
+    %z:vr = PseudoVADD_VV_M1 $noreg, %x, $noreg, 0, 3 /* e8 */, 0
+...
+---
+name: vred_vl0_and_vlreg
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vred_vl0_and_vlreg
+    ; CHECK: %vl:gprnox0 = COPY $x1
+    ; CHECK-NEXT: %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, 1, 3 /* e8 */, 0 /* tu, mu */
+    ; CHECK-NEXT: %y:vr = PseudoVREDSUM_VS_M1_E8 $noreg, $noreg, %x, %vl, 3 /* e8 */, 0 /* tu, mu */
+    ; CHECK-NEXT: %z:vr = PseudoVADD_VV_M1 $noreg, %x, $noreg, 0, 3 /* e8 */, 0 /* tu, mu */
+    %vl:gprnox0 = COPY $x1
+    %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0
+    %y:vr = PseudoVREDSUM_VS_M1_E8 $noreg, $noreg, %x, %vl, 3 /* e8 */, 0
+    %z:vr = PseudoVADD_VV_M1 $noreg, %x, $noreg, 0, 3 /* e8 */, 0
+...
+---
+name: vred_vlreg_and_vl0
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vred_vlreg_and_vl0
+    ; CHECK: %vl:gprnox0 = COPY $x1
+    ; CHECK-NEXT: %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0 /* tu, mu */
+    ; CHECK-NEXT: %y:vr = PseudoVREDSUM_VS_M1_E8 $noreg, $noreg, %x, 0, 3 /* e8 */, 0 /* tu, mu */
+    ; CHECK-NEXT: %z:vr = PseudoVADD_VV_M1 $noreg, %x, $noreg, %vl, 3 /* e8 */, 0 /* tu, mu */
+    %vl:gprnox0 = COPY $x1
+    %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0
+    %y:vr = PseudoVREDSUM_VS_M1_E8 $noreg, $noreg, %x, 0, 3 /* e8 */, 0
+    %z:vr = PseudoVADD_VV_M1 $noreg, %x, $noreg, %vl, 3 /* e8 */, 0
+...
+---
+name: vred_other_user_is_vl2
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vred_other_user_is_vl2
+    ; CHECK: %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, 2, 3 /* e8 */, 0 /* tu, mu */
+    ; CHECK-NEXT: %y:vr = PseudoVREDSUM_VS_M1_E8 $noreg, $noreg, %x, 1, 3 /* e8 */, 0 /* tu, mu */
+    ; CHECK-NEXT: %z:vr = PseudoVADD_VV_M1 $noreg, %x, $noreg, 2, 3 /* e8 */, 0 /* tu, mu */
+    %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0
+    %y:vr = PseudoVREDSUM_VS_M1_E8 $noreg, $noreg, %x, 1, 3 /* e8 */, 0
+    %z:vr = PseudoVADD_VV_M1 $noreg, %x, $noreg, 2, 3 /* e8 */, 0
+...
diff --git a/llvm/test/CodeGen/RISCV/rvv/vl-opt.mir b/llvm/test/CodeGen/RISCV/rvv/vl-opt.mir
index 3f966b036589..0a366f4fd89c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vl-opt.mir
+++ b/llvm/test/CodeGen/RISCV/rvv/vl-opt.mir
@@ -110,4 +110,24 @@ body: |
     %y:vr = PseudoVADD_VV_M1 $noreg, %x, $noreg, 1, 3 /* e8 */, 0
     %z:vr = PseudoVADD_VV_M1 $noreg, %x, $noreg, -1, 3 /* e8 */, 0
 ...
+---
+name: vfcvt_x_f_v_nofpexcept
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vfcvt_x_f_v_nofpexcept
+    ; CHECK: %x:vr = nofpexcept PseudoVFCVT_X_F_V_M1 $noreg, $noreg, 0, 1, 3 /* e8 */, 0 /* tu, mu */
+    ; CHECK-NEXT: %y:vr = PseudoVADD_VV_M1 $noreg, %x, $noreg, 1, 3 /* e8 */, 0 /* tu, mu */
+    %x:vr = nofpexcept PseudoVFCVT_X_F_V_M1 $noreg, $noreg, 0, -1, 3 /* e32 */, 0
+    %y:vr = PseudoVADD_VV_M1 $noreg, %x, $noreg, 1, 3 /* e8 */, 0
+...
+---
+name: vfcvt_x_f_v_fpexcept
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vfcvt_x_f_v_fpexcept
+    ; CHECK: %x:vr = PseudoVFCVT_X_F_V_M1 $noreg, $noreg, 0, -1, 3 /* e8 */, 0 /* tu, mu */
+    ; CHECK-NEXT: %y:vr = PseudoVADD_VV_M1 $noreg, %x, $noreg, 1, 3 /* e8 */, 0 /* tu, mu */
+    %x:vr = PseudoVFCVT_X_F_V_M1 $noreg, $noreg, 0, -1, 3 /* e32 */, 0
+    %y:vr = PseudoVADD_VV_M1 $noreg, %x, $noreg, 1, 3 /* e8 */, 0
+...
 
diff --git a/llvm/test/CodeGen/RISCV/rvv/vlopt-volatile-ld.mir b/llvm/test/CodeGen/RISCV/rvv/vlopt-volatile-ld.mir
new file mode 100644
index 000000000000..e8f7957de47c
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vlopt-volatile-ld.mir
@@ -0,0 +1,13 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc %s -o - -mtriple=riscv64 -mattr=+v -run-pass=riscv-vl-optimizer -verify-machineinstrs | FileCheck %s
+
+---
+name: vleN_v_volatile
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: vleN_v
+    ; CHECK: %x:vr = PseudoVLE8_V_M1 $noreg, $noreg, -1, 3 /* e8 */, 0 /* tu, mu */ :: (volatile load (<vscale x 1 x s64>))
+    ; CHECK-NEXT: %y:vr = PseudoVADD_VV_M1 $noreg, %x, $noreg, 1, 3 /* e8 */, 0 /* tu, mu */
+    %x:vr = PseudoVLE8_V_M1 $noreg, $noreg, -1, 3 /* e8 */, 0 :: (volatile load (<vscale x 1 x s64>))
+    %y:vr = PseudoVADD_VV_M1 $noreg, %x, $noreg, 1, 3 /* e8 */, 0
+...
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmacc-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vmacc-vp.ll
index 333117c8dce2..c334e70f1f35 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmacc-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmacc-vp.ll
@@ -1654,9 +1654,9 @@ define <vscale x 1 x i64> @vmacc_vx_nxv1i64(<vscale x 1 x i64> %a, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
 ; RV32-NEXT:    vlse64.v v10, (a0), zero
-; RV32-NEXT:    vsetvli zero, a2, e64, m1, tu, mu
+; RV32-NEXT:    vsetvli zero, zero, e64, m1, tu, mu
 ; RV32-NEXT:    vmacc.vv v9, v8, v10, v0.t
 ; RV32-NEXT:    vmv1r.v v8, v9
 ; RV32-NEXT:    addi sp, sp, 16
@@ -1685,9 +1685,9 @@ define <vscale x 1 x i64> @vmacc_vx_nxv1i64_unmasked(<vscale x 1 x i64> %a, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
 ; RV32-NEXT:    vlse64.v v10, (a0), zero
-; RV32-NEXT:    vsetvli zero, a2, e64, m1, tu, ma
+; RV32-NEXT:    vsetvli zero, zero, e64, m1, tu, ma
 ; RV32-NEXT:    vmacc.vv v9, v8, v10
 ; RV32-NEXT:    vmv1r.v v8, v9
 ; RV32-NEXT:    addi sp, sp, 16
@@ -1729,9 +1729,8 @@ define <vscale x 1 x i64> @vmacc_vx_nxv1i64_ta(<vscale x 1 x i64> %a, i64 %b, <v
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, mu
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vmacc.vv v9, v8, v10, v0.t
 ; RV32-NEXT:    vmv.v.v v8, v9
 ; RV32-NEXT:    addi sp, sp, 16
@@ -1791,9 +1790,9 @@ define <vscale x 2 x i64> @vmacc_vx_nxv2i64(<vscale x 2 x i64> %a, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
 ; RV32-NEXT:    vlse64.v v12, (a0), zero
-; RV32-NEXT:    vsetvli zero, a2, e64, m2, tu, mu
+; RV32-NEXT:    vsetvli zero, zero, e64, m2, tu, mu
 ; RV32-NEXT:    vmacc.vv v10, v8, v12, v0.t
 ; RV32-NEXT:    vmv2r.v v8, v10
 ; RV32-NEXT:    addi sp, sp, 16
@@ -1822,9 +1821,9 @@ define <vscale x 2 x i64> @vmacc_vx_nxv2i64_unmasked(<vscale x 2 x i64> %a, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
 ; RV32-NEXT:    vlse64.v v12, (a0), zero
-; RV32-NEXT:    vsetvli zero, a2, e64, m2, tu, ma
+; RV32-NEXT:    vsetvli zero, zero, e64, m2, tu, ma
 ; RV32-NEXT:    vmacc.vv v10, v8, v12
 ; RV32-NEXT:    vmv2r.v v8, v10
 ; RV32-NEXT:    addi sp, sp, 16
@@ -1866,9 +1865,8 @@ define <vscale x 2 x i64> @vmacc_vx_nxv2i64_ta(<vscale x 2 x i64> %a, i64 %b, <v
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, mu
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vmacc.vv v10, v8, v12, v0.t
 ; RV32-NEXT:    vmv.v.v v8, v10
 ; RV32-NEXT:    addi sp, sp, 16
@@ -1928,9 +1926,9 @@ define <vscale x 4 x i64> @vmacc_vx_nxv4i64(<vscale x 4 x i64> %a, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
+; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
 ; RV32-NEXT:    vlse64.v v16, (a0), zero
-; RV32-NEXT:    vsetvli zero, a2, e64, m4, tu, mu
+; RV32-NEXT:    vsetvli zero, zero, e64, m4, tu, mu
 ; RV32-NEXT:    vmacc.vv v12, v8, v16, v0.t
 ; RV32-NEXT:    vmv4r.v v8, v12
 ; RV32-NEXT:    addi sp, sp, 16
@@ -1959,9 +1957,9 @@ define <vscale x 4 x i64> @vmacc_vx_nxv4i64_unmasked(<vscale x 4 x i64> %a, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
+; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
 ; RV32-NEXT:    vlse64.v v16, (a0), zero
-; RV32-NEXT:    vsetvli zero, a2, e64, m4, tu, ma
+; RV32-NEXT:    vsetvli zero, zero, e64, m4, tu, ma
 ; RV32-NEXT:    vmacc.vv v12, v8, v16
 ; RV32-NEXT:    vmv4r.v v8, v12
 ; RV32-NEXT:    addi sp, sp, 16
@@ -2003,9 +2001,8 @@ define <vscale x 4 x i64> @vmacc_vx_nxv4i64_ta(<vscale x 4 x i64> %a, i64 %b, <v
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, mu
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vmacc.vv v12, v8, v16, v0.t
 ; RV32-NEXT:    vmv.v.v v8, v12
 ; RV32-NEXT:    addi sp, sp, 16
@@ -2067,9 +2064,9 @@ define <vscale x 8 x i64> @vmacc_vx_nxv8i64(<vscale x 8 x i64> %a, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v24, (a0), zero
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, tu, mu
+; RV32-NEXT:    vsetvli zero, zero, e64, m8, tu, mu
 ; RV32-NEXT:    vmacc.vv v16, v8, v24, v0.t
 ; RV32-NEXT:    vmv8r.v v8, v16
 ; RV32-NEXT:    addi sp, sp, 16
@@ -2098,9 +2095,9 @@ define <vscale x 8 x i64> @vmacc_vx_nxv8i64_unmasked(<vscale x 8 x i64> %a, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v24, (a0), zero
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, tu, ma
+; RV32-NEXT:    vsetvli zero, zero, e64, m8, tu, ma
 ; RV32-NEXT:    vmacc.vv v16, v8, v24
 ; RV32-NEXT:    vmv8r.v v8, v16
 ; RV32-NEXT:    addi sp, sp, 16
@@ -2143,9 +2140,8 @@ define <vscale x 8 x i64> @vmacc_vx_nxv8i64_ta(<vscale x 8 x i64> %a, i64 %b, <v
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, mu
+; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vmacc.vv v16, v8, v24, v0.t
 ; RV32-NEXT:    vmv.v.v v8, v16
 ; RV32-NEXT:    addi sp, sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmax-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vmax-vp.ll
index 7818e99c5562..3df0763fdc75 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmax-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmax-vp.ll
@@ -1124,9 +1124,8 @@ define <vscale x 1 x i64> @vmax_vx_nxv1i64(<vscale x 1 x i64> %va, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vmax.vv v8, v8, v9, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1151,9 +1150,8 @@ define <vscale x 1 x i64> @vmax_vx_nxv1i64_unmasked(<vscale x 1 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vmax.vv v8, v8, v9
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1200,9 +1198,8 @@ define <vscale x 2 x i64> @vmax_vx_nxv2i64(<vscale x 2 x i64> %va, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vmax.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1227,9 +1224,8 @@ define <vscale x 2 x i64> @vmax_vx_nxv2i64_unmasked(<vscale x 2 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vmax.vv v8, v8, v10
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1276,9 +1272,8 @@ define <vscale x 4 x i64> @vmax_vx_nxv4i64(<vscale x 4 x i64> %va, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vmax.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1303,9 +1298,8 @@ define <vscale x 4 x i64> @vmax_vx_nxv4i64_unmasked(<vscale x 4 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vmax.vv v8, v8, v12
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1352,9 +1346,8 @@ define <vscale x 8 x i64> @vmax_vx_nxv8i64(<vscale x 8 x i64> %va, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vmax.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1379,9 +1372,8 @@ define <vscale x 8 x i64> @vmax_vx_nxv8i64_unmasked(<vscale x 8 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vmax.vv v8, v8, v16
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmaxu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vmaxu-vp.ll
index 674b0b806000..8147d467be04 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmaxu-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmaxu-vp.ll
@@ -1123,9 +1123,8 @@ define <vscale x 1 x i64> @vmaxu_vx_nxv1i64(<vscale x 1 x i64> %va, i64 %b, <vsc
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vmaxu.vv v8, v8, v9, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1150,9 +1149,8 @@ define <vscale x 1 x i64> @vmaxu_vx_nxv1i64_unmasked(<vscale x 1 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vmaxu.vv v8, v8, v9
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1199,9 +1197,8 @@ define <vscale x 2 x i64> @vmaxu_vx_nxv2i64(<vscale x 2 x i64> %va, i64 %b, <vsc
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vmaxu.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1226,9 +1223,8 @@ define <vscale x 2 x i64> @vmaxu_vx_nxv2i64_unmasked(<vscale x 2 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vmaxu.vv v8, v8, v10
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1275,9 +1271,8 @@ define <vscale x 4 x i64> @vmaxu_vx_nxv4i64(<vscale x 4 x i64> %va, i64 %b, <vsc
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vmaxu.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1302,9 +1297,8 @@ define <vscale x 4 x i64> @vmaxu_vx_nxv4i64_unmasked(<vscale x 4 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vmaxu.vv v8, v8, v12
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1351,9 +1345,8 @@ define <vscale x 8 x i64> @vmaxu_vx_nxv8i64(<vscale x 8 x i64> %va, i64 %b, <vsc
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vmaxu.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1378,9 +1371,8 @@ define <vscale x 8 x i64> @vmaxu_vx_nxv8i64_unmasked(<vscale x 8 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vmaxu.vv v8, v8, v16
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmin-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vmin-vp.ll
index 79631cd80594..614bd4cbde9e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmin-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmin-vp.ll
@@ -1124,9 +1124,8 @@ define <vscale x 1 x i64> @vmin_vx_nxv1i64(<vscale x 1 x i64> %va, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vmin.vv v8, v8, v9, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1151,9 +1150,8 @@ define <vscale x 1 x i64> @vmin_vx_nxv1i64_unmasked(<vscale x 1 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vmin.vv v8, v8, v9
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1200,9 +1198,8 @@ define <vscale x 2 x i64> @vmin_vx_nxv2i64(<vscale x 2 x i64> %va, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vmin.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1227,9 +1224,8 @@ define <vscale x 2 x i64> @vmin_vx_nxv2i64_unmasked(<vscale x 2 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vmin.vv v8, v8, v10
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1276,9 +1272,8 @@ define <vscale x 4 x i64> @vmin_vx_nxv4i64(<vscale x 4 x i64> %va, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vmin.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1303,9 +1298,8 @@ define <vscale x 4 x i64> @vmin_vx_nxv4i64_unmasked(<vscale x 4 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vmin.vv v8, v8, v12
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1352,9 +1346,8 @@ define <vscale x 8 x i64> @vmin_vx_nxv8i64(<vscale x 8 x i64> %va, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vmin.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1379,9 +1372,8 @@ define <vscale x 8 x i64> @vmin_vx_nxv8i64_unmasked(<vscale x 8 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vmin.vv v8, v8, v16
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/vminu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vminu-vp.ll
index bc93b62fab7f..21160553af59 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vminu-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vminu-vp.ll
@@ -1123,9 +1123,8 @@ define <vscale x 1 x i64> @vminu_vx_nxv1i64(<vscale x 1 x i64> %va, i64 %b, <vsc
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vminu.vv v8, v8, v9, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1150,9 +1149,8 @@ define <vscale x 1 x i64> @vminu_vx_nxv1i64_unmasked(<vscale x 1 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vminu.vv v8, v8, v9
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1199,9 +1197,8 @@ define <vscale x 2 x i64> @vminu_vx_nxv2i64(<vscale x 2 x i64> %va, i64 %b, <vsc
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vminu.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1226,9 +1223,8 @@ define <vscale x 2 x i64> @vminu_vx_nxv2i64_unmasked(<vscale x 2 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vminu.vv v8, v8, v10
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1275,9 +1271,8 @@ define <vscale x 4 x i64> @vminu_vx_nxv4i64(<vscale x 4 x i64> %va, i64 %b, <vsc
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vminu.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1302,9 +1297,8 @@ define <vscale x 4 x i64> @vminu_vx_nxv4i64_unmasked(<vscale x 4 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vminu.vv v8, v8, v12
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1351,9 +1345,8 @@ define <vscale x 8 x i64> @vminu_vx_nxv8i64(<vscale x 8 x i64> %va, i64 %b, <vsc
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vminu.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1378,9 +1371,8 @@ define <vscale x 8 x i64> @vminu_vx_nxv8i64_unmasked(<vscale x 8 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vminu.vv v8, v8, v16
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmul-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vmul-vp.ll
index b63098b64e29..f0907e41cd43 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmul-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmul-vp.ll
@@ -934,9 +934,8 @@ define <vscale x 1 x i64> @vmul_vx_nxv1i64(<vscale x 1 x i64> %va, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vmul.vv v8, v8, v9, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -961,9 +960,8 @@ define <vscale x 1 x i64> @vmul_vx_nxv1i64_unmasked(<vscale x 1 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vmul.vv v8, v8, v9
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1010,9 +1008,8 @@ define <vscale x 2 x i64> @vmul_vx_nxv2i64(<vscale x 2 x i64> %va, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vmul.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1037,9 +1034,8 @@ define <vscale x 2 x i64> @vmul_vx_nxv2i64_unmasked(<vscale x 2 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vmul.vv v8, v8, v10
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1086,9 +1082,8 @@ define <vscale x 4 x i64> @vmul_vx_nxv4i64(<vscale x 4 x i64> %va, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vmul.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1113,9 +1108,8 @@ define <vscale x 4 x i64> @vmul_vx_nxv4i64_unmasked(<vscale x 4 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vmul.vv v8, v8, v12
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1162,9 +1156,8 @@ define <vscale x 8 x i64> @vmul_vx_nxv8i64(<vscale x 8 x i64> %va, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1189,9 +1182,8 @@ define <vscale x 8 x i64> @vmul_vx_nxv8i64_unmasked(<vscale x 8 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vmul.vv v8, v8, v16
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/vnmsac-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vnmsac-vp.ll
index 2e0daa66c82a..3484d288088a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vnmsac-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vnmsac-vp.ll
@@ -1654,9 +1654,9 @@ define <vscale x 1 x i64> @vnmsac_vx_nxv1i64(<vscale x 1 x i64> %a, i64 %b, <vsc
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
 ; RV32-NEXT:    vlse64.v v10, (a0), zero
-; RV32-NEXT:    vsetvli zero, a2, e64, m1, tu, mu
+; RV32-NEXT:    vsetvli zero, zero, e64, m1, tu, mu
 ; RV32-NEXT:    vnmsac.vv v9, v8, v10, v0.t
 ; RV32-NEXT:    vmv1r.v v8, v9
 ; RV32-NEXT:    addi sp, sp, 16
@@ -1685,9 +1685,9 @@ define <vscale x 1 x i64> @vnmsac_vx_nxv1i64_unmasked(<vscale x 1 x i64> %a, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
 ; RV32-NEXT:    vlse64.v v10, (a0), zero
-; RV32-NEXT:    vsetvli zero, a2, e64, m1, tu, ma
+; RV32-NEXT:    vsetvli zero, zero, e64, m1, tu, ma
 ; RV32-NEXT:    vnmsac.vv v9, v8, v10
 ; RV32-NEXT:    vmv1r.v v8, v9
 ; RV32-NEXT:    addi sp, sp, 16
@@ -1729,9 +1729,8 @@ define <vscale x 1 x i64> @vnmsac_vx_nxv1i64_ta(<vscale x 1 x i64> %a, i64 %b, <
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, mu
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vnmsac.vv v9, v8, v10, v0.t
 ; RV32-NEXT:    vmv.v.v v8, v9
 ; RV32-NEXT:    addi sp, sp, 16
@@ -1791,9 +1790,9 @@ define <vscale x 2 x i64> @vnmsac_vx_nxv2i64(<vscale x 2 x i64> %a, i64 %b, <vsc
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
 ; RV32-NEXT:    vlse64.v v12, (a0), zero
-; RV32-NEXT:    vsetvli zero, a2, e64, m2, tu, mu
+; RV32-NEXT:    vsetvli zero, zero, e64, m2, tu, mu
 ; RV32-NEXT:    vnmsac.vv v10, v8, v12, v0.t
 ; RV32-NEXT:    vmv2r.v v8, v10
 ; RV32-NEXT:    addi sp, sp, 16
@@ -1822,9 +1821,9 @@ define <vscale x 2 x i64> @vnmsac_vx_nxv2i64_unmasked(<vscale x 2 x i64> %a, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
 ; RV32-NEXT:    vlse64.v v12, (a0), zero
-; RV32-NEXT:    vsetvli zero, a2, e64, m2, tu, ma
+; RV32-NEXT:    vsetvli zero, zero, e64, m2, tu, ma
 ; RV32-NEXT:    vnmsac.vv v10, v8, v12
 ; RV32-NEXT:    vmv2r.v v8, v10
 ; RV32-NEXT:    addi sp, sp, 16
@@ -1866,9 +1865,8 @@ define <vscale x 2 x i64> @vnmsac_vx_nxv2i64_ta(<vscale x 2 x i64> %a, i64 %b, <
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, mu
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vnmsac.vv v10, v8, v12, v0.t
 ; RV32-NEXT:    vmv.v.v v8, v10
 ; RV32-NEXT:    addi sp, sp, 16
@@ -1928,9 +1926,9 @@ define <vscale x 4 x i64> @vnmsac_vx_nxv4i64(<vscale x 4 x i64> %a, i64 %b, <vsc
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
+; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
 ; RV32-NEXT:    vlse64.v v16, (a0), zero
-; RV32-NEXT:    vsetvli zero, a2, e64, m4, tu, mu
+; RV32-NEXT:    vsetvli zero, zero, e64, m4, tu, mu
 ; RV32-NEXT:    vnmsac.vv v12, v8, v16, v0.t
 ; RV32-NEXT:    vmv4r.v v8, v12
 ; RV32-NEXT:    addi sp, sp, 16
@@ -1959,9 +1957,9 @@ define <vscale x 4 x i64> @vnmsac_vx_nxv4i64_unmasked(<vscale x 4 x i64> %a, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
+; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
 ; RV32-NEXT:    vlse64.v v16, (a0), zero
-; RV32-NEXT:    vsetvli zero, a2, e64, m4, tu, ma
+; RV32-NEXT:    vsetvli zero, zero, e64, m4, tu, ma
 ; RV32-NEXT:    vnmsac.vv v12, v8, v16
 ; RV32-NEXT:    vmv4r.v v8, v12
 ; RV32-NEXT:    addi sp, sp, 16
@@ -2003,9 +2001,8 @@ define <vscale x 4 x i64> @vnmsac_vx_nxv4i64_ta(<vscale x 4 x i64> %a, i64 %b, <
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, mu
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vnmsac.vv v12, v8, v16, v0.t
 ; RV32-NEXT:    vmv.v.v v8, v12
 ; RV32-NEXT:    addi sp, sp, 16
@@ -2067,9 +2064,9 @@ define <vscale x 8 x i64> @vnmsac_vx_nxv8i64(<vscale x 8 x i64> %a, i64 %b, <vsc
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v24, (a0), zero
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, tu, mu
+; RV32-NEXT:    vsetvli zero, zero, e64, m8, tu, mu
 ; RV32-NEXT:    vnmsac.vv v16, v8, v24, v0.t
 ; RV32-NEXT:    vmv8r.v v8, v16
 ; RV32-NEXT:    addi sp, sp, 16
@@ -2098,9 +2095,9 @@ define <vscale x 8 x i64> @vnmsac_vx_nxv8i64_unmasked(<vscale x 8 x i64> %a, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v24, (a0), zero
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, tu, ma
+; RV32-NEXT:    vsetvli zero, zero, e64, m8, tu, ma
 ; RV32-NEXT:    vnmsac.vv v16, v8, v24
 ; RV32-NEXT:    vmv8r.v v8, v16
 ; RV32-NEXT:    addi sp, sp, 16
@@ -2143,9 +2140,8 @@ define <vscale x 8 x i64> @vnmsac_vx_nxv8i64_ta(<vscale x 8 x i64> %a, i64 %b, <
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, mu
+; RV32-NEXT:    vlse64.v v24, (a0), zero
 ; RV32-NEXT:    vnmsac.vv v16, v8, v24, v0.t
 ; RV32-NEXT:    vmv.v.v v8, v16
 ; RV32-NEXT:    addi sp, sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/vor-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vor-vp.ll
index ef281c52838f..e864d71fdad1 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vor-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vor-vp.ll
@@ -1326,9 +1326,8 @@ define <vscale x 1 x i64> @vor_vx_nxv1i64(<vscale x 1 x i64> %va, i64 %b, <vscal
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vor.vv v8, v8, v9, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1353,9 +1352,8 @@ define <vscale x 1 x i64> @vor_vx_nxv1i64_unmasked(<vscale x 1 x i64> %va, i64 %
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vor.vv v8, v8, v9
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1422,9 +1420,8 @@ define <vscale x 2 x i64> @vor_vx_nxv2i64(<vscale x 2 x i64> %va, i64 %b, <vscal
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vor.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1449,9 +1446,8 @@ define <vscale x 2 x i64> @vor_vx_nxv2i64_unmasked(<vscale x 2 x i64> %va, i64 %
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vor.vv v8, v8, v10
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1518,9 +1514,8 @@ define <vscale x 4 x i64> @vor_vx_nxv4i64(<vscale x 4 x i64> %va, i64 %b, <vscal
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vor.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1545,9 +1540,8 @@ define <vscale x 4 x i64> @vor_vx_nxv4i64_unmasked(<vscale x 4 x i64> %va, i64 %
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vor.vv v8, v8, v12
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1614,9 +1608,8 @@ define <vscale x 8 x i64> @vor_vx_nxv8i64(<vscale x 8 x i64> %va, i64 %b, <vscal
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1641,9 +1634,8 @@ define <vscale x 8 x i64> @vor_vx_nxv8i64_unmasked(<vscale x 8 x i64> %va, i64 %
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vor.vv v8, v8, v16
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/vrem-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vrem-vp.ll
index 3273274a70b4..66ba2697fe5f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vrem-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vrem-vp.ll
@@ -893,9 +893,8 @@ define <vscale x 1 x i64> @vrem_vx_nxv1i64(<vscale x 1 x i64> %va, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vrem.vv v8, v8, v9, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -920,9 +919,8 @@ define <vscale x 1 x i64> @vrem_vx_nxv1i64_unmasked(<vscale x 1 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vrem.vv v8, v8, v9
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -969,9 +967,8 @@ define <vscale x 2 x i64> @vrem_vx_nxv2i64(<vscale x 2 x i64> %va, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vrem.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -996,9 +993,8 @@ define <vscale x 2 x i64> @vrem_vx_nxv2i64_unmasked(<vscale x 2 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vrem.vv v8, v8, v10
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1045,9 +1041,8 @@ define <vscale x 4 x i64> @vrem_vx_nxv4i64(<vscale x 4 x i64> %va, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vrem.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1072,9 +1067,8 @@ define <vscale x 4 x i64> @vrem_vx_nxv4i64_unmasked(<vscale x 4 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vrem.vv v8, v8, v12
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1121,9 +1115,8 @@ define <vscale x 8 x i64> @vrem_vx_nxv8i64(<vscale x 8 x i64> %va, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vrem.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1148,9 +1141,8 @@ define <vscale x 8 x i64> @vrem_vx_nxv8i64_unmasked(<vscale x 8 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vrem.vv v8, v8, v16
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/vremu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vremu-vp.ll
index 6b588d0917ff..4608661eb5df 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vremu-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vremu-vp.ll
@@ -892,9 +892,8 @@ define <vscale x 1 x i64> @vremu_vx_nxv1i64(<vscale x 1 x i64> %va, i64 %b, <vsc
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vremu.vv v8, v8, v9, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -919,9 +918,8 @@ define <vscale x 1 x i64> @vremu_vx_nxv1i64_unmasked(<vscale x 1 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vremu.vv v8, v8, v9
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -968,9 +966,8 @@ define <vscale x 2 x i64> @vremu_vx_nxv2i64(<vscale x 2 x i64> %va, i64 %b, <vsc
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vremu.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -995,9 +992,8 @@ define <vscale x 2 x i64> @vremu_vx_nxv2i64_unmasked(<vscale x 2 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vremu.vv v8, v8, v10
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1044,9 +1040,8 @@ define <vscale x 4 x i64> @vremu_vx_nxv4i64(<vscale x 4 x i64> %va, i64 %b, <vsc
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vremu.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1071,9 +1066,8 @@ define <vscale x 4 x i64> @vremu_vx_nxv4i64_unmasked(<vscale x 4 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vremu.vv v8, v8, v12
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1120,9 +1114,8 @@ define <vscale x 8 x i64> @vremu_vx_nxv8i64(<vscale x 8 x i64> %va, i64 %b, <vsc
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vremu.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1147,9 +1140,8 @@ define <vscale x 8 x i64> @vremu_vx_nxv8i64_unmasked(<vscale x 8 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vremu.vv v8, v8, v16
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/vrsub-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vrsub-vp.ll
index 0f38e9408fb5..c41139c64eb0 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vrsub-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vrsub-vp.ll
@@ -842,9 +842,8 @@ define <vscale x 1 x i64> @vrsub_vx_nxv1i64(<vscale x 1 x i64> %va, i64 %b, <vsc
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsub.vv v8, v9, v8, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -869,9 +868,8 @@ define <vscale x 1 x i64> @vrsub_vx_nxv1i64_unmasked(<vscale x 1 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsub.vv v8, v9, v8
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -918,9 +916,8 @@ define <vscale x 2 x i64> @vrsub_vx_nxv2i64(<vscale x 2 x i64> %va, i64 %b, <vsc
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsub.vv v8, v10, v8, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -945,9 +942,8 @@ define <vscale x 2 x i64> @vrsub_vx_nxv2i64_unmasked(<vscale x 2 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsub.vv v8, v10, v8
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -994,9 +990,8 @@ define <vscale x 4 x i64> @vrsub_vx_nxv4i64(<vscale x 4 x i64> %va, i64 %b, <vsc
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsub.vv v8, v12, v8, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1021,9 +1016,8 @@ define <vscale x 4 x i64> @vrsub_vx_nxv4i64_unmasked(<vscale x 4 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsub.vv v8, v12, v8
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1070,9 +1064,8 @@ define <vscale x 8 x i64> @vrsub_vx_nxv8i64(<vscale x 8 x i64> %va, i64 %b, <vsc
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsub.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1097,9 +1090,8 @@ define <vscale x 8 x i64> @vrsub_vx_nxv8i64_unmasked(<vscale x 8 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsub.vv v8, v16, v8
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsadd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vsadd-vp.ll
index 575d041b091d..e471f4b2e92b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsadd-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsadd-vp.ll
@@ -1425,9 +1425,8 @@ define <vscale x 1 x i64> @vsadd_vx_nxv1i64(<vscale x 1 x i64> %va, i64 %b, <vsc
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsadd.vv v8, v8, v9, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1452,9 +1451,8 @@ define <vscale x 1 x i64> @vsadd_vx_nxv1i64_unmasked(<vscale x 1 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsadd.vv v8, v8, v9
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1521,9 +1519,8 @@ define <vscale x 2 x i64> @vsadd_vx_nxv2i64(<vscale x 2 x i64> %va, i64 %b, <vsc
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsadd.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1548,9 +1545,8 @@ define <vscale x 2 x i64> @vsadd_vx_nxv2i64_unmasked(<vscale x 2 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsadd.vv v8, v8, v10
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1617,9 +1613,8 @@ define <vscale x 4 x i64> @vsadd_vx_nxv4i64(<vscale x 4 x i64> %va, i64 %b, <vsc
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsadd.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1644,9 +1639,8 @@ define <vscale x 4 x i64> @vsadd_vx_nxv4i64_unmasked(<vscale x 4 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsadd.vv v8, v8, v12
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1713,9 +1707,8 @@ define <vscale x 8 x i64> @vsadd_vx_nxv8i64(<vscale x 8 x i64> %va, i64 %b, <vsc
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsadd.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1740,9 +1733,8 @@ define <vscale x 8 x i64> @vsadd_vx_nxv8i64_unmasked(<vscale x 8 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsadd.vv v8, v8, v16
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsaddu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vsaddu-vp.ll
index c9ed72bc63da..f76a2b4b78bc 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsaddu-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsaddu-vp.ll
@@ -1424,9 +1424,8 @@ define <vscale x 1 x i64> @vsaddu_vx_nxv1i64(<vscale x 1 x i64> %va, i64 %b, <vs
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsaddu.vv v8, v8, v9, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1451,9 +1450,8 @@ define <vscale x 1 x i64> @vsaddu_vx_nxv1i64_unmasked(<vscale x 1 x i64> %va, i6
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsaddu.vv v8, v8, v9
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1520,9 +1518,8 @@ define <vscale x 2 x i64> @vsaddu_vx_nxv2i64(<vscale x 2 x i64> %va, i64 %b, <vs
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsaddu.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1547,9 +1544,8 @@ define <vscale x 2 x i64> @vsaddu_vx_nxv2i64_unmasked(<vscale x 2 x i64> %va, i6
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsaddu.vv v8, v8, v10
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1616,9 +1612,8 @@ define <vscale x 4 x i64> @vsaddu_vx_nxv4i64(<vscale x 4 x i64> %va, i64 %b, <vs
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsaddu.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1643,9 +1638,8 @@ define <vscale x 4 x i64> @vsaddu_vx_nxv4i64_unmasked(<vscale x 4 x i64> %va, i6
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsaddu.vv v8, v8, v12
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1712,9 +1706,8 @@ define <vscale x 8 x i64> @vsaddu_vx_nxv8i64(<vscale x 8 x i64> %va, i64 %b, <vs
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsaddu.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1739,9 +1732,8 @@ define <vscale x 8 x i64> @vsaddu_vx_nxv8i64_unmasked(<vscale x 8 x i64> %va, i6
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsaddu.vv v8, v8, v16
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/vssub-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vssub-vp.ll
index c0da928a72e9..ebf8d5eeb40b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vssub-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vssub-vp.ll
@@ -1468,9 +1468,8 @@ define <vscale x 1 x i64> @vssub_vx_nxv1i64(<vscale x 1 x i64> %va, i64 %b, <vsc
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vssub.vv v8, v8, v9, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1495,9 +1494,8 @@ define <vscale x 1 x i64> @vssub_vx_nxv1i64_unmasked(<vscale x 1 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vssub.vv v8, v8, v9
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1566,9 +1564,8 @@ define <vscale x 2 x i64> @vssub_vx_nxv2i64(<vscale x 2 x i64> %va, i64 %b, <vsc
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vssub.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1593,9 +1590,8 @@ define <vscale x 2 x i64> @vssub_vx_nxv2i64_unmasked(<vscale x 2 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vssub.vv v8, v8, v10
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1664,9 +1660,8 @@ define <vscale x 4 x i64> @vssub_vx_nxv4i64(<vscale x 4 x i64> %va, i64 %b, <vsc
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vssub.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1691,9 +1686,8 @@ define <vscale x 4 x i64> @vssub_vx_nxv4i64_unmasked(<vscale x 4 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vssub.vv v8, v8, v12
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1762,9 +1756,8 @@ define <vscale x 8 x i64> @vssub_vx_nxv8i64(<vscale x 8 x i64> %va, i64 %b, <vsc
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vssub.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1789,9 +1782,8 @@ define <vscale x 8 x i64> @vssub_vx_nxv8i64_unmasked(<vscale x 8 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vssub.vv v8, v8, v16
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/vssubu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vssubu-vp.ll
index b602f11e2c80..d54901c93d53 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vssubu-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vssubu-vp.ll
@@ -1466,9 +1466,8 @@ define <vscale x 1 x i64> @vssubu_vx_nxv1i64(<vscale x 1 x i64> %va, i64 %b, <vs
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vssubu.vv v8, v8, v9, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1493,9 +1492,8 @@ define <vscale x 1 x i64> @vssubu_vx_nxv1i64_unmasked(<vscale x 1 x i64> %va, i6
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vssubu.vv v8, v8, v9
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1564,9 +1562,8 @@ define <vscale x 2 x i64> @vssubu_vx_nxv2i64(<vscale x 2 x i64> %va, i64 %b, <vs
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vssubu.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1591,9 +1588,8 @@ define <vscale x 2 x i64> @vssubu_vx_nxv2i64_unmasked(<vscale x 2 x i64> %va, i6
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vssubu.vv v8, v8, v10
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1662,9 +1658,8 @@ define <vscale x 4 x i64> @vssubu_vx_nxv4i64(<vscale x 4 x i64> %va, i64 %b, <vs
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vssubu.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1689,9 +1684,8 @@ define <vscale x 4 x i64> @vssubu_vx_nxv4i64_unmasked(<vscale x 4 x i64> %va, i6
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vssubu.vv v8, v8, v12
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1760,9 +1754,8 @@ define <vscale x 8 x i64> @vssubu_vx_nxv8i64(<vscale x 8 x i64> %va, i64 %b, <vs
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vssubu.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1787,9 +1780,8 @@ define <vscale x 8 x i64> @vssubu_vx_nxv8i64_unmasked(<vscale x 8 x i64> %va, i6
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vssubu.vv v8, v8, v16
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsub-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vsub-vp.ll
index 65ba791fe780..e28da6bc4ec6 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsub-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsub-vp.ll
@@ -922,9 +922,8 @@ define <vscale x 1 x i64> @vsub_vx_nxv1i64(<vscale x 1 x i64> %va, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -949,9 +948,8 @@ define <vscale x 1 x i64> @vsub_vx_nxv1i64_unmasked(<vscale x 1 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsub.vv v8, v8, v9
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -998,9 +996,8 @@ define <vscale x 2 x i64> @vsub_vx_nxv2i64(<vscale x 2 x i64> %va, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsub.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1025,9 +1022,8 @@ define <vscale x 2 x i64> @vsub_vx_nxv2i64_unmasked(<vscale x 2 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsub.vv v8, v8, v10
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1074,9 +1070,8 @@ define <vscale x 4 x i64> @vsub_vx_nxv4i64(<vscale x 4 x i64> %va, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsub.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1101,9 +1096,8 @@ define <vscale x 4 x i64> @vsub_vx_nxv4i64_unmasked(<vscale x 4 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsub.vv v8, v8, v12
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1150,9 +1144,8 @@ define <vscale x 8 x i64> @vsub_vx_nxv8i64(<vscale x 8 x i64> %va, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsub.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1177,9 +1170,8 @@ define <vscale x 8 x i64> @vsub_vx_nxv8i64_unmasked(<vscale x 8 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsub.vv v8, v8, v16
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/vxor-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vxor-vp.ll
index f3dd7ec48881..1694a7af0a0b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vxor-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vxor-vp.ll
@@ -1694,9 +1694,8 @@ define <vscale x 1 x i64> @vxor_vx_nxv1i64(<vscale x 1 x i64> %va, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vxor.vv v8, v8, v9, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1721,9 +1720,8 @@ define <vscale x 1 x i64> @vxor_vx_nxv1i64_unmasked(<vscale x 1 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
 ; RV32-NEXT:    vxor.vv v8, v8, v9
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1810,9 +1808,8 @@ define <vscale x 2 x i64> @vxor_vx_nxv2i64(<vscale x 2 x i64> %va, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vxor.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1837,9 +1834,8 @@ define <vscale x 2 x i64> @vxor_vx_nxv2i64_unmasked(<vscale x 2 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
 ; RV32-NEXT:    vxor.vv v8, v8, v10
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1926,9 +1922,8 @@ define <vscale x 4 x i64> @vxor_vx_nxv4i64(<vscale x 4 x i64> %va, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vxor.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -1953,9 +1948,8 @@ define <vscale x 4 x i64> @vxor_vx_nxv4i64_unmasked(<vscale x 4 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
 ; RV32-NEXT:    vxor.vv v8, v8, v12
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -2042,9 +2036,8 @@ define <vscale x 8 x i64> @vxor_vx_nxv8i64(<vscale x 8 x i64> %va, i64 %b, <vsca
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vxor.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -2069,9 +2062,8 @@ define <vscale x 8 x i64> @vxor_vx_nxv8i64_unmasked(<vscale x 8 x i64> %va, i64
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vxor.vv v8, v8, v16
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
diff --git a/llvm/test/CodeGen/SPIRV/AtomicCompareExchange.ll b/llvm/test/CodeGen/SPIRV/AtomicCompareExchange.ll
index f8207c56a565..5ce4a1954c5f 100644
--- a/llvm/test/CodeGen/SPIRV/AtomicCompareExchange.ll
+++ b/llvm/test/CodeGen/SPIRV/AtomicCompareExchange.ll
@@ -1,6 +1,6 @@
 ; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
 
-; CHECK-SPIRV:      %[[#Int:]] = OpTypeInt 32 0
+; CHECK-SPIRV-DAG:      %[[#Int:]] = OpTypeInt 32 0
 ; CHECK-SPIRV-DAG:  %[[#MemScope_CrossDevice:]] = OpConstant %[[#Int]] 0
 ; CHECK-SPIRV-DAG:  %[[#MemSemEqual_SeqCst:]] = OpConstant %[[#Int]] 16
 ; CHECK-SPIRV-DAG:  %[[#MemSemUnequal_Acquire:]] = OpConstant %[[#Int]] 2
diff --git a/llvm/test/CodeGen/SPIRV/event-zero-const.ll b/llvm/test/CodeGen/SPIRV/event-zero-const.ll
index f3f20a0fb2f4..523d2ad9825f 100644
--- a/llvm/test/CodeGen/SPIRV/event-zero-const.ll
+++ b/llvm/test/CodeGen/SPIRV/event-zero-const.ll
@@ -4,10 +4,10 @@
 ; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s
 ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 
-; CHECK: %[[#LongTy:]] = OpTypeInt 64 0
-; CHECK: %[[#EventTy:]] = OpTypeEvent
-; CHECK: %[[#LongNull:]] = OpConstantNull %[[#LongTy]]
-; CHECK: %[[#EventNull:]] = OpConstantNull %[[#EventTy]]
+; CHECK-DAG: %[[#LongTy:]] = OpTypeInt 64 0
+; CHECK-DAG: %[[#EventTy:]] = OpTypeEvent
+; CHECK-DAG: %[[#LongNull:]] = OpConstantNull %[[#LongTy]]
+; CHECK-DAG: %[[#EventNull:]] = OpConstantNull %[[#EventTy]]
 ; CHECK: OpFunction
 ; CHECK: OpINotEqual %[[#]] %[[#]] %[[#LongNull]]
 ; CHECK: OpGroupAsyncCopy %[[#EventTy]] %[[#]] %[[#]] %[[#]] %[[#]] %[[#]] %[[#EventNull]]
diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_function_pointers/fp-simple-hierarchy.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_function_pointers/fp-simple-hierarchy.ll
index 368c5d4a3298..80309e96db00 100644
--- a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_function_pointers/fp-simple-hierarchy.ll
+++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_function_pointers/fp-simple-hierarchy.ll
@@ -1,16 +1,17 @@
-; RUN: llc -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=+SPV_INTEL_function_pointers %s -o - | FileCheck %s
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=+SPV_INTEL_function_pointers %s -o - | FileCheck %s
 ; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 
-; TODO: This test currently fails with LLVM_ENABLE_EXPENSIVE_CHECKS enabled
-; XFAIL: expensive_checks
-
 ; CHECK-DAG: OpName %[[I9:.*]] "_ZN13BaseIncrement9incrementEPi"
 ; CHECK-DAG: OpName %[[I29:.*]] "_ZN12IncrementBy29incrementEPi"
 ; CHECK-DAG: OpName %[[I49:.*]] "_ZN12IncrementBy49incrementEPi"
 ; CHECK-DAG: OpName %[[I89:.*]] "_ZN12IncrementBy89incrementEPi"
+; CHECK-DAG: OpName %[[Foo:.*]] "foo"
 
 ; CHECK-DAG: %[[TyVoid:.*]] = OpTypeVoid
-; CHECK-DAG: %[[TyArr:.*]] = OpTypeArray
+; CHECK-DAG: %[[TyInt32:.*]] = OpTypeInt 32 0
+; CHECK-DAG: %[[TyInt8:.*]] = OpTypeInt 8 0
+; CHECK-DAG: %[[Const8:.*]] = OpConstant %[[TyInt32]] 8
+; CHECK-DAG: %[[TyArr:.*]] = OpTypeArray %[[TyInt8]] %[[Const8]]
 ; CHECK-DAG: %[[TyStruct1:.*]] = OpTypeStruct %[[TyArr]]
 ; CHECK-DAG: %[[TyStruct2:.*]] = OpTypeStruct %[[TyStruct1]]
 ; CHECK-DAG: %[[TyPtrStruct2:.*]] = OpTypePointer Generic %[[TyStruct2]]
@@ -18,16 +19,21 @@
 ; CHECK-DAG: %[[TyPtrFun:.*]] = OpTypePointer Generic %[[TyFun]]
 ; CHECK-DAG: %[[TyPtrPtrFun:.*]] = OpTypePointer Generic %[[TyPtrFun]]
 
-; CHECK: %[[I9]] = OpFunction
-; CHECK: %[[I29]] = OpFunction
-; CHECK: %[[I49]] = OpFunction
-; CHECK: %[[I89]] = OpFunction
+; CHECK-DAG: %[[I9]] = OpFunction
+; CHECK-DAG: %[[I29]] = OpFunction
+; CHECK-DAG: %[[I49]] = OpFunction
+; CHECK-DAG: %[[I89]] = OpFunction
+
+; CHECK: %[[Foo]] = OpFunction
+; CHECK-4: OpFunctionParameter
 
 ; CHECK: %[[Arg1:.*]] = OpPhi %[[TyPtrStruct2]]
 ; CHECK: %[[VTbl:.*]] = OpBitcast %[[TyPtrPtrFun]] %[[#]]
 ; CHECK: %[[FP:.*]] = OpLoad %[[TyPtrFun]] %[[VTbl]]
 ; CHECK: %[[#]] = OpFunctionPointerCallINTEL %[[TyVoid]] %[[FP]] %[[Arg1]] %[[#]]
 
+; CHECK-NO: OpFunction
+
 %"cls::id" = type { %"cls::detail::array" }
 %"cls::detail::array" = type { [1 x i64] }
 %struct.obj_storage_t = type { %"struct.aligned_storage<BaseIncrement, IncrementBy2, IncrementBy4, IncrementBy8>::type" }
diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_function_pointers/fp_const.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_function_pointers/fp_const.ll
index 75ad382f05ff..b96da631c0a8 100644
--- a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_function_pointers/fp_const.ll
+++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_function_pointers/fp_const.ll
@@ -1,9 +1,6 @@
-; RUN: llc -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=+SPV_INTEL_function_pointers %s -o - | FileCheck %s
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=+SPV_INTEL_function_pointers %s -o - | FileCheck %s
 ; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 
-; TODO: This test currently fails with LLVM_ENABLE_EXPENSIVE_CHECKS enabled
-; XFAIL: expensive_checks
-
 ; CHECK-DAG: OpCapability FunctionPointersINTEL
 ; CHECK-DAG: OpCapability Int64
 ; CHECK: OpExtension "SPV_INTEL_function_pointers"
diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_function_pointers/fun-ptr-addrcast.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_function_pointers/fun-ptr-addrcast.ll
index d38de216c223..8edecc1329d0 100644
--- a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_function_pointers/fun-ptr-addrcast.ll
+++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_function_pointers/fun-ptr-addrcast.ll
@@ -2,15 +2,9 @@
 ; pointers/PtrCast-null-in-OpSpecConstantOp.ll (that is OpSpecConstantOp with ptr-cast operation) correctly
 ; work also for function pointers.
 
-; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - --spirv-ext=+SPV_INTEL_function_pointers | FileCheck %s
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o - --spirv-ext=+SPV_INTEL_function_pointers | FileCheck %s
 ; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 
-; TODO: This test currently fails with LLVM_ENABLE_EXPENSIVE_CHECKS enabled
-; XFAIL: expensive_checks
-
-; Running with -verify-machineinstrs would lead to "Reading virtual register without a def"
-; error, because OpConstantFunctionPointerINTEL forward-refers to a function definition.
-
 ; CHECK-COUNT-3: %[[#]] = OpSpecConstantOp %[[#]] 121 %[[#]]
 ; CHECK-COUNT-3: OpPtrCastToGeneric
 
diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_inline_assembly/inline_asm.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_inline_assembly/inline_asm.ll
index e006651d49e4..91286d5bf32e 100644
--- a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_inline_assembly/inline_asm.ll
+++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_inline_assembly/inline_asm.ll
@@ -31,20 +31,20 @@
 ; CHECK-DAG: %[[#Const123:]] = OpConstant %[[#Int32Ty]] 123
 ; CHECK-DAG: %[[#Const42:]] = OpConstant %[[#DoubleTy:]] 42
 
-; CHECK: %[[#Dialect:]] = OpAsmTargetINTEL "spirv64-unknown-unknown"
+; CHECK-DAG: %[[#Dialect:]] = OpAsmTargetINTEL "spirv64-unknown-unknown"
 ; CHECK-NO: OpAsmTargetINTEL
 
-; CHECK: %[[#Asm1:]] = OpAsmINTEL %[[#VoidTy]] %[[#Fun1Ty]] %[[#Dialect]] "" ""
-; CHECK: %[[#Asm2:]] = OpAsmINTEL %[[#VoidTy]] %[[#Fun1Ty]] %[[#Dialect]] "nop" ""
-; CHECK: %[[#Asm3:]] = OpAsmINTEL %[[#VoidTy]] %[[#Fun1Ty]] %[[#Dialect]] "" "~{cc},~{memory}"
-; CHECK: %[[#Asm4:]] = OpAsmINTEL %[[#Int32Ty]] %[[#Fun2Ty:]] %[[#Dialect]] "clobber_out $0" "=&r"
-; CHECK: %[[#Asm5:]] = OpAsmINTEL %[[#Int32Ty]] %[[#Fun3Ty]] %[[#Dialect]] "icmd $0 $1" "=r,r"
-; CHECK: %[[#Asm6:]] = OpAsmINTEL %[[#FloatTy]] %[[#Fun4Ty]] %[[#Dialect]] "fcmd $0 $1" "=r,r"
-; CHECK: %[[#Asm7:]] = OpAsmINTEL %[[#HalfTy]] %[[#Fun5Ty]] %[[#Dialect]] "fcmdext $0 $1 $2" "=r,r,r"
-; CHECK: %[[#Asm8:]] = OpAsmINTEL %[[#Int8Ty]] %[[#Fun6Ty]] %[[#Dialect]] "cmdext $0 $3 $1 $2" "=r,r,r,r"
-; CHECK: %[[#Asm9:]] = OpAsmINTEL %[[#Int64Ty]] %[[#Fun7Ty]] %[[#Dialect]] "icmdext $0 $3 $1 $2" "=r,r,r,r"
-; CHECK: %[[#Asm10:]] = OpAsmINTEL %[[#VoidTy]] %[[#Fun8Ty]] %[[#Dialect]] "constcmd $0 $1" "r,r"
-; CHECK: %[[#Asm11:]] = OpAsmINTEL %[[#VoidTy]] %[[#Fun8Ty]] %[[#Dialect]] "constcmd $0 $1" "i,i"
+; CHECK-DAG: %[[#Asm1:]] = OpAsmINTEL %[[#VoidTy]] %[[#Fun1Ty]] %[[#Dialect]] "" ""
+; CHECK-DAG: %[[#Asm2:]] = OpAsmINTEL %[[#VoidTy]] %[[#Fun1Ty]] %[[#Dialect]] "nop" ""
+; CHECK-DAG: %[[#Asm3:]] = OpAsmINTEL %[[#VoidTy]] %[[#Fun1Ty]] %[[#Dialect]] "" "~{cc},~{memory}"
+; CHECK-DAG: %[[#Asm4:]] = OpAsmINTEL %[[#Int32Ty]] %[[#Fun2Ty:]] %[[#Dialect]] "clobber_out $0" "=&r"
+; CHECK-DAG: %[[#Asm5:]] = OpAsmINTEL %[[#Int32Ty]] %[[#Fun3Ty]] %[[#Dialect]] "icmd $0 $1" "=r,r"
+; CHECK-DAG: %[[#Asm6:]] = OpAsmINTEL %[[#FloatTy]] %[[#Fun4Ty]] %[[#Dialect]] "fcmd $0 $1" "=r,r"
+; CHECK-DAG: %[[#Asm7:]] = OpAsmINTEL %[[#HalfTy]] %[[#Fun5Ty]] %[[#Dialect]] "fcmdext $0 $1 $2" "=r,r,r"
+; CHECK-DAG: %[[#Asm8:]] = OpAsmINTEL %[[#Int8Ty]] %[[#Fun6Ty]] %[[#Dialect]] "cmdext $0 $3 $1 $2" "=r,r,r,r"
+; CHECK-DAG: %[[#Asm9:]] = OpAsmINTEL %[[#Int64Ty]] %[[#Fun7Ty]] %[[#Dialect]] "icmdext $0 $3 $1 $2" "=r,r,r,r"
+; CHECK-DAG: %[[#Asm10:]] = OpAsmINTEL %[[#VoidTy]] %[[#Fun8Ty]] %[[#Dialect]] "constcmd $0 $1" "r,r"
+; CHECK-DAG: %[[#Asm11:]] = OpAsmINTEL %[[#VoidTy]] %[[#Fun8Ty]] %[[#Dialect]] "constcmd $0 $1" "i,i"
 ; CHECK-NO: OpAsmINTEL
 
 ; CHECK: OpFunction
diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_shader_clock/shader_clock.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_shader_clock/shader_clock.ll
index 8ecd0a2b25eb..bd07ba1316ec 100644
--- a/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_shader_clock/shader_clock.ll
+++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_shader_clock/shader_clock.ll
@@ -6,9 +6,11 @@
 
 ; CHECK: OpCapability ShaderClockKHR
 ; CHECK: OpExtension "SPV_KHR_shader_clock"
-; CHECK-DAG: [[uint:%[a-z0-9_]+]] = OpTypeInt 32
+; CHECK-DAG: [[uint:%[a-z0-9_]+]] = OpTypeInt 32 0
 ; CHECK-DAG: [[ulong:%[a-z0-9_]+]] = OpTypeInt 64
 ; CHECK-DAG: [[v2uint:%[a-z0-9_]+]] = OpTypeVector [[uint]] 2
+; CHECK-DAG: OpConstant [[uint]] 8
+; CHECK-DAG: OpConstant [[uint]] 16
 ; CHECK-DAG: [[uint_1:%[a-z0-9_]+]] = OpConstant [[uint]] 1
 ; CHECK-DAG: [[uint_2:%[a-z0-9_]+]] = OpConstant [[uint]] 2
 ; CHECK-DAG: [[uint_3:%[a-z0-9_]+]] = OpConstant [[uint]] 3
diff --git a/llvm/test/CodeGen/SPIRV/global-var-name-linkage.ll b/llvm/test/CodeGen/SPIRV/global-var-name-linkage.ll
new file mode 100644
index 000000000000..4501819ce494
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/global-var-name-linkage.ll
@@ -0,0 +1,59 @@
+; Check names and decoration of global variables.
+
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-DAG: OpName %[[#id18:]] "G1"
+; CHECK-DAG: OpName %[[#id22:]] "g1"
+; CHECK-DAG: OpName %[[#id23:]] "g2"
+; CHECK-DAG: OpName %[[#id27:]] "g4"
+; CHECK-DAG: OpName %[[#id30:]] "c1"
+; CHECK-DAG: OpName %[[#id31:]] "n_t"
+; CHECK-DAG: OpName %[[#id32:]] "w"
+; CHECK-DAG: OpName %[[#id34:]] "a.b"
+; CHECK-DAG: OpName %[[#id35:]] "e"
+; CHECK-DAG: OpName %[[#id36:]] "y.z"
+; CHECK-DAG: OpName %[[#id38:]] "x"
+
+; CHECK-NOT: OpDecorate %[[#id18]] LinkageAttributes
+; CHECK-DAG: OpDecorate %[[#id18]] Constant
+; CHECK-DAG: OpDecorate %[[#id22]] Alignment 4
+; CHECK-DAG: OpDecorate %[[#id22]] LinkageAttributes "g1" Export
+; CHECK-DAG: OpDecorate %[[#id23]] Alignment 4
+; CHECK-DAG: OpDecorate %[[#id27]] Alignment 4
+; CHECK-DAG: OpDecorate %[[#id27]] LinkageAttributes "g4" Export
+; CHECK-DAG: OpDecorate %[[#id30]] Constant
+; CHECK-DAG: OpDecorate %[[#id30]] Alignment 4
+; CHECK-DAG: OpDecorate %[[#id30]] LinkageAttributes "c1" Export
+; CHECK-DAG: OpDecorate %[[#id31]] Constant
+; CHECK-DAG: OpDecorate %[[#id31]] LinkageAttributes "n_t" Import
+; CHECK-DAG: OpDecorate %[[#id32]] Constant
+; CHECK-DAG: OpDecorate %[[#id32]] Alignment 4
+; CHECK-DAG: OpDecorate %[[#id32]] LinkageAttributes "w" Export
+; CHECK-DAG: OpDecorate %[[#id34]] Constant
+; CHECK-DAG: OpDecorate %[[#id34]] Alignment 4
+; CHECK-DAG: OpDecorate %[[#id35]] LinkageAttributes "e" Import
+; CHECK-DAG: OpDecorate %[[#id36]] Alignment 4
+; CHECK-DAG: OpDecorate %[[#id38]] Constant
+; CHECK-DAG: OpDecorate %[[#id38]] Alignment 4
+
+%"class.sycl::_V1::nd_item" = type { i8 }
+
+@G1 = private unnamed_addr addrspace(1) constant %"class.sycl::_V1::nd_item" poison, align 1
+@g1 = addrspace(1) global i32 1, align 4
+@g2 = internal addrspace(1) global i32 2, align 4
+@g4 = common addrspace(1) global i32 0, align 4
+@c1 = addrspace(2) constant [2 x i32] [i32 0, i32 1], align 4
+@n_t = external addrspace(2) constant [256 x i32]
+@w = addrspace(1) constant i32 0, align 4
+@a.b = internal addrspace(2) constant [2 x i32] [i32 2, i32 3], align 4
+@e = external addrspace(1) global i32
+@y.z = internal addrspace(1) global i32 0, align 4
+@x = internal addrspace(2) constant float 1.000000e+00, align 4
+
+define internal spir_func void @foo(ptr addrspace(4) align 1 %arg) {
+  ret void
+}
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/cross.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/cross.ll
index 2e0eb8c429ac..b1625c07111e 100644
--- a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/cross.ll
+++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/cross.ll
@@ -15,7 +15,7 @@ entry:
   ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec3_float_16]]
   ; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#vec3_float_16]]
   ; CHECK: %[[#]] = OpExtInst %[[#vec3_float_16]] %[[#op_ext_glsl]] Cross %[[#arg0]] %[[#arg1]]
-  %hlsl.cross = call <3 x half> @llvm.spv.cross.v4f16(<3 x half> %a, <3 x half> %b)
+  %hlsl.cross = call <3 x half> @llvm.spv.cross.v3f16(<3 x half> %a, <3 x half> %b)
   ret <3 x half> %hlsl.cross
 }
 
@@ -25,9 +25,9 @@ entry:
   ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec3_float_32]]
   ; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#vec3_float_32]]
   ; CHECK: %[[#]] = OpExtInst %[[#vec3_float_32]] %[[#op_ext_glsl]] Cross %[[#arg0]] %[[#arg1]]
-  %hlsl.cross = call <3 x float> @llvm.spv.cross.v4f32(<3 x float> %a, <3 x float> %b)
+  %hlsl.cross = call <3 x float> @llvm.spv.cross.v3f32(<3 x float> %a, <3 x float> %b)
   ret <3 x float> %hlsl.cross
 }
 
-declare <3 x half> @llvm.spv.cross.v4f16(<3 x half>, <3 x half>)
-declare <3 x float> @llvm.spv.cross.v4f32(<3 x float>, <3 x float>)
+declare <3 x half> @llvm.spv.cross.v3f16(<3 x half>, <3 x half>)
+declare <3 x float> @llvm.spv.cross.v3f32(<3 x float>, <3 x float>)
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/length.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/length.ll
index b4a9d8e0664b..1ac862b79a3f 100644
--- a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/length.ll
+++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/length.ll
@@ -11,19 +11,21 @@
 
 define noundef half @length_half4(<4 x half> noundef %a) {
 entry:
-  ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]]
+  ; CHECK: %[[#]] = OpFunction %[[#float_16]] None %[[#]]
+  ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec4_float_16]]
   ; CHECK: %[[#]] = OpExtInst %[[#float_16]] %[[#op_ext_glsl]] Length %[[#arg0]]
-  %hlsl.length = call half @llvm.spv.length.v4f16(<4 x half> %a)
+  %hlsl.length = call half @llvm.spv.length.f16(<4 x half> %a)
   ret half %hlsl.length
 }
 
 define noundef float @length_float4(<4 x float> noundef %a) {
 entry:
-  ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]]
+  ; CHECK: %[[#]] = OpFunction %[[#float_32]] None %[[#]]
+  ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec4_float_32]]
   ; CHECK: %[[#]] = OpExtInst %[[#float_32]] %[[#op_ext_glsl]] Length %[[#arg0]]
-  %hlsl.length = call float @llvm.spv.length.v4f32(<4 x float> %a)
+  %hlsl.length = call float @llvm.spv.length.f32(<4 x float> %a)
   ret float %hlsl.length
 }
 
-declare half @llvm.spv.length.v4f16(<4 x half>)
-declare float @llvm.spv.length.v4f32(<4 x float>)
+declare half @llvm.spv.length.f16(<4 x half>)
+declare float @llvm.spv.length.f32(<4 x float>)
diff --git a/llvm/test/CodeGen/SPIRV/iaddcarry-builtin.ll b/llvm/test/CodeGen/SPIRV/iaddcarry-builtin.ll
index 8f14eba21b63..49aaa45afed1 100644
--- a/llvm/test/CodeGen/SPIRV/iaddcarry-builtin.ll
+++ b/llvm/test/CodeGen/SPIRV/iaddcarry-builtin.ll
@@ -25,9 +25,7 @@
 ; CHECK-SPIRV-DAG:                    [[v4uint:%[a-z0-9_]+]] = OpTypeVector [[uint]] 4
 ; CHECK-SPIRV-DAG:                 [[vecstruct:%[a-z0-9_]+]] = OpTypeStruct [[v4uint]] [[v4uint]]
 ; CHECK-SPIRV-DAG:   [[_ptr_Function_vecstruct:%[a-z0-9_]+]] = OpTypePointer Function [[vecstruct]]
-; CHECK-SPIRV-DAG:               [[struct_anon:%[a-z0-9_.]+]] = OpTypeStruct [[uint]] [[uint]]
-; CHECK-SPIRV-DAG: [[_ptr_Function_struct_anon:%[a-z0-9_]+]] = OpTypePointer Function [[struct_anon]]
-; CHECK-SPIRV-DAG:  [[_ptr_Generic_struct_anon:%[a-z0-9_]+]] = OpTypePointer Generic [[struct_anon]]
+; CHECK-SPIRV-DAG:    [[_ptr_Generic_i32struct:%[a-z0-9_]+]] = OpTypePointer Generic [[i32struct]]
 
 define spir_func void @test_builtin_iaddcarrycc(i8 %a, i8 %b) {
   entry:
@@ -116,9 +114,9 @@ define spir_func void @test_builtin_iaddcarry_anon(i32 %a, i32 %b) {
 ; CHECK-SPIRV:        [[a_4:%[a-z0-9_]+]] = OpFunctionParameter [[uint]]
 ; CHECK-SPIRV:        [[b_4:%[a-z0-9_]+]] = OpFunctionParameter [[uint]]
 ; CHECK-SPIRV:    [[entry_4:%[a-z0-9_]+]] = OpLabel
-; CHECK-SPIRV:     [[var_59:%[a-z0-9_]+]] = OpVariable [[_ptr_Function_struct_anon]] Function
-; CHECK-SPIRV:     [[var_61:%[a-z0-9_]+]] = OpPtrCastToGeneric [[_ptr_Generic_struct_anon]] [[var_59]]
-; CHECK-SPIRV:     [[var_62:%[a-z0-9_]+]] = OpIAddCarry [[struct_anon]] [[a_4]] [[b_4]]
+; CHECK-SPIRV:     [[var_59:%[a-z0-9_]+]] = OpVariable [[_ptr_Function_i32struct]] Function
+; CHECK-SPIRV:     [[var_61:%[a-z0-9_]+]] = OpPtrCastToGeneric [[_ptr_Generic_i32struct]] [[var_59]]
+; CHECK-SPIRV:     [[var_62:%[a-z0-9_]+]] = OpIAddCarry [[i32struct]] [[a_4]] [[b_4]]
 ; CHECK-SPIRV:                              OpStore [[var_61]] [[var_62]]
 
 declare void @_Z17__spirv_IAddCarryIiiE4anonIT_T0_ES1_S2_(ptr addrspace(4) sret(%struct.anon) align 4, i32, i32)
diff --git a/llvm/test/CodeGen/SPIRV/image-unoptimized.ll b/llvm/test/CodeGen/SPIRV/image-unoptimized.ll
index 0ce9c73ab103..d7d5b1d6b756 100644
--- a/llvm/test/CodeGen/SPIRV/image-unoptimized.ll
+++ b/llvm/test/CodeGen/SPIRV/image-unoptimized.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s
 
-; CHECK:     %[[#TypeImage:]] = OpTypeImage
-; CHECK:     %[[#TypeSampler:]] = OpTypeSampler
+; CHECK-DAG: %[[#TypeImage:]] = OpTypeImage
+; CHECK-DAG: %[[#TypeSampler:]] = OpTypeSampler
 ; CHECK-DAG: %[[#TypeImagePtr:]] = OpTypePointer {{.*}} %[[#TypeImage]]
 ; CHECK-DAG: %[[#TypeSamplerPtr:]] = OpTypePointer {{.*}} %[[#TypeSampler]]
 
diff --git a/llvm/test/CodeGen/SPIRV/isubborrow-builtin.ll b/llvm/test/CodeGen/SPIRV/isubborrow-builtin.ll
index 08b4d2a1fa8e..ca842d2f9557 100644
--- a/llvm/test/CodeGen/SPIRV/isubborrow-builtin.ll
+++ b/llvm/test/CodeGen/SPIRV/isubborrow-builtin.ll
@@ -23,9 +23,7 @@
 ; CHECK-SPIRV-DAG:                    [[v4uint:%[a-z0-9_]+]] = OpTypeVector [[uint]] 4
 ; CHECK-SPIRV-DAG:                 [[vecstruct:%[a-z0-9_]+]] = OpTypeStruct [[v4uint]] [[v4uint]]
 ; CHECK-SPIRV-DAG:   [[_ptr_Function_vecstruct:%[a-z0-9_]+]] = OpTypePointer Function [[vecstruct]]
-; CHECK-SPIRV-DAG:               [[struct_anon:%[a-z0-9_.]+]] = OpTypeStruct [[uint]] [[uint]]
-; CHECK-SPIRV-DAG: [[_ptr_Function_struct_anon:%[a-z0-9_]+]] = OpTypePointer Function [[struct_anon]]
-; CHECK-SPIRV-DAG:  [[_ptr_Generic_struct_anon:%[a-z0-9_]+]] = OpTypePointer Generic [[struct_anon]]
+; CHECK-SPIRV-DAG:    [[_ptr_Generic_i32struct:%[a-z0-9_]+]] = OpTypePointer Generic [[i32struct]]
 
 define spir_func void @test_builtin_isubborrowcc(i8 %a, i8 %b) {
   entry:
@@ -114,9 +112,9 @@ define spir_func void @test_builtin_isubborrow_anon(i32 %a, i32 %b) {
 ; CHECK-SPIRV:        [[a_4:%[a-z0-9_]+]] = OpFunctionParameter [[uint]]
 ; CHECK-SPIRV:        [[b_4:%[a-z0-9_]+]] = OpFunctionParameter [[uint]]
 ; CHECK-SPIRV:    [[entry_4:%[a-z0-9_]+]] = OpLabel
-; CHECK-SPIRV:     [[var_59:%[a-z0-9_]+]] = OpVariable [[_ptr_Function_struct_anon]] Function
-; CHECK-SPIRV:     [[var_61:%[a-z0-9_]+]] = OpPtrCastToGeneric [[_ptr_Generic_struct_anon]] [[var_59]]
-; CHECK-SPIRV:     [[var_62:%[a-z0-9_]+]] = OpISubBorrow [[struct_anon]] [[a_4]] [[b_4]]
+; CHECK-SPIRV:     [[var_59:%[a-z0-9_]+]] = OpVariable [[_ptr_Function_i32struct]] Function
+; CHECK-SPIRV:     [[var_61:%[a-z0-9_]+]] = OpPtrCastToGeneric [[_ptr_Generic_i32struct]] [[var_59]]
+; CHECK-SPIRV:     [[var_62:%[a-z0-9_]+]] = OpISubBorrow [[i32struct]] [[a_4]] [[b_4]]
 ; CHECK-SPIRV:                              OpStore [[var_61]] [[var_62]]
 
 declare void @_Z18__spirv_ISubBorrowIiiE4anonIT_T0_ES1_S2_(ptr addrspace(4) sret(%struct.anon) align 4, i32, i32)
diff --git a/llvm/test/CodeGen/SPIRV/keep-tracked-const.ll b/llvm/test/CodeGen/SPIRV/keep-tracked-const.ll
index 0dc86233f8e1..61d06fe2752e 100644
--- a/llvm/test/CodeGen/SPIRV/keep-tracked-const.ll
+++ b/llvm/test/CodeGen/SPIRV/keep-tracked-const.ll
@@ -3,9 +3,9 @@
 ; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
 ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 
-; CHECK-SPIRV: %[[#Int:]] = OpTypeInt 32 0
-; CHECK-SPIRV: %[[#C0:]] = OpConstant %[[#Int]] 0
-; CHECK-SPIRV: %[[#C1:]] = OpConstant %[[#Int]] 1
+; CHECK-SPIRV-DAG: %[[#Int:]] = OpTypeInt 32 0
+; CHECK-SPIRV-DAG: %[[#C0:]] = OpConstant %[[#Int]] 0
+; CHECK-SPIRV-DAG: %[[#C1:]] = OpConstant %[[#Int]] 1
 ; CHECK-SPIRV: OpSelect %[[#Int]] %[[#]] %[[#C1]] %[[#C0]]
 
 
diff --git a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/fshl.ll b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/fshl.ll
index 2d5b30978aa2..25b530461012 100644
--- a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/fshl.ll
+++ b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/fshl.ll
@@ -1,21 +1,21 @@
 ; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
 
-; CHECK-SPIRV:     OpName %[[#NAME_FSHL_FUNC_32:]] "spirv.llvm_fshl_i32"
-; CHECK-SPIRV:     OpName %[[#NAME_FSHL_FUNC_16:]] "spirv.llvm_fshl_i16"
-; CHECK-SPIRV:     OpName %[[#NAME_FSHL_FUNC_VEC_INT_16:]] "spirv.llvm_fshl_v2i16"
-; CHECK-SPIRV:     %[[#TYPE_INT_32:]] = OpTypeInt 32 0
-; CHECK-SPIRV:     %[[#TYPE_ORIG_FUNC_32:]] = OpTypeFunction %[[#TYPE_INT_32]] %[[#TYPE_INT_32]] %[[#TYPE_INT_32]]
-; CHECK-SPIRV:     %[[#TYPE_INT_16:]] = OpTypeInt 16 0
-; CHECK-SPIRV:     %[[#TYPE_ORIG_FUNC_16:]] = OpTypeFunction %[[#TYPE_INT_16]] %[[#TYPE_INT_16]] %[[#TYPE_INT_16]]
-; CHECK-SPIRV:     %[[#TYPE_VEC_INT_16:]] = OpTypeVector %[[#TYPE_INT_16]] 2
-; CHECK-SPIRV:     %[[#TYPE_ORIG_FUNC_VEC_INT_16:]] = OpTypeFunction %[[#TYPE_VEC_INT_16]] %[[#TYPE_VEC_INT_16]] %[[#TYPE_VEC_INT_16]]
-; CHECK-SPIRV:     %[[#TYPE_FSHL_FUNC_32:]] = OpTypeFunction %[[#TYPE_INT_32]] %[[#TYPE_INT_32]] %[[#TYPE_INT_32]] %[[#TYPE_INT_32]]
-; CHECK-SPIRV:     %[[#TYPE_FSHL_FUNC_16:]] = OpTypeFunction %[[#TYPE_INT_16]] %[[#TYPE_INT_16]] %[[#TYPE_INT_16]] %[[#TYPE_INT_16]]
-; CHECK-SPIRV:     %[[#TYPE_FSHL_FUNC_VEC_INT_16:]] = OpTypeFunction %[[#TYPE_VEC_INT_16]] %[[#TYPE_VEC_INT_16]] %[[#TYPE_VEC_INT_16]] %[[#TYPE_VEC_INT_16]]
-; CHECK-SPIRV-DAG: %[[#CONST_ROTATE_32:]] = OpConstant %[[#TYPE_INT_32]] 8
-; CHECK-SPIRV-DAG: %[[#CONST_ROTATE_16:]] = OpConstant %[[#TYPE_INT_16]] 8
-; CHECK-SPIRV:     %[[#CONST_ROTATE_VEC_INT_16:]] = OpConstantComposite %[[#TYPE_VEC_INT_16]] %[[#CONST_ROTATE_16]] %[[#CONST_ROTATE_16]]
-; CHECK-SPIRV-DAG: %[[#CONST_TYPE_SIZE_32:]] = OpConstant %[[#TYPE_INT_32]] 32
+; CHECK-SPIRV-DAG:     OpName %[[#NAME_FSHL_FUNC_32:]] "spirv.llvm_fshl_i32"
+; CHECK-SPIRV-DAG:     OpName %[[#NAME_FSHL_FUNC_16:]] "spirv.llvm_fshl_i16"
+; CHECK-SPIRV-DAG:     OpName %[[#NAME_FSHL_FUNC_VEC_INT_16:]] "spirv.llvm_fshl_v2i16"
+; CHECK-SPIRV-DAG:     %[[#TYPE_INT_32:]] = OpTypeInt 32 0
+; CHECK-SPIRV-DAG:     %[[#TYPE_ORIG_FUNC_32:]] = OpTypeFunction %[[#TYPE_INT_32]] %[[#TYPE_INT_32]] %[[#TYPE_INT_32]]
+; CHECK-SPIRV-DAG:     %[[#TYPE_INT_16:]] = OpTypeInt 16 0
+; CHECK-SPIRV-DAG:     %[[#TYPE_ORIG_FUNC_16:]] = OpTypeFunction %[[#TYPE_INT_16]] %[[#TYPE_INT_16]] %[[#TYPE_INT_16]]
+; CHECK-SPIRV-DAG:     %[[#TYPE_VEC_INT_16:]] = OpTypeVector %[[#TYPE_INT_16]] 2
+; CHECK-SPIRV-DAG:     %[[#TYPE_ORIG_FUNC_VEC_INT_16:]] = OpTypeFunction %[[#TYPE_VEC_INT_16]] %[[#TYPE_VEC_INT_16]] %[[#TYPE_VEC_INT_16]]
+; CHECK-SPIRV-DAG:     %[[#TYPE_FSHL_FUNC_32:]] = OpTypeFunction %[[#TYPE_INT_32]] %[[#TYPE_INT_32]] %[[#TYPE_INT_32]] %[[#TYPE_INT_32]]
+; CHECK-SPIRV-DAG:     %[[#TYPE_FSHL_FUNC_16:]] = OpTypeFunction %[[#TYPE_INT_16]] %[[#TYPE_INT_16]] %[[#TYPE_INT_16]] %[[#TYPE_INT_16]]
+; CHECK-SPIRV-DAG:     %[[#TYPE_FSHL_FUNC_VEC_INT_16:]] = OpTypeFunction %[[#TYPE_VEC_INT_16]] %[[#TYPE_VEC_INT_16]] %[[#TYPE_VEC_INT_16]] %[[#TYPE_VEC_INT_16]]
+; CHECK-SPIRV-DAG:     %[[#CONST_ROTATE_32:]] = OpConstant %[[#TYPE_INT_32]] 8
+; CHECK-SPIRV-DAG:     %[[#CONST_ROTATE_16:]] = OpConstant %[[#TYPE_INT_16]] 8
+; CHECK-SPIRV-DAG:     %[[#CONST_ROTATE_VEC_INT_16:]] = OpConstantComposite %[[#TYPE_VEC_INT_16]] %[[#CONST_ROTATE_16]] %[[#CONST_ROTATE_16]]
+; CHECK-SPIRV-DAG:     %[[#CONST_TYPE_SIZE_32:]] = OpConstant %[[#TYPE_INT_32]] 32
 
 ; CHECK-SPIRV: %[[#]] = OpFunction %[[#TYPE_INT_32]] {{.*}} %[[#TYPE_ORIG_FUNC_32]]
 ; CHECK-SPIRV: %[[#X:]] = OpFunctionParameter %[[#TYPE_INT_32]]
diff --git a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/fshr.ll b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/fshr.ll
index 4cf5ca53a411..55fb2d9079e1 100644
--- a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/fshr.ll
+++ b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/fshr.ll
@@ -1,20 +1,20 @@
 ; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
 
-; CHECK-SPIRV:     OpName %[[#NAME_FSHR_FUNC_32:]] "spirv.llvm_fshr_i32"
-; CHECK-SPIRV:     OpName %[[#NAME_FSHR_FUNC_16:]] "spirv.llvm_fshr_i16"
-; CHECK-SPIRV:     OpName %[[#NAME_FSHR_FUNC_VEC_INT_16:]] "spirv.llvm_fshr_v2i16"
-; CHECK-SPIRV:     %[[#TYPE_INT_32:]] = OpTypeInt 32 0
-; CHECK-SPIRV:     %[[#TYPE_ORIG_FUNC_32:]] = OpTypeFunction %[[#TYPE_INT_32]] %[[#TYPE_INT_32]] %[[#TYPE_INT_32]]
-; CHECK-SPIRV:     %[[#TYPE_INT_16:]] = OpTypeInt 16 0
-; CHECK-SPIRV:     %[[#TYPE_ORIG_FUNC_16:]] = OpTypeFunction %[[#TYPE_INT_16]] %[[#TYPE_INT_16]] %[[#TYPE_INT_16]]
-; CHECK-SPIRV:     %[[#TYPE_VEC_INT_16:]] = OpTypeVector %[[#TYPE_INT_16]] 2
-; CHECK-SPIRV:     %[[#TYPE_ORIG_FUNC_VEC_INT_16:]] = OpTypeFunction %[[#TYPE_VEC_INT_16]] %[[#TYPE_VEC_INT_16]] %[[#TYPE_VEC_INT_16]]
-; CHECK-SPIRV:     %[[#TYPE_FSHR_FUNC_32:]] = OpTypeFunction %[[#TYPE_INT_32]] %[[#TYPE_INT_32]] %[[#TYPE_INT_32]] %[[#TYPE_INT_32]]
-; CHECK-SPIRV:     %[[#TYPE_FSHR_FUNC_16:]] = OpTypeFunction %[[#TYPE_INT_16]] %[[#TYPE_INT_16]] %[[#TYPE_INT_16]] %[[#TYPE_INT_16]]
-; CHECK-SPIRV:     %[[#TYPE_FSHR_FUNC_VEC_INT_16:]] = OpTypeFunction %[[#TYPE_VEC_INT_16]] %[[#TYPE_VEC_INT_16]] %[[#TYPE_VEC_INT_16]] %[[#TYPE_VEC_INT_16]]
+; CHECK-SPIRV-DAG: OpName %[[#NAME_FSHR_FUNC_32:]] "spirv.llvm_fshr_i32"
+; CHECK-SPIRV-DAG: OpName %[[#NAME_FSHR_FUNC_16:]] "spirv.llvm_fshr_i16"
+; CHECK-SPIRV-DAG: OpName %[[#NAME_FSHR_FUNC_VEC_INT_16:]] "spirv.llvm_fshr_v2i16"
+; CHECK-SPIRV-DAG: %[[#TYPE_INT_32:]] = OpTypeInt 32 0
+; CHECK-SPIRV-DAG: %[[#TYPE_ORIG_FUNC_32:]] = OpTypeFunction %[[#TYPE_INT_32]] %[[#TYPE_INT_32]] %[[#TYPE_INT_32]]
+; CHECK-SPIRV-DAG: %[[#TYPE_INT_16:]] = OpTypeInt 16 0
+; CHECK-SPIRV-DAG: %[[#TYPE_ORIG_FUNC_16:]] = OpTypeFunction %[[#TYPE_INT_16]] %[[#TYPE_INT_16]] %[[#TYPE_INT_16]]
+; CHECK-SPIRV-DAG: %[[#TYPE_VEC_INT_16:]] = OpTypeVector %[[#TYPE_INT_16]] 2
+; CHECK-SPIRV-DAG: %[[#TYPE_ORIG_FUNC_VEC_INT_16:]] = OpTypeFunction %[[#TYPE_VEC_INT_16]] %[[#TYPE_VEC_INT_16]] %[[#TYPE_VEC_INT_16]]
+; CHECK-SPIRV-DAG: %[[#TYPE_FSHR_FUNC_32:]] = OpTypeFunction %[[#TYPE_INT_32]] %[[#TYPE_INT_32]] %[[#TYPE_INT_32]] %[[#TYPE_INT_32]]
+; CHECK-SPIRV-DAG: %[[#TYPE_FSHR_FUNC_16:]] = OpTypeFunction %[[#TYPE_INT_16]] %[[#TYPE_INT_16]] %[[#TYPE_INT_16]] %[[#TYPE_INT_16]]
+; CHECK-SPIRV-DAG: %[[#TYPE_FSHR_FUNC_VEC_INT_16:]] = OpTypeFunction %[[#TYPE_VEC_INT_16]] %[[#TYPE_VEC_INT_16]] %[[#TYPE_VEC_INT_16]] %[[#TYPE_VEC_INT_16]]
 ; CHECK-SPIRV-DAG: %[[#CONST_ROTATE_32:]] = OpConstant %[[#TYPE_INT_32]] 8
 ; CHECK-SPIRV-DAG: %[[#CONST_ROTATE_16:]] = OpConstant %[[#TYPE_INT_16]] 8
-; CHECK-SPIRV:     %[[#CONST_ROTATE_VEC_INT_16:]] = OpConstantComposite %[[#TYPE_VEC_INT_16]] %[[#CONST_ROTATE_16]] %[[#CONST_ROTATE_16]]
+; CHECK-SPIRV-DAG: %[[#CONST_ROTATE_VEC_INT_16:]] = OpConstantComposite %[[#TYPE_VEC_INT_16]] %[[#CONST_ROTATE_16]] %[[#CONST_ROTATE_16]]
 ; CHECK-SPIRV-DAG: %[[#CONST_TYPE_SIZE_32:]] = OpConstant %[[#TYPE_INT_32]] 32
 
 ; CHECK-SPIRV: %[[#]] = OpFunction %[[#TYPE_INT_32]] {{.*}} %[[#TYPE_ORIG_FUNC_32]]
diff --git a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/memset.ll b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/memset.ll
index e7a986980f25..d5e70ae9e7aa 100644
--- a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/memset.ll
+++ b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/memset.ll
@@ -12,17 +12,17 @@
 ; CHECK-DAG: %[[#Int8Ptr:]] = OpTypePointer Generic %[[#Int8]]
 
 ; CHECK-DAG: %[[#Const4:]] = OpConstant %[[#Int32]] 4
-; CHECK: %[[#Int8x4:]] = OpTypeArray %[[#Int8]] %[[#Const4]]
+; CHECK-DAG: %[[#Int8x4:]] = OpTypeArray %[[#Int8]] %[[#Const4]]
 
 ; CHECK-DAG: %[[#Const12:]] = OpConstant %[[#Int32]] 12
-; CHECK: %[[#Int8x12:]] = OpTypeArray %[[#Int8]] %[[#Const12]]
+; CHECK-DAG: %[[#Int8x12:]] = OpTypeArray %[[#Int8]] %[[#Const12]]
 
 ; CHECK-DAG: %[[#Const21:]] = OpConstant %[[#Int8]] 21
 ; CHECK-DAG: %[[#False:]] = OpConstantFalse %[[#]]
 ; CHECK-DAG: %[[#ConstComp:]] = OpConstantComposite %[[#Int8x4]] %[[#Const21]] %[[#Const21]] %[[#Const21]] %[[#Const21]]
 ; CHECK-DAG: %[[#ConstNull:]] = OpConstantNull %[[#Int8x12]]
-; CHECK: %[[#VarComp:]] = OpVariable %[[#]] UniformConstant %[[#ConstComp]]
-; CHECK: %[[#VarNull:]] = OpVariable %[[#]] UniformConstant %[[#ConstNull]]
+; CHECK-DAG: %[[#VarComp:]] = OpVariable %[[#]] UniformConstant %[[#ConstComp]]
+; CHECK-DAG: %[[#VarNull:]] = OpVariable %[[#]] UniformConstant %[[#ConstNull]]
 
 ; CHECK-DAG: %[[#Int8PtrConst:]] = OpTypePointer UniformConstant %[[#Int8]]
 ; CHECK: OpCopyMemorySized %[[#Target:]] %[[#Source:]] %[[#Const12]] Aligned 4
diff --git a/llvm/test/CodeGen/SPIRV/logical-access-chain.ll b/llvm/test/CodeGen/SPIRV/logical-access-chain.ll
index 39f6d33712ef..d56678ecfc2c 100644
--- a/llvm/test/CodeGen/SPIRV/logical-access-chain.ll
+++ b/llvm/test/CodeGen/SPIRV/logical-access-chain.ll
@@ -1,10 +1,10 @@
 ; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s
 
-; CHECK:      [[uint:%[0-9]+]] = OpTypeInt 32 0
-; CHECK:     [[uint2:%[0-9]+]] = OpTypeVector [[uint]] 2
-; CHECK:    [[uint_1:%[0-9]+]] = OpConstant [[uint]] 1
-; CHECK:  [[ptr_uint:%[0-9]+]] = OpTypePointer Function [[uint]]
-; CHECK: [[ptr_uint2:%[0-9]+]] = OpTypePointer Function [[uint2]]
+; CHECK-DAG:      [[uint:%[0-9]+]] = OpTypeInt 32 0
+; CHECK-DAG:     [[uint2:%[0-9]+]] = OpTypeVector [[uint]] 2
+; CHECK-DAG:    [[uint_1:%[0-9]+]] = OpConstant [[uint]] 1
+; CHECK-DAG:  [[ptr_uint:%[0-9]+]] = OpTypePointer Function [[uint]]
+; CHECK-DAG: [[ptr_uint2:%[0-9]+]] = OpTypePointer Function [[uint2]]
 
 define void @main() #1 {
 entry:
diff --git a/llvm/test/CodeGen/SPIRV/opencl/degrees.ll b/llvm/test/CodeGen/SPIRV/opencl/degrees.ll
index 88f97835fe71..b8d4f52a2879 100644
--- a/llvm/test/CodeGen/SPIRV/opencl/degrees.ll
+++ b/llvm/test/CodeGen/SPIRV/opencl/degrees.ll
@@ -3,7 +3,7 @@
 ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 
-; CHECK-DAG: %[[#op_ext_glsl:]] = OpExtInstImport "OpenCL.std"
+; CHECK-DAG: %[[#op_ext_ocl:]] = OpExtInstImport "OpenCL.std"
 
 ; CHECK-DAG: %[[#float_32:]] = OpTypeFloat 32
 ; CHECK-DAG: %[[#float_16:]] = OpTypeFloat 16
@@ -20,7 +20,7 @@ declare <4 x half> @llvm.spv.degrees.v4f16(<4 x half>)
 define noundef float @degrees_float(float noundef %a) {
 entry:
 ; CHECK: %[[#float_32_arg:]] = OpFunctionParameter %[[#float_32]]
-; CHECK: %[[#]] = OpExtInst %[[#float_32]] %[[#op_ext_glsl]] degrees %[[#float_32_arg]]
+; CHECK: %[[#]] = OpExtInst %[[#float_32]] %[[#op_ext_ocl]] degrees %[[#float_32_arg]]
   %elt.degrees = call float @llvm.spv.degrees.f32(float %a)
   ret float %elt.degrees
 }
@@ -28,7 +28,7 @@ entry:
 define noundef half @degrees_half(half noundef %a) {
 entry:
 ; CHECK: %[[#float_16_arg:]] = OpFunctionParameter %[[#float_16]]
-; CHECK: %[[#]] = OpExtInst %[[#float_16]] %[[#op_ext_glsl]] degrees %[[#float_16_arg]]
+; CHECK: %[[#]] = OpExtInst %[[#float_16]] %[[#op_ext_ocl]] degrees %[[#float_16_arg]]
   %elt.degrees = call half @llvm.spv.degrees.f16(half %a)
   ret half %elt.degrees
 }
@@ -36,7 +36,7 @@ entry:
 define noundef <4 x float> @degrees_float_vector(<4 x float> noundef %a) {
 entry:
 ; CHECK: %[[#vec4_float_32_arg:]] = OpFunctionParameter %[[#vec4_float_32]]
-; CHECK: %[[#]] = OpExtInst %[[#vec4_float_32]] %[[#op_ext_glsl]] degrees %[[#vec4_float_32_arg]]
+; CHECK: %[[#]] = OpExtInst %[[#vec4_float_32]] %[[#op_ext_ocl]] degrees %[[#vec4_float_32_arg]]
   %elt.degrees = call <4 x float> @llvm.spv.degrees.v4f32(<4 x float> %a)
   ret <4 x float> %elt.degrees
 }
@@ -44,7 +44,7 @@ entry:
 define noundef <4 x half> @degrees_half_vector(<4 x half> noundef %a) {
 entry:
 ; CHECK: %[[#vec4_float_16_arg:]] = OpFunctionParameter %[[#vec4_float_16]]
-; CHECK: %[[#]] = OpExtInst %[[#vec4_float_16]] %[[#op_ext_glsl]] degrees %[[#vec4_float_16_arg]]
+; CHECK: %[[#]] = OpExtInst %[[#vec4_float_16]] %[[#op_ext_ocl]] degrees %[[#vec4_float_16_arg]]
   %elt.degrees = call <4 x half> @llvm.spv.degrees.v4f16(<4 x half> %a)
   ret <4 x half> %elt.degrees
 }
diff --git a/llvm/test/CodeGen/SPIRV/opencl/radians.ll b/llvm/test/CodeGen/SPIRV/opencl/radians.ll
index f7bb8d5226cd..5b4f26a13a4c 100644
--- a/llvm/test/CodeGen/SPIRV/opencl/radians.ll
+++ b/llvm/test/CodeGen/SPIRV/opencl/radians.ll
@@ -3,7 +3,7 @@
 ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 
-; CHECK-DAG: %[[#op_ext_glsl:]] = OpExtInstImport "OpenCL.std"
+; CHECK-DAG: %[[#op_ext_ocl:]] = OpExtInstImport "OpenCL.std"
 
 ; CHECK-DAG: %[[#float_32:]] = OpTypeFloat 32
 ; CHECK-DAG: %[[#float_16:]] = OpTypeFloat 16
@@ -20,7 +20,7 @@ declare <4 x half> @llvm.spv.radians.v4f16(<4 x half>)
 define noundef float @radians_float(float noundef %a) {
 entry:
 ; CHECK: %[[#float_32_arg:]] = OpFunctionParameter %[[#float_32]]
-; CHECK: %[[#]] = OpExtInst %[[#float_32]] %[[#op_ext_glsl]] radians %[[#float_32_arg]]
+; CHECK: %[[#]] = OpExtInst %[[#float_32]] %[[#op_ext_ocl]] radians %[[#float_32_arg]]
   %elt.radians = call float @llvm.spv.radians.f32(float %a)
   ret float %elt.radians
 }
@@ -28,7 +28,7 @@ entry:
 define noundef half @radians_half(half noundef %a) {
 entry:
 ; CHECK: %[[#float_16_arg:]] = OpFunctionParameter %[[#float_16]]
-; CHECK: %[[#]] = OpExtInst %[[#float_16]] %[[#op_ext_glsl]] radians %[[#float_16_arg]]
+; CHECK: %[[#]] = OpExtInst %[[#float_16]] %[[#op_ext_ocl]] radians %[[#float_16_arg]]
   %elt.radians = call half @llvm.spv.radians.f16(half %a)
   ret half %elt.radians
 }
@@ -36,7 +36,7 @@ entry:
 define noundef <4 x float> @radians_float_vector(<4 x float> noundef %a) {
 entry:
 ; CHECK: %[[#vec4_float_32_arg:]] = OpFunctionParameter %[[#vec4_float_32]]
-; CHECK: %[[#]] = OpExtInst %[[#vec4_float_32]] %[[#op_ext_glsl]] radians %[[#vec4_float_32_arg]]
+; CHECK: %[[#]] = OpExtInst %[[#vec4_float_32]] %[[#op_ext_ocl]] radians %[[#vec4_float_32_arg]]
   %elt.radians = call <4 x float> @llvm.spv.radians.v4f32(<4 x float> %a)
   ret <4 x float> %elt.radians
 }
@@ -44,7 +44,7 @@ entry:
 define noundef <4 x half> @radians_half_vector(<4 x half> noundef %a) {
 entry:
 ; CHECK: %[[#vec4_float_16_arg:]] = OpFunctionParameter %[[#vec4_float_16]]
-; CHECK: %[[#]] = OpExtInst %[[#vec4_float_16]] %[[#op_ext_glsl]] radians %[[#vec4_float_16_arg]]
+; CHECK: %[[#]] = OpExtInst %[[#vec4_float_16]] %[[#op_ext_ocl]] radians %[[#vec4_float_16_arg]]
   %elt.radians = call <4 x half> @llvm.spv.radians.v4f16(<4 x half> %a)
   ret <4 x half> %elt.radians
 }
diff --git a/llvm/test/CodeGen/SPIRV/pointers/PtrCast-null-in-OpSpecConstantOp.ll b/llvm/test/CodeGen/SPIRV/pointers/PtrCast-null-in-OpSpecConstantOp.ll
index 99e2c3e6d396..dee16da7e099 100644
--- a/llvm/test/CodeGen/SPIRV/pointers/PtrCast-null-in-OpSpecConstantOp.ll
+++ b/llvm/test/CodeGen/SPIRV/pointers/PtrCast-null-in-OpSpecConstantOp.ll
@@ -5,10 +5,8 @@
 ; CHECK-DAG: %[[Struct:.*]] = OpTypeStruct %[[Array]]
 ; CHECK-DAG: %[[Zero:.*]] = OpTypeInt 64 0
 ; CHECK-DAG: %[[Null:.*]] = OpConstantNull %[[Zero]]
-; CHECK-DAG: %[[R1:.*]] = OpConstantComposite %[[Array]] %[[Null]]
-; CHECK-DAG: %[[#]] = OpConstantComposite %[[Struct]] %[[R1]]
-; CHECK-DAG: %[[R2:.*]] = OpConstantComposite %[[Array]] %[[Null]]
-; CHECK-DAG: %[[#]] = OpConstantComposite %[[Struct]] %[[R2]]
+; CHECK-DAG: %[[R:.*]] = OpConstantComposite %[[Array]] %[[Null]]
+; CHECK-DAG: %[[#]] = OpConstantComposite %[[Struct]] %[[R]]
 
 @G1 = addrspace(1) constant { [1 x ptr addrspace(4)] } { [1 x ptr addrspace(4)] [ptr addrspace(4) addrspacecast (ptr null to ptr addrspace(4))] }
 @G2 = addrspace(1) constant { [1 x ptr addrspace(4)] } { [1 x ptr addrspace(4)] [ptr addrspace(4) addrspacecast (ptr addrspace(1) null to ptr addrspace(4))] }
diff --git a/llvm/test/CodeGen/SPIRV/pointers/struct-opaque-pointers.ll b/llvm/test/CodeGen/SPIRV/pointers/struct-opaque-pointers.ll
index 03ecf5e8d839..59a24231769c 100644
--- a/llvm/test/CodeGen/SPIRV/pointers/struct-opaque-pointers.ll
+++ b/llvm/test/CodeGen/SPIRV/pointers/struct-opaque-pointers.ll
@@ -1,12 +1,12 @@
 ; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s
 ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 
-; CHECK: %[[TyInt64:.*]] = OpTypeInt 64 0
-; CHECK: %[[TyInt64Ptr:.*]] = OpTypePointer {{[a-zA-Z]+}} %[[TyInt64]]
-; CHECK: %[[TyStruct:.*]] = OpTypeStruct %[[TyInt64Ptr]] %[[TyInt64Ptr]]
-; CHECK: %[[ConstStruct:.*]] = OpConstantComposite %[[TyStruct]] %[[ConstField:.*]] %[[ConstField]]
-; CHECK: %[[TyStructPtr:.*]] = OpTypePointer {{[a-zA-Z]+}} %[[TyStruct]]
-; CHECK: OpVariable %[[TyStructPtr]] {{[a-zA-Z]+}} %[[ConstStruct]]
+; CHECK-DAG: %[[TyInt64:.*]] = OpTypeInt 64 0
+; CHECK-DAG: %[[TyInt64Ptr:.*]] = OpTypePointer {{[a-zA-Z]+}} %[[TyInt64]]
+; CHECK-DAG: %[[TyStruct:.*]] = OpTypeStruct %[[TyInt64Ptr]] %[[TyInt64Ptr]]
+; CHECK-DAG: %[[ConstStruct:.*]] = OpConstantComposite %[[TyStruct]] %[[ConstField:.*]] %[[ConstField]]
+; CHECK-DAG: %[[TyStructPtr:.*]] = OpTypePointer {{[a-zA-Z]+}} %[[TyStruct]]
+; CHECK-DAG: OpVariable %[[TyStructPtr]] {{[a-zA-Z]+}} %[[ConstStruct]]
 
 @a = addrspace(1) constant i64 42
 @struct = addrspace(1) global {ptr addrspace(1), ptr addrspace(1)} { ptr addrspace(1) @a, ptr addrspace(1) @a }
diff --git a/llvm/test/CodeGen/SPIRV/structurizer/HLSLControlFlowHint-pass-check.ll b/llvm/test/CodeGen/SPIRV/structurizer/HLSLControlFlowHint-pass-check.ll
deleted file mode 100644
index 9911b3119ce5..000000000000
--- a/llvm/test/CodeGen/SPIRV/structurizer/HLSLControlFlowHint-pass-check.ll
+++ /dev/null
@@ -1,90 +0,0 @@
-; RUN: opt -passes='spirv-structurizer' -S -mtriple=spirv-unknown-unknown %s | FileCheck %s
-
-; CHECK-LABEL: define spir_func noundef i32 @test_branch
-; CHECK: call void @llvm.spv.selection.merge.p0(ptr blockaddress(@test_branch, %if.end), i32 1)
-; CHECK-NEXT: br i1 %cmp, label %if.then, label %if.else, !hlsl.controlflow.hint !{{[0-9]+}}
-define spir_func noundef i32 @test_branch(i32 noundef %X) {
-entry:
-  %X.addr = alloca i32, align 4
-  %resp = alloca i32, align 4
-  store i32 %X, ptr %X.addr, align 4
-  %0 = load i32, ptr %X.addr, align 4
-  %cmp = icmp sgt i32 %0, 0
-  br i1 %cmp, label %if.then, label %if.else, !hlsl.controlflow.hint !0
-
-if.then:                                          ; preds = %entry
-  %1 = load i32, ptr %X.addr, align 4
-  %sub = sub nsw i32 0, %1
-  store i32 %sub, ptr %resp, align 4
-  br label %if.end
-
-if.else:                                          ; preds = %entry
-  %2 = load i32, ptr %X.addr, align 4
-  %mul = mul nsw i32 %2, 2
-  store i32 %mul, ptr %resp, align 4
-  br label %if.end
-
-if.end:                                           ; preds = %if.else, %if.then
-  %3 = load i32, ptr %resp, align 4
-  ret i32 %3
-}
-
-; CHECK-LABEL: define spir_func noundef i32 @test_flatten
-; CHECK: call void @llvm.spv.selection.merge.p0(ptr blockaddress(@test_flatten, %if.end), i32 2)
-; CHECK-NEXT: br i1 %cmp, label %if.then, label %if.else, !hlsl.controlflow.hint !{{[0-9]+}}
-define spir_func noundef i32 @test_flatten(i32 noundef %X) {
-entry:
-  %X.addr = alloca i32, align 4
-  %resp = alloca i32, align 4
-  store i32 %X, ptr %X.addr, align 4
-  %0 = load i32, ptr %X.addr, align 4
-  %cmp = icmp sgt i32 %0, 0
-  br i1 %cmp, label %if.then, label %if.else, !hlsl.controlflow.hint !1
-
-if.then:                                          ; preds = %entry
-  %1 = load i32, ptr %X.addr, align 4
-  %sub = sub nsw i32 0, %1
-  store i32 %sub, ptr %resp, align 4
-  br label %if.end
-
-if.else:                                          ; preds = %entry
-  %2 = load i32, ptr %X.addr, align 4
-  %mul = mul nsw i32 %2, 2
-  store i32 %mul, ptr %resp, align 4
-  br label %if.end
-
-if.end:                                           ; preds = %if.else, %if.then
-  %3 = load i32, ptr %resp, align 4
-  ret i32 %3
-}
-; CHECK-LABEL: define spir_func noundef i32 @test_no_attr
-; CHECK: call void @llvm.spv.selection.merge.p0(ptr blockaddress(@test_no_attr, %if.end), i32 0)
-; CHECK-NEXT: br i1 %cmp, label %if.then, label %if.else
-define spir_func noundef i32 @test_no_attr(i32 noundef %X) {
-entry:
-  %X.addr = alloca i32, align 4
-  %resp = alloca i32, align 4
-  store i32 %X, ptr %X.addr, align 4
-  %0 = load i32, ptr %X.addr, align 4
-  %cmp = icmp sgt i32 %0, 0
-  br i1 %cmp, label %if.then, label %if.else
-
-if.then:                                          ; preds = %entry
-  %1 = load i32, ptr %X.addr, align 4
-  %sub = sub nsw i32 0, %1
-  store i32 %sub, ptr %resp, align 4
-  br label %if.end
-
-if.else:                                          ; preds = %entry
-  %2 = load i32, ptr %X.addr, align 4
-  %mul = mul nsw i32 %2, 2
-  store i32 %mul, ptr %resp, align 4
-  br label %if.end
-
-if.end:                                           ; preds = %if.else, %if.then
-  %3 = load i32, ptr %resp, align 4
-  ret i32 %3
-}
-
-!0 = !{!"hlsl.controlflow.hint", i32 1}
-!1 = !{!"hlsl.controlflow.hint", i32 2}
diff --git a/llvm/test/CodeGen/SPIRV/structurizer/HLSLControlFlowHint.ll b/llvm/test/CodeGen/SPIRV/structurizer/HLSLControlFlowHint.ll
deleted file mode 100644
index 848eaf70f5a1..000000000000
--- a/llvm/test/CodeGen/SPIRV/structurizer/HLSLControlFlowHint.ll
+++ /dev/null
@@ -1,91 +0,0 @@
-; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s
-; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %}
-
-
-define spir_func noundef i32 @test_branch(i32 noundef %X) {
-entry:
-; CHECK-LABEL: ; -- Begin function test_branch
-; OpSelectionMerge %[[#]] DontFlatten
-  %X.addr = alloca i32, align 4
-  %resp = alloca i32, align 4
-  store i32 %X, ptr %X.addr, align 4
-  %0 = load i32, ptr %X.addr, align 4
-  %cmp = icmp sgt i32 %0, 0
-  br i1 %cmp, label %if.then, label %if.else, !hlsl.controlflow.hint !0
-
-if.then:                                          ; preds = %entry
-  %1 = load i32, ptr %X.addr, align 4
-  %sub = sub nsw i32 0, %1
-  store i32 %sub, ptr %resp, align 4
-  br label %if.end
-
-if.else:                                          ; preds = %entry
-  %2 = load i32, ptr %X.addr, align 4
-  %mul = mul nsw i32 %2, 2
-  store i32 %mul, ptr %resp, align 4
-  br label %if.end
-
-if.end:                                           ; preds = %if.else, %if.then
-  %3 = load i32, ptr %resp, align 4
-  ret i32 %3
-}
-
-
-define spir_func noundef i32 @test_flatten(i32 noundef %X) {
-entry:
-; CHECK-LABEL: ; -- Begin function test_flatten
-; OpSelectionMerge %[[#]] Flatten
-  %X.addr = alloca i32, align 4
-  %resp = alloca i32, align 4
-  store i32 %X, ptr %X.addr, align 4
-  %0 = load i32, ptr %X.addr, align 4
-  %cmp = icmp sgt i32 %0, 0
-  br i1 %cmp, label %if.then, label %if.else, !hlsl.controlflow.hint !1
-
-if.then:                                          ; preds = %entry
-  %1 = load i32, ptr %X.addr, align 4
-  %sub = sub nsw i32 0, %1
-  store i32 %sub, ptr %resp, align 4
-  br label %if.end
-
-if.else:                                          ; preds = %entry
-  %2 = load i32, ptr %X.addr, align 4
-  %mul = mul nsw i32 %2, 2
-  store i32 %mul, ptr %resp, align 4
-  br label %if.end
-
-if.end:                                           ; preds = %if.else, %if.then
-  %3 = load i32, ptr %resp, align 4
-  ret i32 %3
-}
-
-define spir_func noundef i32 @test_no_attr(i32 noundef %X) {
-entry:
-; CHECK-LABEL: ; -- Begin function test_no_attr
-; OpSelectionMerge %[[#]] None
-  %X.addr = alloca i32, align 4
-  %resp = alloca i32, align 4
-  store i32 %X, ptr %X.addr, align 4
-  %0 = load i32, ptr %X.addr, align 4
-  %cmp = icmp sgt i32 %0, 0
-  br i1 %cmp, label %if.then, label %if.else
-
-if.then:                                          ; preds = %entry
-  %1 = load i32, ptr %X.addr, align 4
-  %sub = sub nsw i32 0, %1
-  store i32 %sub, ptr %resp, align 4
-  br label %if.end
-
-if.else:                                          ; preds = %entry
-  %2 = load i32, ptr %X.addr, align 4
-  %mul = mul nsw i32 %2, 2
-  store i32 %mul, ptr %resp, align 4
-  br label %if.end
-
-if.end:                                           ; preds = %if.else, %if.then
-  %3 = load i32, ptr %resp, align 4
-  ret i32 %3
-}
-
-!0 = !{!"hlsl.controlflow.hint", i32 1}
-!1 = !{!"hlsl.controlflow.hint", i32 2}
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/SampledImage.ll b/llvm/test/CodeGen/SPIRV/transcoding/SampledImage.ll
index e4c7bdb9e9c8..8a90e40e8881 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/SampledImage.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/SampledImage.ll
@@ -24,12 +24,10 @@
 ; CHECK-SPIRV: OpName %[[#sample_kernel_float:]] "sample_kernel_float"
 ; CHECK-SPIRV: OpName %[[#sample_kernel_int:]] "sample_kernel_int"
 
-; CHECK-SPIRV:     %[[#TypeSampler:]] = OpTypeSampler
+; CHECK-SPIRV-DAG: %[[#TypeSampler:]] = OpTypeSampler
 ; CHECK-SPIRV-DAG: %[[#SampledImageTy:]] = OpTypeSampledImage
 ; CHECK-SPIRV-DAG: %[[#ConstSampler1:]] = OpConstantSampler %[[#TypeSampler]] None 0 Linear
 ; CHECK-SPIRV-DAG: %[[#ConstSampler2:]] = OpConstantSampler %[[#TypeSampler]] Repeat 0 Nearest
-; CHECK-SPIRV-DAG: %[[#ConstSampler3:]] = OpConstantSampler %[[#TypeSampler]] None 0 Linear
-; CHECK-SPIRV-DAG: %[[#ConstSampler4:]] = OpConstantSampler %[[#TypeSampler]] Repeat 0 Nearest
 
 ; CHECK-SPIRV: %[[#sample_kernel_float]] = OpFunction %{{.*}}
 ; CHECK-SPIRV: %[[#InputImage:]] = OpFunctionParameter %{{.*}}
@@ -65,13 +63,13 @@ declare spir_func target("spirv.Sampler") @__translate_sampler_initializer(i32)
 ; CHECK-SPIRV: %[[#InputImage:]] = OpFunctionParameter %{{.*}}
 ; CHECK-SPIRV: %[[#argSampl:]] = OpFunctionParameter %[[#TypeSampler]]
 
-; CHECK-SPIRV: %[[#SampledImage4:]] = OpSampledImage %[[#SampledImageTy]] %[[#InputImage]] %[[#ConstSampler3]]
+; CHECK-SPIRV: %[[#SampledImage4:]] = OpSampledImage %[[#SampledImageTy]] %[[#InputImage]] %[[#ConstSampler1]]
 ; CHECK-SPIRV: %[[#]] = OpImageSampleExplicitLod %[[#]] %[[#SampledImage4]]
 
 ; CHECK-SPIRV: %[[#SampledImage5:]] = OpSampledImage %[[#SampledImageTy]] %[[#InputImage]] %[[#argSampl]]
 ; CHECK-SPIRV: %[[#]] = OpImageSampleExplicitLod %[[#]] %[[#SampledImage5]]
 
-; CHECK-SPIRV: %[[#SampledImage6:]] = OpSampledImage %[[#SampledImageTy]] %[[#InputImage]] %[[#ConstSampler4]]
+; CHECK-SPIRV: %[[#SampledImage6:]] = OpSampledImage %[[#SampledImageTy]] %[[#InputImage]] %[[#ConstSampler2]]
 ; CHECK-SPIRV: %[[#]] = OpImageSampleExplicitLod %[[#]] %[[#SampledImage6]]
 
 define dso_local spir_kernel void @sample_kernel_int(target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 0) %input, <2 x float> noundef %coords, <4 x i32> addrspace(1)* nocapture noundef writeonly %results, target("spirv.Sampler") %argSampl) local_unnamed_addr {
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/cl-types.ll b/llvm/test/CodeGen/SPIRV/transcoding/cl-types.ll
index 8b326e265502..55f1125706f6 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/cl-types.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/cl-types.ll
@@ -39,7 +39,7 @@
 ; CHECK-SPIRV-DAG: %[[#SAMP:]] = OpTypeSampler
 ; CHECK-SPIRV-DAG: %[[#SAMPIMG:]] = OpTypeSampledImage %[[#IMG2D_RD]]
 
-; CHECK-SPIRV:     %[[#SAMP_CONST:]] = OpConstantSampler %[[#SAMP]] None 0 Linear
+; CHECK-SPIRV-DAG: %[[#SAMP_CONST:]] = OpConstantSampler %[[#SAMP]] None 0 Linear
 
 ; CHECK-SPIRV: %[[#]] = OpFunctionParameter %[[#PIPE_RD]]
 ; CHECK-SPIRV: %[[#]] = OpFunctionParameter %[[#PIPE_WR]]
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/spirv-private-array-initialization.ll b/llvm/test/CodeGen/SPIRV/transcoding/spirv-private-array-initialization.ll
index 74dbaab63e03..5810d9c59ee3 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/spirv-private-array-initialization.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/spirv-private-array-initialization.ll
@@ -14,19 +14,19 @@
 ; CHECK-SPIRV-DAG:    %[[#twelve:]] = OpConstant %[[#i32]] 12
 ; CHECK-SPIRV-DAG:    %[[#const_i32x3_ptr:]] = OpTypePointer UniformConstant %[[#i32x3]]
 
-; CHECK-SPIRV:        %[[#test_arr2:]] = OpVariable %[[#const_i32x3_ptr]] UniformConstant %[[#test_arr_init]]
-; CHECK-SPIRV:        %[[#test_arr:]] = OpVariable %[[#const_i32x3_ptr]] UniformConstant %[[#test_arr_init]]
+; CHECK-SPIRV-DAG:    %[[#test_arr1:]] = OpVariable %[[#const_i32x3_ptr]] UniformConstant %[[#test_arr_init]]
+; CHECK-SPIRV-DAG:    %[[#test_arr2:]] = OpVariable %[[#const_i32x3_ptr]] UniformConstant %[[#test_arr_init]]
 
 ; CHECK-SPIRV-DAG:    %[[#i32x3_ptr:]] = OpTypePointer Function %[[#i32x3]]
 
-; CHECK-SPIRV:        %[[#arr:]] = OpVariable %[[#i32x3_ptr]] Function
+; CHECK-SPIRV:        %[[#arr1:]] = OpVariable %[[#i32x3_ptr]] Function
 ; CHECK-SPIRV:        %[[#arr2:]] = OpVariable %[[#i32x3_ptr]] Function
 
-; CHECK-SPIRV-32:     OpCopyMemorySized %[[#arr]] %[[#test_arr]] %[[#twelve]] Aligned 4
+; CHECK-SPIRV-32:     OpCopyMemorySized %[[#arr1]] %[[#test_arr1]] %[[#twelve]] Aligned 4
 ; CHECK-SPIRV-32:     OpCopyMemorySized %[[#arr2]] %[[#test_arr2]] %[[#twelve]] Aligned 4
 
 ; CHECK-SPIRV-64:     %[[#twelvezext1:]] = OpUConvert %[[#i64:]] %[[#twelve:]]
-; CHECK-SPIRV-64:     OpCopyMemorySized %[[#arr]] %[[#test_arr]] %[[#twelvezext1]] Aligned 4
+; CHECK-SPIRV-64:     OpCopyMemorySized %[[#arr1]] %[[#test_arr1]] %[[#twelvezext1]] Aligned 4
 ; CHECK-SPIRV-64:     %[[#twelvezext2:]] = OpUConvert %[[#i64:]] %[[#twelve:]]
 ; CHECK-SPIRV-64:     OpCopyMemorySized %[[#arr2]] %[[#test_arr2]] %[[#twelvezext2]] Aligned 4
 
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/sub_group_non_uniform_arithmetic.ll b/llvm/test/CodeGen/SPIRV/transcoding/sub_group_non_uniform_arithmetic.ll
index adf73fe153de..62b09f6fe685 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/sub_group_non_uniform_arithmetic.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/sub_group_non_uniform_arithmetic.ll
@@ -329,12 +329,12 @@
 ; CHECK-SPIRV-DAG: %[[#double:]] = OpTypeFloat 64
 
 ; CHECK-SPIRV-DAG: %[[#false:]] = OpConstantFalse %[[#bool]]
+; CHECK-SPIRV-DAG: %[[#int_32:]] = OpConstant %[[#int]] 32
 ; CHECK-SPIRV-DAG: %[[#ScopeSubgroup:]] = OpConstant %[[#int]] 3
 ; CHECK-SPIRV-DAG: %[[#char_0:]] = OpConstant %[[#char]] 0
 ; CHECK-SPIRV-DAG: %[[#char_10:]] = OpConstant %[[#char]] 10
 ; CHECK-SPIRV-DAG: %[[#short_0:]] = OpConstant %[[#short]] 0
 ; CHECK-SPIRV-DAG: %[[#int_0:]] = OpConstant %[[#int]] 0
-; CHECK-SPIRV-DAG: %[[#int_32:]] = OpConstant %[[#int]] 32
 ; CHECK-SPIRV-DAG: %[[#long_0:]] = OpConstantNull %[[#long]]
 ; CHECK-SPIRV-DAG: %[[#half_0:]] = OpConstant %[[#half]] 0
 ; CHECK-SPIRV-DAG: %[[#float_0:]] = OpConstant %[[#float]] 0
diff --git a/llvm/test/CodeGen/SPIRV/unnamed-global.ll b/llvm/test/CodeGen/SPIRV/unnamed-global.ll
index f72334bd7752..90bac50507c0 100644
--- a/llvm/test/CodeGen/SPIRV/unnamed-global.ll
+++ b/llvm/test/CodeGen/SPIRV/unnamed-global.ll
@@ -4,10 +4,10 @@
 ; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s
 ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 
-; CHECK: %[[TyInt:.*]] = OpTypeInt 8 0
-; CHECK: %[[ConstInt:.*]] = OpConstant %[[TyInt]] 123
-; CHECK: %[[TyPtr:.*]] = OpTypePointer {{[a-zA-Z]+}} %[[TyInt]]
-; CHECK: %[[VarId:.*]] = OpVariable %[[TyPtr]] {{[a-zA-Z]+}} %[[ConstInt]]
+; CHECK-DAG: %[[TyInt:.*]] = OpTypeInt 8 0
+; CHECK-DAG: %[[ConstInt:.*]] = OpConstant %[[TyInt]] 123
+; CHECK-DAG: %[[TyPtr:.*]] = OpTypePointer {{[a-zA-Z]+}} %[[TyInt]]
+; CHECK-DAG: %[[VarId:.*]] = OpVariable %[[TyPtr]] {{[a-zA-Z]+}} %[[ConstInt]]
 
 @0 = addrspace(1) global i8 123
 
diff --git a/llvm/test/CodeGen/X86/codegen-prepare-addrmode-sext.ll b/llvm/test/CodeGen/X86/codegen-prepare-addrmode-sext.ll
index c611e89f2786..f3070cd55903 100644
--- a/llvm/test/CodeGen/X86/codegen-prepare-addrmode-sext.ll
+++ b/llvm/test/CodeGen/X86/codegen-prepare-addrmode-sext.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt -S -passes='require<profile-summary>,function(codegenprepare)' %s -o - | FileCheck %s
 ; This file tests the different cases what are involved when codegen prepare
 ; tries to get sign/zero extension out of the way of addressing mode.
@@ -9,14 +10,17 @@ target triple = "x86_64-apple-macosx"
 
 
 ; Check that we correctly promote both operands of the promotable add.
-; CHECK-LABEL: @twoArgsPromotion
-; CHECK: [[ARG1SEXT:%[a-zA-Z_0-9-]+]] = sext i32 %arg1 to i64
-; CHECK: [[ARG2SEXT:%[a-zA-Z_0-9-]+]] = sext i32 %arg2 to i64
-; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nsw i64 [[ARG1SEXT]], [[ARG2SEXT]]
-; CHECK: inttoptr i64 [[PROMOTED]] to ptr
-; CHECK: ret
 define i8 @twoArgsPromotion(i32 %arg1, i32 %arg2) {
-  %add = add nsw i32 %arg1, %arg2 
+; CHECK-LABEL: define i8 @twoArgsPromotion(
+; CHECK-SAME: i32 [[ARG1:%.*]], i32 [[ARG2:%.*]]) {
+; CHECK-NEXT:    [[PROMOTED:%.*]] = sext i32 [[ARG1]] to i64
+; CHECK-NEXT:    [[PROMOTED2:%.*]] = sext i32 [[ARG2]] to i64
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i64 [[PROMOTED]], [[PROMOTED2]]
+; CHECK-NEXT:    [[BASE:%.*]] = inttoptr i64 [[ADD]] to ptr
+; CHECK-NEXT:    [[RES:%.*]] = load i8, ptr [[BASE]], align 1
+; CHECK-NEXT:    ret i8 [[RES]]
+;
+  %add = add nsw i32 %arg1, %arg2
   %sextadd = sext i32 %add to i64
   %base = inttoptr i64 %sextadd to ptr
   %res = load i8, ptr %base
@@ -28,11 +32,16 @@ define i8 @twoArgsPromotion(i32 %arg1, i32 %arg2) {
 ; Otherwise, we will increase the number of instruction executed.
 ; (This is a heuristic of course, because the new sext could have been
 ; merged with something else.)
-; CHECK-LABEL: @twoArgsNoPromotion
-; CHECK: add nsw i32 %arg1, %arg2
-; CHECK: ret
 define i8 @twoArgsNoPromotion(i32 %arg1, i32 %arg2, ptr %base) {
-  %add = add nsw i32 %arg1, %arg2 
+; CHECK-LABEL: define i8 @twoArgsNoPromotion(
+; CHECK-SAME: i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], ptr [[BASE:%.*]]) {
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[ARG1]], [[ARG2]]
+; CHECK-NEXT:    [[SEXTADD:%.*]] = sext i32 [[ADD]] to i64
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[BASE]], i64 [[SEXTADD]]
+; CHECK-NEXT:    [[RES:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    ret i8 [[RES]]
+;
+  %add = add nsw i32 %arg1, %arg2
   %sextadd = sext i32 %add to i64
   %arrayidx = getelementptr inbounds i8, ptr %base, i64 %sextadd
   %res = load i8, ptr %arrayidx
@@ -41,11 +50,16 @@ define i8 @twoArgsNoPromotion(i32 %arg1, i32 %arg2, ptr %base) {
 
 ; Check that we do not promote when the related instruction does not have
 ; the nsw flag.
-; CHECK-LABEL: @noPromotion
-; CHECK-NOT: add i64
-; CHECK: ret
 define i8 @noPromotion(i32 %arg1, i32 %arg2, ptr %base) {
-  %add = add i32 %arg1, %arg2 
+; CHECK-LABEL: define i8 @noPromotion(
+; CHECK-SAME: i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], ptr [[BASE:%.*]]) {
+; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[ARG1]], [[ARG2]]
+; CHECK-NEXT:    [[SEXTADD:%.*]] = sext i32 [[ADD]] to i64
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[BASE]], i64 [[SEXTADD]]
+; CHECK-NEXT:    [[RES:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    ret i8 [[RES]]
+;
+  %add = add i32 %arg1, %arg2
   %sextadd = sext i32 %add to i64
   %arrayidx = getelementptr inbounds i8, ptr %base, i64 %sextadd
   %res = load i8, ptr %arrayidx
@@ -53,13 +67,16 @@ define i8 @noPromotion(i32 %arg1, i32 %arg2, ptr %base) {
 }
 
 ; Check that we correctly promote constant arguments.
-; CHECK-LABEL: @oneArgPromotion
-; CHECK: [[ARG1SEXT:%[a-zA-Z_0-9-]+]] = sext i32 %arg1 to i64
-; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nsw i64 [[ARG1SEXT]], 1
-; CHECK: getelementptr inbounds i8, ptr %base, i64 [[PROMOTED]]
-; CHECK: ret
 define i8 @oneArgPromotion(i32 %arg1, ptr %base) {
-  %add = add nsw i32 %arg1, 1 
+; CHECK-LABEL: define i8 @oneArgPromotion(
+; CHECK-SAME: i32 [[ARG1:%.*]], ptr [[BASE:%.*]]) {
+; CHECK-NEXT:    [[PROMOTED:%.*]] = sext i32 [[ARG1]] to i64
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i64 [[PROMOTED]], 1
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[BASE]], i64 [[ADD]]
+; CHECK-NEXT:    [[RES:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    ret i8 [[RES]]
+;
+  %add = add nsw i32 %arg1, 1
   %sextadd = sext i32 %add to i64
   %arrayidx = getelementptr inbounds i8, ptr %base, i64 %sextadd
   %res = load i8, ptr %arrayidx
@@ -67,14 +84,17 @@ define i8 @oneArgPromotion(i32 %arg1, ptr %base) {
 }
 
 ; Check that we are able to merge a sign extension with a zero extension.
-; CHECK-LABEL: @oneArgPromotionZExt
-; CHECK: [[ARG1ZEXT:%[a-zA-Z_0-9-]+]] = zext i8 %arg1 to i64
-; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nsw i64 [[ARG1ZEXT]], 1
-; CHECK: getelementptr inbounds i8, ptr %base, i64 [[PROMOTED]]
-; CHECK: ret
 define i8 @oneArgPromotionZExt(i8 %arg1, ptr %base) {
+; CHECK-LABEL: define i8 @oneArgPromotionZExt(
+; CHECK-SAME: i8 [[ARG1:%.*]], ptr [[BASE:%.*]]) {
+; CHECK-NEXT:    [[PROMOTED2:%.*]] = zext i8 [[ARG1]] to i64
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i64 [[PROMOTED2]], 1
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[BASE]], i64 [[ADD]]
+; CHECK-NEXT:    [[RES:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    ret i8 [[RES]]
+;
   %zext = zext i8 %arg1 to i32
-  %add = add nsw i32 %zext, 1 
+  %add = add nsw i32 %zext, 1
   %sextadd = sext i32 %add to i64
   %arrayidx = getelementptr inbounds i8, ptr %base, i64 %sextadd
   %res = load i8, ptr %arrayidx
@@ -88,11 +108,14 @@ define i8 @oneArgPromotionZExt(i8 %arg1, ptr %base) {
 ; more thing in the addressing mode. Therefore the modification is
 ; rolled back.
 ; Still, this test case exercises the desired code path.
-; CHECK-LABEL: @oneArgPromotionCstZExt
-; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nsw i64 0, 1
-; CHECK: getelementptr inbounds i8, ptr %base, i64 [[PROMOTED]]
-; CHECK: ret
 define i8 @oneArgPromotionCstZExt(ptr %base) {
+; CHECK-LABEL: define i8 @oneArgPromotionCstZExt(
+; CHECK-SAME: ptr [[BASE:%.*]]) {
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i64 0, 1
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[BASE]], i64 [[ADD]]
+; CHECK-NEXT:    [[RES:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    ret i8 [[RES]]
+;
   %cst = zext i16 undef to i32
   %add = add nsw i32 %cst, 1
   %sextadd = sext i32 %add to i64
@@ -103,15 +126,18 @@ define i8 @oneArgPromotionCstZExt(ptr %base) {
 
 ; Check that we do not promote truncate when we cannot determine the
 ; bits that are dropped.
-; CHECK-LABEL: @oneArgPromotionBlockTrunc1
-; CHECK: [[ARG1TRUNC:%[a-zA-Z_0-9-]+]] = trunc i32 %arg1 to i8
-; CHECK: [[ARG1SEXT:%[a-zA-Z_0-9-]+]] = sext i8 [[ARG1TRUNC]] to i64
-; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nsw i64 [[ARG1SEXT]], 1
-; CHECK: getelementptr inbounds i8, ptr %base, i64 [[PROMOTED]]
-; CHECK: ret
 define i8 @oneArgPromotionBlockTrunc1(i32 %arg1, ptr %base) {
+; CHECK-LABEL: define i8 @oneArgPromotionBlockTrunc1(
+; CHECK-SAME: i32 [[ARG1:%.*]], ptr [[BASE:%.*]]) {
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i32 [[ARG1]] to i8
+; CHECK-NEXT:    [[PROMOTED:%.*]] = sext i8 [[TRUNC]] to i64
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i64 [[PROMOTED]], 1
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[BASE]], i64 [[ADD]]
+; CHECK-NEXT:    [[RES:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    ret i8 [[RES]]
+;
   %trunc = trunc i32 %arg1 to i8
-  %add = add nsw i8 %trunc, 1 
+  %add = add nsw i8 %trunc, 1
   %sextadd = sext i8 %add to i64
   %arrayidx = getelementptr inbounds i8, ptr %base, i64 %sextadd
   %res = load i8, ptr %arrayidx
@@ -120,17 +146,20 @@ define i8 @oneArgPromotionBlockTrunc1(i32 %arg1, ptr %base) {
 
 ; Check that we do not promote truncate when we cannot determine all the
 ; bits that are dropped.
-; CHECK-LABEL: @oneArgPromotionBlockTrunc2
-; CHECK: [[ARG1SEXT:%[a-zA-Z_0-9-]+]] = sext i16 %arg1 to i32
-; CHECK: [[ARG1TRUNC:%[a-zA-Z_0-9-]+]] = trunc i32 [[ARG1SEXT]] to i8
-; CHECK: [[ARG1SEXT64:%[a-zA-Z_0-9-]+]] = sext i8 [[ARG1TRUNC]] to i64
-; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nsw i64 [[ARG1SEXT64]], 1
-; CHECK: getelementptr inbounds i8, ptr %base, i64 [[PROMOTED]]
-; CHECK: ret
 define i8 @oneArgPromotionBlockTrunc2(i16 %arg1, ptr %base) {
+; CHECK-LABEL: define i8 @oneArgPromotionBlockTrunc2(
+; CHECK-SAME: i16 [[ARG1:%.*]], ptr [[BASE:%.*]]) {
+; CHECK-NEXT:    [[SEXTARG1:%.*]] = sext i16 [[ARG1]] to i32
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i32 [[SEXTARG1]] to i8
+; CHECK-NEXT:    [[PROMOTED:%.*]] = sext i8 [[TRUNC]] to i64
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i64 [[PROMOTED]], 1
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[BASE]], i64 [[ADD]]
+; CHECK-NEXT:    [[RES:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    ret i8 [[RES]]
+;
   %sextarg1 = sext i16 %arg1 to i32
   %trunc = trunc i32 %sextarg1 to i8
-  %add = add nsw i8 %trunc, 1 
+  %add = add nsw i8 %trunc, 1
   %sextadd = sext i8 %add to i64
   %arrayidx = getelementptr inbounds i8, ptr %base, i64 %sextadd
   %res = load i8, ptr %arrayidx
@@ -139,15 +168,18 @@ define i8 @oneArgPromotionBlockTrunc2(i16 %arg1, ptr %base) {
 
 ; Check that we are able to promote truncate when we know all the bits
 ; that are dropped.
-; CHECK-LABEL: @oneArgPromotionPassTruncKeepSExt
-; CHECK: [[ARG1SEXT:%[a-zA-Z_0-9-]+]] = sext i1 %arg1 to i64
-; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nsw i64 [[ARG1SEXT]], 1
-; CHECK: getelementptr inbounds i8, ptr %base, i64 [[PROMOTED]]
-; CHECK: ret
 define i8 @oneArgPromotionPassTruncKeepSExt(i1 %arg1, ptr %base) {
+; CHECK-LABEL: define i8 @oneArgPromotionPassTruncKeepSExt(
+; CHECK-SAME: i1 [[ARG1:%.*]], ptr [[BASE:%.*]]) {
+; CHECK-NEXT:    [[PROMOTED:%.*]] = sext i1 [[ARG1]] to i64
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i64 [[PROMOTED]], 1
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[BASE]], i64 [[ADD]]
+; CHECK-NEXT:    [[RES:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    ret i8 [[RES]]
+;
   %sextarg1 = sext i1 %arg1 to i32
   %trunc = trunc i32 %sextarg1 to i8
-  %add = add nsw i8 %trunc, 1 
+  %add = add nsw i8 %trunc, 1
   %sextadd = sext i8 %add to i64
   %arrayidx = getelementptr inbounds i8, ptr %base, i64 %sextadd
   %res = load i8, ptr %arrayidx
@@ -156,17 +188,19 @@ define i8 @oneArgPromotionPassTruncKeepSExt(i1 %arg1, ptr %base) {
 
 ; On X86 truncate are free. Check that we are able to promote the add
 ; to be used as addressing mode and that we insert a truncate for the other
-; use. 
-; CHECK-LABEL: @oneArgPromotionTruncInsert
-; CHECK: [[ARG1SEXT:%[a-zA-Z_0-9-]+]] = sext i8 %arg1 to i64
-; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nsw i64 [[ARG1SEXT]], 1
-; CHECK: [[TRUNC:%[a-zA-Z_0-9-]+]] = trunc i64 [[PROMOTED]] to i8
-; CHECK: [[GEP:%[a-zA-Z_0-9-]+]] = getelementptr inbounds i8, ptr %base, i64 [[PROMOTED]]
-; CHECK: [[LOAD:%[a-zA-Z_0-9-]+]] = load i8, ptr [[GEP]]
-; CHECK: add i8 [[LOAD]], [[TRUNC]]
-; CHECK: ret
+; use.
 define i8 @oneArgPromotionTruncInsert(i8 %arg1, ptr %base) {
-  %add = add nsw i8 %arg1, 1 
+; CHECK-LABEL: define i8 @oneArgPromotionTruncInsert(
+; CHECK-SAME: i8 [[ARG1:%.*]], ptr [[BASE:%.*]]) {
+; CHECK-NEXT:    [[PROMOTED2:%.*]] = sext i8 [[ARG1]] to i64
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i64 [[PROMOTED2]], 1
+; CHECK-NEXT:    [[PROMOTED:%.*]] = trunc i64 [[ADD]] to i8
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[BASE]], i64 [[ADD]]
+; CHECK-NEXT:    [[RES:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[FINALRES:%.*]] = add i8 [[RES]], [[PROMOTED]]
+; CHECK-NEXT:    ret i8 [[FINALRES]]
+;
+  %add = add nsw i8 %arg1, 1
   %sextadd = sext i8 %add to i64
   %arrayidx = getelementptr inbounds i8, ptr %base, i64 %sextadd
   %res = load i8, ptr %arrayidx
@@ -175,15 +209,20 @@ define i8 @oneArgPromotionTruncInsert(i8 %arg1, ptr %base) {
 }
 
 ; Cannot sext from a larger type than the promoted type.
-; CHECK-LABEL: @oneArgPromotionLargerType
-; CHECK: [[ARG1TRUNC:%[a-zA-Z_0-9-]+]] = trunc i128 %arg1 to i8
-; CHECK: [[ARG1SEXT64:%[a-zA-Z_0-9-]+]] = sext i8 [[ARG1TRUNC]] to i64
-; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nsw i64 [[ARG1SEXT64]], 1
-; CHECK: getelementptr inbounds i8, ptr %base, i64 [[PROMOTED]]
-; CHECK: ret
 define i8 @oneArgPromotionLargerType(i128 %arg1, ptr %base) {
+; CHECK-LABEL: define i8 @oneArgPromotionLargerType(
+; CHECK-SAME: i128 [[ARG1:%.*]], ptr [[BASE:%.*]]) {
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i128 [[ARG1]] to i8
+; CHECK-NEXT:    [[PROMOTED2:%.*]] = sext i8 [[TRUNC]] to i64
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i64 [[PROMOTED2]], 1
+; CHECK-NEXT:    [[PROMOTED:%.*]] = trunc i64 [[ADD]] to i8
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[BASE]], i64 [[ADD]]
+; CHECK-NEXT:    [[RES:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[FINALRES:%.*]] = add i8 [[RES]], [[PROMOTED]]
+; CHECK-NEXT:    ret i8 [[FINALRES]]
+;
   %trunc = trunc i128 %arg1 to i8
-  %add = add nsw i8 %trunc, 1 
+  %add = add nsw i8 %trunc, 1
   %sextadd = sext i8 %add to i64
   %arrayidx = getelementptr inbounds i8, ptr %base, i64 %sextadd
   %res = load i8, ptr %arrayidx
@@ -194,18 +233,20 @@ define i8 @oneArgPromotionLargerType(i128 %arg1, ptr %base) {
 ; Use same inserted trunc
 ; On X86 truncate are free. Check that we are able to promote the add
 ; to be used as addressing mode and that we insert a truncate for
-; *all* the other uses. 
-; CHECK-LABEL: @oneArgPromotionTruncInsertSeveralUse
-; CHECK: [[ARG1SEXT:%[a-zA-Z_0-9-]+]] = sext i8 %arg1 to i64
-; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nsw i64 [[ARG1SEXT]], 1
-; CHECK: [[TRUNC:%[a-zA-Z_0-9-]+]] = trunc i64 [[PROMOTED]] to i8
-; CHECK: [[GEP:%[a-zA-Z_0-9-]+]] = getelementptr inbounds i8, ptr %base, i64 [[PROMOTED]]
-; CHECK: [[LOAD:%[a-zA-Z_0-9-]+]] = load i8, ptr [[GEP]]
-; CHECK: [[ADDRES:%[a-zA-Z_0-9-]+]] = add i8 [[LOAD]], [[TRUNC]]
-; CHECK: add i8 [[ADDRES]], [[TRUNC]]
-; CHECK: ret
+; *all* the other uses.
 define i8 @oneArgPromotionTruncInsertSeveralUse(i8 %arg1, ptr %base) {
-  %add = add nsw i8 %arg1, 1 
+; CHECK-LABEL: define i8 @oneArgPromotionTruncInsertSeveralUse(
+; CHECK-SAME: i8 [[ARG1:%.*]], ptr [[BASE:%.*]]) {
+; CHECK-NEXT:    [[PROMOTED2:%.*]] = sext i8 [[ARG1]] to i64
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i64 [[PROMOTED2]], 1
+; CHECK-NEXT:    [[PROMOTED:%.*]] = trunc i64 [[ADD]] to i8
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[BASE]], i64 [[ADD]]
+; CHECK-NEXT:    [[RES:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ALMOSTFINALRES:%.*]] = add i8 [[RES]], [[PROMOTED]]
+; CHECK-NEXT:    [[FINALRES:%.*]] = add i8 [[ALMOSTFINALRES]], [[PROMOTED]]
+; CHECK-NEXT:    ret i8 [[FINALRES]]
+;
+  %add = add nsw i8 %arg1, 1
   %sextadd = sext i8 %add to i64
   %arrayidx = getelementptr inbounds i8, ptr %base, i64 %sextadd
   %res = load i8, ptr %arrayidx
@@ -216,16 +257,18 @@ define i8 @oneArgPromotionTruncInsertSeveralUse(i8 %arg1, ptr %base) {
 
 ; Check that the promoted instruction is used for all uses of the original
 ; sign extension.
-; CHECK-LABEL: @oneArgPromotionSExtSeveralUse
-; CHECK: [[ARG1SEXT:%[a-zA-Z_0-9-]+]] = sext i8 %arg1 to i64
-; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nsw i64 [[ARG1SEXT]], 1
-; CHECK: [[GEP:%[a-zA-Z_0-9-]+]] = getelementptr inbounds i8, ptr %base, i64 [[PROMOTED]]
-; CHECK: [[LOAD:%[a-zA-Z_0-9-]+]] = load i8, ptr [[GEP]]
-; CHECK: [[ADDRES:%[a-zA-Z_0-9-]+]] = zext i8 [[LOAD]] to i64
-; CHECK: add i64 [[ADDRES]], [[PROMOTED]]
-; CHECK: ret
 define i64 @oneArgPromotionSExtSeveralUse(i8 %arg1, ptr %base) {
-  %add = add nsw i8 %arg1, 1 
+; CHECK-LABEL: define i64 @oneArgPromotionSExtSeveralUse(
+; CHECK-SAME: i8 [[ARG1:%.*]], ptr [[BASE:%.*]]) {
+; CHECK-NEXT:    [[PROMOTED:%.*]] = sext i8 [[ARG1]] to i64
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i64 [[PROMOTED]], 1
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[BASE]], i64 [[ADD]]
+; CHECK-NEXT:    [[RES:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ALMOSTFINALRES:%.*]] = zext i8 [[RES]] to i64
+; CHECK-NEXT:    [[FINALRES:%.*]] = add i64 [[ALMOSTFINALRES]], [[ADD]]
+; CHECK-NEXT:    ret i64 [[FINALRES]]
+;
+  %add = add nsw i8 %arg1, 1
   %sextadd = sext i8 %add to i64
   %arrayidx = getelementptr inbounds i8, ptr %base, i64 %sextadd
   %res = load i8, ptr %arrayidx
@@ -249,16 +292,19 @@ define i64 @oneArgPromotionSExtSeveralUse(i8 %arg1, ptr %base) {
 ; - Setting the operands of the promoted instruction with the promoted values.
 ; - Moving instruction around (mainly sext when promoting instruction).
 ; Each type of those promotions has to be undo at least once during this
-; specific test. 
-; CHECK-LABEL: @twoArgsPromotionNest
-; CHECK: [[ORIG:%[a-zA-Z_0-9-]+]] = add nsw i32 %arg1, %arg2
-; CHECK: [[ADD:%[a-zA-Z_0-9-]+]] = add nsw i32 [[ORIG]], [[ORIG]]
-; CHECK: [[SEXT:%[a-zA-Z_0-9-]+]] = sext i32 [[ADD]] to i64
-; CHECK: getelementptr inbounds i8, ptr %base, i64 [[SEXT]]
-; CHECK: ret
+; specific test.
 define i8 @twoArgsPromotionNest(i32 %arg1, i32 %arg2, ptr %base) {
+; CHECK-LABEL: define i8 @twoArgsPromotionNest(
+; CHECK-SAME: i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], ptr [[BASE:%.*]]) {
+; CHECK-NEXT:    [[PROMOTABLEADD1:%.*]] = add nsw i32 [[ARG1]], [[ARG2]]
+; CHECK-NEXT:    [[PROMOTABLEADD2:%.*]] = add nsw i32 [[PROMOTABLEADD1]], [[PROMOTABLEADD1]]
+; CHECK-NEXT:    [[SEXTADD:%.*]] = sext i32 [[PROMOTABLEADD2]] to i64
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[BASE]], i64 [[SEXTADD]]
+; CHECK-NEXT:    [[RES:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    ret i8 [[RES]]
+;
   %promotableadd1 = add nsw i32 %arg1, %arg2
-  %promotableadd2 = add nsw i32 %promotableadd1, %promotableadd1 
+  %promotableadd2 = add nsw i32 %promotableadd1, %promotableadd1
   %sextadd = sext i32 %promotableadd2 to i64
   %arrayidx = getelementptr inbounds i8, ptr %base, i64 %sextadd
   %res = load i8, ptr %arrayidx
@@ -270,18 +316,21 @@ define i8 @twoArgsPromotionNest(i32 %arg1, i32 %arg2, ptr %base) {
 ; The matcher first promotes the add, removes the trunc and promotes
 ; the sext of arg1.
 ; Then, the matcher cannot use an addressing mode r + r + r, thus it
-; rolls back. 
-; CHECK-LABEL: @twoArgsNoPromotionRemove
-; CHECK: [[SEXTARG1:%[a-zA-Z_0-9-]+]] = sext i1 %arg1 to i32
-; CHECK: [[TRUNC:%[a-zA-Z_0-9-]+]] = trunc i32 [[SEXTARG1]] to i8
-; CHECK: [[ADD:%[a-zA-Z_0-9-]+]] = add nsw i8 [[TRUNC]], %arg2
-; CHECK: [[SEXT:%[a-zA-Z_0-9-]+]] = sext i8 [[ADD]] to i64
-; CHECK: getelementptr inbounds i8, ptr %base, i64 [[SEXT]]
-; CHECK: ret
+; rolls back.
 define i8 @twoArgsNoPromotionRemove(i1 %arg1, i8 %arg2, ptr %base) {
+; CHECK-LABEL: define i8 @twoArgsNoPromotionRemove(
+; CHECK-SAME: i1 [[ARG1:%.*]], i8 [[ARG2:%.*]], ptr [[BASE:%.*]]) {
+; CHECK-NEXT:    [[SEXTARG1:%.*]] = sext i1 [[ARG1]] to i32
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i32 [[SEXTARG1]] to i8
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i8 [[TRUNC]], [[ARG2]]
+; CHECK-NEXT:    [[SEXTADD:%.*]] = sext i8 [[ADD]] to i64
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[BASE]], i64 [[SEXTADD]]
+; CHECK-NEXT:    [[RES:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    ret i8 [[RES]]
+;
   %sextarg1 = sext i1 %arg1 to i32
   %trunc = trunc i32 %sextarg1 to i8
-  %add = add nsw i8 %trunc, %arg2 
+  %add = add nsw i8 %trunc, %arg2
   %sextadd = sext i8 %add to i64
   %arrayidx = getelementptr inbounds i8, ptr %base, i64 %sextadd
   %res = load i8, ptr %arrayidx
@@ -301,29 +350,40 @@ define i8 @twoArgsNoPromotionRemove(i1 %arg1, i8 %arg2, ptr %base) {
 ; Check that we did not promote anything in the final matching.
 ;
 ; <rdar://problem/16020230>
-; CHECK-LABEL: @checkProfitability
-; CHECK-NOT: {{%[a-zA-Z_0-9-]+}} = sext i32 %arg1 to i64
-; CHECK-NOT: {{%[a-zA-Z_0-9-]+}} = sext i32 %arg2 to i64
-; CHECK: [[SHL:%[a-zA-Z_0-9-]+]] = shl nsw i32 %arg1, 1
-; CHECK: [[ADD:%[a-zA-Z_0-9-]+]] = add nsw i32 [[SHL]], %arg2
-; CHECK: [[SEXTADD:%[a-zA-Z_0-9-]+]] = sext i32 [[ADD]] to i64
 ; BB then
-; CHECK: [[BASE1:%[a-zA-Z_0-9-]+]] = inttoptr i64 [[SEXTADD]] to ptr
-; CHECK: [[FULL1:%[a-zA-Z_0-9-]+]] = getelementptr i8, ptr [[BASE1]], i64 48
-; CHECK: load i32, ptr [[FULL1]]
 ; BB else
-; CHECK: [[BASE2:%[a-zA-Z_0-9-]+]] = inttoptr i64 [[SEXTADD]] to ptr
-; CHECK: [[FULL2:%[a-zA-Z_0-9-]+]] = getelementptr i8, ptr [[BASE2]], i64 48
-; CHECK: load i32, ptr [[FULL2]]
-; CHECK: ret
 define i32 @checkProfitability(i32 %arg1, i32 %arg2, i1 %test) {
+; CHECK-LABEL: define i32 @checkProfitability(
+; CHECK-SAME: i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i1 [[TEST:%.*]]) {
+; CHECK-NEXT:    [[SHL:%.*]] = shl nsw i32 [[ARG1]], 1
+; CHECK-NEXT:    [[ADD1:%.*]] = add nsw i32 [[SHL]], [[ARG2]]
+; CHECK-NEXT:    [[SEXTIDX1:%.*]] = sext i32 [[ADD1]] to i64
+; CHECK-NEXT:    br i1 [[TEST]], label %[[THEN:.*]], label %[[ELSE:.*]]
+; CHECK:       [[THEN]]:
+; CHECK-NEXT:    [[SUNKADDR:%.*]] = inttoptr i64 [[SEXTIDX1]] to ptr
+; CHECK-NEXT:    [[SUNKADDR13:%.*]] = getelementptr i8, ptr [[SUNKADDR]], i64 48
+; CHECK-NEXT:    [[RES1:%.*]] = load i32, ptr [[SUNKADDR13]], align 4
+; CHECK-NEXT:    br label %[[END:.*]]
+; CHECK:       [[ELSE]]:
+; CHECK-NEXT:    [[SUNKADDR17:%.*]] = inttoptr i64 [[SEXTIDX1]] to ptr
+; CHECK-NEXT:    [[SUNKADDR18:%.*]] = getelementptr i8, ptr [[SUNKADDR17]], i64 48
+; CHECK-NEXT:    [[RES2:%.*]] = load i32, ptr [[SUNKADDR18]], align 4
+; CHECK-NEXT:    br label %[[END]]
+; CHECK:       [[END]]:
+; CHECK-NEXT:    [[TMP:%.*]] = phi i32 [ [[RES1]], %[[THEN]] ], [ [[RES2]], %[[ELSE]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[SEXTIDX1]] to i32
+; CHECK-NEXT:    [[RES:%.*]] = add i32 [[TMP]], [[TMP1]]
+; CHECK-NEXT:    [[ADDR:%.*]] = inttoptr i32 [[RES]] to ptr
+; CHECK-NEXT:    [[FINAL:%.*]] = load i32, ptr [[ADDR]], align 4
+; CHECK-NEXT:    ret i32 [[FINAL]]
+;
   %shl = shl nsw i32 %arg1, 1
   %add1 = add nsw i32 %shl, %arg2
   %sextidx1 = sext i32 %add1 to i64
   %tmpptr = inttoptr i64 %sextidx1 to ptr
   %arrayidx1 = getelementptr i32, ptr %tmpptr, i64 12
   br i1 %test, label %then, label %else
-then: 
+then:
   %res1 = load i32, ptr %arrayidx1
   br label %end
 else:
@@ -346,15 +406,47 @@ end:
 ; We used to crash on this function because we did not return the right
 ; promoted instruction for %conv.i.
 ; Make sure we generate the right code now.
-; CHECK-LABEL: @fn3
 ; %conv.i is used twice and only one of its use is being promoted.
 ; Use it at the starting point for the matching.
-; CHECK: %conv.i = zext i16 [[PLAIN_OPND:%[.a-zA-Z_0-9-]+]] to i32
-; CHECK-NEXT: [[PROMOTED_CONV:%[.a-zA-Z_0-9-]+]] = zext i16 [[PLAIN_OPND]] to i64
-; CHECK-NEXT: [[ADD:%[a-zA-Z_0-9-]+]] = getelementptr i8, ptr %P, i64 [[PROMOTED_CONV]]
-; CHECK-NEXT: [[ADDR:%[a-zA-Z_0-9-]+]] = getelementptr i8, ptr [[ADD]], i64 7
-; CHECK-NEXT: load i8, ptr [[ADDR]], align 1
 define signext i16 @fn3(ptr nocapture readonly %P) {
+; CHECK-LABEL: define signext i16 @fn3(
+; CHECK-SAME: ptr nocapture readonly [[P:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[WHILE_BODY_I_I:.*]]
+; CHECK:       [[WHILE_BODY_I_I]]:
+; CHECK-NEXT:    [[SRC_ADDR_0_I_I:%.*]] = phi i16 [ 0, %[[ENTRY]] ], [ [[INC_I_I:%.*]], %[[WHILE_BODY_I_I]] ]
+; CHECK-NEXT:    [[INC_I_I]] = add i16 [[SRC_ADDR_0_I_I]], 1
+; CHECK-NEXT:    [[IDXPROM_I_I:%.*]] = sext i16 [[SRC_ADDR_0_I_I]] to i64
+; CHECK-NEXT:    [[SUNKADDR:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[IDXPROM_I_I]]
+; CHECK-NEXT:    [[SUNKADDR2:%.*]] = getelementptr inbounds i8, ptr [[SUNKADDR]], i64 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[SUNKADDR2]], align 1
+; CHECK-NEXT:    [[CONV2_I_I:%.*]] = zext i8 [[TMP1]] to i32
+; CHECK-NEXT:    [[AND_I_I:%.*]] = and i32 [[CONV2_I_I]], 15
+; CHECK-NEXT:    store i32 [[AND_I_I]], ptr @a, align 4
+; CHECK-NEXT:    [[TOBOOL_I_I:%.*]] = icmp eq i32 [[AND_I_I]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL_I_I]], label %[[WHILE_BODY_I_I]], label %[[FN1_EXIT_I:.*]]
+; CHECK:       [[FN1_EXIT_I]]:
+; CHECK-NEXT:    [[CONV_I:%.*]] = zext i16 [[INC_I_I]] to i32
+; CHECK-NEXT:    [[PROMOTED4:%.*]] = zext i16 [[INC_I_I]] to i64
+; CHECK-NEXT:    [[SUNKADDR5:%.*]] = getelementptr i8, ptr [[P]], i64 [[PROMOTED4]]
+; CHECK-NEXT:    [[SUNKADDR6:%.*]] = getelementptr i8, ptr [[SUNKADDR5]], i64 7
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr [[SUNKADDR6]], align 1
+; CHECK-NEXT:    [[CONV2_I:%.*]] = sext i8 [[TMP2]] to i16
+; CHECK-NEXT:    store i16 [[CONV2_I]], ptr @b, align 2
+; CHECK-NEXT:    [[SUB4_I:%.*]] = sub nsw i32 0, [[CONV_I]]
+; CHECK-NEXT:    [[CONV5_I:%.*]] = zext i16 [[CONV2_I]] to i32
+; CHECK-NEXT:    [[CMP_I:%.*]] = icmp sgt i32 [[CONV5_I]], [[SUB4_I]]
+; CHECK-NEXT:    br i1 [[CMP_I]], label %[[IF_THEN_I:.*]], label %[[FN2_EXIT:.*]]
+; CHECK:       [[IF_THEN_I]]:
+; CHECK-NEXT:    [[END_I:%.*]] = getelementptr inbounds [[STRUCT_DNS_PACKET:%.*]], ptr [[P]], i64 0, i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[END_I]], align 4
+; CHECK-NEXT:    [[SUB7_I:%.*]] = add i32 [[TMP3]], 65535
+; CHECK-NEXT:    [[CONV8_I:%.*]] = trunc i32 [[SUB7_I]] to i16
+; CHECK-NEXT:    br label %[[FN2_EXIT]]
+; CHECK:       [[FN2_EXIT]]:
+; CHECK-NEXT:    [[RETVAL_0_I:%.*]] = phi i16 [ [[CONV8_I]], %[[IF_THEN_I]] ], [ undef, %[[FN1_EXIT_I]] ]
+; CHECK-NEXT:    ret i16 [[RETVAL_0_I]]
+;
 entry:
   %tmp = getelementptr inbounds %struct.dns_packet, ptr %P, i64 0, i32 2
   br label %while.body.i.i
@@ -399,13 +491,16 @@ fn2.exit:                                         ; preds = %if.then.i, %fn1.exi
 
 ; Check that we do not promote an extension if the non-wrapping flag does not
 ; match the kind of the extension.
-; CHECK-LABEL: @noPromotionFlag
-; CHECK: [[ADD:%[a-zA-Z_0-9-]+]] = add nsw i32 %arg1, %arg2
-; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = zext i32 [[ADD]] to i64
-; CHECK: inttoptr i64 [[PROMOTED]] to ptr
-; CHECK: ret
 define i8 @noPromotionFlag(i32 %arg1, i32 %arg2) {
-  %add = add nsw i32 %arg1, %arg2 
+; CHECK-LABEL: define i8 @noPromotionFlag(
+; CHECK-SAME: i32 [[ARG1:%.*]], i32 [[ARG2:%.*]]) {
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[ARG1]], [[ARG2]]
+; CHECK-NEXT:    [[ZEXTADD:%.*]] = zext i32 [[ADD]] to i64
+; CHECK-NEXT:    [[BASE:%.*]] = inttoptr i64 [[ZEXTADD]] to ptr
+; CHECK-NEXT:    [[RES:%.*]] = load i8, ptr [[BASE]], align 1
+; CHECK-NEXT:    ret i8 [[RES]]
+;
+  %add = add nsw i32 %arg1, %arg2
   %zextadd = zext i32 %add to i64
   %base = inttoptr i64 %zextadd to ptr
   %res = load i8, ptr %base
@@ -413,14 +508,17 @@ define i8 @noPromotionFlag(i32 %arg1, i32 %arg2) {
 }
 
 ; Check that we correctly promote both operands of the promotable add with zext.
-; CHECK-LABEL: @twoArgsPromotionZExt
-; CHECK: [[ARG1ZEXT:%[a-zA-Z_0-9-]+]] = zext i32 %arg1 to i64
-; CHECK: [[ARG2ZEXT:%[a-zA-Z_0-9-]+]] = zext i32 %arg2 to i64
-; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nuw i64 [[ARG1ZEXT]], [[ARG2ZEXT]]
-; CHECK: inttoptr i64 [[PROMOTED]] to ptr
-; CHECK: ret
 define i8 @twoArgsPromotionZExt(i32 %arg1, i32 %arg2) {
-  %add = add nuw i32 %arg1, %arg2 
+; CHECK-LABEL: define i8 @twoArgsPromotionZExt(
+; CHECK-SAME: i32 [[ARG1:%.*]], i32 [[ARG2:%.*]]) {
+; CHECK-NEXT:    [[PROMOTED:%.*]] = zext i32 [[ARG1]] to i64
+; CHECK-NEXT:    [[PROMOTED2:%.*]] = zext i32 [[ARG2]] to i64
+; CHECK-NEXT:    [[ADD:%.*]] = add nuw i64 [[PROMOTED]], [[PROMOTED2]]
+; CHECK-NEXT:    [[BASE:%.*]] = inttoptr i64 [[ADD]] to ptr
+; CHECK-NEXT:    [[RES:%.*]] = load i8, ptr [[BASE]], align 1
+; CHECK-NEXT:    ret i8 [[RES]]
+;
+  %add = add nuw i32 %arg1, %arg2
   %zextadd = zext i32 %add to i64
   %base = inttoptr i64 %zextadd to ptr
   %res = load i8, ptr %base
@@ -428,13 +526,16 @@ define i8 @twoArgsPromotionZExt(i32 %arg1, i32 %arg2) {
 }
 
 ; Check that we correctly promote constant arguments.
-; CHECK-LABEL: @oneArgPromotionNegativeCstZExt
-; CHECK: [[ARG1ZEXT:%[a-zA-Z_0-9-]+]] = zext i8 %arg1 to i64
-; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nuw i64 [[ARG1ZEXT]], 255
-; CHECK: getelementptr inbounds i8, ptr %base, i64 [[PROMOTED]]
-; CHECK: ret
 define i8 @oneArgPromotionNegativeCstZExt(i8 %arg1, ptr %base) {
-  %add = add nuw i8 %arg1, -1 
+; CHECK-LABEL: define i8 @oneArgPromotionNegativeCstZExt(
+; CHECK-SAME: i8 [[ARG1:%.*]], ptr [[BASE:%.*]]) {
+; CHECK-NEXT:    [[PROMOTED:%.*]] = zext i8 [[ARG1]] to i64
+; CHECK-NEXT:    [[ADD:%.*]] = add nuw i64 [[PROMOTED]], 255
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[BASE]], i64 [[ADD]]
+; CHECK-NEXT:    [[RES:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    ret i8 [[RES]]
+;
+  %add = add nuw i8 %arg1, -1
   %zextadd = zext i8 %add to i64
   %arrayidx = getelementptr inbounds i8, ptr %base, i64 %zextadd
   %res = load i8, ptr %arrayidx
@@ -442,14 +543,17 @@ define i8 @oneArgPromotionNegativeCstZExt(i8 %arg1, ptr %base) {
 }
 
 ; Check that we are able to merge two zero extensions.
-; CHECK-LABEL: @oneArgPromotionZExtZExt
-; CHECK: [[ARG1ZEXT:%[a-zA-Z_0-9-]+]] = zext i8 %arg1 to i64
-; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nuw i64 [[ARG1ZEXT]], 1
-; CHECK: getelementptr inbounds i8, ptr %base, i64 [[PROMOTED]]
-; CHECK: ret
 define i8 @oneArgPromotionZExtZExt(i8 %arg1, ptr %base) {
+; CHECK-LABEL: define i8 @oneArgPromotionZExtZExt(
+; CHECK-SAME: i8 [[ARG1:%.*]], ptr [[BASE:%.*]]) {
+; CHECK-NEXT:    [[PROMOTED2:%.*]] = zext i8 [[ARG1]] to i64
+; CHECK-NEXT:    [[ADD:%.*]] = add nuw i64 [[PROMOTED2]], 1
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[BASE]], i64 [[ADD]]
+; CHECK-NEXT:    [[RES:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    ret i8 [[RES]]
+;
   %zext = zext i8 %arg1 to i32
-  %add = add nuw i32 %zext, 1 
+  %add = add nuw i32 %zext, 1
   %zextadd = zext i32 %add to i64
   %arrayidx = getelementptr inbounds i8, ptr %base, i64 %zextadd
   %res = load i8, ptr %arrayidx
@@ -458,17 +562,20 @@ define i8 @oneArgPromotionZExtZExt(i8 %arg1, ptr %base) {
 
 ; Check that we do not promote truncate when the dropped bits
 ; are of a different kind.
-; CHECK-LABEL: @oneArgPromotionBlockTruncZExt
-; CHECK: [[ARG1SEXT:%[a-zA-Z_0-9-]+]] = sext i1 %arg1 to i32
-; CHECK: [[ARG1TRUNC:%[a-zA-Z_0-9-]+]] = trunc i32 [[ARG1SEXT]] to i8
-; CHECK: [[ARG1ZEXT:%[a-zA-Z_0-9-]+]] = zext i8 [[ARG1TRUNC]] to i64
-; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nuw i64 [[ARG1ZEXT]], 1
-; CHECK: getelementptr inbounds i8, ptr %base, i64 [[PROMOTED]]
-; CHECK: ret
 define i8 @oneArgPromotionBlockTruncZExt(i1 %arg1, ptr %base) {
+; CHECK-LABEL: define i8 @oneArgPromotionBlockTruncZExt(
+; CHECK-SAME: i1 [[ARG1:%.*]], ptr [[BASE:%.*]]) {
+; CHECK-NEXT:    [[SEXTARG1:%.*]] = sext i1 [[ARG1]] to i32
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i32 [[SEXTARG1]] to i8
+; CHECK-NEXT:    [[PROMOTED:%.*]] = zext i8 [[TRUNC]] to i64
+; CHECK-NEXT:    [[ADD:%.*]] = add nuw i64 [[PROMOTED]], 1
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[BASE]], i64 [[ADD]]
+; CHECK-NEXT:    [[RES:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    ret i8 [[RES]]
+;
   %sextarg1 = sext i1 %arg1 to i32
   %trunc = trunc i32 %sextarg1 to i8
-  %add = add nuw i8 %trunc, 1 
+  %add = add nuw i8 %trunc, 1
   %zextadd = zext i8 %add to i64
   %arrayidx = getelementptr inbounds i8, ptr %base, i64 %zextadd
   %res = load i8, ptr %arrayidx
@@ -477,15 +584,18 @@ define i8 @oneArgPromotionBlockTruncZExt(i1 %arg1, ptr %base) {
 
 ; Check that we are able to promote truncate when we know all the bits
 ; that are dropped.
-; CHECK-LABEL: @oneArgPromotionPassTruncZExt
-; CHECK: [[ARG1ZEXT:%[a-zA-Z_0-9-]+]] = zext i1 %arg1 to i64
-; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nuw i64 [[ARG1ZEXT]], 1
-; CHECK: getelementptr inbounds i8, ptr %base, i64 [[PROMOTED]]
-; CHECK: ret
 define i8 @oneArgPromotionPassTruncZExt(i1 %arg1, ptr %base) {
+; CHECK-LABEL: define i8 @oneArgPromotionPassTruncZExt(
+; CHECK-SAME: i1 [[ARG1:%.*]], ptr [[BASE:%.*]]) {
+; CHECK-NEXT:    [[PROMOTED2:%.*]] = zext i1 [[ARG1]] to i64
+; CHECK-NEXT:    [[ADD:%.*]] = add nuw i64 [[PROMOTED2]], 1
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[BASE]], i64 [[ADD]]
+; CHECK-NEXT:    [[RES:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    ret i8 [[RES]]
+;
   %sextarg1 = zext i1 %arg1 to i32
   %trunc = trunc i32 %sextarg1 to i8
-  %add = add nuw i8 %trunc, 1 
+  %add = add nuw i8 %trunc, 1
   %zextadd = zext i8 %add to i64
   %arrayidx = getelementptr inbounds i8, ptr %base, i64 %zextadd
   %res = load i8, ptr %arrayidx
@@ -493,15 +603,18 @@ define i8 @oneArgPromotionPassTruncZExt(i1 %arg1, ptr %base) {
 }
 
 ; Check that we do not promote sext with zext.
-; CHECK-LABEL: @oneArgPromotionBlockSExtZExt
-; CHECK: [[ARG1SEXT:%[a-zA-Z_0-9-]+]] = sext i1 %arg1 to i8
-; CHECK: [[ARG1ZEXT:%[a-zA-Z_0-9-]+]] = zext i8 [[ARG1SEXT]] to i64
-; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nuw i64 [[ARG1ZEXT]], 1
-; CHECK: getelementptr inbounds i8, ptr %base, i64 [[PROMOTED]]
-; CHECK: ret
 define i8 @oneArgPromotionBlockSExtZExt(i1 %arg1, ptr %base) {
+; CHECK-LABEL: define i8 @oneArgPromotionBlockSExtZExt(
+; CHECK-SAME: i1 [[ARG1:%.*]], ptr [[BASE:%.*]]) {
+; CHECK-NEXT:    [[SEXTARG1:%.*]] = sext i1 [[ARG1]] to i8
+; CHECK-NEXT:    [[PROMOTED:%.*]] = zext i8 [[SEXTARG1]] to i64
+; CHECK-NEXT:    [[ADD:%.*]] = add nuw i64 [[PROMOTED]], 1
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[BASE]], i64 [[ADD]]
+; CHECK-NEXT:    [[RES:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    ret i8 [[RES]]
+;
   %sextarg1 = sext i1 %arg1 to i8
-  %add = add nuw i8 %sextarg1, 1 
+  %add = add nuw i8 %sextarg1, 1
   %zextadd = zext i8 %add to i64
   %arrayidx = getelementptr inbounds i8, ptr %base, i64 %zextadd
   %res = load i8, ptr %arrayidx
diff --git a/llvm/test/CodeGen/X86/uint_to_half.ll b/llvm/test/CodeGen/X86/uint_to_half.ll
new file mode 100644
index 000000000000..b62a07eec1ce
--- /dev/null
+++ b/llvm/test/CodeGen/X86/uint_to_half.ll
@@ -0,0 +1,198 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx,+f16c | FileCheck %s -check-prefixes=AVX1
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+f16c | FileCheck %s -check-prefixes=AVX2
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s -check-prefixes=AVX512
+
+define <8 x half> @test_uitofp_v8i32_v8f16(<8 x i32> %a) {
+; AVX1-LABEL: test_uitofp_v8i32_v8f16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpsrld $16, %xmm2, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    vcvtdq2ps %ymm1, %ymm1
+; AVX1-NEXT:    vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT:    vaddps %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    vcvtps2ph $4, %ymm0, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_uitofp_v8i32_v8f16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [1258291200,1258291200,1258291200,1258291200,1258291200,1258291200,1258291200,1258291200]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
+; AVX2-NEXT:    vpsrld $16, %ymm0, %ymm0
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [1392508928,1392508928,1392508928,1392508928,1392508928,1392508928,1392508928,1392508928]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15]
+; AVX2-NEXT:    vbroadcastss {{.*#+}} ymm2 = [5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11]
+; AVX2-NEXT:    vsubps %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vaddps %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vcvtps2ph $4, %ymm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_uitofp_v8i32_v8f16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vcvtudq2ps %ymm0, %ymm0
+; AVX512-NEXT:    vcvtps2ph $4, %ymm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %vec = uitofp <8 x i32> %a to <8 x half>
+  ret <8 x half> %vec
+}
+
+define <8 x half> @test_strict_uitofp_v8i32_v8f16(<8 x i32> %a) {
+; AVX1-LABEL: test_strict_uitofp_v8i32_v8f16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpsrld $16, %xmm2, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    vcvtdq2ps %ymm1, %ymm1
+; AVX1-NEXT:    vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT:    vaddps %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    vcvtps2ph $4, %ymm0, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_strict_uitofp_v8i32_v8f16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [1258291200,1258291200,1258291200,1258291200,1258291200,1258291200,1258291200,1258291200]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
+; AVX2-NEXT:    vpsrld $16, %ymm0, %ymm0
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [1392508928,1392508928,1392508928,1392508928,1392508928,1392508928,1392508928,1392508928]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15]
+; AVX2-NEXT:    vbroadcastss {{.*#+}} ymm2 = [5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11]
+; AVX2-NEXT:    vsubps %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vaddps %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vcvtps2ph $4, %ymm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_strict_uitofp_v8i32_v8f16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vcvtudq2ps %ymm0, %ymm0
+; AVX512-NEXT:    vcvtps2ph $4, %ymm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %vec = tail call <8 x half> @llvm.experimental.constrained.uitofp.f16.i32(<8 x i32> %a, metadata !"round.dynamic", metadata !"fpexcept.strict")
+  ret <8 x half> %vec
+}
+
+define <16 x half> @test_uitofp_v16i32_v16f16(<16 x i32> %a) {
+; AVX1-LABEL: test_uitofp_v16i32_v16f16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpsrld $16, %xmm3, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX1-NEXT:    vcvtdq2ps %ymm2, %ymm2
+; AVX1-NEXT:    vbroadcastss {{.*#+}} ymm3 = [6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4]
+; AVX1-NEXT:    vmulps %ymm3, %ymm2, %ymm2
+; AVX1-NEXT:    vbroadcastss {{.*#+}} ymm4 = [65535,65535,65535,65535,65535,65535,65535,65535]
+; AVX1-NEXT:    vandps %ymm4, %ymm0, %ymm0
+; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT:    vaddps %ymm0, %ymm2, %ymm0
+; AVX1-NEXT:    vcvtps2ph $4, %ymm0, %xmm0
+; AVX1-NEXT:    vpsrld $16, %xmm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
+; AVX1-NEXT:    vpsrld $16, %xmm5, %xmm5
+; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm2, %ymm2
+; AVX1-NEXT:    vcvtdq2ps %ymm2, %ymm2
+; AVX1-NEXT:    vmulps %ymm3, %ymm2, %ymm2
+; AVX1-NEXT:    vandps %ymm4, %ymm1, %ymm1
+; AVX1-NEXT:    vcvtdq2ps %ymm1, %ymm1
+; AVX1-NEXT:    vaddps %ymm1, %ymm2, %ymm1
+; AVX1-NEXT:    vcvtps2ph $4, %ymm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_uitofp_v16i32_v16f16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [1258291200,1258291200,1258291200,1258291200,1258291200,1258291200,1258291200,1258291200]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm3 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15]
+; AVX2-NEXT:    vpsrld $16, %ymm0, %ymm0
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm4 = [1392508928,1392508928,1392508928,1392508928,1392508928,1392508928,1392508928,1392508928]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2],ymm4[3],ymm0[4],ymm4[5],ymm0[6],ymm4[7],ymm0[8],ymm4[9],ymm0[10],ymm4[11],ymm0[12],ymm4[13],ymm0[14],ymm4[15]
+; AVX2-NEXT:    vbroadcastss {{.*#+}} ymm5 = [5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11]
+; AVX2-NEXT:    vsubps %ymm5, %ymm0, %ymm0
+; AVX2-NEXT:    vaddps %ymm0, %ymm3, %ymm0
+; AVX2-NEXT:    vcvtps2ph $4, %ymm0, %xmm0
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7],ymm1[8],ymm2[9],ymm1[10],ymm2[11],ymm1[12],ymm2[13],ymm1[14],ymm2[15]
+; AVX2-NEXT:    vpsrld $16, %ymm1, %ymm1
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2],ymm4[3],ymm1[4],ymm4[5],ymm1[6],ymm4[7],ymm1[8],ymm4[9],ymm1[10],ymm4[11],ymm1[12],ymm4[13],ymm1[14],ymm4[15]
+; AVX2-NEXT:    vsubps %ymm5, %ymm1, %ymm1
+; AVX2-NEXT:    vaddps %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vcvtps2ph $4, %ymm1, %xmm1
+; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_uitofp_v16i32_v16f16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vcvtudq2ps %zmm0, %zmm0
+; AVX512-NEXT:    vcvtps2ph $4, %zmm0, %ymm0
+; AVX512-NEXT:    retq
+  %vec = uitofp <16 x i32> %a to <16 x half>
+  ret <16 x half> %vec
+}
+
+define <16 x half> @test_strict_uitofp_v16i32_v16f16(<16 x i32> %a) {
+; AVX1-LABEL: test_strict_uitofp_v16i32_v16f16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpsrld $16, %xmm3, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX1-NEXT:    vcvtdq2ps %ymm2, %ymm2
+; AVX1-NEXT:    vbroadcastss {{.*#+}} ymm3 = [6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4]
+; AVX1-NEXT:    vmulps %ymm3, %ymm2, %ymm2
+; AVX1-NEXT:    vbroadcastss {{.*#+}} ymm4 = [65535,65535,65535,65535,65535,65535,65535,65535]
+; AVX1-NEXT:    vandps %ymm4, %ymm0, %ymm0
+; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT:    vaddps %ymm0, %ymm2, %ymm0
+; AVX1-NEXT:    vcvtps2ph $4, %ymm0, %xmm0
+; AVX1-NEXT:    vpsrld $16, %xmm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
+; AVX1-NEXT:    vpsrld $16, %xmm5, %xmm5
+; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm2, %ymm2
+; AVX1-NEXT:    vcvtdq2ps %ymm2, %ymm2
+; AVX1-NEXT:    vmulps %ymm3, %ymm2, %ymm2
+; AVX1-NEXT:    vandps %ymm4, %ymm1, %ymm1
+; AVX1-NEXT:    vcvtdq2ps %ymm1, %ymm1
+; AVX1-NEXT:    vaddps %ymm1, %ymm2, %ymm1
+; AVX1-NEXT:    vcvtps2ph $4, %ymm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_strict_uitofp_v16i32_v16f16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [1258291200,1258291200,1258291200,1258291200,1258291200,1258291200,1258291200,1258291200]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm3 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15]
+; AVX2-NEXT:    vpsrld $16, %ymm0, %ymm0
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm4 = [1392508928,1392508928,1392508928,1392508928,1392508928,1392508928,1392508928,1392508928]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2],ymm4[3],ymm0[4],ymm4[5],ymm0[6],ymm4[7],ymm0[8],ymm4[9],ymm0[10],ymm4[11],ymm0[12],ymm4[13],ymm0[14],ymm4[15]
+; AVX2-NEXT:    vbroadcastss {{.*#+}} ymm5 = [5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11]
+; AVX2-NEXT:    vsubps %ymm5, %ymm0, %ymm0
+; AVX2-NEXT:    vaddps %ymm0, %ymm3, %ymm0
+; AVX2-NEXT:    vcvtps2ph $4, %ymm0, %xmm0
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7],ymm1[8],ymm2[9],ymm1[10],ymm2[11],ymm1[12],ymm2[13],ymm1[14],ymm2[15]
+; AVX2-NEXT:    vpsrld $16, %ymm1, %ymm1
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2],ymm4[3],ymm1[4],ymm4[5],ymm1[6],ymm4[7],ymm1[8],ymm4[9],ymm1[10],ymm4[11],ymm1[12],ymm4[13],ymm1[14],ymm4[15]
+; AVX2-NEXT:    vsubps %ymm5, %ymm1, %ymm1
+; AVX2-NEXT:    vaddps %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vcvtps2ph $4, %ymm1, %xmm1
+; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_strict_uitofp_v16i32_v16f16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vcvtudq2ps %zmm0, %zmm0
+; AVX512-NEXT:    vcvtps2ph $4, %zmm0, %ymm0
+; AVX512-NEXT:    retq
+  %vec = tail call <16 x half> @llvm.experimental.constrained.uitofp.f16.i32(<16 x i32> %a, metadata !"round.dynamic", metadata !"fpexcept.strict")
+  ret <16 x half> %vec
+}
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
index 4f42d5c65528..15e287d66754 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
@@ -4129,6 +4129,62 @@ define <32 x i8> @shuffle_v32i8_56_zz_zz_zz_57_zz_zz_zz_58_zz_zz_zz__zz_59_zz_zz
   ret <32 x i8> %shuffle
 }
 
+; PR121823
+define <32 x i8> @shuffle_v32i8_01_09_00_03_11_02_05_13_04_07_15_06_17_25_16_19_27_18_21_29_20_23_31_22_zz_zz_zz_zz_zz_zz_zz_zz(<32 x i8> %a)  {
+; AVX1-LABEL: shuffle_v32i8_01_09_00_03_11_02_05_13_04_07_15_06_17_25_16_19_27_18_21_29_20_23_31_22_zz_zz_zz_zz_zz_zz_zz_zz:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[1,9,0,3]
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,9,0,3,11,2,5,13,4,7,15,6],zero,zero,zero,zero
+; AVX1-NEXT:    vpor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[11,2,5,13,4,7,15,6],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_01_09_00_03_11_02_05_13_04_07_15_06_17_25_16_19_27_18_21_29_20_23_31_22_zz_zz_zz_zz_zz_zz_zz_zz:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[1,9,0,3,11,2,5,13,4,7,15,6,u,u,u,u,17,25,16,19,27,18,21,29,20,23,31,22,u,u,u,u]
+; AVX2-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,1,2,4,5,6,0,0]
+; AVX2-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; AVX2-NEXT:    retq
+;
+; AVX512VLBW-LABEL: shuffle_v32i8_01_09_00_03_11_02_05_13_04_07_15_06_17_25_16_19_27_18_21_29_20_23_31_22_zz_zz_zz_zz_zz_zz_zz_zz:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vpshufb {{.*#+}} ymm1 = ymm0[1,9,0,3,11,2,5,13,4,7,15,6,u,u,u,u,17,25,16,19,27,18,21,29,20,23,31,22,u,u,u,u]
+; AVX512VLBW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLBW-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [0,1,2,4,5,6,14,15]
+; AVX512VLBW-NEXT:    vpermi2d %ymm2, %ymm1, %ymm0
+; AVX512VLBW-NEXT:    retq
+;
+; AVX512VLVBMI-LABEL: shuffle_v32i8_01_09_00_03_11_02_05_13_04_07_15_06_17_25_16_19_27_18_21_29_20_23_31_22_zz_zz_zz_zz_zz_zz_zz_zz:
+; AVX512VLVBMI:       # %bb.0:
+; AVX512VLVBMI-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512VLVBMI-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,9,0,3,11,2,5,13,4,7,15,6,17,25,16,19,27,18,21,29,20,23,31,22,56,57,58,59,60,61,62,63]
+; AVX512VLVBMI-NEXT:    vpermt2b %ymm1, %ymm2, %ymm0
+; AVX512VLVBMI-NEXT:    retq
+;
+; XOPAVX1-LABEL: shuffle_v32i8_01_09_00_03_11_02_05_13_04_07_15_06_17_25_16_19_27_18_21_29_20_23_31_22_zz_zz_zz_zz_zz_zz_zz_zz:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; XOPAVX1-NEXT:    vpperm {{.*#+}} xmm0 = xmm0[1,9,0,3,11,2,5,13,4,7,15,6],xmm1[1,9,0,3]
+; XOPAVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[11,2,5,13,4,7,15,6],zero,zero,zero,zero,zero,zero,zero,zero
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: shuffle_v32i8_01_09_00_03_11_02_05_13_04_07_15_06_17_25_16_19_27_18_21_29_20_23_31_22_zz_zz_zz_zz_zz_zz_zz_zz:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[1,9,0,3,11,2,5,13,4,7,15,6,u,u,u,u,17,25,16,19,27,18,21,29,20,23,31,22,u,u,u,u]
+; XOPAVX2-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,1,2,4,5,6,0,0]
+; XOPAVX2-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; XOPAVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; XOPAVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; XOPAVX2-NEXT:    retq
+  %r = shufflevector <32 x i8> %a, <32 x i8> zeroinitializer, <32 x i32> <i32 1, i32 9, i32 0, i32 3, i32 11, i32 2, i32 5, i32 13, i32 4, i32 7, i32 15, i32 6, i32 17, i32 25, i32 16, i32 19, i32 27, i32 18, i32 21, i32 29, i32 20, i32 23, i32 31, i32 22, i32 32, i32 32, i32 32, i32 32, i32 48, i32 48, i32 48, i32 48>
+  ret <32 x i8> %r
+}
+
 define <32 x i8> @shuffle_v32i8_47_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_63_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30(<32 x i8> %a, <32 x i8> %b) {
 ; AVX1-LABEL: shuffle_v32i8_47_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_63_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30:
 ; AVX1:       # %bb.0:
diff --git a/llvm/test/DebugInfo/NVPTX/debug-addr-class.ll b/llvm/test/DebugInfo/NVPTX/debug-addr-class.ll
index 26ad59723abf..82301e42f7d0 100644
--- a/llvm/test/DebugInfo/NVPTX/debug-addr-class.ll
+++ b/llvm/test/DebugInfo/NVPTX/debug-addr-class.ll
@@ -4,7 +4,7 @@
 @GLOBAL = addrspace(1) externally_initialized global i32 0, align 4, !dbg !0
 @SHARED = addrspace(3) externally_initialized global i32 undef, align 4, !dbg !6
 
-define void @test(float, ptr, ptr, i32) !dbg !17 {
+define ptx_kernel void @test(float, ptr, ptr, i32) !dbg !17 {
   %5 = alloca float, align 4
   %6 = alloca ptr, align 8
   %7 = alloca ptr, align 8
@@ -38,7 +38,6 @@ define void @test(float, ptr, ptr, i32) !dbg !17 {
 declare void @llvm.dbg.declare(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!2}
-!nvvm.annotations = !{!10}
 !llvm.module.flags = !{!11, !12, !13, !14, !15}
 !llvm.ident = !{!16}
 
@@ -52,7 +51,6 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata)
 !7 = distinct !DIGlobalVariable(name: "SHARED", scope: !2, file: !8, line: 4, type: !9, isLocal: false, isDefinition: true)
 !8 = !DIFile(filename: "test.cu", directory: "/tmp")
 !9 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
-!10 = !{ptr @test, !"kernel", i32 1}
 !11 = !{i32 2, !"Dwarf Version", i32 2}
 !12 = !{i32 2, !"Debug Info Version", i32 3}
 !13 = !{i32 1, !"wchar_size", i32 4}
diff --git a/llvm/test/DebugInfo/NVPTX/debug-info.ll b/llvm/test/DebugInfo/NVPTX/debug-info.ll
index 55c81caaed05..c926229f96e3 100644
--- a/llvm/test/DebugInfo/NVPTX/debug-info.ll
+++ b/llvm/test/DebugInfo/NVPTX/debug-info.ll
@@ -59,7 +59,7 @@
 ; CHECK: }
 
 ; Function Attrs: nounwind
-define void @_Z5saxpyifPfS_(i32 %n, float %a, ptr nocapture readonly %x, ptr nocapture %y) local_unnamed_addr #0 !dbg !566 {
+define ptx_kernel void @_Z5saxpyifPfS_(i32 %n, float %a, ptr nocapture readonly %x, ptr nocapture %y) local_unnamed_addr #0 !dbg !566 {
 entry:
   call void @llvm.dbg.value(metadata i32 %n, metadata !570, metadata !DIExpression()), !dbg !575
   call void @llvm.dbg.value(metadata float %a, metadata !571, metadata !DIExpression()), !dbg !576
@@ -8496,7 +8496,6 @@ attributes #2 = { nounwind readnone speculatable }
 attributes #3 = { nounwind }
 
 !llvm.dbg.cu = !{!0}
-!nvvm.annotations = !{!555, !556, !557, !556, !558, !558, !558, !558, !559, !559, !558}
 !llvm.module.flags = !{!560, !561, !562, !563}
 !llvm.ident = !{!564}
 !nvvm.internalize.after.link = !{}
@@ -9057,11 +9056,6 @@ attributes #3 = { nounwind }
 !552 = !DISubprogram(name: "tgammaf", linkageName: "_ZL7tgammaff", scope: !444, file: !444, line: 1592, type: !13, isLocal: true, isDefinition: false, flags: DIFlagPrototyped, isOptimized: true)
 !553 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !5, entity: !554, file: !445, line: 459)
 !554 = !DISubprogram(name: "truncf", linkageName: "_ZL6truncff", scope: !462, file: !462, line: 662, type: !13, isLocal: true, isDefinition: false, flags: DIFlagPrototyped, isOptimized: true)
-!555 = !{ptr @_Z5saxpyifPfS_, !"kernel", i32 1}
-!556 = !{null, !"align", i32 8}
-!557 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
-!558 = !{null, !"align", i32 16}
-!559 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
 !560 = !{i32 2, !"Dwarf Version", i32 2}
 !561 = !{i32 2, !"Debug Info Version", i32 3}
 !562 = !{i32 1, !"wchar_size", i32 4}
diff --git a/llvm/test/DebugInfo/X86/dwarf5-debug-names-addr-tu-to-non-tu.ll b/llvm/test/DebugInfo/X86/dwarf5-debug-names-addr-tu-to-non-tu.ll
new file mode 100644
index 000000000000..a836b2a44e84
--- /dev/null
+++ b/llvm/test/DebugInfo/X86/dwarf5-debug-names-addr-tu-to-non-tu.ll
@@ -0,0 +1,83 @@
+; RUN: llc -filetype=obj -O0 -generate-type-units -mtriple=x86_64-unknown-linux-gnu < %s \
+; RUN:     | llvm-dwarfdump -debug-info -debug-names - \
+; RUN:     | FileCheck %s
+
+;; Test that an entry in the debug names table gets created for a top level DIE when the creation of TU fails.
+
+;; clang++ -O0 main.cpp -gdwarf-5 -fdebug-types-section -gpubnames -S -emit-llvm -glldb -o main.ll
+;; int foo;
+;; namespace {
+;; struct t1 {};
+;; } // namespace
+;; template <int *> struct t2 {
+;;   t1 v1;
+;; };
+;; struct t3 {
+;;   t2<&foo> v1;
+;; };
+;; t3 v1;
+
+; CHECK: [[OFFSET:0x[0-9a-f]*]]:   DW_TAG_structure_type
+; CHECK: [[OFFSET1:0x[0-9a-f]*]]:   DW_TAG_structure_type
+
+; CHECK:        Bucket 0 [
+; CHECK-NEXT:    Name 1 {
+; CHECK-NEXT:      Hash: {{.+}}
+; CHECK-NEXT:      String: {{.+}} "t3"
+; CHECK-NEXT:      Entry @ {{.+}} {
+; CHECK-NEXT:        Abbrev: 0x1
+; CHECK-NEXT:        Tag: DW_TAG_structure_type
+; CHECK-NEXT:        DW_IDX_die_offset: [[OFFSET]]
+; CHECK-NEXT:        DW_IDX_parent: <parent not indexed>
+
+; CHECK:        Name 5 {
+; CHECK-NEXT:      Hash: {{.+}}
+; CHECK-NEXT:      String: {{.+}} "t2<&foo>"
+; CHECK-NEXT:      Entry @ 0xe1 {
+; CHECK-NEXT:        Abbrev: 0x1
+; CHECK-NEXT:        Tag: DW_TAG_structure_type
+; CHECK-NEXT:        DW_IDX_die_offset: [[OFFSET1]]
+; CHECK-NEXT:        DW_IDX_parent: <parent not indexed>
+
+; ModuleID = 'main.cpp'
+source_filename = "main.cpp"
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%struct.t3 = type { i8 }
+
+@foo = dso_local global i32 0, align 4, !dbg !0
+@v1 = dso_local global %struct.t3 zeroinitializer, align 1, !dbg !5
+
+!llvm.dbg.cu = !{!2}
+!llvm.module.flags = !{!20, !21, !22, !23, !24, !25, !26}
+!llvm.ident = !{!27}
+
+!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression())
+!1 = distinct !DIGlobalVariable(name: "foo", scope: !2, file: !3, line: 1, type: !19, isLocal: false, isDefinition: true)
+!2 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !3, producer: "clang version 20.0.0git (git@github.com:llvm/llvm-project.git ba373096e8ac83a7136fc44bc4e71a7bc53417a6)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, globals: !4, splitDebugInlining: false, sysroot: "/")
+!3 = !DIFile(filename: "main.cpp", directory: "/StructuredType", checksumkind: CSK_MD5, checksum: "f91f8d905197b1c0309da9526bc4776e")
+!4 = !{!0, !5}
+!5 = !DIGlobalVariableExpression(var: !6, expr: !DIExpression())
+!6 = distinct !DIGlobalVariable(name: "v1", scope: !2, file: !3, line: 11, type: !7, isLocal: false, isDefinition: true)
+!7 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "t3", file: !3, line: 8, size: 8, flags: DIFlagTypePassByValue, elements: !8, identifier: "_ZTS2t3")
+!8 = !{!9}
+!9 = !DIDerivedType(tag: DW_TAG_member, name: "v1", scope: !7, file: !3, line: 9, baseType: !10, size: 8)
+!10 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "t2<&foo>", file: !3, line: 5, size: 8, flags: DIFlagTypePassByValue, elements: !11, templateParams: !16, identifier: "_ZTS2t2IXadL_Z3fooEEE")
+!11 = !{!12}
+!12 = !DIDerivedType(tag: DW_TAG_member, name: "v1", scope: !10, file: !3, line: 6, baseType: !13, size: 8)
+!13 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "t1", scope: !14, file: !3, line: 3, size: 8, flags: DIFlagTypePassByValue, elements: !15)
+!14 = !DINamespace(scope: null)
+!15 = !{}
+!16 = !{!17}
+!17 = !DITemplateValueParameter(type: !18, value: ptr @foo)
+!18 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !19, size: 64)
+!19 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!20 = !{i32 7, !"Dwarf Version", i32 5}
+!21 = !{i32 2, !"Debug Info Version", i32 3}
+!22 = !{i32 1, !"wchar_size", i32 4}
+!23 = !{i32 8, !"PIC Level", i32 2}
+!24 = !{i32 7, !"PIE Level", i32 2}
+!25 = !{i32 7, !"uwtable", i32 2}
+!26 = !{i32 7, !"frame-pointer", i32 2}
+!27 = !{!"clang version 20.0.0git (git@github.com:llvm/llvm-project.git ba373096e8ac83a7136fc44bc4e71a7bc53417a6)"}
diff --git a/llvm/test/ExecutionEngine/JITLink/AArch32/ELF_data_alignment.s b/llvm/test/ExecutionEngine/JITLink/AArch32/ELF_data_alignment.s
index b4f6e04d4eb4..9296f048e51e 100644
--- a/llvm/test/ExecutionEngine/JITLink/AArch32/ELF_data_alignment.s
+++ b/llvm/test/ExecutionEngine/JITLink/AArch32/ELF_data_alignment.s
@@ -1,16 +1,18 @@
 # REQUIRES: asserts
 # RUN: llvm-mc -triple=armv7-linux-gnueabi -arm-add-build-attributes -filetype=obj -o %t_armv7.o %s
 # RUN: llvm-objdump -s --section=.rodata %t_armv7.o | FileCheck --check-prefix=CHECK-OBJ %s
-# RUN: llvm-jitlink -noexec -slab-address 0x76ff0000 -slab-allocate 10Kb \
-# RUN:              -slab-page-size 4096 %t_armv7.o -debug-only=jitlink 2>&1 \
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec \
+# RUN:              -slab-address 0x76ff0000 -slab-allocate 10Kb \
+# RUN:              -slab-page-size 4096 %t_armv7.o 2>&1 \
 # RUN:              | FileCheck --check-prefix=CHECK-LG %s
 # RUN: llvm-jitlink -noexec -slab-address 0x76ff0000 -slab-allocate 10Kb \
 # RUN:              -slab-page-size 4096 %t_armv7.o -check %s
 
 # RUN: llvm-mc -triple=thumbv7-linux-gnueabi -arm-add-build-attributes -filetype=obj -o %t_thumbv7.o %s
 # RUN: llvm-objdump -s --section=.rodata %t_thumbv7.o | FileCheck --check-prefix=CHECK-OBJ %s
-# RUN: llvm-jitlink -noexec -slab-address 0x76ff0000 -slab-allocate 10Kb \
-# RUN:              -slab-page-size 4096 %t_thumbv7.o -debug-only=jitlink 2>&1 \
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec \
+# RUN:              -slab-address 0x76ff0000 -slab-allocate 10Kb \
+# RUN:              -slab-page-size 4096 %t_thumbv7.o 2>&1 \
 # RUN:              | FileCheck --check-prefix=CHECK-LG %s
 # RUN: llvm-jitlink -noexec -slab-address 0x76ff0000 -slab-allocate 10Kb \
 # RUN:              -slab-page-size 4096 %t_thumbv7.o -check %s
diff --git a/llvm/test/ExecutionEngine/JITLink/AArch64/ELF_ehframe.s b/llvm/test/ExecutionEngine/JITLink/AArch64/ELF_ehframe.s
index 151a041e7bcd..b25ffee270c4 100644
--- a/llvm/test/ExecutionEngine/JITLink/AArch64/ELF_ehframe.s
+++ b/llvm/test/ExecutionEngine/JITLink/AArch64/ELF_ehframe.s
@@ -1,7 +1,7 @@
 # REQUIRES: asserts
 # RUN: llvm-mc -triple=aarch64-linux-gnu -filetype=obj -o %t %s
-# RUN: llvm-jitlink -noexec -phony-externals -debug-only=jitlink %t 2>&1 | \
-# RUN:   FileCheck %s
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec \
+# RUN:              -phony-externals %t 2>&1 | FileCheck %s
 #
 # Check that splitting of eh-frame sections works.
 #
diff --git a/llvm/test/ExecutionEngine/JITLink/AArch64/MachO_compact_unwind.s b/llvm/test/ExecutionEngine/JITLink/AArch64/MachO_compact_unwind.s
index 20534d5a4865..b2adb8566e7c 100644
--- a/llvm/test/ExecutionEngine/JITLink/AArch64/MachO_compact_unwind.s
+++ b/llvm/test/ExecutionEngine/JITLink/AArch64/MachO_compact_unwind.s
@@ -1,6 +1,7 @@
 # REQUIRES: asserts
 # RUN: llvm-mc -triple=arm64-apple-ios -filetype=obj -o %t %s
-# RUN: llvm-jitlink -noexec -debug-only=jitlink %t 2>&1 | FileCheck %s
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec %t 2>&1 \
+# RUN:              | FileCheck %s
 #
 # Check that splitting of compact-unwind sections works.
 #
diff --git a/llvm/test/ExecutionEngine/JITLink/AArch64/MachO_ehframe.s b/llvm/test/ExecutionEngine/JITLink/AArch64/MachO_ehframe.s
index 8d43b0f975f6..4e84518f86a0 100644
--- a/llvm/test/ExecutionEngine/JITLink/AArch64/MachO_ehframe.s
+++ b/llvm/test/ExecutionEngine/JITLink/AArch64/MachO_ehframe.s
@@ -1,7 +1,7 @@
 # REQUIRES: asserts
 # RUN: llvm-mc -triple=arm64-apple-darwin11 -filetype=obj -o %t %s
-# RUN: llvm-jitlink -noexec -phony-externals -debug-only=jitlink %t 2>&1 | \
-# RUN:   FileCheck %s
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec \
+# RUN:              -phony-externals %t 2>&1 | FileCheck %s
 #
 # Check that splitting of eh-frame sections works.
 #
diff --git a/llvm/test/ExecutionEngine/JITLink/LoongArch/ELF_loongarch64_ehframe.s b/llvm/test/ExecutionEngine/JITLink/LoongArch/ELF_loongarch64_ehframe.s
index cc545853f327..806cdcf392f2 100644
--- a/llvm/test/ExecutionEngine/JITLink/LoongArch/ELF_loongarch64_ehframe.s
+++ b/llvm/test/ExecutionEngine/JITLink/LoongArch/ELF_loongarch64_ehframe.s
@@ -1,7 +1,7 @@
 # REQUIRES: asserts
-# RUN: llvm-mc --triple=loongarch64-linux-gnu --filetype=obj -o %t %s
-# RUN: llvm-jitlink --noexec --phony-externals --debug-only=jitlink %t 2>&1 | \
-# RUN:   FileCheck %s
+# RUN: llvm-mc -triple=loongarch64-linux-gnu -filetype=obj -o %t %s
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec \
+# RUN:              -phony-externals %t 2>&1 | FileCheck %s
 
 ## Check that splitting of eh-frame sections works.
 
diff --git a/llvm/test/ExecutionEngine/JITLink/RISCV/ELF_relax_call.s b/llvm/test/ExecutionEngine/JITLink/RISCV/ELF_relax_call.s
index 480fbb861310..2b5c9e383c04 100644
--- a/llvm/test/ExecutionEngine/JITLink/RISCV/ELF_relax_call.s
+++ b/llvm/test/ExecutionEngine/JITLink/RISCV/ELF_relax_call.s
@@ -1,15 +1,15 @@
 # REQUIRES: asserts
 # RUN: llvm-mc -triple=riscv32 -mattr=+relax -filetype=obj -o %t.rv32 %s
-# RUN: llvm-jitlink -noexec \
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec \
 # RUN:     -slab-allocate 100Kb -slab-address 0x1000 -slab-page-size 4096 \
-# RUN:     -debug-only=jitlink -check %s %t.rv32 \
-# RUN:    2>&1 | FileCheck %s
+# RUN:     -check %s %t.rv32 2>&1 \
+# RUN:     | FileCheck %s
 
 # RUN: llvm-mc -triple=riscv64 -mattr=+relax -filetype=obj -o %t.rv64 %s
-# RUN: llvm-jitlink -noexec \
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec \
 # RUN:     -slab-allocate 100Kb -slab-address 0x1000 -slab-page-size 4096 \
-# RUN:     -debug-only=jitlink -check %s %t.rv64 \
-# RUN:    2>&1 | FileCheck %s
+# RUN:     -check %s %t.rv64 2>&1 \
+# RUN:     | FileCheck %s
 
         .text
 
diff --git a/llvm/test/ExecutionEngine/JITLink/RISCV/ELF_relax_call_rvc.s b/llvm/test/ExecutionEngine/JITLink/RISCV/ELF_relax_call_rvc.s
index e8a2928999f4..3bbfd557a0a6 100644
--- a/llvm/test/ExecutionEngine/JITLink/RISCV/ELF_relax_call_rvc.s
+++ b/llvm/test/ExecutionEngine/JITLink/RISCV/ELF_relax_call_rvc.s
@@ -1,43 +1,43 @@
 # REQUIRES: asserts
 # RUN: llvm-mc -triple=riscv32 -mattr=+relax,+c -filetype=obj -o %t.rv32 %s
-# RUN: llvm-jitlink -noexec \
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec \
 # RUN:     -slab-allocate 100Kb -slab-address 0x1000 -slab-page-size 4096 \
-# RUN:     -debug-only=jitlink -check %s %t.rv32 \
-# RUN:    2>&1 | FileCheck %s
-# RUN: llvm-jitlink -noexec \
+# RUN:     -check %s %t.rv32 2>&1 \
+# RUN:     | FileCheck %s
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec \
 # RUN:     -slab-allocate 100Kb -slab-address 0x1000 -slab-page-size 4096 \
-# RUN:     -debug-only=jitlink -check %s -check-name=jitlink-check-rv32 %t.rv32 \
-# RUN:     2>&1 | FileCheck -check-prefix=CHECK-RV32 %s
+# RUN:     -check %s -check-name=jitlink-check-rv32 %t.rv32 2>&1 \
+# RUN:     | FileCheck -check-prefix=CHECK-RV32 %s
 
 # RUN: llvm-mc -triple=riscv64 -mattr=+relax,+c -filetype=obj -o %t.rv64 %s
-# RUN: llvm-jitlink -noexec \
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec \
 # RUN:     -slab-allocate 100Kb -slab-address 0x1000 -slab-page-size 4096 \
-# RUN:     -debug-only=jitlink -check %s %t.rv64 \
-# RUN:     2>&1 | FileCheck %s
-# RUN: llvm-jitlink -noexec \
+# RUN:     -check %s %t.rv64 2>&1 \
+# RUN:     | FileCheck %s
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec \
 # RUN:     -slab-allocate 100Kb -slab-address 0x1000 -slab-page-size 4096 \
-# RUN:     -debug-only=jitlink -check %s -check-name=jitlink-check-rv64 %t.rv64 \
-# RUN:     2>&1 | FileCheck -check-prefix=CHECK-RV64 %s
+# RUN:     -check %s -check-name=jitlink-check-rv64 %t.rv64 2>&1 \
+# RUN:     | FileCheck -check-prefix=CHECK-RV64 %s
 
 # RUN: llvm-mc -triple=riscv32 -mattr=+relax,+zca -filetype=obj -o %t.rv32zca %s
-# RUN: llvm-jitlink -noexec \
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec \
 # RUN:     -slab-allocate 100Kb -slab-address 0x1000 -slab-page-size 4096 \
-# RUN:     -debug-only=jitlink -check %s %t.rv32zca \
-# RUN:    2>&1 | FileCheck %s
-# RUN: llvm-jitlink -noexec \
+# RUN:     -check %s %t.rv32zca 2>&1 \
+# RUN:     | FileCheck %s
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec \
 # RUN:     -slab-allocate 100Kb -slab-address 0x1000 -slab-page-size 4096 \
-# RUN:     -debug-only=jitlink -check %s -check-name=jitlink-check-rv32 %t.rv32zca \
-# RUN:     2>&1 | FileCheck -check-prefix=CHECK-RV32 %s
+# RUN:     -check %s -check-name=jitlink-check-rv32 %t.rv32zca 2>&1 \
+# RUN:     | FileCheck -check-prefix=CHECK-RV32 %s
 
 # RUN: llvm-mc -triple=riscv64 -mattr=+relax,+c -filetype=obj -o %t.rv64 %s
-# RUN: llvm-jitlink -noexec \
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec \
 # RUN:     -slab-allocate 100Kb -slab-address 0x1000 -slab-page-size 4096 \
-# RUN:     -debug-only=jitlink -check %s %t.rv64 \
-# RUN:     2>&1 | FileCheck %s
-# RUN: llvm-jitlink -noexec \
+# RUN:     -check %s %t.rv64 2>&1 \
+# RUN:     | FileCheck %s
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec \
 # RUN:     -slab-allocate 100Kb -slab-address 0x1000 -slab-page-size 4096 \
-# RUN:     -debug-only=jitlink -check %s -check-name=jitlink-check-rv64 %t.rv64 \
-# RUN:     2>&1 | FileCheck -check-prefix=CHECK-RV64 %s
+# RUN:     -check %s -check-name=jitlink-check-rv64 %t.rv64 2>&1 \
+# RUN:     | FileCheck -check-prefix=CHECK-RV64 %s
 
         .text
 
diff --git a/llvm/test/ExecutionEngine/JITLink/RISCV/anonymous_symbol.s b/llvm/test/ExecutionEngine/JITLink/RISCV/anonymous_symbol.s
index e7114e4d643c..a1badfd0ee3d 100644
--- a/llvm/test/ExecutionEngine/JITLink/RISCV/anonymous_symbol.s
+++ b/llvm/test/ExecutionEngine/JITLink/RISCV/anonymous_symbol.s
@@ -1,6 +1,7 @@
 # REQUIRES: asserts
 # RUN: llvm-mc -triple=riscv64 -filetype=obj -o %t %s
-# RUN: llvm-jitlink -debug-only=jitlink -noexec %t 2>&1 | FileCheck %s
+# RUN: llvm-jitlink -debug-only=jitlink -num-threads=0 -noexec %t 2>&1 \
+# RUN:              | FileCheck %s
 #
 # Because of the exist of cfi directive, sections like eh_frame section will be emitted
 # in llvm's object code emission phase. Anonymous symbols will also be emitted to indicate
diff --git a/llvm/test/ExecutionEngine/JITLink/ppc64/ELF_ppc64_ehframe.s b/llvm/test/ExecutionEngine/JITLink/ppc64/ELF_ppc64_ehframe.s
index 9e9b340c5d8d..75f09ff402ad 100644
--- a/llvm/test/ExecutionEngine/JITLink/ppc64/ELF_ppc64_ehframe.s
+++ b/llvm/test/ExecutionEngine/JITLink/ppc64/ELF_ppc64_ehframe.s
@@ -1,10 +1,12 @@
 # REQUIRES: asserts
 # RUN: llvm-mc -triple=powerpc64le-unknown-linux-gnu -filetype=obj -o %t %s
-# RUN: llvm-jitlink -noexec -phony-externals -debug-only=jitlink %t 2>&1 | \
-# RUN:   FileCheck %s
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec -phony-externals \
+# RUN:              %t 2>&1 \
+# RUN:              | FileCheck %s
 # RUN: llvm-mc -triple=powerpc64-unknown-linux-gnu -filetype=obj -o %t %s
-# RUN: llvm-jitlink -noexec -phony-externals -debug-only=jitlink %t 2>&1 | \
-# RUN:   FileCheck %s
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec -phony-externals \
+# RUN:              %t 2>&1 \
+# RUN:              | FileCheck %s
 #
 # Check that splitting of eh-frame sections works.
 #
diff --git a/llvm/test/ExecutionEngine/JITLink/ppc64/external_weak.s b/llvm/test/ExecutionEngine/JITLink/ppc64/external_weak.s
index 0bc90903caf9..7021a27294c9 100644
--- a/llvm/test/ExecutionEngine/JITLink/ppc64/external_weak.s
+++ b/llvm/test/ExecutionEngine/JITLink/ppc64/external_weak.s
@@ -4,8 +4,9 @@
 # RUN:   %t/external_weak.o %S/Inputs/external_weak.s
 # RUN: llvm-mc -triple=powerpc64le-unknown-linux-gnu -filetype=obj -o \
 # RUN:   %t/external_weak_main.o %S/Inputs/external_weak_main.s
-# RUN: llvm-jitlink -noexec -debug-only=jitlink %t/external_weak.o \
-# RUN:   %t/external_weak_main.o 2>&1 | FileCheck %s
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec \
+# RUN:              %t/external_weak.o %t/external_weak_main.o 2>&1 \
+# RUN:              | FileCheck %s
 # CHECK: Created ELFLinkGraphBuilder for "{{.*}}external_weak_main.o"
 # CHECK: Creating defined graph symbol for ELF symbol "foo"
 # CHECK: External symbols:
diff --git a/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_abs.s b/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_abs.s
index 830a2e00f487..d69dbbdd7040 100644
--- a/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_abs.s
+++ b/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_abs.s
@@ -1,6 +1,7 @@
 # REQUIRES: asserts
 # RUN: llvm-mc -filetype=obj -triple=x86_64-windows-msvc %s -o %t
-# RUN: llvm-jitlink --debug-only=jitlink -noexec %t 2>&1 | FileCheck %s
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec %t 2>&1 \
+# RUN:              | FileCheck %s
 #
 # Check absolute symbol is created with a correct value.
 #
diff --git a/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_any.test b/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_any.test
index 10f118280b1d..b1176744b9a4 100644
--- a/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_any.test
+++ b/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_any.test
@@ -1,7 +1,8 @@
 # REQUIRES: asserts
 # RUN: yaml2obj %s -o %t
-# RUN: llvm-jitlink -noexec --debug-only=jitlink -noexec %t 2>&1 | FileCheck %s
-# 
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec %t 2>&1 \
+# RUN:              | FileCheck %s
+#
 # Check a weak symbol is created for a COMDAT symbol with IMAGE_COMDAT_SELECT_ANY selection type.
 #
 # CHECK: Creating graph symbols...
diff --git a/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_associative.test b/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_associative.test
index 7dfb4c7ec8ea..8915d04d7cbf 100644
--- a/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_associative.test
+++ b/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_associative.test
@@ -1,16 +1,19 @@
 # REQUIRES: asserts
 # RUN: yaml2obj %s -o %t
-# RUN: llvm-jitlink -noexec --debug-only=jitlink -noexec %t 2>&1
-# 
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec %t 2>&1 \
+# RUN:              | FileCheck %s
+#
 # Check COMDAT associative symbol is emitted as local symbol.
 #
-# CHECK: Creating graph symbols...
-# CHECK:      2: Creating defined graph symbol for COFF symbol ".text" in .text (index: 2)
-# CHECK-NEXT:   0x0 (block + 0x00000000): size: 0x00000001, linkage: strong, scope: local, dead  -   <anonymous symbol>
-# CHECK-NEXT: 4: Exporting COMDAT graph symbol for COFF symbol "func" in section 2
-# CHECK-NEXT:   0x0 (block + 0x00000000): size: 0x00000001, linkage: weak, scope: default, dead  -   func
-# CHECK-NEXT: 5: Creating defined graph symbol for COFF symbol ".xdata" in .xdata (index: 3)
-# CHECK-NEXT:   0x0 (block + 0x00000000): size: 0x00000000, linkage: strong, scope: local, dead  -   .xdata
+# CHECK:       Creating graph symbols...
+# CHECK:         0: Creating defined graph symbol for COFF symbol ".text" in .text (index: 1)
+# CHECK-NEXT:      0x0 (block + 0x00000000): size: 0x00000000, linkage: strong, scope: local, dead  -   .text
+# CHECK-NEXT:    4: Exporting COMDAT graph symbol for COFF symbol "func" in section 2
+# CHECK-NEXT:      0x0 (block + 0x00000000): size: 0x00000000, linkage: weak, scope: default, dead  -   func
+# CHECK-NEXT:    4: Creating defined graph symbol for COFF symbol "func" in .text (index: 2)
+# CHECK-NEXT:      0x0 (block + 0x00000000): size: 0x00000000, linkage: weak, scope: default, dead  -   func
+# CHECK-NEXT:    5: Creating defined graph symbol for COFF symbol ".xdata" in .xdata (index: 3)
+# CHECK-NEXT:      0x0 (block + 0x00000000): size: 0x00000000, linkage: strong, scope: local, dead  -   .xdata
 
 --- !COFF
 header:
diff --git a/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_exact_match.test b/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_exact_match.test
index f7572714bae1..76a0ac4813b8 100644
--- a/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_exact_match.test
+++ b/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_exact_match.test
@@ -1,7 +1,8 @@
 # REQUIRES: asserts
 # RUN: yaml2obj %s -o %t
-# RUN: llvm-jitlink -noexec --debug-only=jitlink -noexec %t 2>&1 | FileCheck %s
-# 
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec %t 2>&1 \
+# RUN:              | FileCheck %s
+#
 # Check a weak symbol is created for a COMDAT symbol with IMAGE_COMDAT_SELECT_EXACT_MATCH selection type.
 # Doesn't check the content validation.
 #
diff --git a/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_intervene.test b/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_intervene.test
index 11a182578379..79f4b15bd53b 100644
--- a/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_intervene.test
+++ b/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_intervene.test
@@ -1,7 +1,8 @@
 # REQUIRES: asserts
 # RUN: yaml2obj %s -o %t
-# RUN: llvm-jitlink -noexec --debug-only=jitlink -noexec %t 2>&1 | FileCheck %s
-# 
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec %t 2>&1 \
+# RUN:              | FileCheck %s
+#
 # Check a comdat export is done correctly even if second symbol of comdat sequences appear out of order
 #
 # CHECK: Creating graph symbols...
diff --git a/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_largest.test b/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_largest.test
index 86d809d63ed2..dc0529780f6a 100644
--- a/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_largest.test
+++ b/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_largest.test
@@ -1,7 +1,8 @@
 # REQUIRES: asserts
 # RUN: yaml2obj %s -o %t
-# RUN: llvm-jitlink -noexec --debug-only=jitlink -noexec %t 2>&1
-# 
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec %t 2>&1 \
+# RUN:              | FileCheck %s
+#
 # Check jitlink handles largest selection type as plain weak symbol.
 #
 # CHECK: Creating graph symbols...
diff --git a/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_noduplicate.test b/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_noduplicate.test
index 53b2c81b5ec7..0c5313eb7d61 100644
--- a/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_noduplicate.test
+++ b/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_noduplicate.test
@@ -1,7 +1,8 @@
 # REQUIRES: asserts
 # RUN: yaml2obj %s -o %t
-# RUN: llvm-jitlink -noexec --debug-only=jitlink -noexec %t 2>&1 | FileCheck %s
-# 
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec -noexec %t 2>&1 \
+# RUN:              | FileCheck %s
+#
 # Check a strong symbol is created for a COMDAT symbol with IMAGE_COMDAT_SELECT_NODUPLICATES selection type.
 #
 # CHECK: Creating graph symbols...
diff --git a/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_offset.test b/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_offset.test
index 97467fdb5ee9..6cd8ff9b1e5a 100644
--- a/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_offset.test
+++ b/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_offset.test
@@ -1,7 +1,8 @@
 # REQUIRES: asserts
 # RUN: yaml2obj %s -o %t
-# RUN: llvm-jitlink -noexec --debug-only=jitlink -noexec %t 2>&1 | FileCheck %s
-# 
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec %t 2>&1 \
+# RUN:              | FileCheck %s
+#
 # Check a COMDAT symbol with an offset is handled correctly.
 #
 # CHECK: Creating graph symbols...
diff --git a/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_same_size.test b/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_same_size.test
index ef0f84a584c3..e1d955f54ffb 100644
--- a/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_same_size.test
+++ b/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_same_size.test
@@ -1,7 +1,8 @@
 # REQUIRES: asserts
 # RUN: yaml2obj %s -o %t
-# RUN: llvm-jitlink -noexec --debug-only=jitlink -noexec %t 2>&1 | FileCheck %s
-# 
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec %t 2>&1 \
+# RUN:              | FileCheck %s
+#
 # Check a weak symbol is created for a COMDAT symbol with IMAGE_COMDAT_SELECT_SAME_SIZE selection type.
 # Doesn't check the size validation.
 #
diff --git a/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_weak.s b/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_weak.s
index 79ac75ffe441..8fa8ba0d8c95 100644
--- a/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_weak.s
+++ b/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_comdat_weak.s
@@ -1,6 +1,7 @@
 # REQUIRES: asserts
 # RUN: llvm-mc -filetype=obj -triple=x86_64-windows-msvc %s -o %t
-# RUN: llvm-jitlink --debug-only=jitlink -noexec %t 2>&1 | FileCheck %s
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec %t 2>&1 \
+# RUN:              | FileCheck %s
 #
 # Check a COMDAT any symbol is exported as a weak symbol.
 #
diff --git a/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_common_symbol.s b/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_common_symbol.s
index 2d4ad30f94d8..2788a9b0ae14 100644
--- a/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_common_symbol.s
+++ b/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_common_symbol.s
@@ -1,6 +1,7 @@
 # REQUIRES: asserts
 # RUN: llvm-mc -filetype=obj -triple=x86_64-windows-msvc %s -o %t
-# RUN: llvm-jitlink --debug-only=jitlink -noexec %t 2>&1 | FileCheck %s
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec %t 2>&1 \
+# RUN:              | FileCheck %s
 #
 # Check a common symbol is created.
 #
diff --git a/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_duplicate_externals.test b/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_duplicate_externals.test
index e929c0131686..ebce7958442c 100644
--- a/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_duplicate_externals.test
+++ b/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_duplicate_externals.test
@@ -1,10 +1,10 @@
 # REQUIRES: asserts
 # RUN: yaml2obj %s -o %t
-# RUN: llvm-jitlink -noexec -abs __ImageBase=0xfff00000 \
-# RUN: --debug-only=jitlink \
-# RUN: -slab-allocate 100Kb -slab-address 0xfff00000 -slab-page-size 4096 \
-# RUN: %t 2>&1 | FileCheck %s
-# 
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec \
+# RUN:              -abs __ImageBase=0xfff00000 -slab-allocate 100Kb \
+# RUN:              -slab-address 0xfff00000 -slab-page-size 4096 %t 2>&1 \
+# RUN:              | FileCheck %s
+#
 # Check duplicate undefined external symbols are handled correctly.
 #
 # CHECK: Creating graph symbols...
diff --git a/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_file_debug.s b/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_file_debug.s
index 3980f81801ac..ac1ef2dc962b 100644
--- a/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_file_debug.s
+++ b/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_file_debug.s
@@ -1,6 +1,8 @@
 # REQUIRES: asserts
 # RUN: llvm-mc -filetype=obj -triple=x86_64-windows-msvc %s -o %t
-# RUN: llvm-jitlink -abs func=0xcafef00d --debug-only=jitlink -noexec %t 2>&1 | FileCheck %s
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec \
+# RUN:              -abs func=0xcafef00d %t 2>&1 \
+# RUN:              | FileCheck %s
 #
 # Check a file debug symbol is skipped.
 #
diff --git a/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_static_var.s b/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_static_var.s
index 5275c7d81766..dce0c1eaa77c 100644
--- a/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_static_var.s
+++ b/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_static_var.s
@@ -1,6 +1,8 @@
 # REQUIRES: asserts
 # RUN: llvm-mc -filetype=obj -triple=x86_64-windows-msvc %s -o %t
-# RUN: llvm-jitlink -abs var=0xcafef00d --debug-only=jitlink -noexec %t 2>&1 | FileCheck %s
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec \
+# RUN:              -abs var=0xcafef00d  %t 2>&1 \
+# RUN:              | FileCheck %s
 #
 # Check a local symbol is created for a static variable.
 #
diff --git a/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_weak_external.s b/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_weak_external.s
index c750d75b82f2..d49d56150017 100644
--- a/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_weak_external.s
+++ b/llvm/test/ExecutionEngine/JITLink/x86-64/COFF_weak_external.s
@@ -1,6 +1,8 @@
 # REQUIRES: asserts
 # RUN: llvm-mc -filetype=obj -triple=x86_64-windows-msvc %s -o %t
-# RUN: llvm-jitlink -abs var=0xcafef00d --debug-only=jitlink -noexec %t 2>&1 | FileCheck %s
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec \
+# RUN:              -abs var=0xcafef00d %t 2>&1 | \
+# RUN:              FileCheck %s
 #
 # Check a default symbol is aliased as a weak external symbol.
 #
diff --git a/llvm/test/ExecutionEngine/JITLink/x86-64/ELF_debug_section_lifetime_is_NoAlloc.yaml b/llvm/test/ExecutionEngine/JITLink/x86-64/ELF_debug_section_lifetime_is_NoAlloc.yaml
index 0afcda467c32..09dda4739225 100644
--- a/llvm/test/ExecutionEngine/JITLink/x86-64/ELF_debug_section_lifetime_is_NoAlloc.yaml
+++ b/llvm/test/ExecutionEngine/JITLink/x86-64/ELF_debug_section_lifetime_is_NoAlloc.yaml
@@ -1,6 +1,7 @@
 # REQUIRES: asserts
 # RUN: yaml2obj -o %t.o %s
-# RUN: llvm-jitlink -debug-only=jitlink -noexec %t.o 2>&1 | FileCheck %s
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec %t.o 2>&1 \
+# RUN:              | FileCheck %s
 #
 # Check that debug sections get NoAlloc lifetimes.
 #
diff --git a/llvm/test/ExecutionEngine/JITLink/x86-64/ELF_ehframe_basic.s b/llvm/test/ExecutionEngine/JITLink/x86-64/ELF_ehframe_basic.s
index c01ced5d0523..9339f076c67f 100644
--- a/llvm/test/ExecutionEngine/JITLink/x86-64/ELF_ehframe_basic.s
+++ b/llvm/test/ExecutionEngine/JITLink/x86-64/ELF_ehframe_basic.s
@@ -2,8 +2,9 @@
 # UNSUPPORTED: system-windows
 # RUN: llvm-mc -triple=x86_64-unknown-linux -position-independent \
 # RUN:     -filetype=obj -o %t %s
-# RUN: llvm-jitlink -debug-only=jitlink -abs bar=0x01 \
-# RUN:     -abs _ZTIi=0x02 -noexec %t 2>&1 | FileCheck %s
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec \
+# RUN:              -abs bar=0x01 -abs _ZTIi=0x02 %t 2>&1 \
+# RUN:              | FileCheck %s
 #
 # FIXME: This test should run on windows. Investigate spurious
 # 'note: command had no output on stdout or stderr' errors, then re-enable.
diff --git a/llvm/test/ExecutionEngine/JITLink/x86-64/ELF_ehframe_large_static_personality_encodings.s b/llvm/test/ExecutionEngine/JITLink/x86-64/ELF_ehframe_large_static_personality_encodings.s
index 64990b5d38f0..98fc5f4e3acc 100644
--- a/llvm/test/ExecutionEngine/JITLink/x86-64/ELF_ehframe_large_static_personality_encodings.s
+++ b/llvm/test/ExecutionEngine/JITLink/x86-64/ELF_ehframe_large_static_personality_encodings.s
@@ -2,8 +2,9 @@
 # UNSUPPORTED: system-windows
 # RUN: llvm-mc -triple=x86_64-pc-linux-gnu -large-code-model \
 # RUN:   -filetype=obj -o %t %s
-# RUN: llvm-jitlink -debug-only=jitlink -noexec -phony-externals %t 2>&1 | \
-# RUN:   FileCheck %s
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec \
+# RUN:              -phony-externals %t 2>&1 \
+# RUN:              | FileCheck %s
 #
 # Check handling of pointer encodings for personality functions when compiling
 # with `-mcmodel=large -static`.
diff --git a/llvm/test/ExecutionEngine/JITLink/x86-64/LocalDependencyPropagation.s b/llvm/test/ExecutionEngine/JITLink/x86-64/LocalDependencyPropagation.s
index 0898ad8b1823..83d71cdf6fc8 100644
--- a/llvm/test/ExecutionEngine/JITLink/x86-64/LocalDependencyPropagation.s
+++ b/llvm/test/ExecutionEngine/JITLink/x86-64/LocalDependencyPropagation.s
@@ -1,7 +1,8 @@
 # REQUIRES: asserts
 # RUN: llvm-mc -triple=x86_64-apple-macosx10.9 -filetype=obj -o %t %s
-# RUN: llvm-jitlink -debug-only=orc -num-threads=0 -noexec \
-# RUN:     -abs _external_func=0x1 -entry=_foo %t 2>&1 | FileCheck %s
+# RUN: llvm-jitlink -num-threads=0 -debug-only=orc -noexec \
+# RUN:              -abs _external_func=0x1 -entry=_foo %t 2>&1 \
+# RUN:              | FileCheck %s
 #
 # Check that simplification eliminates dependencies on symbols in this unit,
 # and correctly propagates dependencies on symbols outside the unit (including
diff --git a/llvm/test/ExecutionEngine/JITLink/x86-64/MachO-check-dwarf-filename.s b/llvm/test/ExecutionEngine/JITLink/x86-64/MachO-check-dwarf-filename.s
index df44ce996eca..81ea18f24934 100644
--- a/llvm/test/ExecutionEngine/JITLink/x86-64/MachO-check-dwarf-filename.s
+++ b/llvm/test/ExecutionEngine/JITLink/x86-64/MachO-check-dwarf-filename.s
@@ -1,6 +1,7 @@
 # RUN: llvm-mc -triple=x86_64-apple-macosx10.9 -filetype=obj -o %t.o %s
-# RUN: llvm-jitlink -debug-only=orc -noexec -debugger-support %t.o 2>&1 | \
-# RUN:     FileCheck %s
+# RUN: llvm-jitlink -num-threads=0 -debug-only=orc -noexec -debugger-support \
+# RUN:              %t.o 2>&1 \
+# RUN:              | FileCheck %s
 #
 # REQUIRES: asserts && system-darwin
 #
diff --git a/llvm/test/ExecutionEngine/JITLink/x86-64/MachO_compact_unwind.s b/llvm/test/ExecutionEngine/JITLink/x86-64/MachO_compact_unwind.s
index e57831418304..38522078f4ce 100644
--- a/llvm/test/ExecutionEngine/JITLink/x86-64/MachO_compact_unwind.s
+++ b/llvm/test/ExecutionEngine/JITLink/x86-64/MachO_compact_unwind.s
@@ -1,6 +1,7 @@
 # REQUIRES: asserts
 # RUN: llvm-mc -triple=x86_64-apple-darwin11 -filetype=obj -o %t %s
-# RUN: llvm-jitlink -noexec -debug-only=jitlink %t 2>&1 | FileCheck %s
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec %t 2>&1 \
+# RUN:              | FileCheck %s
 #
 # Check that splitting of compact-unwind sections works.
 #
diff --git a/llvm/test/ExecutionEngine/JITLink/x86-64/MachO_cstring_section_alignment.s b/llvm/test/ExecutionEngine/JITLink/x86-64/MachO_cstring_section_alignment.s
index 5a8cef5749f0..3859a35ce221 100644
--- a/llvm/test/ExecutionEngine/JITLink/x86-64/MachO_cstring_section_alignment.s
+++ b/llvm/test/ExecutionEngine/JITLink/x86-64/MachO_cstring_section_alignment.s
@@ -1,6 +1,7 @@
 # REQUIRES: asserts
 # RUN: llvm-mc -triple=x86_64-apple-macos10.9 -filetype=obj -o %t %s
-# RUN: llvm-jitlink -debug-only=jitlink -noexec %t 2>&1 | FileCheck %s
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec %t 2>&1 \
+# RUN:              | FileCheck %s
 #
 # Verify that PC-begin candidate symbols have been sorted correctly when adding
 # PC-begin edges for FDEs. In this test both _main and _X are at address zero,
diff --git a/llvm/test/ExecutionEngine/JITLink/x86-64/MachO_cstring_section_splitting.s b/llvm/test/ExecutionEngine/JITLink/x86-64/MachO_cstring_section_splitting.s
index a5baf5611cc2..0d68a10c59d1 100644
--- a/llvm/test/ExecutionEngine/JITLink/x86-64/MachO_cstring_section_splitting.s
+++ b/llvm/test/ExecutionEngine/JITLink/x86-64/MachO_cstring_section_splitting.s
@@ -1,7 +1,8 @@
 # REQUIRES: asserts
 # RUN: llvm-mc -triple=x86_64-apple-macosx10.9 -filetype=obj -o %t %s
-# RUN: llvm-jitlink -debug-only=jitlink -noexec -entry hook %t 2>&1 | \
-# RUN:   FileCheck %s
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec \
+# RUN:              -entry hook %t 2>&1 \
+# RUN:              | FileCheck %s
 #
 # Verify that we split C string literals on null-terminators, rather than on
 # symbol boundaries. We expect four dead-stripped symbols: l_str.0, l_str.2,
diff --git a/llvm/test/ExecutionEngine/JITLink/x86-64/MachO_non_subsections_via_symbols.s b/llvm/test/ExecutionEngine/JITLink/x86-64/MachO_non_subsections_via_symbols.s
index e1adb3bc75a1..66fcb47fc7ab 100644
--- a/llvm/test/ExecutionEngine/JITLink/x86-64/MachO_non_subsections_via_symbols.s
+++ b/llvm/test/ExecutionEngine/JITLink/x86-64/MachO_non_subsections_via_symbols.s
@@ -4,7 +4,8 @@
 #
 # REQUIRES: asserts
 # RUN: llvm-mc -triple=x86_64-apple-macosx10.9 -filetype=obj -o %t %s
-# RUN: llvm-jitlink -debug-only=jitlink -noexec %t 2>&1 | FileCheck %s
+# RUN: llvm-jitlink -num-threads=0 -debug-only=jitlink -noexec %t 2>&1 \
+# RUN:              | FileCheck %s
 
 # CHECK:        Creating graph symbols...
 # CHECK:          Graphifying regular section __DATA,__data...
diff --git a/llvm/test/Instrumentation/HWAddressSanitizer/zero-ptr.ll b/llvm/test/Instrumentation/HWAddressSanitizer/zero-ptr.ll
new file mode 100644
index 000000000000..a201174df995
--- /dev/null
+++ b/llvm/test/Instrumentation/HWAddressSanitizer/zero-ptr.ll
@@ -0,0 +1,35 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
+; RUN: opt < %s -passes=hwasan -S | FileCheck %s
+; RUN: opt < %s -passes=hwasan -hwasan-recover=0 -hwasan-mapping-offset=0 -S | FileCheck %s --check-prefixes=ABORT-ZERO-BASED-SHADOW
+
+; This shows that HWASan will emit a memaccess check when dereferencing a null
+; pointer.
+; The output is used as the source for llvm/test/CodeGen/AArch64/hwasan-zero-ptr.ll.
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-android10000"
+
+define void @test_store_to_zeroptr() sanitize_hwaddress {
+; CHECK-LABEL: define void @test_store_to_zeroptr
+; CHECK-SAME: () #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DOTHWASAN_SHADOW:%.*]] = call ptr asm "", "=r,0"(ptr @__hwasan_shadow)
+; CHECK-NEXT:    [[B:%.*]] = inttoptr i64 0 to ptr
+; CHECK-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[DOTHWASAN_SHADOW]], ptr [[B]], i32 19)
+; CHECK-NEXT:    store i64 42, ptr [[B]], align 8
+; CHECK-NEXT:    ret void
+;
+; ABORT-ZERO-BASED-SHADOW-LABEL: define void @test_store_to_zeroptr
+; ABORT-ZERO-BASED-SHADOW-SAME: () #[[ATTR0:[0-9]+]] {
+; ABORT-ZERO-BASED-SHADOW-NEXT:  entry:
+; ABORT-ZERO-BASED-SHADOW-NEXT:    [[DOTHWASAN_SHADOW:%.*]] = call ptr asm "", "=r,0"(ptr null)
+; ABORT-ZERO-BASED-SHADOW-NEXT:    [[B:%.*]] = inttoptr i64 0 to ptr
+; ABORT-ZERO-BASED-SHADOW-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules.fixedshadow(ptr [[B]], i32 19, i64 0)
+; ABORT-ZERO-BASED-SHADOW-NEXT:    store i64 42, ptr [[B]], align 8
+; ABORT-ZERO-BASED-SHADOW-NEXT:    ret void
+;
+entry:
+  %b = inttoptr i64 0 to i64*
+  store i64 42, ptr %b
+  ret void
+}
diff --git a/llvm/test/Instrumentation/TypeSanitizer/access-with-offset.ll b/llvm/test/Instrumentation/TypeSanitizer/access-with-offset.ll
index 78f3816c9aef..56cf3f528f83 100644
--- a/llvm/test/Instrumentation/TypeSanitizer/access-with-offset.ll
+++ b/llvm/test/Instrumentation/TypeSanitizer/access-with-offset.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals
-; RUN: opt -passes='tysan-module,tysan' -S %s | FileCheck %s
+; RUN: opt -passes='tysan' -S %s | FileCheck %s
 
 ;.
 ; CHECK: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 0, ptr @tysan.module_ctor, ptr null }]
diff --git a/llvm/test/Instrumentation/TypeSanitizer/alloca-only.ll b/llvm/test/Instrumentation/TypeSanitizer/alloca-only.ll
index 1aa47cacc127..117cd1a3d410 100644
--- a/llvm/test/Instrumentation/TypeSanitizer/alloca-only.ll
+++ b/llvm/test/Instrumentation/TypeSanitizer/alloca-only.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals
 ; Test basic type sanitizer instrumentation.
 ;
-; RUN: opt -passes='tysan-module,tysan' -S %s | FileCheck %s
+; RUN: opt -passes='tysan' -S %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/llvm/test/Instrumentation/TypeSanitizer/alloca.ll b/llvm/test/Instrumentation/TypeSanitizer/alloca.ll
index 94098bd8a173..ea5adf6e4340 100644
--- a/llvm/test/Instrumentation/TypeSanitizer/alloca.ll
+++ b/llvm/test/Instrumentation/TypeSanitizer/alloca.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; Test basic type sanitizer instrumentation.
 ;
-; RUN: opt -passes='tysan-module,tysan' -S %s | FileCheck %s
+; RUN: opt -passes='tysan' -S %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/llvm/test/Instrumentation/TypeSanitizer/anon.ll b/llvm/test/Instrumentation/TypeSanitizer/anon.ll
index ce4f0c1be0a4..37de1b71e0c7 100644
--- a/llvm/test/Instrumentation/TypeSanitizer/anon.ll
+++ b/llvm/test/Instrumentation/TypeSanitizer/anon.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals
 ; Test basic type sanitizer instrumentation.
 ;
-; RUN: opt -passes='tysan-module,tysan' -S %s | FileCheck %s
+; RUN: opt -passes='tysan' -S %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/llvm/test/Instrumentation/TypeSanitizer/basic-nosan.ll b/llvm/test/Instrumentation/TypeSanitizer/basic-nosan.ll
index 9b9522f3dba1..8ddc5738a673 100644
--- a/llvm/test/Instrumentation/TypeSanitizer/basic-nosan.ll
+++ b/llvm/test/Instrumentation/TypeSanitizer/basic-nosan.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals --include-generated-funcs
 ; Test basic type sanitizer instrumentation.
-; RUN: opt -passes='tysan-module,tysan' -S %s | FileCheck %s
+; RUN: opt -passes='tysan' -S %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/llvm/test/Instrumentation/TypeSanitizer/basic.ll b/llvm/test/Instrumentation/TypeSanitizer/basic.ll
index 8873a40798b1..704c18800f19 100644
--- a/llvm/test/Instrumentation/TypeSanitizer/basic.ll
+++ b/llvm/test/Instrumentation/TypeSanitizer/basic.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals
 ; Test basic type sanitizer instrumentation.
 ;
-; RUN: opt -passes='tysan-module,tysan' -S %s | FileCheck %s
+; RUN: opt -passes='tysan' -S %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/llvm/test/Instrumentation/TypeSanitizer/byval.ll b/llvm/test/Instrumentation/TypeSanitizer/byval.ll
index 23ed1b00173b..6ae343d8d534 100644
--- a/llvm/test/Instrumentation/TypeSanitizer/byval.ll
+++ b/llvm/test/Instrumentation/TypeSanitizer/byval.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals --include-generated-funcs
 ; Test basic type sanitizer instrumentation.
-; RUN: opt -passes='tysan-module,tysan' -S %s | FileCheck %s
+; RUN: opt -passes='tysan' -S %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/llvm/test/Instrumentation/TypeSanitizer/globals.ll b/llvm/test/Instrumentation/TypeSanitizer/globals.ll
index 1f57c2a3816d..a73599e86448 100644
--- a/llvm/test/Instrumentation/TypeSanitizer/globals.ll
+++ b/llvm/test/Instrumentation/TypeSanitizer/globals.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals --include-generated-funcs
-; RUN: opt -passes='tysan-module,tysan' -S %s | FileCheck %s
+; RUN: opt -passes='tysan' -S %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/llvm/test/Instrumentation/TypeSanitizer/invalid-metadata.ll b/llvm/test/Instrumentation/TypeSanitizer/invalid-metadata.ll
index e7de62e12d35..0c99c0f2e674 100644
--- a/llvm/test/Instrumentation/TypeSanitizer/invalid-metadata.ll
+++ b/llvm/test/Instrumentation/TypeSanitizer/invalid-metadata.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals --include-generated-funcs
-; RUN: opt -passes='tysan-module,tysan' -S %s | FileCheck %s
+; RUN: opt -passes='tysan' -S %s | FileCheck %s
 
 !llvm.tysan.globals = !{!0}
 
diff --git a/llvm/test/Instrumentation/TypeSanitizer/memintrinsics.ll b/llvm/test/Instrumentation/TypeSanitizer/memintrinsics.ll
index 26f7c186748c..65a30bd1ace4 100644
--- a/llvm/test/Instrumentation/TypeSanitizer/memintrinsics.ll
+++ b/llvm/test/Instrumentation/TypeSanitizer/memintrinsics.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; Test basic type sanitizer instrumentation.
 ;
-; RUN: opt -passes='tysan-module,tysan' -S %s | FileCheck %s
+; RUN: opt -passes='tysan' -S %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/llvm/test/Instrumentation/TypeSanitizer/nosanitize.ll b/llvm/test/Instrumentation/TypeSanitizer/nosanitize.ll
index 7b07a42379b3..c7c153e140fc 100644
--- a/llvm/test/Instrumentation/TypeSanitizer/nosanitize.ll
+++ b/llvm/test/Instrumentation/TypeSanitizer/nosanitize.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals
 ; Test basic type sanitizer instrumentation.
 ;
-; RUN: opt -passes='tysan-module,tysan' -S %s | FileCheck %s
+; RUN: opt -passes='tysan' -S %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/llvm/test/Instrumentation/TypeSanitizer/sanitize-no-tbaa.ll b/llvm/test/Instrumentation/TypeSanitizer/sanitize-no-tbaa.ll
index 3cb7b8365866..060f031bf2c5 100644
--- a/llvm/test/Instrumentation/TypeSanitizer/sanitize-no-tbaa.ll
+++ b/llvm/test/Instrumentation/TypeSanitizer/sanitize-no-tbaa.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; Test basic type sanitizer instrumentation.
 ;
-; RUN: opt -passes='tysan-module,tysan' -S %s | FileCheck %s
+; RUN: opt -passes='tysan' -S %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/llvm/test/Instrumentation/TypeSanitizer/swifterror.ll b/llvm/test/Instrumentation/TypeSanitizer/swifterror.ll
index 5711fb4b839f..dc83a020bc1a 100644
--- a/llvm/test/Instrumentation/TypeSanitizer/swifterror.ll
+++ b/llvm/test/Instrumentation/TypeSanitizer/swifterror.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; Test basic type sanitizer instrumentation.
 ;
-; RUN: opt -passes='tysan-module,tysan' -S %s | FileCheck %s
+; RUN: opt -passes='tysan' -S %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/llvm/test/MC/AArch64/armv9.6a-rme-gpc3.s b/llvm/test/MC/AArch64/armv9.6a-rme-gpc3.s
index baf05f10b9a1..093101b6cd81 100644
--- a/llvm/test/MC/AArch64/armv9.6a-rme-gpc3.s
+++ b/llvm/test/MC/AArch64/armv9.6a-rme-gpc3.s
@@ -2,10 +2,18 @@
 // RUN: llvm-mc -triple aarch64 -show-encoding %s  | FileCheck %s
 .func:
   apas x0
+  apas x1
+  apas x2
+  apas x17
+  apas x30
   mrs x3, GPCBW_EL3
   msr GPCBW_EL3, x4
 
 # CHECK:      .func:
-# CHECK-NEXT: 	apas	x0                              // encoding: [0x1f,0x70,0x0e,0xd5]
+# CHECK-NEXT:	apas    x0                              // encoding: [0x00,0x70,0x0e,0xd5]
+# CHECK-NEXT:	apas    x1                              // encoding: [0x01,0x70,0x0e,0xd5]
+# CHECK-NEXT:	apas    x2                              // encoding: [0x02,0x70,0x0e,0xd5]
+# CHECK-NEXT:	apas    x17                             // encoding: [0x11,0x70,0x0e,0xd5]
+# CHECK-NEXT:	apas    x30                             // encoding: [0x1e,0x70,0x0e,0xd5]
 # CHECK-NEXT: 	mrs	x3, GPCBW_EL3                   // encoding: [0xa3,0x21,0x3e,0xd5]
 # CHECK-NEXT: 	msr	GPCBW_EL3, x4                   // encoding: [0xa4,0x21,0x1e,0xd5]
diff --git a/llvm/test/MC/AMDGPU/gfx1030_err.s b/llvm/test/MC/AMDGPU/gfx1030_err.s
index 87a09875f75e..a0565dc1e6d3 100644
--- a/llvm/test/MC/AMDGPU/gfx1030_err.s
+++ b/llvm/test/MC/AMDGPU/gfx1030_err.s
@@ -573,3 +573,9 @@ v_dot8_u32_u4 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[1,0]
 
 v_dot8_u32_u4 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[1,1]
 // GFX10: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+image_bvh_intersect_ray v[4:7], v[9:19], null
+// GFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_bvh64_intersect_ray v[4:7], v[9:20], null
+// GFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
diff --git a/llvm/test/MC/AMDGPU/gfx10_asm_smem_err.s b/llvm/test/MC/AMDGPU/gfx10_asm_smem_err.s
index 670e97325355..74c283c571c5 100644
--- a/llvm/test/MC/AMDGPU/gfx10_asm_smem_err.s
+++ b/llvm/test/MC/AMDGPU/gfx10_asm_smem_err.s
@@ -84,3 +84,5 @@ s_buffer_load_dwordx16 s[4:19], null, s101
 s_buffer_store_dword s4, null, s101
 // NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
 
+s_atc_probe_buffer 7, null, s2
+// NOGFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_mimg_err.s b/llvm/test/MC/AMDGPU/gfx11_asm_mimg_err.s
index 9c614453c1eb..25861989ec32 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_mimg_err.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_mimg_err.s
@@ -517,3 +517,9 @@ image_sample_o v[5:6], v[1:2], null, s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D
 
 image_sample_o v[5:6], v[1:2], s[8:15], null dmask:0x3 dim:SQ_RSRC_IMG_1D
 // NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_bvh_intersect_ray v[4:7], v[9:19], null
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_bvh64_intersect_ray v[4:7], v[9:20], null
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_smem_err.s b/llvm/test/MC/AMDGPU/gfx11_asm_smem_err.s
index da195b4a4118..7dd6ded66f1d 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_smem_err.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_smem_err.s
@@ -29,3 +29,6 @@ s_buffer_load_dwordx8 s[4:11], null, s101
 
 s_buffer_load_dwordx16 s[4:19], null, s101
 // NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+s_atc_probe_buffer 7, null, s2
+// NOGFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3.s
index fc8d2bdc0540..6bc92bc29ea8 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3.s
@@ -3746,50 +3746,62 @@ v_max_u16 v5.l, v255.l, v255.h
 v_max_u16 v255.h, 0xfe0b, vcc_hi
 // GFX11: v_max_u16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] ; encoding: [0xff,0x40,0x09,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
-v_maxmin_f16 v5, v1, v2, s3
-// GFX11: v_maxmin_f16 v5, v1, v2, s3             ; encoding: [0x05,0x00,0x60,0xd6,0x01,0x05,0x0e,0x00]
+v_maxmin_f16 v5.l, v1.l, v2.l, s3
+// GFX11: v_maxmin_f16 v5.l, v1.l, v2.l, s3       ; encoding: [0x05,0x00,0x60,0xd6,0x01,0x05,0x0e,0x00]
 
-v_maxmin_f16 v5, v255, s2, s105
-// GFX11: v_maxmin_f16 v5, v255, s2, s105         ; encoding: [0x05,0x00,0x60,0xd6,0xff,0x05,0xa4,0x01]
+v_maxmin_f16 v5.l, v255.l, s2, s105
+// GFX11: v_maxmin_f16 v5.l, v255.l, s2, s105     ; encoding: [0x05,0x00,0x60,0xd6,0xff,0x05,0xa4,0x01]
 
-v_maxmin_f16 v5, s1, v255, exec_hi
-// GFX11: v_maxmin_f16 v5, s1, v255, exec_hi      ; encoding: [0x05,0x00,0x60,0xd6,0x01,0xfe,0xff,0x01]
+v_maxmin_f16 v5.l, s1, v255.l, exec_hi
+// GFX11: v_maxmin_f16 v5.l, s1, v255.l, exec_hi  ; encoding: [0x05,0x00,0x60,0xd6,0x01,0xfe,0xff,0x01]
 
-v_maxmin_f16 v5, s105, s105, exec_lo
-// GFX11: v_maxmin_f16 v5, s105, s105, exec_lo    ; encoding: [0x05,0x00,0x60,0xd6,0x69,0xd2,0xf8,0x01]
+v_maxmin_f16 v5.l, s105, s105, exec_lo
+// GFX11: v_maxmin_f16 v5.l, s105, s105, exec_lo  ; encoding: [0x05,0x00,0x60,0xd6,0x69,0xd2,0xf8,0x01]
 
-v_maxmin_f16 v5, vcc_lo, ttmp15, v3
-// GFX11: v_maxmin_f16 v5, vcc_lo, ttmp15, v3     ; encoding: [0x05,0x00,0x60,0xd6,0x6a,0xf6,0x0c,0x04]
+v_maxmin_f16 v5.l, vcc_lo, ttmp15, v3.l
+// GFX11: v_maxmin_f16 v5.l, vcc_lo, ttmp15, v3.l ; encoding: [0x05,0x00,0x60,0xd6,0x6a,0xf6,0x0c,0x04]
 
-v_maxmin_f16 v5, vcc_hi, 0xfe0b, v255
-// GFX11: v_maxmin_f16 v5, vcc_hi, 0xfe0b, v255   ; encoding: [0x05,0x00,0x60,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+v_maxmin_f16 v5.l, vcc_hi, 0xfe0b, v255.l
+// GFX11: v_maxmin_f16 v5.l, vcc_hi, 0xfe0b, v255.l ; encoding: [0x05,0x00,0x60,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
 
-v_maxmin_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15|
-// GFX11: v_maxmin_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x60,0xd6,0x7b,0xfa,0xed,0xe1]
+v_maxmin_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15|
+// GFX11: v_maxmin_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x60,0xd6,0x7b,0xfa,0xed,0xe1]
 
-v_maxmin_f16 v5, m0, 0.5, m0
-// GFX11: v_maxmin_f16 v5, m0, 0.5, m0            ; encoding: [0x05,0x00,0x60,0xd6,0x7d,0xe0,0xf5,0x01]
+v_maxmin_f16 v5.l, m0, 0.5, m0
+// GFX11: v_maxmin_f16 v5.l, m0, 0.5, m0          ; encoding: [0x05,0x00,0x60,0xd6,0x7d,0xe0,0xf5,0x01]
 
-v_maxmin_f16 v5, |exec_lo|, -1, vcc_hi
-// GFX11: v_maxmin_f16 v5, |exec_lo|, -1, vcc_hi  ; encoding: [0x05,0x01,0x60,0xd6,0x7e,0x82,0xad,0x01]
+v_maxmin_f16 v5.l, |exec_lo|, -1, vcc_hi
+// GFX11: v_maxmin_f16 v5.l, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x60,0xd6,0x7e,0x82,0xad,0x01]
 
-v_maxmin_f16 v5, -|exec_hi|, null, -|vcc_lo|
-// GFX11: v_maxmin_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x60,0xd6,0x7f,0xf8,0xa8,0xa1]
+v_maxmin_f16 v5.l, -|exec_hi|, null, -|vcc_lo|
+// GFX11: v_maxmin_f16 v5.l, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x60,0xd6,0x7f,0xf8,0xa8,0xa1]
 
-v_maxmin_f16 v5, null, exec_lo, -|0xfe0b|
-// GFX11: v_maxmin_f16 v5, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x60,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
+v_maxmin_f16 v5.l, null, exec_lo, -|0xfe0b|
+// GFX11: v_maxmin_f16 v5.l, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x60,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
 
-v_maxmin_f16 v5, -1, -|exec_hi|, -|src_scc|
-// GFX11: v_maxmin_f16 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x60,0xd6,0xc1,0xfe,0xf4,0xc3]
+v_maxmin_f16 v5.l, -1, -|exec_hi|, -|src_scc|
+// GFX11: v_maxmin_f16 v5.l, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x60,0xd6,0xc1,0xfe,0xf4,0xc3]
 
-v_maxmin_f16 v5, 0.5, -m0, 0.5 mul:2
-// GFX11: v_maxmin_f16 v5, 0.5, -m0, 0.5 mul:2    ; encoding: [0x05,0x00,0x60,0xd6,0xf0,0xfa,0xc0,0x4b]
+v_maxmin_f16 v5.l, 0.5, -m0, 0.5 mul:2
+// GFX11: v_maxmin_f16 v5.l, 0.5, -m0, 0.5 mul:2  ; encoding: [0x05,0x00,0x60,0xd6,0xf0,0xfa,0xc0,0x4b]
 
-v_maxmin_f16 v5, -src_scc, |vcc_lo|, -1 mul:4
-// GFX11: v_maxmin_f16 v5, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x60,0xd6,0xfd,0xd4,0x04,0x33]
+v_maxmin_f16 v5.l, -src_scc, |vcc_lo|, -1 mul:4
+// GFX11: v_maxmin_f16 v5.l, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x60,0xd6,0xfd,0xd4,0x04,0x33]
 
-v_maxmin_f16 v255, -|0xfe0b|, -|vcc_hi|, null clamp div:2
-// GFX11: v_maxmin_f16 v255, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x60,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+v_maxmin_f16 v255.l, -|0xfe0b|, -|vcc_hi|, null clamp div:2
+// GFX11: v_maxmin_f16 v255.l, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x60,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+
+v_maxmin_f16 v5.l, v255.h, s2, s105
+// GFX11: v_maxmin_f16 v5.l, v255.h, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x60,0xd6,0xff,0x05,0xa4,0x01]
+
+v_maxmin_f16 v5.l, s1, v255.h, exec_hi
+// GFX11: v_maxmin_f16 v5.l, s1, v255.h, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x60,0xd6,0x01,0xfe,0xff,0x01]
+
+v_maxmin_f16 v5.l, vcc_hi, 0xfe0b, v255.h
+// GFX11: v_maxmin_f16 v5.l, vcc_hi, 0xfe0b, v255.h op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x60,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+
+v_maxmin_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null clamp div:2
+// GFX11: v_maxmin_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp div:2 ; encoding: [0xff,0xc3,0x60,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
 
 v_maxmin_f32 v5, v1, v2, s3
 // GFX11: v_maxmin_f32 v5, v1, v2, s3             ; encoding: [0x05,0x00,0x5e,0xd6,0x01,0x05,0x0e,0x00]
@@ -4823,50 +4835,62 @@ v_min_u16 v5.l, v255.l, v255.h
 v_min_u16 v255.h, 0xfe0b, vcc_hi
 // GFX11: v_min_u16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] ; encoding: [0xff,0x40,0x0b,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
-v_minmax_f16 v5, v1, v2, s3
-// GFX11: v_minmax_f16 v5, v1, v2, s3             ; encoding: [0x05,0x00,0x61,0xd6,0x01,0x05,0x0e,0x00]
+v_minmax_f16 v5.l, v1.l, v2.l, s3
+// GFX11: v_minmax_f16 v5.l, v1.l, v2.l, s3       ; encoding: [0x05,0x00,0x61,0xd6,0x01,0x05,0x0e,0x00]
+
+v_minmax_f16 v5.l, v255.l, s2, s105
+// GFX11: v_minmax_f16 v5.l, v255.l, s2, s105     ; encoding: [0x05,0x00,0x61,0xd6,0xff,0x05,0xa4,0x01]
+
+v_minmax_f16 v5.l, s1, v255.l, exec_hi
+// GFX11: v_minmax_f16 v5.l, s1, v255.l, exec_hi  ; encoding: [0x05,0x00,0x61,0xd6,0x01,0xfe,0xff,0x01]
+
+v_minmax_f16 v5.l, s105, s105, exec_lo
+// GFX11: v_minmax_f16 v5.l, s105, s105, exec_lo  ; encoding: [0x05,0x00,0x61,0xd6,0x69,0xd2,0xf8,0x01]
+
+v_minmax_f16 v5.l, vcc_lo, ttmp15, v3.l
+// GFX11: v_minmax_f16 v5.l, vcc_lo, ttmp15, v3.l ; encoding: [0x05,0x00,0x61,0xd6,0x6a,0xf6,0x0c,0x04]
 
-v_minmax_f16 v5, v255, s2, s105
-// GFX11: v_minmax_f16 v5, v255, s2, s105         ; encoding: [0x05,0x00,0x61,0xd6,0xff,0x05,0xa4,0x01]
+v_minmax_f16 v5.l, vcc_hi, 0xfe0b, v255.l
+// GFX11: v_minmax_f16 v5.l, vcc_hi, 0xfe0b, v255.l ; encoding: [0x05,0x00,0x61,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
 
-v_minmax_f16 v5, s1, v255, exec_hi
-// GFX11: v_minmax_f16 v5, s1, v255, exec_hi      ; encoding: [0x05,0x00,0x61,0xd6,0x01,0xfe,0xff,0x01]
+v_minmax_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15|
+// GFX11: v_minmax_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x61,0xd6,0x7b,0xfa,0xed,0xe1]
 
-v_minmax_f16 v5, s105, s105, exec_lo
-// GFX11: v_minmax_f16 v5, s105, s105, exec_lo    ; encoding: [0x05,0x00,0x61,0xd6,0x69,0xd2,0xf8,0x01]
+v_minmax_f16 v5.l, m0, 0.5, m0
+// GFX11: v_minmax_f16 v5.l, m0, 0.5, m0          ; encoding: [0x05,0x00,0x61,0xd6,0x7d,0xe0,0xf5,0x01]
 
-v_minmax_f16 v5, vcc_lo, ttmp15, v3
-// GFX11: v_minmax_f16 v5, vcc_lo, ttmp15, v3     ; encoding: [0x05,0x00,0x61,0xd6,0x6a,0xf6,0x0c,0x04]
+v_minmax_f16 v5.l, |exec_lo|, -1, vcc_hi
+// GFX11: v_minmax_f16 v5.l, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x61,0xd6,0x7e,0x82,0xad,0x01]
 
-v_minmax_f16 v5, vcc_hi, 0xfe0b, v255
-// GFX11: v_minmax_f16 v5, vcc_hi, 0xfe0b, v255   ; encoding: [0x05,0x00,0x61,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+v_minmax_f16 v5.l, -|exec_hi|, null, -|vcc_lo|
+// GFX11: v_minmax_f16 v5.l, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x61,0xd6,0x7f,0xf8,0xa8,0xa1]
 
-v_minmax_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15|
-// GFX11: v_minmax_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x61,0xd6,0x7b,0xfa,0xed,0xe1]
+v_minmax_f16 v5.l, null, exec_lo, -|0xfe0b|
+// GFX11: v_minmax_f16 v5.l, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x61,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
 
-v_minmax_f16 v5, m0, 0.5, m0
-// GFX11: v_minmax_f16 v5, m0, 0.5, m0            ; encoding: [0x05,0x00,0x61,0xd6,0x7d,0xe0,0xf5,0x01]
+v_minmax_f16 v5.l, -1, -|exec_hi|, -|src_scc|
+// GFX11: v_minmax_f16 v5.l, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x61,0xd6,0xc1,0xfe,0xf4,0xc3]
 
-v_minmax_f16 v5, |exec_lo|, -1, vcc_hi
-// GFX11: v_minmax_f16 v5, |exec_lo|, -1, vcc_hi  ; encoding: [0x05,0x01,0x61,0xd6,0x7e,0x82,0xad,0x01]
+v_minmax_f16 v5.l, 0.5, -m0, 0.5 mul:2
+// GFX11: v_minmax_f16 v5.l, 0.5, -m0, 0.5 mul:2  ; encoding: [0x05,0x00,0x61,0xd6,0xf0,0xfa,0xc0,0x4b]
 
-v_minmax_f16 v5, -|exec_hi|, null, -|vcc_lo|
-// GFX11: v_minmax_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x61,0xd6,0x7f,0xf8,0xa8,0xa1]
+v_minmax_f16 v5.l, -src_scc, |vcc_lo|, -1 mul:4
+// GFX11: v_minmax_f16 v5.l, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x61,0xd6,0xfd,0xd4,0x04,0x33]
 
-v_minmax_f16 v5, null, exec_lo, -|0xfe0b|
-// GFX11: v_minmax_f16 v5, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x61,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
+v_minmax_f16 v255.l, -|0xfe0b|, -|vcc_hi|, null clamp div:2
+// GFX11: v_minmax_f16 v255.l, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x61,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
 
-v_minmax_f16 v5, -1, -|exec_hi|, -|src_scc|
-// GFX11: v_minmax_f16 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x61,0xd6,0xc1,0xfe,0xf4,0xc3]
+v_minmax_f16 v5.l, v255.h, s2, s105
+// GFX11: v_minmax_f16 v5.l, v255.h, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x61,0xd6,0xff,0x05,0xa4,0x01]
 
-v_minmax_f16 v5, 0.5, -m0, 0.5 mul:2
-// GFX11: v_minmax_f16 v5, 0.5, -m0, 0.5 mul:2    ; encoding: [0x05,0x00,0x61,0xd6,0xf0,0xfa,0xc0,0x4b]
+v_minmax_f16 v5.l, s1, v255.h, exec_hi
+// GFX11: v_minmax_f16 v5.l, s1, v255.h, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x61,0xd6,0x01,0xfe,0xff,0x01]
 
-v_minmax_f16 v5, -src_scc, |vcc_lo|, -1 mul:4
-// GFX11: v_minmax_f16 v5, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x61,0xd6,0xfd,0xd4,0x04,0x33]
+v_minmax_f16 v5.l, vcc_hi, 0xfe0b, v255.h
+// GFX11: v_minmax_f16 v5.l, vcc_hi, 0xfe0b, v255.h op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x61,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
 
-v_minmax_f16 v255, -|0xfe0b|, -|vcc_hi|, null clamp div:2
-// GFX11: v_minmax_f16 v255, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x61,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+v_minmax_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null clamp div:2
+// GFX11: v_minmax_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp div:2 ; encoding: [0xff,0xc3,0x61,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
 
 v_minmax_f32 v5, v1, v2, s3
 // GFX11: v_minmax_f32 v5, v1, v2, s3             ; encoding: [0x05,0x00,0x5f,0xd6,0x01,0x05,0x0e,0x00]
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16.s
index f71569433d32..5fa1334aa6e9 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16.s
@@ -2660,47 +2660,92 @@ v_max_u16_e64_dpp v5.l, v1.l, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_
 v_max_u16_e64_dpp v255.h, v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: v_max_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x40,0x09,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
 
-v_maxmin_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
-// GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0]
+// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
-v_maxmin_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
-// GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
+v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf
+// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
-v_maxmin_f16_e64_dpp v5, v1, v2, v3 row_mirror
-// GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
+v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror row_mask:0xf bank_mask:0xf
+// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
 
-v_maxmin_f16_e64_dpp v5, v1, v2, v255 row_half_mirror
-// GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
+v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror row_mask:0xf bank_mask:0xf
+// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
 
-v_maxmin_f16_e64_dpp v5, v1, v2, s105 row_shl:1
-// GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
+v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1 row_mask:0xf bank_mask:0xf
+// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
 
-v_maxmin_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15
-// GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15
+// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
 
-v_maxmin_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1
-// GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1
+// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
 
-v_maxmin_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15
-// GFX11: v_maxmin_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x60,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
+v_maxmin_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf
+// GFX11: v_maxmin_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x60,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
 
-v_maxmin_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1
-// GFX11: v_maxmin_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x60,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+v_maxmin_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf
+// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x60,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
 
-v_maxmin_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15
-// GFX11: v_maxmin_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x60,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+v_maxmin_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf
+// GFX11: v_maxmin_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x60,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
 
-v_maxmin_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: v_maxmin_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x60,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+v_maxmin_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: v_maxmin_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x60,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
 
-v_maxmin_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: v_maxmin_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x60,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01]
+v_maxmin_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_maxmin_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x60,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01]
 
-v_maxmin_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX11: v_maxmin_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x60,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x09,0x13]
+v_maxmin_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1
+// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x60,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x09,0x13]
 
-v_maxmin_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX11: v_maxmin_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x60,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x05,0x30]
+v_maxmin_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: v_maxmin_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x60,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x05,0x30]
+
+v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v255.l quad_perm:[0,1,2,3]
+// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v255.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+
+v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, s3 row_mirror
+// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, s3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x40,0x01,0xff]
+
+v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_half_mirror
+// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x41,0x01,0xff]
+
+v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 row_shl:1
+// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xee,0x01,0x01,0x01,0x01,0xff]
+
+v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, -|m0| row_shr:15
+// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, -|m0| row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x60,0xd6,0xfa,0x04,0xf6,0x81,0x01,0x1f,0x01,0xff]
+
+v_maxmin_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|exec_hi| row_ror:1
+// GFX11: v_maxmin_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|exec_hi| row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x05,0x60,0xd6,0xfa,0x04,0xfe,0xa1,0x01,0x21,0x01,0xff]
+
+v_maxmin_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|exec_lo| row_ror:15
+// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x06,0x60,0xd6,0xfa,0x04,0xfa,0xc1,0x01,0x2f,0x01,0xff]
+
+v_maxmin_f16_e64_dpp v5.l, |v1.l|, -v2.l, null row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: v_maxmin_f16_e64_dpp v5.l, |v1.l|, -v2.l, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x60,0xd6,0xfa,0x04,0xf2,0x41,0x01,0x50,0x01,0xff]
+
+v_maxmin_f16_e64_dpp v5.l, -v1.l, |v2.l|, -1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_maxmin_f16_e64_dpp v5.l, -v1.l, |v2.l|, -1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x02,0x60,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01]
+
+v_maxmin_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, 0.5 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: v_maxmin_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, 0.5 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x03,0x60,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x09,0x13]
+
+v_maxmin_f16_e64_dpp v5.h, v1.h, v2.h, v3.h quad_perm:[3,2,1,0]
+// GFX11: v_maxmin_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+
+v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v255.h quad_perm:[0,1,2,3]
+// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x60,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+
+v_maxmin_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_maxmin_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x60,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01]
+
+v_maxmin_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: v_maxmin_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x13,0x60,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x09,0x13]
+
+v_maxmin_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: v_maxmin_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0xc7,0x60,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x05,0x30]
 
 v_maxmin_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX11: v_maxmin_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
@@ -3704,47 +3749,92 @@ v_min_u16_e64_dpp v5.l, v1.l, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_
 v_min_u16_e64_dpp v255.h, v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: v_min_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x40,0x0b,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
 
-v_minmax_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
-// GFX11: v_minmax_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0]
+// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+
+v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf
+// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
+
+v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror row_mask:0xf bank_mask:0xf
+// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
+
+v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror row_mask:0xf bank_mask:0xf
+// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
+
+v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1 row_mask:0xf bank_mask:0xf
+// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
+
+v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15
+// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+
+v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1
+// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+
+v_minmax_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf
+// GFX11: v_minmax_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x61,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
+
+v_minmax_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf
+// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x61,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+
+v_minmax_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf
+// GFX11: v_minmax_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x61,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+
+v_minmax_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: v_minmax_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x61,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+
+v_minmax_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_minmax_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x61,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01]
+
+v_minmax_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1
+// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x61,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x09,0x13]
+
+v_minmax_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: v_minmax_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x61,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x05,0x30]
+
+v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v255.l quad_perm:[0,1,2,3]
+// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v255.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+
+v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, s3 row_mirror
+// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, s3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x40,0x01,0xff]
 
-v_minmax_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
-// GFX11: v_minmax_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
+v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_half_mirror
+// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x41,0x01,0xff]
 
-v_minmax_f16_e64_dpp v5, v1, v2, v3 row_mirror
-// GFX11: v_minmax_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
+v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 row_shl:1
+// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xee,0x01,0x01,0x01,0x01,0xff]
 
-v_minmax_f16_e64_dpp v5, v1, v2, v255 row_half_mirror
-// GFX11: v_minmax_f16_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
+v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, -|m0| row_shr:15
+// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, -|m0| row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x61,0xd6,0xfa,0x04,0xf6,0x81,0x01,0x1f,0x01,0xff]
 
-v_minmax_f16_e64_dpp v5, v1, v2, s105 row_shl:1
-// GFX11: v_minmax_f16_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
+v_minmax_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|exec_hi| row_ror:1
+// GFX11: v_minmax_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|exec_hi| row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x05,0x61,0xd6,0xfa,0x04,0xfe,0xa1,0x01,0x21,0x01,0xff]
 
-v_minmax_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15
-// GFX11: v_minmax_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+v_minmax_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|exec_lo| row_ror:15
+// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x06,0x61,0xd6,0xfa,0x04,0xfa,0xc1,0x01,0x2f,0x01,0xff]
 
-v_minmax_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1
-// GFX11: v_minmax_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+v_minmax_f16_e64_dpp v5.l, |v1.l|, -v2.l, null row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: v_minmax_f16_e64_dpp v5.l, |v1.l|, -v2.l, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x61,0xd6,0xfa,0x04,0xf2,0x41,0x01,0x50,0x01,0xff]
 
-v_minmax_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15
-// GFX11: v_minmax_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x61,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
+v_minmax_f16_e64_dpp v5.l, -v1.l, |v2.l|, -1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_minmax_f16_e64_dpp v5.l, -v1.l, |v2.l|, -1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x02,0x61,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01]
 
-v_minmax_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1
-// GFX11: v_minmax_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x61,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+v_minmax_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, 0.5 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: v_minmax_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, 0.5 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x03,0x61,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x09,0x13]
 
-v_minmax_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15
-// GFX11: v_minmax_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x61,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+v_minmax_f16_e64_dpp v5.h, v1.h, v2.h, v3.h quad_perm:[3,2,1,0]
+// GFX11: v_minmax_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
-v_minmax_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: v_minmax_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x61,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v255.h quad_perm:[0,1,2,3]
+// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x61,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
 
-v_minmax_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: v_minmax_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x61,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01]
+v_minmax_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_minmax_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x61,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01]
 
-v_minmax_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX11: v_minmax_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x61,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x09,0x13]
+v_minmax_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: v_minmax_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x13,0x61,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x09,0x13]
 
-v_minmax_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX11: v_minmax_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x61,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x05,0x30]
+v_minmax_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: v_minmax_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0xc7,0x61,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x05,0x30]
 
 v_minmax_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX11: v_minmax_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8.s
index 2ececc0c78ec..2fc02061c59d 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8.s
@@ -1660,41 +1660,80 @@ v_max_u16_e64_dpp v5.l, v1.l, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_max_u16_e64_dpp v255.h, v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: v_max_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x40,0x09,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
-v_maxmin_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
-v_maxmin_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
-v_maxmin_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
 
-v_maxmin_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
 
-v_maxmin_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
 
-v_maxmin_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_maxmin_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x60,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
+v_maxmin_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_maxmin_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x60,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
 
-v_maxmin_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_maxmin_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x60,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+v_maxmin_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x60,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
 
-v_maxmin_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_maxmin_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x60,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+v_maxmin_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_maxmin_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x60,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
 
-v_maxmin_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_maxmin_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x60,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+v_maxmin_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_maxmin_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x60,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
 
-v_maxmin_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_maxmin_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x60,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05]
+v_maxmin_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_maxmin_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x60,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05]
 
-v_maxmin_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX11: v_maxmin_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x60,0xd6,0xea,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05]
+v_maxmin_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x60,0xd6,0xea,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05]
 
-v_maxmin_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX11: v_maxmin_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x60,0xd6,0xe9,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+v_maxmin_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: v_maxmin_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x60,0xd6,0xe9,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+
+v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, s3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0x0e,0x00,0x01,0x77,0x39,0x05]
+
+v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05]
+
+v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, -|m0| dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, -|m0| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x60,0xd6,0xe9,0x04,0xf6,0x81,0x01,0x77,0x39,0x05]
+
+v_maxmin_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|exec_hi| dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_maxmin_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|exec_hi| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x60,0xd6,0xe9,0x04,0xfe,0xa1,0x01,0x77,0x39,0x05]
+
+v_maxmin_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|exec_lo| dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x06,0x60,0xd6,0xe9,0x04,0xfa,0xc1,0x01,0x77,0x39,0x05]
+
+v_maxmin_f16_e64_dpp v5.l, |v1.l|, -v2.l, null dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_maxmin_f16_e64_dpp v5.l, |v1.l|, -v2.l, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x60,0xd6,0xe9,0x04,0xf2,0x41,0x01,0x77,0x39,0x05]
+
+v_maxmin_f16_e64_dpp v5.l, -v1.l, |v2.l|, -1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_maxmin_f16_e64_dpp v5.l, -v1.l, |v2.l|, -1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x60,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05]
+
+v_maxmin_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, 0.5 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_maxmin_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, 0.5 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x03,0x60,0xd6,0xea,0x04,0xc2,0x73,0x01,0x77,0x39,0x05]
+
+v_maxmin_f16_e64_dpp v5.h, v1.h, v2.h, v3.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_maxmin_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x60,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+
+v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v255.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x60,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+
+v_maxmin_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_maxmin_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x60,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05]
+
+v_maxmin_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_maxmin_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x13,0x60,0xd6,0xea,0x04,0xc2,0x73,0x01,0x77,0x39,0x05]
+
+v_maxmin_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: v_maxmin_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0xc7,0x60,0xd6,0xe9,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
 
 v_maxmin_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_maxmin_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5e,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
@@ -2434,41 +2473,80 @@ v_min_u16_e64_dpp v5.l, v1.l, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_min_u16_e64_dpp v255.h, v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: v_min_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x40,0x0b,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
-v_minmax_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_minmax_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+
+v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+
+v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+
+v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+
+v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+
+v_minmax_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_minmax_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x61,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
+
+v_minmax_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x61,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+
+v_minmax_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_minmax_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x61,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+
+v_minmax_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_minmax_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x61,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+
+v_minmax_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_minmax_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x61,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05]
+
+v_minmax_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x61,0xd6,0xea,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05]
+
+v_minmax_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: v_minmax_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x61,0xd6,0xe9,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+
+v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, s3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0x0e,0x00,0x01,0x77,0x39,0x05]
+
+v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05]
 
-v_minmax_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_minmax_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, -|m0| dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, -|m0| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x61,0xd6,0xe9,0x04,0xf6,0x81,0x01,0x77,0x39,0x05]
 
-v_minmax_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_minmax_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+v_minmax_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|exec_hi| dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_minmax_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|exec_hi| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x61,0xd6,0xe9,0x04,0xfe,0xa1,0x01,0x77,0x39,0x05]
 
-v_minmax_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_minmax_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+v_minmax_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|exec_lo| dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x06,0x61,0xd6,0xe9,0x04,0xfa,0xc1,0x01,0x77,0x39,0x05]
 
-v_minmax_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_minmax_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+v_minmax_f16_e64_dpp v5.l, |v1.l|, -v2.l, null dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_minmax_f16_e64_dpp v5.l, |v1.l|, -v2.l, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x61,0xd6,0xe9,0x04,0xf2,0x41,0x01,0x77,0x39,0x05]
 
-v_minmax_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_minmax_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x61,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
+v_minmax_f16_e64_dpp v5.l, -v1.l, |v2.l|, -1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_minmax_f16_e64_dpp v5.l, -v1.l, |v2.l|, -1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x61,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05]
 
-v_minmax_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_minmax_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x61,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+v_minmax_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, 0.5 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_minmax_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, 0.5 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x03,0x61,0xd6,0xea,0x04,0xc2,0x73,0x01,0x77,0x39,0x05]
 
-v_minmax_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_minmax_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x61,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+v_minmax_f16_e64_dpp v5.h, v1.h, v2.h, v3.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_minmax_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x61,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
-v_minmax_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_minmax_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x61,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v255.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x61,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
-v_minmax_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_minmax_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x61,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05]
+v_minmax_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_minmax_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x61,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05]
 
-v_minmax_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX11: v_minmax_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x61,0xd6,0xea,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05]
+v_minmax_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_minmax_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x13,0x61,0xd6,0xea,0x04,0xc2,0x73,0x01,0x77,0x39,0x05]
 
-v_minmax_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX11: v_minmax_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x61,0xd6,0xe9,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+v_minmax_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: v_minmax_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0xc7,0x61,0xd6,0xe9,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
 
 v_minmax_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_minmax_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5f,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_mimg_err.s b/llvm/test/MC/AMDGPU/gfx12_asm_mimg_err.s
index 0f2cfc39e2ec..ee82fa302d49 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_mimg_err.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_mimg_err.s
@@ -374,3 +374,8 @@ image_sample_o v[5:6], [v1, v2], null, s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D
 image_sample_o v[5:6], [v1, v2], s[8:15], null dmask:0x3 dim:SQ_RSRC_IMG_1D
 // NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
 
+image_bvh_intersect_ray v[4:7], [v9, v10, v[11:13], v[14:16], v[17:19]], null
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+image_bvh64_intersect_ray v[4:7], [v[9:10], v11, v[12:14], v[15:17], v[18:20]], null
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_smem_err.s b/llvm/test/MC/AMDGPU/gfx12_asm_smem_err.s
index 0f62c8b93999..49d7c7245608 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_smem_err.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_smem_err.s
@@ -29,3 +29,21 @@ s_buffer_load_dwordx8 s[4:11], null, s101
 
 s_buffer_load_dwordx16 s[4:19], null, s101
 // NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+s_atc_probe_buffer 7, null, s2
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+s_buffer_prefetch_data null, 100, s10, 7
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+s_buffer_load_i8 s5, null, s0
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+s_buffer_load_u8 s5, null, s0
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+s_buffer_load_i16 s5, null, s0
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+s_buffer_load_u16 s5, null, s0
+// NOGFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3.s
index c2db5b90bb47..3e7b7d28c2e9 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3.s
@@ -3590,50 +3590,62 @@ v_max_u16 v255.l, 0xfe0b, vcc_hi
 v_max_u16 v255.h, 0xfe0b, vcc_hi
 // GFX12: v_max_u16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] ; encoding: [0xff,0x40,0x09,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
-v_maxmin_num_f16 v5, v1, v2, s3
-// GFX12: v_maxmin_num_f16 v5, v1, v2, s3         ; encoding: [0x05,0x00,0x6b,0xd6,0x01,0x05,0x0e,0x00]
+v_maxmin_num_f16 v5.l, v1.l, v2.l, s3
+// GFX12: v_maxmin_num_f16 v5.l, v1.l, v2.l, s3   ; encoding: [0x05,0x00,0x6b,0xd6,0x01,0x05,0x0e,0x00]
 
-v_maxmin_num_f16 v5, v255, s2, s105
-// GFX12: v_maxmin_num_f16 v5, v255, s2, s105     ; encoding: [0x05,0x00,0x6b,0xd6,0xff,0x05,0xa4,0x01]
+v_maxmin_num_f16 v5.l, v255.l, s2, s105
+// GFX12: v_maxmin_num_f16 v5.l, v255.l, s2, s105 ; encoding: [0x05,0x00,0x6b,0xd6,0xff,0x05,0xa4,0x01]
 
-v_maxmin_num_f16 v5, s1, v255, exec_hi
-// GFX12: v_maxmin_num_f16 v5, s1, v255, exec_hi  ; encoding: [0x05,0x00,0x6b,0xd6,0x01,0xfe,0xff,0x01]
+v_maxmin_num_f16 v5.l, s1, v255.l, exec_hi
+// GFX12: v_maxmin_num_f16 v5.l, s1, v255.l, exec_hi ; encoding: [0x05,0x00,0x6b,0xd6,0x01,0xfe,0xff,0x01]
 
-v_maxmin_num_f16 v5, s105, s105, exec_lo
-// GFX12: v_maxmin_num_f16 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x6b,0xd6,0x69,0xd2,0xf8,0x01]
+v_maxmin_num_f16 v5.l, s105, s105, exec_lo
+// GFX12: v_maxmin_num_f16 v5.l, s105, s105, exec_lo ; encoding: [0x05,0x00,0x6b,0xd6,0x69,0xd2,0xf8,0x01]
 
-v_maxmin_num_f16 v5, vcc_lo, ttmp15, v3
-// GFX12: v_maxmin_num_f16 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x6b,0xd6,0x6a,0xf6,0x0c,0x04]
+v_maxmin_num_f16 v5.l, vcc_lo, ttmp15, v3.l
+// GFX12: v_maxmin_num_f16 v5.l, vcc_lo, ttmp15, v3.l ; encoding: [0x05,0x00,0x6b,0xd6,0x6a,0xf6,0x0c,0x04]
 
-v_maxmin_num_f16 v5, vcc_hi, 0xfe0b, v255
-// GFX12: v_maxmin_num_f16 v5, vcc_hi, 0xfe0b, v255 ; encoding: [0x05,0x00,0x6b,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+v_maxmin_num_f16 v5.l, vcc_hi, 0xfe0b, v255.l
+// GFX12: v_maxmin_num_f16 v5.l, vcc_hi, 0xfe0b, v255.l ; encoding: [0x05,0x00,0x6b,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
 
-v_maxmin_num_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15|
-// GFX12: v_maxmin_num_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x6b,0xd6,0x7b,0xfa,0xed,0xe1]
+v_maxmin_num_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15|
+// GFX12: v_maxmin_num_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x6b,0xd6,0x7b,0xfa,0xed,0xe1]
 
-v_maxmin_num_f16 v5, m0, 0.5, m0
-// GFX12: v_maxmin_num_f16 v5, m0, 0.5, m0        ; encoding: [0x05,0x00,0x6b,0xd6,0x7d,0xe0,0xf5,0x01]
+v_maxmin_num_f16 v5.l, m0, 0.5, m0
+// GFX12: v_maxmin_num_f16 v5.l, m0, 0.5, m0      ; encoding: [0x05,0x00,0x6b,0xd6,0x7d,0xe0,0xf5,0x01]
 
-v_maxmin_num_f16 v5, |exec_lo|, -1, vcc_hi
-// GFX12: v_maxmin_num_f16 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x6b,0xd6,0x7e,0x82,0xad,0x01]
+v_maxmin_num_f16 v5.l, |exec_lo|, -1, vcc_hi
+// GFX12: v_maxmin_num_f16 v5.l, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x6b,0xd6,0x7e,0x82,0xad,0x01]
 
-v_maxmin_num_f16 v5, -|exec_hi|, null, -|vcc_lo|
-// GFX12: v_maxmin_num_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x6b,0xd6,0x7f,0xf8,0xa8,0xa1]
+v_maxmin_num_f16 v5.l, -|exec_hi|, null, -|vcc_lo|
+// GFX12: v_maxmin_num_f16 v5.l, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x6b,0xd6,0x7f,0xf8,0xa8,0xa1]
 
-v_maxmin_num_f16 v5, null, exec_lo, -|0xfe0b|
-// GFX12: v_maxmin_num_f16 v5, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x6b,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
+v_maxmin_num_f16 v5.l, null, exec_lo, -|0xfe0b|
+// GFX12: v_maxmin_num_f16 v5.l, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x6b,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
 
-v_maxmin_num_f16 v5, -1, -|exec_hi|, -|src_scc|
-// GFX12: v_maxmin_num_f16 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x6b,0xd6,0xc1,0xfe,0xf4,0xc3]
+v_maxmin_num_f16 v5.l, -1, -|exec_hi|, -|src_scc|
+// GFX12: v_maxmin_num_f16 v5.l, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x6b,0xd6,0xc1,0xfe,0xf4,0xc3]
 
-v_maxmin_num_f16 v5, 0.5, -m0, 0.5 mul:2
-// GFX12: v_maxmin_num_f16 v5, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x00,0x6b,0xd6,0xf0,0xfa,0xc0,0x4b]
+v_maxmin_num_f16 v5.l, 0.5, -m0, 0.5 mul:2
+// GFX12: v_maxmin_num_f16 v5.l, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x00,0x6b,0xd6,0xf0,0xfa,0xc0,0x4b]
 
-v_maxmin_num_f16 v5, -src_scc, |vcc_lo|, -1 mul:4
-// GFX12: v_maxmin_num_f16 v5, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x6b,0xd6,0xfd,0xd4,0x04,0x33]
+v_maxmin_num_f16 v5.l, -src_scc, |vcc_lo|, -1 mul:4
+// GFX12: v_maxmin_num_f16 v5.l, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x6b,0xd6,0xfd,0xd4,0x04,0x33]
 
-v_maxmin_num_f16 v255, -|0xfe0b|, -|vcc_hi|, null clamp div:2
-// GFX12: v_maxmin_num_f16 v255, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x6b,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+v_maxmin_num_f16 v255.l, -|0xfe0b|, -|vcc_hi|, null clamp div:2
+// GFX12: v_maxmin_num_f16 v255.l, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x6b,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+
+v_maxmin_num_f16 v5.l, v255.h, s2, s105
+// GFX12: v_maxmin_num_f16 v5.l, v255.h, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x6b,0xd6,0xff,0x05,0xa4,0x01]
+
+v_maxmin_num_f16 v5.l, s1, v255.h, exec_hi
+// GFX12: v_maxmin_num_f16 v5.l, s1, v255.h, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x6b,0xd6,0x01,0xfe,0xff,0x01]
+
+v_maxmin_num_f16 v5.l, vcc_hi, 0xfe0b, v255.h
+// GFX12: v_maxmin_num_f16 v5.l, vcc_hi, 0xfe0b, v255.h op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x6b,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+
+v_maxmin_num_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null clamp div:2
+// GFX12: v_maxmin_num_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp div:2 ; encoding: [0xff,0xc3,0x6b,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
 
 v_maxmin_num_f32 v5, v1, v2, s3
 // GFX12: v_maxmin_num_f32 v5, v1, v2, s3         ; encoding: [0x05,0x00,0x69,0xd6,0x01,0x05,0x0e,0x00]
@@ -4580,50 +4592,62 @@ v_min_u16 v255.l, 0xfe0b, vcc_hi
 v_min_u16 v255.h, 0xfe0b, vcc_hi
 // GFX12: v_min_u16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] ; encoding: [0xff,0x40,0x0b,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
-v_minmax_num_f16 v5, v1, v2, s3
-// GFX12: v_minmax_num_f16 v5, v1, v2, s3         ; encoding: [0x05,0x00,0x6a,0xd6,0x01,0x05,0x0e,0x00]
+v_minmax_num_f16 v5.l, v1.l, v2.l, s3
+// GFX12: v_minmax_num_f16 v5.l, v1.l, v2.l, s3   ; encoding: [0x05,0x00,0x6a,0xd6,0x01,0x05,0x0e,0x00]
+
+v_minmax_num_f16 v5.l, v255.l, s2, s105
+// GFX12: v_minmax_num_f16 v5.l, v255.l, s2, s105 ; encoding: [0x05,0x00,0x6a,0xd6,0xff,0x05,0xa4,0x01]
+
+v_minmax_num_f16 v5.l, s1, v255.l, exec_hi
+// GFX12: v_minmax_num_f16 v5.l, s1, v255.l, exec_hi ; encoding: [0x05,0x00,0x6a,0xd6,0x01,0xfe,0xff,0x01]
+
+v_minmax_num_f16 v5.l, s105, s105, exec_lo
+// GFX12: v_minmax_num_f16 v5.l, s105, s105, exec_lo ; encoding: [0x05,0x00,0x6a,0xd6,0x69,0xd2,0xf8,0x01]
+
+v_minmax_num_f16 v5.l, vcc_lo, ttmp15, v3.l
+// GFX12: v_minmax_num_f16 v5.l, vcc_lo, ttmp15, v3.l ; encoding: [0x05,0x00,0x6a,0xd6,0x6a,0xf6,0x0c,0x04]
 
-v_minmax_num_f16 v5, v255, s2, s105
-// GFX12: v_minmax_num_f16 v5, v255, s2, s105     ; encoding: [0x05,0x00,0x6a,0xd6,0xff,0x05,0xa4,0x01]
+v_minmax_num_f16 v5.l, vcc_hi, 0xfe0b, v255.l
+// GFX12: v_minmax_num_f16 v5.l, vcc_hi, 0xfe0b, v255.l ; encoding: [0x05,0x00,0x6a,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
 
-v_minmax_num_f16 v5, s1, v255, exec_hi
-// GFX12: v_minmax_num_f16 v5, s1, v255, exec_hi  ; encoding: [0x05,0x00,0x6a,0xd6,0x01,0xfe,0xff,0x01]
+v_minmax_num_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15|
+// GFX12: v_minmax_num_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x6a,0xd6,0x7b,0xfa,0xed,0xe1]
 
-v_minmax_num_f16 v5, s105, s105, exec_lo
-// GFX12: v_minmax_num_f16 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x6a,0xd6,0x69,0xd2,0xf8,0x01]
+v_minmax_num_f16 v5.l, m0, 0.5, m0
+// GFX12: v_minmax_num_f16 v5.l, m0, 0.5, m0      ; encoding: [0x05,0x00,0x6a,0xd6,0x7d,0xe0,0xf5,0x01]
 
-v_minmax_num_f16 v5, vcc_lo, ttmp15, v3
-// GFX12: v_minmax_num_f16 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x6a,0xd6,0x6a,0xf6,0x0c,0x04]
+v_minmax_num_f16 v5.l, |exec_lo|, -1, vcc_hi
+// GFX12: v_minmax_num_f16 v5.l, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x6a,0xd6,0x7e,0x82,0xad,0x01]
 
-v_minmax_num_f16 v5, vcc_hi, 0xfe0b, v255
-// GFX12: v_minmax_num_f16 v5, vcc_hi, 0xfe0b, v255 ; encoding: [0x05,0x00,0x6a,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+v_minmax_num_f16 v5.l, -|exec_hi|, null, -|vcc_lo|
+// GFX12: v_minmax_num_f16 v5.l, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x6a,0xd6,0x7f,0xf8,0xa8,0xa1]
 
-v_minmax_num_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15|
-// GFX12: v_minmax_num_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x6a,0xd6,0x7b,0xfa,0xed,0xe1]
+v_minmax_num_f16 v5.l, null, exec_lo, -|0xfe0b|
+// GFX12: v_minmax_num_f16 v5.l, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x6a,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
 
-v_minmax_num_f16 v5, m0, 0.5, m0
-// GFX12: v_minmax_num_f16 v5, m0, 0.5, m0        ; encoding: [0x05,0x00,0x6a,0xd6,0x7d,0xe0,0xf5,0x01]
+v_minmax_num_f16 v5.l, -1, -|exec_hi|, -|src_scc|
+// GFX12: v_minmax_num_f16 v5.l, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x6a,0xd6,0xc1,0xfe,0xf4,0xc3]
 
-v_minmax_num_f16 v5, |exec_lo|, -1, vcc_hi
-// GFX12: v_minmax_num_f16 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x6a,0xd6,0x7e,0x82,0xad,0x01]
+v_minmax_num_f16 v5.l, 0.5, -m0, 0.5 mul:2
+// GFX12: v_minmax_num_f16 v5.l, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x00,0x6a,0xd6,0xf0,0xfa,0xc0,0x4b]
 
-v_minmax_num_f16 v5, -|exec_hi|, null, -|vcc_lo|
-// GFX12: v_minmax_num_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x6a,0xd6,0x7f,0xf8,0xa8,0xa1]
+v_minmax_num_f16 v5.l, -src_scc, |vcc_lo|, -1 mul:4
+// GFX12: v_minmax_num_f16 v5.l, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x6a,0xd6,0xfd,0xd4,0x04,0x33]
 
-v_minmax_num_f16 v5, null, exec_lo, -|0xfe0b|
-// GFX12: v_minmax_num_f16 v5, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x6a,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
+v_minmax_num_f16 v255.l, -|0xfe0b|, -|vcc_hi|, null clamp div:2
+// GFX12: v_minmax_num_f16 v255.l, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x6a,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
 
-v_minmax_num_f16 v5, -1, -|exec_hi|, -|src_scc|
-// GFX12: v_minmax_num_f16 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x6a,0xd6,0xc1,0xfe,0xf4,0xc3]
+v_minmax_num_f16 v5.l, v255.h, s2, s105
+// GFX12: v_minmax_num_f16 v5.l, v255.h, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x6a,0xd6,0xff,0x05,0xa4,0x01]
 
-v_minmax_num_f16 v5, 0.5, -m0, 0.5 mul:2
-// GFX12: v_minmax_num_f16 v5, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x00,0x6a,0xd6,0xf0,0xfa,0xc0,0x4b]
+v_minmax_num_f16 v5.l, s1, v255.h, exec_hi
+// GFX12: v_minmax_num_f16 v5.l, s1, v255.h, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x6a,0xd6,0x01,0xfe,0xff,0x01]
 
-v_minmax_num_f16 v5, -src_scc, |vcc_lo|, -1 mul:4
-// GFX12: v_minmax_num_f16 v5, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x6a,0xd6,0xfd,0xd4,0x04,0x33]
+v_minmax_num_f16 v5.l, vcc_hi, 0xfe0b, v255.h
+// GFX12: v_minmax_num_f16 v5.l, vcc_hi, 0xfe0b, v255.h op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x6a,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
 
-v_minmax_num_f16 v255, -|0xfe0b|, -|vcc_hi|, null clamp div:2
-// GFX12: v_minmax_num_f16 v255, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x6a,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+v_minmax_num_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null clamp div:2
+// GFX12: v_minmax_num_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp div:2 ; encoding: [0xff,0xc3,0x6a,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
 
 v_minmax_num_f32 v5, v1, v2, s3
 // GFX12: v_minmax_num_f32 v5, v1, v2, s3         ; encoding: [0x05,0x00,0x68,0xd6,0x01,0x05,0x0e,0x00]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_aliases.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_aliases.s
index ee4561fad367..ffcf65187747 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_aliases.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_aliases.s
@@ -24,11 +24,11 @@ v_minmax_f32_e64_dpp v0, -v1, -v2, -v3 dpp8:[0,1,2,3,4,5,6,7]
 v_maxmin_f32_e64_dpp v0, v1, v2, v3 clamp dpp8:[0,1,2,3,4,5,6,7]
 // GFX12: v_maxmin_num_f32_e64_dpp v0, v1, v2, v3 clamp dpp8:[0,1,2,3,4,5,6,7] ; encoding: [0x00,0x80,0x69,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x88,0xc6,0xfa]
 
-v_minmax_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
-// GFX12: v_minmax_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0]
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
-v_maxmin_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
-// GFX12: v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0]
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
 v_mad_i64_i32 v[5:6], s12, v1, v2, v[3:4]
 // GFX12: v_mad_co_i64_i32 v[5:6], s12, v1, v2, v[3:4] ; encoding: [0x05,0x0c,0xff,0xd6,0x01,0x05,0x0e,0x04]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s
index 623e66885aae..aa804cc302bf 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s
@@ -2921,53 +2921,98 @@ v_max_u16_e64_dpp v5.l, v1.l, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_
 v_max_u16_e64_dpp v255.h, v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX12: v_max_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x40,0x09,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
 
-v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
-// GFX12: v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0]
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
-v_maxmin_num_f16_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
-// GFX12: v_maxmin_num_f16_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+v_maxmin_num_f16_e64_dpp v5.l, v1.l, s2, v3.l quad_perm:[3,2,1,0]
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, s2, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
 
-v_maxmin_num_f16_e64_dpp v5, v1, 2.0, v3 quad_perm:[3,2,1,0]
-// GFX12: v_maxmin_num_f16_e64_dpp v5, v1, 2.0, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0xe8,0x0d,0x04,0x01,0x1b,0x00,0xff]
+v_maxmin_num_f16_e64_dpp v5.l, v1.l, 2.0, v3.l quad_perm:[3,2,1,0]
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, 2.0, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0xe8,0x0d,0x04,0x01,0x1b,0x00,0xff]
 
-v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
-// GFX12: v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
+v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3]
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
-v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 row_mirror
-// GFX12: v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
+v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
 
-v_maxmin_num_f16_e64_dpp v5, v1, v2, v255 row_half_mirror
-// GFX12: v_maxmin_num_f16_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
+v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
 
-v_maxmin_num_f16_e64_dpp v5, v1, v2, s105 row_shl:1
-// GFX12: v_maxmin_num_f16_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
+v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
 
-v_maxmin_num_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15
-// GFX12: v_maxmin_num_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
 
-v_maxmin_num_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1
-// GFX12: v_maxmin_num_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
 
-v_maxmin_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15
-// GFX12: v_maxmin_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x6b,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
+v_maxmin_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x6b,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
 
-v_maxmin_num_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1
-// GFX12: v_maxmin_num_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x6b,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+v_maxmin_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x6b,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
 
-v_maxmin_num_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15
-// GFX12: v_maxmin_num_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x6b,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+v_maxmin_num_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x6b,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
 
-v_maxmin_num_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf
-// GFX12: v_maxmin_num_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x6b,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x6b,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
 
-v_maxmin_num_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX12: v_maxmin_num_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x6b,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01]
+v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x6b,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01]
 
-v_maxmin_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX12: v_maxmin_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x6b,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x09,0x13]
+v_maxmin_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x6b,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x09,0x13]
 
-v_maxmin_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX12: v_maxmin_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x6b,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x05,0x30]
+v_maxmin_num_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_maxmin_num_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x6b,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x05,0x30]
+
+v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l quad_perm:[0,1,2,3]
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+
+v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, s3 row_mirror
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, s3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x40,0x01,0xff]
+
+v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_half_mirror
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x41,0x01,0xff]
+
+v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 row_shl:1
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xee,0x01,0x01,0x01,0x01,0xff]
+
+v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, -|m0| row_shr:15
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, -|m0| row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x6b,0xd6,0xfa,0x04,0xf6,0x81,0x01,0x1f,0x01,0xff]
+
+v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|exec_hi| row_ror:1
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|exec_hi| row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x05,0x6b,0xd6,0xfa,0x04,0xfe,0xa1,0x01,0x21,0x01,0xff]
+
+v_maxmin_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|exec_lo| row_ror:15
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x06,0x6b,0xd6,0xfa,0x04,0xfa,0xc1,0x01,0x2f,0x01,0xff]
+
+v_maxmin_num_f16_e64_dpp v5.l, |v1.l|, -v2.l, null row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, |v1.l|, -v2.l, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x6b,0xd6,0xfa,0x04,0xf2,0x41,0x01,0x50,0x01,0xff]
+
+v_maxmin_num_f16_e64_dpp v5.l, -v1.l, |v2.l|, -1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, -v1.l, |v2.l|, -1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x02,0x6b,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01]
+
+v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, 0.5 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, 0.5 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x03,0x6b,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x09,0x13]
+
+v_maxmin_num_f16_e64_dpp v5.h, v1.h, v2.h, v3.h quad_perm:[3,2,1,0]
+// GFX12: v_maxmin_num_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+
+v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.h quad_perm:[0,1,2,3]
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x6b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+
+v_maxmin_num_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x6b,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01]
+
+v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x13,0x6b,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x09,0x13]
+
+v_maxmin_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_maxmin_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0xc7,0x6b,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x05,0x30]
 
 v_maxmin_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: v_maxmin_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x69,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
@@ -3956,53 +4001,98 @@ v_min_u16_e64_dpp v5.l, v1.l, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_
 v_min_u16_e64_dpp v255.h, v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX12: v_min_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x40,0x0b,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
 
-v_minmax_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
-// GFX12: v_minmax_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0]
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+
+v_minmax_num_f16_e64_dpp v5.l, v1.l, s2, v3.l quad_perm:[3,2,1,0]
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, s2, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_minmax_num_f16_e64_dpp v5.l, v1.l, 2.0, v3.l quad_perm:[3,2,1,0]
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, 2.0, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0xe8,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
+v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3]
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
+
+v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
+
+v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
+
+v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
+
+v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+
+v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+
+v_minmax_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x6a,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
+
+v_minmax_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x6a,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+
+v_minmax_num_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x6a,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+
+v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x6a,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+
+v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x6a,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01]
+
+v_minmax_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x6a,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x09,0x13]
+
+v_minmax_num_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_minmax_num_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x6a,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x05,0x30]
 
-v_minmax_num_f16_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
-// GFX12: v_minmax_num_f16_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l quad_perm:[0,1,2,3]
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
 
-v_minmax_num_f16_e64_dpp v5, v1, 2.0, v3 quad_perm:[3,2,1,0]
-// GFX12: v_minmax_num_f16_e64_dpp v5, v1, 2.0, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0xe8,0x0d,0x04,0x01,0x1b,0x00,0xff]
+v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, s3 row_mirror
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, s3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x40,0x01,0xff]
 
-v_minmax_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
-// GFX12: v_minmax_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
+v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_half_mirror
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x41,0x01,0xff]
 
-v_minmax_num_f16_e64_dpp v5, v1, v2, v3 row_mirror
-// GFX12: v_minmax_num_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
+v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 row_shl:1
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xee,0x01,0x01,0x01,0x01,0xff]
 
-v_minmax_num_f16_e64_dpp v5, v1, v2, v255 row_half_mirror
-// GFX12: v_minmax_num_f16_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
+v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, -|m0| row_shr:15
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, -|m0| row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x6a,0xd6,0xfa,0x04,0xf6,0x81,0x01,0x1f,0x01,0xff]
 
-v_minmax_num_f16_e64_dpp v5, v1, v2, s105 row_shl:1
-// GFX12: v_minmax_num_f16_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
+v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|exec_hi| row_ror:1
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|exec_hi| row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x05,0x6a,0xd6,0xfa,0x04,0xfe,0xa1,0x01,0x21,0x01,0xff]
 
-v_minmax_num_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15
-// GFX12: v_minmax_num_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+v_minmax_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|exec_lo| row_ror:15
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x06,0x6a,0xd6,0xfa,0x04,0xfa,0xc1,0x01,0x2f,0x01,0xff]
 
-v_minmax_num_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1
-// GFX12: v_minmax_num_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+v_minmax_num_f16_e64_dpp v5.l, |v1.l|, -v2.l, null row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, |v1.l|, -v2.l, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x6a,0xd6,0xfa,0x04,0xf2,0x41,0x01,0x50,0x01,0xff]
 
-v_minmax_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15
-// GFX12: v_minmax_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x6a,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
+v_minmax_num_f16_e64_dpp v5.l, -v1.l, |v2.l|, -1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, -v1.l, |v2.l|, -1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x02,0x6a,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01]
 
-v_minmax_num_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1
-// GFX12: v_minmax_num_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x6a,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, 0.5 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, 0.5 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x03,0x6a,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x09,0x13]
 
-v_minmax_num_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15
-// GFX12: v_minmax_num_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x6a,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+v_minmax_num_f16_e64_dpp v5.h, v1.h, v2.h, v3.h quad_perm:[3,2,1,0]
+// GFX12: v_minmax_num_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
-v_minmax_num_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf
-// GFX12: v_minmax_num_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x6a,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.h quad_perm:[0,1,2,3]
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x6a,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
 
-v_minmax_num_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX12: v_minmax_num_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x6a,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01]
+v_minmax_num_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x6a,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01]
 
-v_minmax_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX12: v_minmax_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x6a,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x09,0x13]
+v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x13,0x6a,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x09,0x13]
 
-v_minmax_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX12: v_minmax_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x6a,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x05,0x30]
+v_minmax_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_minmax_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0xc7,0x6a,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x05,0x30]
 
 v_minmax_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: v_minmax_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s
index 056ea80d8a99..e93a65ec92e7 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s
@@ -1878,47 +1878,86 @@ v_max_u16_e64_dpp v5.l, v1.l, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_max_u16_e64_dpp v255.h, v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: v_max_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x40,0x09,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
-v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
-v_maxmin_num_f16_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_maxmin_num_f16_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+v_maxmin_num_f16_e64_dpp v5.l, v1.l, s2, v3.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, s2, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
 
-v_maxmin_num_f16_e64_dpp v5, v1, 2.0, v3 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_maxmin_num_f16_e64_dpp v5, v1, 2.0, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0xe8,0x0d,0x04,0x01,0x77,0x39,0x05]
+v_maxmin_num_f16_e64_dpp v5.l, v1.l, 2.0, v3.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, 2.0, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0xe8,0x0d,0x04,0x01,0x77,0x39,0x05]
 
-v_maxmin_num_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_maxmin_num_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
-v_maxmin_num_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_maxmin_num_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
 
-v_maxmin_num_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_maxmin_num_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
 
-v_maxmin_num_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_maxmin_num_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
 
-v_maxmin_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_maxmin_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x6b,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
+v_maxmin_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x6b,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
 
-v_maxmin_num_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_maxmin_num_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x6b,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+v_maxmin_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x6b,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
 
-v_maxmin_num_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_maxmin_num_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x6b,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+v_maxmin_num_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x6b,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
 
-v_maxmin_num_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_maxmin_num_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x6b,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x6b,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
 
-v_maxmin_num_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_maxmin_num_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x6b,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05]
+v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x6b,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05]
 
-v_maxmin_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX12: v_maxmin_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x6b,0xd6,0xea,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05]
+v_maxmin_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x6b,0xd6,0xea,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05]
 
-v_maxmin_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0]
-// GFX12: v_maxmin_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x6b,0xd6,0xe9,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+v_maxmin_num_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0]
+// GFX12: v_maxmin_num_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x6b,0xd6,0xe9,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+
+v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, s3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0x0e,0x00,0x01,0x77,0x39,0x05]
+
+v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05]
+
+v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, -|m0| dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, -|m0| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x6b,0xd6,0xe9,0x04,0xf6,0x81,0x01,0x77,0x39,0x05]
+
+v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|exec_hi| dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|exec_hi| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x6b,0xd6,0xe9,0x04,0xfe,0xa1,0x01,0x77,0x39,0x05]
+
+v_maxmin_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|exec_lo| dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x06,0x6b,0xd6,0xe9,0x04,0xfa,0xc1,0x01,0x77,0x39,0x05]
+
+v_maxmin_num_f16_e64_dpp v5.l, |v1.l|, -v2.l, null dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, |v1.l|, -v2.l, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x6b,0xd6,0xe9,0x04,0xf2,0x41,0x01,0x77,0x39,0x05]
+
+v_maxmin_num_f16_e64_dpp v5.l, -v1.l, |v2.l|, -1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, -v1.l, |v2.l|, -1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x6b,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05]
+
+v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, 0.5 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, 0.5 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x03,0x6b,0xd6,0xea,0x04,0xc2,0x73,0x01,0x77,0x39,0x05]
+
+v_maxmin_num_f16_e64_dpp v5.h, v1.h, v2.h, v3.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_maxmin_num_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x6b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+
+v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x6b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+
+v_maxmin_num_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x6b,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05]
+
+v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x13,0x6b,0xd6,0xea,0x04,0xc2,0x73,0x01,0x77,0x39,0x05]
+
+v_maxmin_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_maxmin_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0xc7,0x6b,0xd6,0xe9,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
 
 v_maxmin_num_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: v_maxmin_num_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x69,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
@@ -2673,47 +2712,86 @@ v_min_u16_e64_dpp v5.l, v1.l, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_min_u16_e64_dpp v255.h, v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: v_min_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x40,0x0b,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
-v_minmax_num_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_minmax_num_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+
+v_minmax_num_f16_e64_dpp v5.l, v1.l, s2, v3.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, s2, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_minmax_num_f16_e64_dpp v5.l, v1.l, 2.0, v3.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, 2.0, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0xe8,0x0d,0x04,0x01,0x77,0x39,0x05]
+
+v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+
+v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+
+v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+
+v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+
+v_minmax_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x6a,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
+
+v_minmax_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x6a,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+
+v_minmax_num_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x6a,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+
+v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x6a,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+
+v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x6a,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05]
+
+v_minmax_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x6a,0xd6,0xea,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05]
+
+v_minmax_num_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0]
+// GFX12: v_minmax_num_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x6a,0xd6,0xe9,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
 
-v_minmax_num_f16_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_minmax_num_f16_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, s3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0x0e,0x00,0x01,0x77,0x39,0x05]
 
-v_minmax_num_f16_e64_dpp v5, v1, 2.0, v3 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_minmax_num_f16_e64_dpp v5, v1, 2.0, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0xe8,0x0d,0x04,0x01,0x77,0x39,0x05]
+v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05]
 
-v_minmax_num_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_minmax_num_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, -|m0| dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, -|m0| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x6a,0xd6,0xe9,0x04,0xf6,0x81,0x01,0x77,0x39,0x05]
 
-v_minmax_num_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_minmax_num_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|exec_hi| dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|exec_hi| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x6a,0xd6,0xe9,0x04,0xfe,0xa1,0x01,0x77,0x39,0x05]
 
-v_minmax_num_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_minmax_num_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+v_minmax_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|exec_lo| dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x06,0x6a,0xd6,0xe9,0x04,0xfa,0xc1,0x01,0x77,0x39,0x05]
 
-v_minmax_num_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_minmax_num_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+v_minmax_num_f16_e64_dpp v5.l, |v1.l|, -v2.l, null dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, |v1.l|, -v2.l, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x6a,0xd6,0xe9,0x04,0xf2,0x41,0x01,0x77,0x39,0x05]
 
-v_minmax_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_minmax_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x6a,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
+v_minmax_num_f16_e64_dpp v5.l, -v1.l, |v2.l|, -1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, -v1.l, |v2.l|, -1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x6a,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05]
 
-v_minmax_num_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_minmax_num_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x6a,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, 0.5 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, 0.5 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x03,0x6a,0xd6,0xea,0x04,0xc2,0x73,0x01,0x77,0x39,0x05]
 
-v_minmax_num_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_minmax_num_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x6a,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+v_minmax_num_f16_e64_dpp v5.h, v1.h, v2.h, v3.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_minmax_num_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x6a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
-v_minmax_num_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_minmax_num_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x6a,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x6a,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
-v_minmax_num_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_minmax_num_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x6a,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05]
+v_minmax_num_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x6a,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05]
 
-v_minmax_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX12: v_minmax_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x6a,0xd6,0xea,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05]
+v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x13,0x6a,0xd6,0xea,0x04,0xc2,0x73,0x01,0x77,0x39,0x05]
 
-v_minmax_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0]
-// GFX12: v_minmax_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x6a,0xd6,0xe9,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+v_minmax_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_minmax_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0xc7,0x6a,0xd6,0xe9,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
 
 v_minmax_num_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: v_minmax_num_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x68,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
diff --git a/llvm/test/MC/Disassembler/AArch64/armv9.6a-rme-gpc3.txt b/llvm/test/MC/Disassembler/AArch64/armv9.6a-rme-gpc3.txt
index c5d074bf0394..d198771c341b 100644
--- a/llvm/test/MC/Disassembler/AArch64/armv9.6a-rme-gpc3.txt
+++ b/llvm/test/MC/Disassembler/AArch64/armv9.6a-rme-gpc3.txt
@@ -1,10 +1,18 @@
 # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
 # RUN: llvm-mc -triple aarch64 -disassemble %s  | FileCheck %s
 
-[0x1f,0x70,0x0e,0xd5]
+[0x00,0x70,0x0e,0xd5]
+[0x01,0x70,0x0e,0xd5]
+[0x02,0x70,0x0e,0xd5]
+[0x11,0x70,0x0e,0xd5]
+[0x1e,0x70,0x0e,0xd5]
 [0xa3,0x21,0x3e,0xd5]
 [0xa4,0x21,0x1e,0xd5]
 
-# CHECK:      	sys	#6, c7, c0, #0
+# CHECK:      	sys	#6, c7, c0, #0, x0
+# CHECK-NEXT: 	sys	#6, c7, c0, #0, x1
+# CHECK-NEXT: 	sys	#6, c7, c0, #0, x2
+# CHECK-NEXT: 	sys	#6, c7, c0, #0, x17
+# CHECK-NEXT: 	sys	#6, c7, c0, #0, x30
 # CHECK-NEXT: 	mrs	x3, GPCBW_EL3
 # CHECK-NEXT: 	msr	GPCBW_EL3, x4
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3.txt
index f9e236977c97..adcca5877610 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3.txt
@@ -4361,49 +4361,118 @@
 # W64-FAKE16: v_max_u16 v255, 0xfe0b, vcc_hi          ; encoding: [0xff,0x00,0x09,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
 0x05,0x00,0x60,0xd6,0x01,0x05,0x0e,0x00
-# GFX11: v_maxmin_f16 v5, v1, v2, s3             ; encoding: [0x05,0x00,0x60,0xd6,0x01,0x05,0x0e,0x00]
+# W32-REAL16: v_maxmin_f16 v5.l, v1.l, v2.l, s3       ; encoding: [0x05,0x00,0x60,0xd6,0x01,0x05,0x0e,0x00]
+# W32-FAKE16: v_maxmin_f16 v5, v1, v2, s3             ; encoding: [0x05,0x00,0x60,0xd6,0x01,0x05,0x0e,0x00]
+# W64-REAL16: v_maxmin_f16 v5.l, v1.l, v2.l, s3       ; encoding: [0x05,0x00,0x60,0xd6,0x01,0x05,0x0e,0x00]
+# W64-FAKE16: v_maxmin_f16 v5, v1, v2, s3             ; encoding: [0x05,0x00,0x60,0xd6,0x01,0x05,0x0e,0x00]
 
 0x05,0x00,0x60,0xd6,0xff,0x05,0xa4,0x01
-# GFX11: v_maxmin_f16 v5, v255, s2, s105         ; encoding: [0x05,0x00,0x60,0xd6,0xff,0x05,0xa4,0x01]
+# W32-REAL16: v_maxmin_f16 v5.l, v255.l, s2, s105     ; encoding: [0x05,0x00,0x60,0xd6,0xff,0x05,0xa4,0x01]
+# W32-FAKE16: v_maxmin_f16 v5, v255, s2, s105         ; encoding: [0x05,0x00,0x60,0xd6,0xff,0x05,0xa4,0x01]
+# W64-REAL16: v_maxmin_f16 v5.l, v255.l, s2, s105     ; encoding: [0x05,0x00,0x60,0xd6,0xff,0x05,0xa4,0x01]
+# W64-FAKE16: v_maxmin_f16 v5, v255, s2, s105         ; encoding: [0x05,0x00,0x60,0xd6,0xff,0x05,0xa4,0x01]
 
 0x05,0x00,0x60,0xd6,0x01,0xfe,0xff,0x01
-# GFX11: v_maxmin_f16 v5, s1, v255, exec_hi      ; encoding: [0x05,0x00,0x60,0xd6,0x01,0xfe,0xff,0x01]
+# W32-REAL16: v_maxmin_f16 v5.l, s1, v255.l, exec_hi  ; encoding: [0x05,0x00,0x60,0xd6,0x01,0xfe,0xff,0x01]
+# W32-FAKE16: v_maxmin_f16 v5, s1, v255, exec_hi      ; encoding: [0x05,0x00,0x60,0xd6,0x01,0xfe,0xff,0x01]
+# W64-REAL16: v_maxmin_f16 v5.l, s1, v255.l, exec_hi  ; encoding: [0x05,0x00,0x60,0xd6,0x01,0xfe,0xff,0x01]
+# W64-FAKE16: v_maxmin_f16 v5, s1, v255, exec_hi      ; encoding: [0x05,0x00,0x60,0xd6,0x01,0xfe,0xff,0x01]
 
 0x05,0x00,0x60,0xd6,0x69,0xd2,0xf8,0x01
-# GFX11: v_maxmin_f16 v5, s105, s105, exec_lo    ; encoding: [0x05,0x00,0x60,0xd6,0x69,0xd2,0xf8,0x01]
+# W32-REAL16: v_maxmin_f16 v5.l, s105, s105, exec_lo  ; encoding: [0x05,0x00,0x60,0xd6,0x69,0xd2,0xf8,0x01]
+# W32-FAKE16: v_maxmin_f16 v5, s105, s105, exec_lo    ; encoding: [0x05,0x00,0x60,0xd6,0x69,0xd2,0xf8,0x01]
+# W64-REAL16: v_maxmin_f16 v5.l, s105, s105, exec_lo  ; encoding: [0x05,0x00,0x60,0xd6,0x69,0xd2,0xf8,0x01]
+# W64-FAKE16: v_maxmin_f16 v5, s105, s105, exec_lo    ; encoding: [0x05,0x00,0x60,0xd6,0x69,0xd2,0xf8,0x01]
 
 0x05,0x00,0x60,0xd6,0x6a,0xf6,0x0c,0x04
-# GFX11: v_maxmin_f16 v5, vcc_lo, ttmp15, v3     ; encoding: [0x05,0x00,0x60,0xd6,0x6a,0xf6,0x0c,0x04]
+# W32-REAL16: v_maxmin_f16 v5.l, vcc_lo, ttmp15, v3.l ; encoding: [0x05,0x00,0x60,0xd6,0x6a,0xf6,0x0c,0x04]
+# W32-FAKE16: v_maxmin_f16 v5, vcc_lo, ttmp15, v3     ; encoding: [0x05,0x00,0x60,0xd6,0x6a,0xf6,0x0c,0x04]
+# W64-REAL16: v_maxmin_f16 v5.l, vcc_lo, ttmp15, v3.l ; encoding: [0x05,0x00,0x60,0xd6,0x6a,0xf6,0x0c,0x04]
+# W64-FAKE16: v_maxmin_f16 v5, vcc_lo, ttmp15, v3     ; encoding: [0x05,0x00,0x60,0xd6,0x6a,0xf6,0x0c,0x04]
 
 0x05,0x00,0x60,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00
-# GFX11: v_maxmin_f16 v5, vcc_hi, 0xfe0b, v255   ; encoding: [0x05,0x00,0x60,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_maxmin_f16 v5.l, vcc_hi, 0xfe0b, v255.l ; encoding: [0x05,0x00,0x60,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_maxmin_f16 v5, vcc_hi, 0xfe0b, v255   ; encoding: [0x05,0x00,0x60,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_maxmin_f16 v5.l, vcc_hi, 0xfe0b, v255.l ; encoding: [0x05,0x00,0x60,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_maxmin_f16 v5, vcc_hi, 0xfe0b, v255   ; encoding: [0x05,0x00,0x60,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
 
 0x05,0x07,0x60,0xd6,0x7b,0xfa,0xed,0xe1
-# GFX11: v_maxmin_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x60,0xd6,0x7b,0xfa,0xed,0xe1]
+# W32-REAL16: v_maxmin_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x60,0xd6,0x7b,0xfa,0xed,0xe1]
+# W32-FAKE16: v_maxmin_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x60,0xd6,0x7b,0xfa,0xed,0xe1]
+# W64-REAL16: v_maxmin_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x60,0xd6,0x7b,0xfa,0xed,0xe1]
+# W64-FAKE16: v_maxmin_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x60,0xd6,0x7b,0xfa,0xed,0xe1]
 
 0x05,0x00,0x60,0xd6,0x7d,0xe0,0xf5,0x01
-# GFX11: v_maxmin_f16 v5, m0, 0.5, m0            ; encoding: [0x05,0x00,0x60,0xd6,0x7d,0xe0,0xf5,0x01]
+# W32-REAL16: v_maxmin_f16 v5.l, m0, 0.5, m0          ; encoding: [0x05,0x00,0x60,0xd6,0x7d,0xe0,0xf5,0x01]
+# W32-FAKE16: v_maxmin_f16 v5, m0, 0.5, m0            ; encoding: [0x05,0x00,0x60,0xd6,0x7d,0xe0,0xf5,0x01]
+# W64-REAL16: v_maxmin_f16 v5.l, m0, 0.5, m0          ; encoding: [0x05,0x00,0x60,0xd6,0x7d,0xe0,0xf5,0x01]
+# W64-FAKE16: v_maxmin_f16 v5, m0, 0.5, m0            ; encoding: [0x05,0x00,0x60,0xd6,0x7d,0xe0,0xf5,0x01]
 
 0x05,0x01,0x60,0xd6,0x7e,0x82,0xad,0x01
-# GFX11: v_maxmin_f16 v5, |exec_lo|, -1, vcc_hi  ; encoding: [0x05,0x01,0x60,0xd6,0x7e,0x82,0xad,0x01]
+# W32-REAL16: v_maxmin_f16 v5.l, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x60,0xd6,0x7e,0x82,0xad,0x01]
+# W32-FAKE16: v_maxmin_f16 v5, |exec_lo|, -1, vcc_hi  ; encoding: [0x05,0x01,0x60,0xd6,0x7e,0x82,0xad,0x01]
+# W64-REAL16: v_maxmin_f16 v5.l, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x60,0xd6,0x7e,0x82,0xad,0x01]
+# W64-FAKE16: v_maxmin_f16 v5, |exec_lo|, -1, vcc_hi  ; encoding: [0x05,0x01,0x60,0xd6,0x7e,0x82,0xad,0x01]
 
 0x05,0x05,0x60,0xd6,0x7f,0xf8,0xa8,0xa1
-# GFX11: v_maxmin_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x60,0xd6,0x7f,0xf8,0xa8,0xa1]
+# W32-REAL16: v_maxmin_f16 v5.l, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x60,0xd6,0x7f,0xf8,0xa8,0xa1]
+# W32-FAKE16: v_maxmin_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x60,0xd6,0x7f,0xf8,0xa8,0xa1]
+# W64-REAL16: v_maxmin_f16 v5.l, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x60,0xd6,0x7f,0xf8,0xa8,0xa1]
+# W64-FAKE16: v_maxmin_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x60,0xd6,0x7f,0xf8,0xa8,0xa1]
 
 0x05,0x04,0x60,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00
-# GFX11: v_maxmin_f16 v5, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x60,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_maxmin_f16 v5.l, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x60,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_maxmin_f16 v5, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x60,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_maxmin_f16 v5.l, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x60,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_maxmin_f16 v5, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x60,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
 
 0x05,0x06,0x60,0xd6,0xc1,0xfe,0xf4,0xc3
-# GFX11: v_maxmin_f16 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x60,0xd6,0xc1,0xfe,0xf4,0xc3]
+# W32-REAL16: v_maxmin_f16 v5.l, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x60,0xd6,0xc1,0xfe,0xf4,0xc3]
+# W32-FAKE16: v_maxmin_f16 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x60,0xd6,0xc1,0xfe,0xf4,0xc3]
+# W64-REAL16: v_maxmin_f16 v5.l, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x60,0xd6,0xc1,0xfe,0xf4,0xc3]
+# W64-FAKE16: v_maxmin_f16 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x60,0xd6,0xc1,0xfe,0xf4,0xc3]
 
 0x05,0x00,0x60,0xd6,0xf0,0xfa,0xc0,0x4b
-# GFX11: v_maxmin_f16 v5, 0.5, -m0, 0.5 mul:2    ; encoding: [0x05,0x00,0x60,0xd6,0xf0,0xfa,0xc0,0x4b]
+# W32-REAL16: v_maxmin_f16 v5.l, 0.5, -m0, 0.5 mul:2  ; encoding: [0x05,0x00,0x60,0xd6,0xf0,0xfa,0xc0,0x4b]
+# W32-FAKE16: v_maxmin_f16 v5, 0.5, -m0, 0.5 mul:2    ; encoding: [0x05,0x00,0x60,0xd6,0xf0,0xfa,0xc0,0x4b]
+# W64-REAL16: v_maxmin_f16 v5.l, 0.5, -m0, 0.5 mul:2  ; encoding: [0x05,0x00,0x60,0xd6,0xf0,0xfa,0xc0,0x4b]
+# W64-FAKE16: v_maxmin_f16 v5, 0.5, -m0, 0.5 mul:2    ; encoding: [0x05,0x00,0x60,0xd6,0xf0,0xfa,0xc0,0x4b]
 
 0x05,0x02,0x60,0xd6,0xfd,0xd4,0x04,0x33
-# GFX11: v_maxmin_f16 v5, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x60,0xd6,0xfd,0xd4,0x04,0x33]
+# W32-REAL16: v_maxmin_f16 v5.l, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x60,0xd6,0xfd,0xd4,0x04,0x33]
+# W32-FAKE16: v_maxmin_f16 v5, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x60,0xd6,0xfd,0xd4,0x04,0x33]
+# W64-REAL16: v_maxmin_f16 v5.l, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x60,0xd6,0xfd,0xd4,0x04,0x33]
+# W64-FAKE16: v_maxmin_f16 v5, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x60,0xd6,0xfd,0xd4,0x04,0x33]
 
 0xff,0x83,0x60,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00
-# GFX11: v_maxmin_f16 v255, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x60,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_maxmin_f16 v255.l, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x60,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_maxmin_f16 v255, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x60,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_maxmin_f16 v255.l, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x60,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_maxmin_f16 v255, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x60,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+
+0x05,0x08,0x60,0xd6,0xff,0x05,0xa4,0x01
+# W32-REAL16: v_maxmin_f16 v5.l, v255.h, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x60,0xd6,0xff,0x05,0xa4,0x01]
+# W32-FAKE16: v_maxmin_f16 v5, v255, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x60,0xd6,0xff,0x05,0xa4,0x01]
+# W64-REAL16: v_maxmin_f16 v5.l, v255.h, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x60,0xd6,0xff,0x05,0xa4,0x01]
+# W64-FAKE16: v_maxmin_f16 v5, v255, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x60,0xd6,0xff,0x05,0xa4,0x01]
+
+0x05,0x10,0x60,0xd6,0x01,0xfe,0xff,0x01
+# W32-REAL16: v_maxmin_f16 v5.l, s1, v255.h, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x60,0xd6,0x01,0xfe,0xff,0x01]
+# W32-FAKE16: v_maxmin_f16 v5, s1, v255, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x60,0xd6,0x01,0xfe,0xff,0x01]
+# W64-REAL16: v_maxmin_f16 v5.l, s1, v255.h, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x60,0xd6,0x01,0xfe,0xff,0x01]
+# W64-FAKE16: v_maxmin_f16 v5, s1, v255, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x60,0xd6,0x01,0xfe,0xff,0x01]
+
+0x05,0x20,0x60,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00
+# W32-REAL16: v_maxmin_f16 v5.l, vcc_hi, 0xfe0b, v255.h op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x60,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_maxmin_f16 v5, vcc_hi, 0xfe0b, v255 op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x60,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_maxmin_f16 v5.l, vcc_hi, 0xfe0b, v255.h op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x60,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_maxmin_f16 v5, vcc_hi, 0xfe0b, v255 op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x60,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+
+0xff,0xc3,0x60,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00
+# W32-REAL16: v_maxmin_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp div:2 ; encoding: [0xff,0xc3,0x60,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_maxmin_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp div:2 ; encoding: [0xff,0xc3,0x60,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_maxmin_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp div:2 ; encoding: [0xff,0xc3,0x60,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_maxmin_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp div:2 ; encoding: [0xff,0xc3,0x60,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
 
 0x05,0x00,0x5e,0xd6,0x01,0x05,0x0e,0x00
 # GFX11: v_maxmin_f32 v5, v1, v2, s3             ; encoding: [0x05,0x00,0x5e,0xd6,0x01,0x05,0x0e,0x00]
@@ -5851,49 +5920,118 @@
 # W64-FAKE16: v_min_u16 v255, 0xfe0b, vcc_hi          ; encoding: [0xff,0x00,0x0b,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
 0x05,0x00,0x61,0xd6,0x01,0x05,0x0e,0x00
-# GFX11: v_minmax_f16 v5, v1, v2, s3             ; encoding: [0x05,0x00,0x61,0xd6,0x01,0x05,0x0e,0x00]
+# W32-REAL16: v_minmax_f16 v5.l, v1.l, v2.l, s3       ; encoding: [0x05,0x00,0x61,0xd6,0x01,0x05,0x0e,0x00]
+# W32-FAKE16: v_minmax_f16 v5, v1, v2, s3             ; encoding: [0x05,0x00,0x61,0xd6,0x01,0x05,0x0e,0x00]
+# W64-REAL16: v_minmax_f16 v5.l, v1.l, v2.l, s3       ; encoding: [0x05,0x00,0x61,0xd6,0x01,0x05,0x0e,0x00]
+# W64-FAKE16: v_minmax_f16 v5, v1, v2, s3             ; encoding: [0x05,0x00,0x61,0xd6,0x01,0x05,0x0e,0x00]
 
 0x05,0x00,0x61,0xd6,0xff,0x05,0xa4,0x01
-# GFX11: v_minmax_f16 v5, v255, s2, s105         ; encoding: [0x05,0x00,0x61,0xd6,0xff,0x05,0xa4,0x01]
+# W32-REAL16: v_minmax_f16 v5.l, v255.l, s2, s105     ; encoding: [0x05,0x00,0x61,0xd6,0xff,0x05,0xa4,0x01]
+# W32-FAKE16: v_minmax_f16 v5, v255, s2, s105         ; encoding: [0x05,0x00,0x61,0xd6,0xff,0x05,0xa4,0x01]
+# W64-REAL16: v_minmax_f16 v5.l, v255.l, s2, s105     ; encoding: [0x05,0x00,0x61,0xd6,0xff,0x05,0xa4,0x01]
+# W64-FAKE16: v_minmax_f16 v5, v255, s2, s105         ; encoding: [0x05,0x00,0x61,0xd6,0xff,0x05,0xa4,0x01]
 
 0x05,0x00,0x61,0xd6,0x01,0xfe,0xff,0x01
-# GFX11: v_minmax_f16 v5, s1, v255, exec_hi      ; encoding: [0x05,0x00,0x61,0xd6,0x01,0xfe,0xff,0x01]
+# W32-REAL16: v_minmax_f16 v5.l, s1, v255.l, exec_hi  ; encoding: [0x05,0x00,0x61,0xd6,0x01,0xfe,0xff,0x01]
+# W32-FAKE16: v_minmax_f16 v5, s1, v255, exec_hi      ; encoding: [0x05,0x00,0x61,0xd6,0x01,0xfe,0xff,0x01]
+# W64-REAL16: v_minmax_f16 v5.l, s1, v255.l, exec_hi  ; encoding: [0x05,0x00,0x61,0xd6,0x01,0xfe,0xff,0x01]
+# W64-FAKE16: v_minmax_f16 v5, s1, v255, exec_hi      ; encoding: [0x05,0x00,0x61,0xd6,0x01,0xfe,0xff,0x01]
 
 0x05,0x00,0x61,0xd6,0x69,0xd2,0xf8,0x01
-# GFX11: v_minmax_f16 v5, s105, s105, exec_lo    ; encoding: [0x05,0x00,0x61,0xd6,0x69,0xd2,0xf8,0x01]
+# W32-REAL16: v_minmax_f16 v5.l, s105, s105, exec_lo  ; encoding: [0x05,0x00,0x61,0xd6,0x69,0xd2,0xf8,0x01]
+# W32-FAKE16: v_minmax_f16 v5, s105, s105, exec_lo    ; encoding: [0x05,0x00,0x61,0xd6,0x69,0xd2,0xf8,0x01]
+# W64-REAL16: v_minmax_f16 v5.l, s105, s105, exec_lo  ; encoding: [0x05,0x00,0x61,0xd6,0x69,0xd2,0xf8,0x01]
+# W64-FAKE16: v_minmax_f16 v5, s105, s105, exec_lo    ; encoding: [0x05,0x00,0x61,0xd6,0x69,0xd2,0xf8,0x01]
 
 0x05,0x00,0x61,0xd6,0x6a,0xf6,0x0c,0x04
-# GFX11: v_minmax_f16 v5, vcc_lo, ttmp15, v3     ; encoding: [0x05,0x00,0x61,0xd6,0x6a,0xf6,0x0c,0x04]
+# W32-REAL16: v_minmax_f16 v5.l, vcc_lo, ttmp15, v3.l ; encoding: [0x05,0x00,0x61,0xd6,0x6a,0xf6,0x0c,0x04]
+# W32-FAKE16: v_minmax_f16 v5, vcc_lo, ttmp15, v3     ; encoding: [0x05,0x00,0x61,0xd6,0x6a,0xf6,0x0c,0x04]
+# W64-REAL16: v_minmax_f16 v5.l, vcc_lo, ttmp15, v3.l ; encoding: [0x05,0x00,0x61,0xd6,0x6a,0xf6,0x0c,0x04]
+# W64-FAKE16: v_minmax_f16 v5, vcc_lo, ttmp15, v3     ; encoding: [0x05,0x00,0x61,0xd6,0x6a,0xf6,0x0c,0x04]
 
 0x05,0x00,0x61,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00
-# GFX11: v_minmax_f16 v5, vcc_hi, 0xfe0b, v255   ; encoding: [0x05,0x00,0x61,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_minmax_f16 v5.l, vcc_hi, 0xfe0b, v255.l ; encoding: [0x05,0x00,0x61,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_minmax_f16 v5, vcc_hi, 0xfe0b, v255   ; encoding: [0x05,0x00,0x61,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_minmax_f16 v5.l, vcc_hi, 0xfe0b, v255.l ; encoding: [0x05,0x00,0x61,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_minmax_f16 v5, vcc_hi, 0xfe0b, v255   ; encoding: [0x05,0x00,0x61,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
 
 0x05,0x07,0x61,0xd6,0x7b,0xfa,0xed,0xe1
-# GFX11: v_minmax_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x61,0xd6,0x7b,0xfa,0xed,0xe1]
+# W32-REAL16: v_minmax_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x61,0xd6,0x7b,0xfa,0xed,0xe1]
+# W32-FAKE16: v_minmax_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x61,0xd6,0x7b,0xfa,0xed,0xe1]
+# W64-REAL16: v_minmax_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x61,0xd6,0x7b,0xfa,0xed,0xe1]
+# W64-FAKE16: v_minmax_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x61,0xd6,0x7b,0xfa,0xed,0xe1]
 
 0x05,0x00,0x61,0xd6,0x7d,0xe0,0xf5,0x01
-# GFX11: v_minmax_f16 v5, m0, 0.5, m0            ; encoding: [0x05,0x00,0x61,0xd6,0x7d,0xe0,0xf5,0x01]
+# W32-REAL16: v_minmax_f16 v5.l, m0, 0.5, m0          ; encoding: [0x05,0x00,0x61,0xd6,0x7d,0xe0,0xf5,0x01]
+# W32-FAKE16: v_minmax_f16 v5, m0, 0.5, m0            ; encoding: [0x05,0x00,0x61,0xd6,0x7d,0xe0,0xf5,0x01]
+# W64-REAL16: v_minmax_f16 v5.l, m0, 0.5, m0          ; encoding: [0x05,0x00,0x61,0xd6,0x7d,0xe0,0xf5,0x01]
+# W64-FAKE16: v_minmax_f16 v5, m0, 0.5, m0            ; encoding: [0x05,0x00,0x61,0xd6,0x7d,0xe0,0xf5,0x01]
 
 0x05,0x01,0x61,0xd6,0x7e,0x82,0xad,0x01
-# GFX11: v_minmax_f16 v5, |exec_lo|, -1, vcc_hi  ; encoding: [0x05,0x01,0x61,0xd6,0x7e,0x82,0xad,0x01]
+# W32-REAL16: v_minmax_f16 v5.l, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x61,0xd6,0x7e,0x82,0xad,0x01]
+# W32-FAKE16: v_minmax_f16 v5, |exec_lo|, -1, vcc_hi  ; encoding: [0x05,0x01,0x61,0xd6,0x7e,0x82,0xad,0x01]
+# W64-REAL16: v_minmax_f16 v5.l, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x61,0xd6,0x7e,0x82,0xad,0x01]
+# W64-FAKE16: v_minmax_f16 v5, |exec_lo|, -1, vcc_hi  ; encoding: [0x05,0x01,0x61,0xd6,0x7e,0x82,0xad,0x01]
 
 0x05,0x05,0x61,0xd6,0x7f,0xf8,0xa8,0xa1
-# GFX11: v_minmax_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x61,0xd6,0x7f,0xf8,0xa8,0xa1]
+# W32-REAL16: v_minmax_f16 v5.l, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x61,0xd6,0x7f,0xf8,0xa8,0xa1]
+# W32-FAKE16: v_minmax_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x61,0xd6,0x7f,0xf8,0xa8,0xa1]
+# W64-REAL16: v_minmax_f16 v5.l, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x61,0xd6,0x7f,0xf8,0xa8,0xa1]
+# W64-FAKE16: v_minmax_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x61,0xd6,0x7f,0xf8,0xa8,0xa1]
 
 0x05,0x04,0x61,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00
-# GFX11: v_minmax_f16 v5, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x61,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_minmax_f16 v5.l, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x61,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_minmax_f16 v5, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x61,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_minmax_f16 v5.l, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x61,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_minmax_f16 v5, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x61,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
 
 0x05,0x06,0x61,0xd6,0xc1,0xfe,0xf4,0xc3
-# GFX11: v_minmax_f16 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x61,0xd6,0xc1,0xfe,0xf4,0xc3]
+# W32-REAL16: v_minmax_f16 v5.l, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x61,0xd6,0xc1,0xfe,0xf4,0xc3]
+# W32-FAKE16: v_minmax_f16 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x61,0xd6,0xc1,0xfe,0xf4,0xc3]
+# W64-REAL16: v_minmax_f16 v5.l, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x61,0xd6,0xc1,0xfe,0xf4,0xc3]
+# W64-FAKE16: v_minmax_f16 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x61,0xd6,0xc1,0xfe,0xf4,0xc3]
 
 0x05,0x00,0x61,0xd6,0xf0,0xfa,0xc0,0x4b
-# GFX11: v_minmax_f16 v5, 0.5, -m0, 0.5 mul:2    ; encoding: [0x05,0x00,0x61,0xd6,0xf0,0xfa,0xc0,0x4b]
+# W32-REAL16: v_minmax_f16 v5.l, 0.5, -m0, 0.5 mul:2  ; encoding: [0x05,0x00,0x61,0xd6,0xf0,0xfa,0xc0,0x4b]
+# W32-FAKE16: v_minmax_f16 v5, 0.5, -m0, 0.5 mul:2    ; encoding: [0x05,0x00,0x61,0xd6,0xf0,0xfa,0xc0,0x4b]
+# W64-REAL16: v_minmax_f16 v5.l, 0.5, -m0, 0.5 mul:2  ; encoding: [0x05,0x00,0x61,0xd6,0xf0,0xfa,0xc0,0x4b]
+# W64-FAKE16: v_minmax_f16 v5, 0.5, -m0, 0.5 mul:2    ; encoding: [0x05,0x00,0x61,0xd6,0xf0,0xfa,0xc0,0x4b]
 
 0x05,0x02,0x61,0xd6,0xfd,0xd4,0x04,0x33
-# GFX11: v_minmax_f16 v5, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x61,0xd6,0xfd,0xd4,0x04,0x33]
+# W32-REAL16: v_minmax_f16 v5.l, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x61,0xd6,0xfd,0xd4,0x04,0x33]
+# W32-FAKE16: v_minmax_f16 v5, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x61,0xd6,0xfd,0xd4,0x04,0x33]
+# W64-REAL16: v_minmax_f16 v5.l, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x61,0xd6,0xfd,0xd4,0x04,0x33]
+# W64-FAKE16: v_minmax_f16 v5, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x61,0xd6,0xfd,0xd4,0x04,0x33]
 
 0xff,0x83,0x61,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00
-# GFX11: v_minmax_f16 v255, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x61,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_minmax_f16 v255.l, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x61,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_minmax_f16 v255, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x61,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_minmax_f16 v255.l, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x61,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_minmax_f16 v255, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x61,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+
+0x05,0x08,0x61,0xd6,0xff,0x05,0xa4,0x01
+# W32-REAL16: v_minmax_f16 v5.l, v255.h, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x61,0xd6,0xff,0x05,0xa4,0x01]
+# W32-FAKE16: v_minmax_f16 v5, v255, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x61,0xd6,0xff,0x05,0xa4,0x01]
+# W64-REAL16: v_minmax_f16 v5.l, v255.h, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x61,0xd6,0xff,0x05,0xa4,0x01]
+# W64-FAKE16: v_minmax_f16 v5, v255, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x61,0xd6,0xff,0x05,0xa4,0x01]
+
+0x05,0x10,0x61,0xd6,0x01,0xfe,0xff,0x01
+# W32-REAL16: v_minmax_f16 v5.l, s1, v255.h, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x61,0xd6,0x01,0xfe,0xff,0x01]
+# W32-FAKE16: v_minmax_f16 v5, s1, v255, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x61,0xd6,0x01,0xfe,0xff,0x01]
+# W64-REAL16: v_minmax_f16 v5.l, s1, v255.h, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x61,0xd6,0x01,0xfe,0xff,0x01]
+# W64-FAKE16: v_minmax_f16 v5, s1, v255, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x61,0xd6,0x01,0xfe,0xff,0x01]
+
+0x05,0x20,0x61,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00
+# W32-REAL16: v_minmax_f16 v5.l, vcc_hi, 0xfe0b, v255.h op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x61,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_minmax_f16 v5, vcc_hi, 0xfe0b, v255 op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x61,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_minmax_f16 v5.l, vcc_hi, 0xfe0b, v255.h op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x61,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_minmax_f16 v5, vcc_hi, 0xfe0b, v255 op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x61,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+
+0xff,0xc3,0x61,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00
+# W32-REAL16: v_minmax_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp div:2 ; encoding: [0xff,0xc3,0x61,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_minmax_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp div:2 ; encoding: [0xff,0xc3,0x61,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_minmax_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp div:2 ; encoding: [0xff,0xc3,0x61,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_minmax_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp div:2 ; encoding: [0xff,0xc3,0x61,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
 
 0x05,0x00,0x5f,0xd6,0x01,0x05,0x0e,0x00
 # GFX11: v_minmax_f32 v5, v1, v2, s3             ; encoding: [0x05,0x00,0x5f,0xd6,0x01,0x05,0x0e,0x00]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16.txt
index 132fc80dda47..2964360a77fd 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16.txt
@@ -2113,46 +2113,118 @@
 # W64-FAKE16: v_max_u16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x09,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
-# GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W32-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W64-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
 0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
-# GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
+# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
+# W32-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
+# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
+# W64-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
 0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff
-# GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
+# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
+# W32-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
+# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
+# W64-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
 
 0x05,0x00,0x60,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff
-# GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
+# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
+# W32-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
+# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
+# W64-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
 
 0x05,0x00,0x60,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff
-# GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
+# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
+# W32-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
+# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
+# W64-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
 
 0x05,0x00,0x60,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff
-# GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+# W32-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+# W64-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
 
 0x05,0x00,0x60,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff
-# GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+# W32-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+# W64-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
 
 0x05,0x01,0x60,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff
-# GFX11: v_maxmin_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x60,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
+# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x60,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
+# W32-FAKE16: v_maxmin_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x60,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
+# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x60,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
+# W64-FAKE16: v_maxmin_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x60,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
 
 0x05,0x02,0x60,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff
-# GFX11: v_maxmin_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x60,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x60,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+# W32-FAKE16: v_maxmin_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x60,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x60,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+# W64-FAKE16: v_maxmin_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x60,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
 
 0x05,0x04,0x60,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff
-# GFX11: v_maxmin_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x60,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x60,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+# W32-FAKE16: v_maxmin_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x60,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x60,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+# W64-FAKE16: v_maxmin_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x60,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
 
 0x05,0x03,0x60,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff
-# GFX11: v_maxmin_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x60,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x60,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_maxmin_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x60,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x60,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_maxmin_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x60,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
 
 0x05,0x05,0x60,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01
-# GFX11: v_maxmin_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x60,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01]
+# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x60,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_maxmin_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x60,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x60,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_maxmin_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x60,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01]
 
 0x05,0x06,0x60,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x01,0x13
-# GFX11: v_maxmin_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x06,0x60,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x01,0x13]
+# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x06,0x60,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_maxmin_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x06,0x60,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x06,0x60,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_maxmin_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x06,0x60,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x01,0x13]
 
 0xff,0x87,0x60,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30
-# GFX11: v_maxmin_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x87,0x60,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
+# W32-REAL16: v_maxmin_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x87,0x60,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_maxmin_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x87,0x60,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_maxmin_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x87,0x60,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_maxmin_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x87,0x60,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
+
+0x05,0x78,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
+# W32-REAL16: v_maxmin_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W32-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W64-REAL16: v_maxmin_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W64-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+
+0x05,0x20,0x60,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff
+# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x60,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+# W32-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x60,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x60,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+# W64-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x60,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+
+0x05,0x0a,0x60,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01
+# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x60,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_maxmin_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x60,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x60,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_maxmin_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x60,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01]
+
+0x05,0x13,0x60,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x01,0x13
+# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x60,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_maxmin_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x60,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x60,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_maxmin_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x60,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x01,0x13]
+
+0xff,0xc7,0x60,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30
+# W32-REAL16: v_maxmin_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x60,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_maxmin_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x60,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_maxmin_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x60,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_maxmin_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x60,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0x5e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 # GFX11: v_maxmin_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
@@ -2833,46 +2905,118 @@
 # W64-FAKE16: v_min_u16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x0b,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
-# GFX11: v_minmax_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W32-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W32-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W64-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W64-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
 0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
-# GFX11: v_minmax_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
+# W32-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
+# W32-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
+# W64-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
+# W64-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
 0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff
-# GFX11: v_minmax_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
+# W32-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
+# W32-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
+# W64-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
+# W64-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
 
 0x05,0x00,0x61,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff
-# GFX11: v_minmax_f16_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
+# W32-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
+# W32-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
+# W64-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
+# W64-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
 
 0x05,0x00,0x61,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff
-# GFX11: v_minmax_f16_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
+# W32-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
+# W32-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
+# W64-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
+# W64-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
 
 0x05,0x00,0x61,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff
-# GFX11: v_minmax_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+# W32-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+# W32-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+# W64-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+# W64-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
 
 0x05,0x00,0x61,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff
-# GFX11: v_minmax_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+# W32-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+# W32-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+# W64-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+# W64-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
 
 0x05,0x01,0x61,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff
-# GFX11: v_minmax_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x61,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
+# W32-REAL16: v_minmax_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x61,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
+# W32-FAKE16: v_minmax_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x61,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
+# W64-REAL16: v_minmax_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x61,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
+# W64-FAKE16: v_minmax_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x61,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
 
 0x05,0x02,0x61,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff
-# GFX11: v_minmax_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x61,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+# W32-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x61,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+# W32-FAKE16: v_minmax_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x61,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+# W64-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x61,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+# W64-FAKE16: v_minmax_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x61,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
 
 0x05,0x04,0x61,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff
-# GFX11: v_minmax_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x61,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+# W32-REAL16: v_minmax_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x61,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+# W32-FAKE16: v_minmax_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x61,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+# W64-REAL16: v_minmax_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x61,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+# W64-FAKE16: v_minmax_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x61,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
 
 0x05,0x03,0x61,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff
-# GFX11: v_minmax_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x61,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+# W32-REAL16: v_minmax_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x61,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_minmax_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x61,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_minmax_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x61,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_minmax_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x61,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
 
 0x05,0x05,0x61,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01
-# GFX11: v_minmax_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x61,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01]
+# W32-REAL16: v_minmax_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x61,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_minmax_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x61,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_minmax_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x61,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_minmax_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x61,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01]
 
 0x05,0x06,0x61,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x01,0x13
-# GFX11: v_minmax_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x06,0x61,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x01,0x13]
+# W32-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x06,0x61,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_minmax_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x06,0x61,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x06,0x61,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_minmax_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x06,0x61,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x01,0x13]
 
 0xff,0x87,0x61,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30
-# GFX11: v_minmax_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x87,0x61,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
+# W32-REAL16: v_minmax_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x87,0x61,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_minmax_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x87,0x61,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_minmax_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x87,0x61,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_minmax_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x87,0x61,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
+
+0x05,0x78,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
+# W32-REAL16: v_minmax_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W32-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W64-REAL16: v_minmax_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W64-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+
+0x05,0x20,0x61,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff
+# W32-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x61,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+# W32-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x61,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+# W64-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x61,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+# W64-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x61,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+
+0x05,0x0a,0x61,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01
+# W32-REAL16: v_minmax_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x61,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_minmax_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x61,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_minmax_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x61,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_minmax_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x61,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01]
+
+0x05,0x13,0x61,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x01,0x13
+# W32-REAL16: v_minmax_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x61,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_minmax_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x61,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_minmax_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x61,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_minmax_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x61,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x01,0x13]
+
+0xff,0xc7,0x61,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30
+# W32-REAL16: v_minmax_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x61,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_minmax_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x61,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_minmax_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x61,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_minmax_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x61,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0x5f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 # GFX11: v_minmax_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8.txt
index 714fac9fe62a..7a81ba23afa3 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8.txt
@@ -1141,40 +1141,106 @@
 # W64-FAKE16: v_max_u16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x09,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0x60,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
-# GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x60,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
-# GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x60,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05
-# GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x60,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05
-# GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x60,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05
-# GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
 
 0x05,0x01,0x60,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05
-# GFX11: v_maxmin_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x60,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x60,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maxmin_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x60,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x60,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maxmin_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x60,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
 
 0x05,0x02,0x60,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05
-# GFX11: v_maxmin_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x60,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x60,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maxmin_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x60,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x60,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maxmin_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x60,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
 
 0x05,0x04,0x60,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05
-# GFX11: v_maxmin_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x60,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x60,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maxmin_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x60,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x60,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maxmin_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x60,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
 
 0x05,0x03,0x60,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05
-# GFX11: v_maxmin_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x60,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x60,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maxmin_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x60,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x60,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maxmin_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x60,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
 
 0x05,0x05,0x60,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05
-# GFX11: v_maxmin_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x60,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x60,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maxmin_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x60,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x60,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maxmin_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x60,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05]
 
 0x05,0x06,0x60,0xd6,0xe9,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05
-# GFX11: v_maxmin_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x06,0x60,0xd6,0xe9,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x06,0x60,0xd6,0xe9,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maxmin_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x06,0x60,0xd6,0xe9,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x06,0x60,0xd6,0xe9,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maxmin_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x06,0x60,0xd6,0xe9,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05]
 
 0xff,0x87,0x60,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00
-# GFX11: v_maxmin_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x87,0x60,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+# W32-REAL16: v_maxmin_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x87,0x60,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_maxmin_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x87,0x60,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_maxmin_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x87,0x60,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_maxmin_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x87,0x60,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+
+0x05,0x78,0x60,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
+# W32-REAL16: v_maxmin_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x60,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x60,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maxmin_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x60,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x60,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+
+0x05,0x20,0x60,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
+# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x60,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x60,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x60,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x60,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+
+0x05,0x0a,0x60,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05
+# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x60,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maxmin_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x60,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x60,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maxmin_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x60,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05]
+
+0x05,0x13,0x60,0xd6,0xe9,0x04,0xc2,0x73,0x01,0x77,0x39,0x05
+# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x60,0xd6,0xe9,0x04,0xc2,0x73,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maxmin_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x60,0xd6,0xe9,0x04,0xc2,0x73,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x60,0xd6,0xe9,0x04,0xc2,0x73,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maxmin_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x60,0xd6,0xe9,0x04,0xc2,0x73,0x01,0x77,0x39,0x05]
+
+0xff,0xc7,0x60,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00
+# W32-REAL16: v_maxmin_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x60,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_maxmin_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x60,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_maxmin_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x60,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_maxmin_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x60,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0x5e,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 # GFX11: v_maxmin_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5e,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
@@ -1585,40 +1651,106 @@
 # W64-FAKE16: v_min_u16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x0b,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0x61,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
-# GFX11: v_minmax_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x61,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
-# GFX11: v_minmax_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x61,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05
-# GFX11: v_minmax_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x61,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05
-# GFX11: v_minmax_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x61,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05
-# GFX11: v_minmax_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
 
 0x05,0x01,0x61,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05
-# GFX11: v_minmax_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x61,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_minmax_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x61,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minmax_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x61,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minmax_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x61,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minmax_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x61,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
 
 0x05,0x02,0x61,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05
-# GFX11: v_minmax_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x61,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x61,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minmax_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x61,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x61,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minmax_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x61,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
 
 0x05,0x04,0x61,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05
-# GFX11: v_minmax_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x61,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_minmax_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x61,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minmax_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x61,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minmax_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x61,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minmax_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x61,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
 
 0x05,0x03,0x61,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05
-# GFX11: v_minmax_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x61,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_minmax_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x61,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minmax_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x61,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minmax_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x61,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minmax_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x61,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
 
 0x05,0x05,0x61,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05
-# GFX11: v_minmax_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x61,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_minmax_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x61,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minmax_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x61,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minmax_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x61,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minmax_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x61,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05]
 
 0x05,0x06,0x61,0xd6,0xe9,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05
-# GFX11: v_minmax_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x06,0x61,0xd6,0xe9,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x06,0x61,0xd6,0xe9,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minmax_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x06,0x61,0xd6,0xe9,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x06,0x61,0xd6,0xe9,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minmax_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x06,0x61,0xd6,0xe9,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05]
 
 0xff,0x87,0x61,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00
-# GFX11: v_minmax_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x87,0x61,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+# W32-REAL16: v_minmax_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x87,0x61,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_minmax_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x87,0x61,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_minmax_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x87,0x61,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_minmax_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x87,0x61,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+
+0x05,0x78,0x61,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
+# W32-REAL16: v_minmax_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x61,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x61,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minmax_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x61,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x61,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+
+0x05,0x20,0x61,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
+# W32-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x61,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x61,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x61,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x61,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+
+0x05,0x0a,0x61,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05
+# W32-REAL16: v_minmax_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x61,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minmax_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x61,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minmax_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x61,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minmax_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x61,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05]
+
+0x05,0x13,0x61,0xd6,0xe9,0x04,0xc2,0x73,0x01,0x77,0x39,0x05
+# W32-REAL16: v_minmax_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x61,0xd6,0xe9,0x04,0xc2,0x73,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minmax_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x61,0xd6,0xe9,0x04,0xc2,0x73,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minmax_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x61,0xd6,0xe9,0x04,0xc2,0x73,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minmax_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x61,0xd6,0xe9,0x04,0xc2,0x73,0x01,0x77,0x39,0x05]
+
+0xff,0xc7,0x61,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00
+# W32-REAL16: v_minmax_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x61,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_minmax_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x61,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_minmax_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x61,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_minmax_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x61,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0x5f,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 # GFX11: v_minmax_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5f,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt
index 6d48440633f4..633d3a48634f 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt
@@ -4265,49 +4265,120 @@
 # W64-FAKE16: v_max_u16 v255, 0xfe0b, vcc_hi          ; encoding: [0xff,0x00,0x09,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
 0x05,0x00,0x6b,0xd6,0x01,0x05,0x0e,0x00
-# GFX12: v_maxmin_num_f16 v5, v1, v2, s3         ; encoding: [0x05,0x00,0x6b,0xd6,0x01,0x05,0x0e,0x00]
+# W32-REAL16: v_maxmin_num_f16 v5.l, v1.l, v2.l, s3   ; encoding: [0x05,0x00,0x6b,0xd6,0x01,0x05,0x0e,0x00]
+# W32-FAKE16: v_maxmin_num_f16 v5, v1, v2, s3         ; encoding: [0x05,0x00,0x6b,0xd6,0x01,0x05,0x0e,0x00]
+# W64-REAL16: v_maxmin_num_f16 v5.l, v1.l, v2.l, s3   ; encoding: [0x05,0x00,0x6b,0xd6,0x01,0x05,0x0e,0x00]
+# W64-FAKE16: v_maxmin_num_f16 v5, v1, v2, s3         ; encoding: [0x05,0x00,0x6b,0xd6,0x01,0x05,0x0e,0x00]
 
 0x05,0x00,0x6b,0xd6,0xff,0x05,0xa4,0x01
-# GFX12: v_maxmin_num_f16 v5, v255, s2, s105     ; encoding: [0x05,0x00,0x6b,0xd6,0xff,0x05,0xa4,0x01]
+# W32-REAL16: v_maxmin_num_f16 v5.l, v255.l, s2, s105 ; encoding: [0x05,0x00,0x6b,0xd6,0xff,0x05,0xa4,0x01]
+# W32-FAKE16: v_maxmin_num_f16 v5, v255, s2, s105     ; encoding: [0x05,0x00,0x6b,0xd6,0xff,0x05,0xa4,0x01]
+# W64-REAL16: v_maxmin_num_f16 v5.l, v255.l, s2, s105 ; encoding: [0x05,0x00,0x6b,0xd6,0xff,0x05,0xa4,0x01]
+# W64-FAKE16: v_maxmin_num_f16 v5, v255, s2, s105     ; encoding: [0x05,0x00,0x6b,0xd6,0xff,0x05,0xa4,0x01]
 
 0x05,0x00,0x6b,0xd6,0x01,0xfe,0xff,0x01
-# GFX12: v_maxmin_num_f16 v5, s1, v255, exec_hi  ; encoding: [0x05,0x00,0x6b,0xd6,0x01,0xfe,0xff,0x01]
+# W32-REAL16: v_maxmin_num_f16 v5.l, s1, v255.l, exec_hi ; encoding: [0x05,0x00,0x6b,0xd6,0x01,0xfe,0xff,0x01]
+# W32-FAKE16: v_maxmin_num_f16 v5, s1, v255, exec_hi  ; encoding: [0x05,0x00,0x6b,0xd6,0x01,0xfe,0xff,0x01]
+# W64-REAL16: v_maxmin_num_f16 v5.l, s1, v255.l, exec_hi ; encoding: [0x05,0x00,0x6b,0xd6,0x01,0xfe,0xff,0x01]
+# W64-FAKE16: v_maxmin_num_f16 v5, s1, v255, exec_hi  ; encoding: [0x05,0x00,0x6b,0xd6,0x01,0xfe,0xff,0x01]
 
 0x05,0x00,0x6b,0xd6,0x69,0xd2,0xf8,0x01
-# GFX12: v_maxmin_num_f16 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x6b,0xd6,0x69,0xd2,0xf8,0x01]
+# W32-REAL16: v_maxmin_num_f16 v5.l, s105, s105, exec_lo ; encoding: [0x05,0x00,0x6b,0xd6,0x69,0xd2,0xf8,0x01]
+# W32-FAKE16: v_maxmin_num_f16 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x6b,0xd6,0x69,0xd2,0xf8,0x01]
+# W64-REAL16: v_maxmin_num_f16 v5.l, s105, s105, exec_lo ; encoding: [0x05,0x00,0x6b,0xd6,0x69,0xd2,0xf8,0x01]
+# W64-FAKE16: v_maxmin_num_f16 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x6b,0xd6,0x69,0xd2,0xf8,0x01]
 
 0x05,0x00,0x6b,0xd6,0x6a,0xf6,0x0c,0x04
-# GFX12: v_maxmin_num_f16 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x6b,0xd6,0x6a,0xf6,0x0c,0x04]
+# W32-REAL16: v_maxmin_num_f16 v5.l, vcc_lo, ttmp15, v3.l ; encoding: [0x05,0x00,0x6b,0xd6,0x6a,0xf6,0x0c,0x04]
+# W32-FAKE16: v_maxmin_num_f16 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x6b,0xd6,0x6a,0xf6,0x0c,0x04]
+# W64-REAL16: v_maxmin_num_f16 v5.l, vcc_lo, ttmp15, v3.l ; encoding: [0x05,0x00,0x6b,0xd6,0x6a,0xf6,0x0c,0x04]
+# W64-FAKE16: v_maxmin_num_f16 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x6b,0xd6,0x6a,0xf6,0x0c,0x04]
 
 0x05,0x00,0x6b,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00
-# GFX12: v_maxmin_num_f16 v5, vcc_hi, 0xfe0b, v255 ; encoding: [0x05,0x00,0x6b,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_maxmin_num_f16 v5.l, vcc_hi, 0xfe0b, v255.l ; encoding: [0x05,0x00,0x6b,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_maxmin_num_f16 v5, vcc_hi, 0xfe0b, v255 ; encoding: [0x05,0x00,0x6b,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_maxmin_num_f16 v5.l, vcc_hi, 0xfe0b, v255.l ; encoding: [0x05,0x00,0x6b,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_maxmin_num_f16 v5, vcc_hi, 0xfe0b, v255 ; encoding: [0x05,0x00,0x6b,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
 
 0x05,0x07,0x6b,0xd6,0x7b,0xfa,0xed,0xe1
-# GFX12: v_maxmin_num_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x6b,0xd6,0x7b,0xfa,0xed,0xe1]
+# W32-REAL16: v_maxmin_num_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x6b,0xd6,0x7b,0xfa,0xed,0xe1]
+# W32-FAKE16: v_maxmin_num_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x6b,0xd6,0x7b,0xfa,0xed,0xe1]
+# W64-REAL16: v_maxmin_num_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x6b,0xd6,0x7b,0xfa,0xed,0xe1]
+# W64-FAKE16: v_maxmin_num_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x6b,0xd6,0x7b,0xfa,0xed,0xe1]
 
 0x05,0x00,0x6b,0xd6,0x7d,0xe0,0xf5,0x01
-# GFX12: v_maxmin_num_f16 v5, m0, 0.5, m0        ; encoding: [0x05,0x00,0x6b,0xd6,0x7d,0xe0,0xf5,0x01]
+# W32-REAL16: v_maxmin_num_f16 v5.l, m0, 0.5, m0      ; encoding: [0x05,0x00,0x6b,0xd6,0x7d,0xe0,0xf5,0x01]
+# W32-FAKE16: v_maxmin_num_f16 v5, m0, 0.5, m0        ; encoding: [0x05,0x00,0x6b,0xd6,0x7d,0xe0,0xf5,0x01]
+# W64-REAL16: v_maxmin_num_f16 v5.l, m0, 0.5, m0      ; encoding: [0x05,0x00,0x6b,0xd6,0x7d,0xe0,0xf5,0x01]
+# W64-FAKE16: v_maxmin_num_f16 v5, m0, 0.5, m0        ; encoding: [0x05,0x00,0x6b,0xd6,0x7d,0xe0,0xf5,0x01]
 
 0x05,0x01,0x6b,0xd6,0x7e,0x82,0xad,0x01
-# GFX12: v_maxmin_num_f16 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x6b,0xd6,0x7e,0x82,0xad,0x01]
+# W32-REAL16: v_maxmin_num_f16 v5.l, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x6b,0xd6,0x7e,0x82,0xad,0x01]
+# W32-FAKE16: v_maxmin_num_f16 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x6b,0xd6,0x7e,0x82,0xad,0x01]
+# W64-REAL16: v_maxmin_num_f16 v5.l, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x6b,0xd6,0x7e,0x82,0xad,0x01]
+# W64-FAKE16: v_maxmin_num_f16 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x6b,0xd6,0x7e,0x82,0xad,0x01]
 
 0x05,0x05,0x6b,0xd6,0x7f,0xf8,0xa8,0xa1
-# GFX12: v_maxmin_num_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x6b,0xd6,0x7f,0xf8,0xa8,0xa1]
+# W32-REAL16: v_maxmin_num_f16 v5.l, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x6b,0xd6,0x7f,0xf8,0xa8,0xa1]
+# W32-FAKE16: v_maxmin_num_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x6b,0xd6,0x7f,0xf8,0xa8,0xa1]
+# W64-REAL16: v_maxmin_num_f16 v5.l, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x6b,0xd6,0x7f,0xf8,0xa8,0xa1]
+# W64-FAKE16: v_maxmin_num_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x6b,0xd6,0x7f,0xf8,0xa8,0xa1]
 
 0x05,0x04,0x6b,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00
-# GFX12: v_maxmin_num_f16 v5, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x6b,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_maxmin_num_f16 v5.l, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x6b,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_maxmin_num_f16 v5, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x6b,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_maxmin_num_f16 v5.l, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x6b,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_maxmin_num_f16 v5, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x6b,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
 
 0x05,0x06,0x6b,0xd6,0xc1,0xfe,0xf4,0xc3
-# GFX12: v_maxmin_num_f16 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x6b,0xd6,0xc1,0xfe,0xf4,0xc3]
+# W32-REAL16: v_maxmin_num_f16 v5.l, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x6b,0xd6,0xc1,0xfe,0xf4,0xc3]
+# W32-FAKE16: v_maxmin_num_f16 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x6b,0xd6,0xc1,0xfe,0xf4,0xc3]
+# W64-REAL16: v_maxmin_num_f16 v5.l, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x6b,0xd6,0xc1,0xfe,0xf4,0xc3]
+# W64-FAKE16: v_maxmin_num_f16 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x6b,0xd6,0xc1,0xfe,0xf4,0xc3]
 
 0x05,0x00,0x6b,0xd6,0xf0,0xfa,0xc0,0x4b
-# GFX12: v_maxmin_num_f16 v5, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x00,0x6b,0xd6,0xf0,0xfa,0xc0,0x4b]
+# W32-REAL16: v_maxmin_num_f16 v5.l, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x00,0x6b,0xd6,0xf0,0xfa,0xc0,0x4b]
+# W32-FAKE16: v_maxmin_num_f16 v5, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x00,0x6b,0xd6,0xf0,0xfa,0xc0,0x4b]
+# W64-REAL16: v_maxmin_num_f16 v5.l, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x00,0x6b,0xd6,0xf0,0xfa,0xc0,0x4b]
+# W64-FAKE16: v_maxmin_num_f16 v5, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x00,0x6b,0xd6,0xf0,0xfa,0xc0,0x4b]
 
 0x05,0x02,0x6b,0xd6,0xfd,0xd4,0x04,0x33
-# GFX12: v_maxmin_num_f16 v5, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x6b,0xd6,0xfd,0xd4,0x04,0x33]
+# W32-REAL16: v_maxmin_num_f16 v5.l, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x6b,0xd6,0xfd,0xd4,0x04,0x33]
+# W32-FAKE16: v_maxmin_num_f16 v5, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x6b,0xd6,0xfd,0xd4,0x04,0x33]
+# W64-REAL16: v_maxmin_num_f16 v5.l, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x6b,0xd6,0xfd,0xd4,0x04,0x33]
+# W64-FAKE16: v_maxmin_num_f16 v5, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x6b,0xd6,0xfd,0xd4,0x04,0x33]
 
 0xff,0x83,0x6b,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00
-# GFX12: v_maxmin_num_f16 v255, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x6b,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_maxmin_num_f16 v255.l, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x6b,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_maxmin_num_f16 v255, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x6b,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_maxmin_num_f16 v255.l, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x6b,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_maxmin_num_f16 v255, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x6b,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+
+0x05,0x08,0x6b,0xd6,0xff,0x05,0xa4,0x01
+# W32-REAL16: v_maxmin_num_f16 v5.l, v255.h, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x6b,0xd6,0xff,0x05,0xa4,0x01]
+# W32-FAKE16: v_maxmin_num_f16 v5, v255, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x6b,0xd6,0xff,0x05,0xa4,0x01]
+# W64-REAL16: v_maxmin_num_f16 v5.l, v255.h, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x6b,0xd6,0xff,0x05,0xa4,0x01]
+# W64-FAKE16: v_maxmin_num_f16 v5, v255, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x6b,0xd6,0xff,0x05,0xa4,0x01]
+
+
+0x05,0x10,0x6b,0xd6,0x01,0xfe,0xff,0x01
+# W32-REAL16: v_maxmin_num_f16 v5.l, s1, v255.h, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x6b,0xd6,0x01,0xfe,0xff,0x01]
+# W32-FAKE16: v_maxmin_num_f16 v5, s1, v255, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x6b,0xd6,0x01,0xfe,0xff,0x01]
+# W64-REAL16: v_maxmin_num_f16 v5.l, s1, v255.h, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x6b,0xd6,0x01,0xfe,0xff,0x01]
+# W64-FAKE16: v_maxmin_num_f16 v5, s1, v255, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x6b,0xd6,0x01,0xfe,0xff,0x01]
+
+
+0x05,0x20,0x6b,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00
+# W32-REAL16: v_maxmin_num_f16 v5.l, vcc_hi, 0xfe0b, v255.h op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x6b,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_maxmin_num_f16 v5, vcc_hi, 0xfe0b, v255 op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x6b,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_maxmin_num_f16 v5.l, vcc_hi, 0xfe0b, v255.h op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x6b,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_maxmin_num_f16 v5, vcc_hi, 0xfe0b, v255 op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x6b,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+
+0xff,0xc3,0x6b,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00
+# W32-REAL16: v_maxmin_num_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp div:2 ; encoding: [0xff,0xc3,0x6b,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_maxmin_num_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp div:2 ; encoding: [0xff,0xc3,0x6b,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_maxmin_num_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp div:2 ; encoding: [0xff,0xc3,0x6b,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_maxmin_num_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp div:2 ; encoding: [0xff,0xc3,0x6b,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
 
 0x05,0x00,0x69,0xd6,0x01,0x05,0x0e,0x00
 # GFX12: v_maxmin_num_f32 v5, v1, v2, s3         ; encoding: [0x05,0x00,0x69,0xd6,0x01,0x05,0x0e,0x00]
@@ -5693,49 +5764,120 @@
 # W64-FAKE16: v_min_u16 v255, 0xfe0b, vcc_hi          ; encoding: [0xff,0x00,0x0b,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
 0x05,0x00,0x6a,0xd6,0x01,0x05,0x0e,0x00
-# GFX12: v_minmax_num_f16 v5, v1, v2, s3         ; encoding: [0x05,0x00,0x6a,0xd6,0x01,0x05,0x0e,0x00]
+# W32-REAL16: v_minmax_num_f16 v5.l, v1.l, v2.l, s3   ; encoding: [0x05,0x00,0x6a,0xd6,0x01,0x05,0x0e,0x00]
+# W32-FAKE16: v_minmax_num_f16 v5, v1, v2, s3         ; encoding: [0x05,0x00,0x6a,0xd6,0x01,0x05,0x0e,0x00]
+# W64-REAL16: v_minmax_num_f16 v5.l, v1.l, v2.l, s3   ; encoding: [0x05,0x00,0x6a,0xd6,0x01,0x05,0x0e,0x00]
+# W64-FAKE16: v_minmax_num_f16 v5, v1, v2, s3         ; encoding: [0x05,0x00,0x6a,0xd6,0x01,0x05,0x0e,0x00]
 
 0x05,0x00,0x6a,0xd6,0xff,0x05,0xa4,0x01
-# GFX12: v_minmax_num_f16 v5, v255, s2, s105     ; encoding: [0x05,0x00,0x6a,0xd6,0xff,0x05,0xa4,0x01]
+# W32-REAL16: v_minmax_num_f16 v5.l, v255.l, s2, s105 ; encoding: [0x05,0x00,0x6a,0xd6,0xff,0x05,0xa4,0x01]
+# W32-FAKE16: v_minmax_num_f16 v5, v255, s2, s105     ; encoding: [0x05,0x00,0x6a,0xd6,0xff,0x05,0xa4,0x01]
+# W64-REAL16: v_minmax_num_f16 v5.l, v255.l, s2, s105 ; encoding: [0x05,0x00,0x6a,0xd6,0xff,0x05,0xa4,0x01]
+# W64-FAKE16: v_minmax_num_f16 v5, v255, s2, s105     ; encoding: [0x05,0x00,0x6a,0xd6,0xff,0x05,0xa4,0x01]
 
 0x05,0x00,0x6a,0xd6,0x01,0xfe,0xff,0x01
-# GFX12: v_minmax_num_f16 v5, s1, v255, exec_hi  ; encoding: [0x05,0x00,0x6a,0xd6,0x01,0xfe,0xff,0x01]
+# W32-REAL16: v_minmax_num_f16 v5.l, s1, v255.l, exec_hi ; encoding: [0x05,0x00,0x6a,0xd6,0x01,0xfe,0xff,0x01]
+# W32-FAKE16: v_minmax_num_f16 v5, s1, v255, exec_hi  ; encoding: [0x05,0x00,0x6a,0xd6,0x01,0xfe,0xff,0x01]
+# W64-REAL16: v_minmax_num_f16 v5.l, s1, v255.l, exec_hi ; encoding: [0x05,0x00,0x6a,0xd6,0x01,0xfe,0xff,0x01]
+# W64-FAKE16: v_minmax_num_f16 v5, s1, v255, exec_hi  ; encoding: [0x05,0x00,0x6a,0xd6,0x01,0xfe,0xff,0x01]
 
 0x05,0x00,0x6a,0xd6,0x69,0xd2,0xf8,0x01
-# GFX12: v_minmax_num_f16 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x6a,0xd6,0x69,0xd2,0xf8,0x01]
+# W32-REAL16: v_minmax_num_f16 v5.l, s105, s105, exec_lo ; encoding: [0x05,0x00,0x6a,0xd6,0x69,0xd2,0xf8,0x01]
+# W32-FAKE16: v_minmax_num_f16 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x6a,0xd6,0x69,0xd2,0xf8,0x01]
+# W64-REAL16: v_minmax_num_f16 v5.l, s105, s105, exec_lo ; encoding: [0x05,0x00,0x6a,0xd6,0x69,0xd2,0xf8,0x01]
+# W64-FAKE16: v_minmax_num_f16 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x6a,0xd6,0x69,0xd2,0xf8,0x01]
 
 0x05,0x00,0x6a,0xd6,0x6a,0xf6,0x0c,0x04
-# GFX12: v_minmax_num_f16 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x6a,0xd6,0x6a,0xf6,0x0c,0x04]
+# W32-REAL16: v_minmax_num_f16 v5.l, vcc_lo, ttmp15, v3.l ; encoding: [0x05,0x00,0x6a,0xd6,0x6a,0xf6,0x0c,0x04]
+# W32-FAKE16: v_minmax_num_f16 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x6a,0xd6,0x6a,0xf6,0x0c,0x04]
+# W64-REAL16: v_minmax_num_f16 v5.l, vcc_lo, ttmp15, v3.l ; encoding: [0x05,0x00,0x6a,0xd6,0x6a,0xf6,0x0c,0x04]
+# W64-FAKE16: v_minmax_num_f16 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x6a,0xd6,0x6a,0xf6,0x0c,0x04]
 
 0x05,0x00,0x6a,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00
-# GFX12: v_minmax_num_f16 v5, vcc_hi, 0xfe0b, v255 ; encoding: [0x05,0x00,0x6a,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_minmax_num_f16 v5.l, vcc_hi, 0xfe0b, v255.l ; encoding: [0x05,0x00,0x6a,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_minmax_num_f16 v5, vcc_hi, 0xfe0b, v255 ; encoding: [0x05,0x00,0x6a,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_minmax_num_f16 v5.l, vcc_hi, 0xfe0b, v255.l ; encoding: [0x05,0x00,0x6a,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_minmax_num_f16 v5, vcc_hi, 0xfe0b, v255 ; encoding: [0x05,0x00,0x6a,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
 
 0x05,0x07,0x6a,0xd6,0x7b,0xfa,0xed,0xe1
-# GFX12: v_minmax_num_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x6a,0xd6,0x7b,0xfa,0xed,0xe1]
+# W32-REAL16: v_minmax_num_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x6a,0xd6,0x7b,0xfa,0xed,0xe1]
+# W32-FAKE16: v_minmax_num_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x6a,0xd6,0x7b,0xfa,0xed,0xe1]
+# W64-REAL16: v_minmax_num_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x6a,0xd6,0x7b,0xfa,0xed,0xe1]
+# W64-FAKE16: v_minmax_num_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x6a,0xd6,0x7b,0xfa,0xed,0xe1]
 
 0x05,0x00,0x6a,0xd6,0x7d,0xe0,0xf5,0x01
-# GFX12: v_minmax_num_f16 v5, m0, 0.5, m0        ; encoding: [0x05,0x00,0x6a,0xd6,0x7d,0xe0,0xf5,0x01]
+# W32-REAL16: v_minmax_num_f16 v5.l, m0, 0.5, m0      ; encoding: [0x05,0x00,0x6a,0xd6,0x7d,0xe0,0xf5,0x01]
+# W32-FAKE16: v_minmax_num_f16 v5, m0, 0.5, m0        ; encoding: [0x05,0x00,0x6a,0xd6,0x7d,0xe0,0xf5,0x01]
+# W64-REAL16: v_minmax_num_f16 v5.l, m0, 0.5, m0      ; encoding: [0x05,0x00,0x6a,0xd6,0x7d,0xe0,0xf5,0x01]
+# W64-FAKE16: v_minmax_num_f16 v5, m0, 0.5, m0        ; encoding: [0x05,0x00,0x6a,0xd6,0x7d,0xe0,0xf5,0x01]
 
 0x05,0x01,0x6a,0xd6,0x7e,0x82,0xad,0x01
-# GFX12: v_minmax_num_f16 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x6a,0xd6,0x7e,0x82,0xad,0x01]
+# W32-REAL16: v_minmax_num_f16 v5.l, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x6a,0xd6,0x7e,0x82,0xad,0x01]
+# W32-FAKE16: v_minmax_num_f16 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x6a,0xd6,0x7e,0x82,0xad,0x01]
+# W64-REAL16: v_minmax_num_f16 v5.l, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x6a,0xd6,0x7e,0x82,0xad,0x01]
+# W64-FAKE16: v_minmax_num_f16 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x6a,0xd6,0x7e,0x82,0xad,0x01]
 
 0x05,0x05,0x6a,0xd6,0x7f,0xf8,0xa8,0xa1
-# GFX12: v_minmax_num_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x6a,0xd6,0x7f,0xf8,0xa8,0xa1]
+# W32-REAL16: v_minmax_num_f16 v5.l, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x6a,0xd6,0x7f,0xf8,0xa8,0xa1]
+# W32-FAKE16: v_minmax_num_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x6a,0xd6,0x7f,0xf8,0xa8,0xa1]
+# W64-REAL16: v_minmax_num_f16 v5.l, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x6a,0xd6,0x7f,0xf8,0xa8,0xa1]
+# W64-FAKE16: v_minmax_num_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x6a,0xd6,0x7f,0xf8,0xa8,0xa1]
 
 0x05,0x04,0x6a,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00
-# GFX12: v_minmax_num_f16 v5, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x6a,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_minmax_num_f16 v5.l, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x6a,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_minmax_num_f16 v5, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x6a,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_minmax_num_f16 v5.l, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x6a,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_minmax_num_f16 v5, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x6a,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
 
 0x05,0x06,0x6a,0xd6,0xc1,0xfe,0xf4,0xc3
-# GFX12: v_minmax_num_f16 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x6a,0xd6,0xc1,0xfe,0xf4,0xc3]
+# W32-REAL16: v_minmax_num_f16 v5.l, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x6a,0xd6,0xc1,0xfe,0xf4,0xc3]
+# W32-FAKE16: v_minmax_num_f16 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x6a,0xd6,0xc1,0xfe,0xf4,0xc3]
+# W64-REAL16: v_minmax_num_f16 v5.l, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x6a,0xd6,0xc1,0xfe,0xf4,0xc3]
+# W64-FAKE16: v_minmax_num_f16 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x6a,0xd6,0xc1,0xfe,0xf4,0xc3]
 
 0x05,0x00,0x6a,0xd6,0xf0,0xfa,0xc0,0x4b
-# GFX12: v_minmax_num_f16 v5, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x00,0x6a,0xd6,0xf0,0xfa,0xc0,0x4b]
+# W32-REAL16: v_minmax_num_f16 v5.l, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x00,0x6a,0xd6,0xf0,0xfa,0xc0,0x4b]
+# W32-FAKE16: v_minmax_num_f16 v5, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x00,0x6a,0xd6,0xf0,0xfa,0xc0,0x4b]
+# W64-REAL16: v_minmax_num_f16 v5.l, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x00,0x6a,0xd6,0xf0,0xfa,0xc0,0x4b]
+# W64-FAKE16: v_minmax_num_f16 v5, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x00,0x6a,0xd6,0xf0,0xfa,0xc0,0x4b]
 
 0x05,0x02,0x6a,0xd6,0xfd,0xd4,0x04,0x33
-# GFX12: v_minmax_num_f16 v5, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x6a,0xd6,0xfd,0xd4,0x04,0x33]
+# W32-REAL16: v_minmax_num_f16 v5.l, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x6a,0xd6,0xfd,0xd4,0x04,0x33]
+# W32-FAKE16: v_minmax_num_f16 v5, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x6a,0xd6,0xfd,0xd4,0x04,0x33]
+# W64-REAL16: v_minmax_num_f16 v5.l, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x6a,0xd6,0xfd,0xd4,0x04,0x33]
+# W64-FAKE16: v_minmax_num_f16 v5, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x6a,0xd6,0xfd,0xd4,0x04,0x33]
 
 0xff,0x83,0x6a,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00
-# GFX12: v_minmax_num_f16 v255, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x6a,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_minmax_num_f16 v255.l, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x6a,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_minmax_num_f16 v255, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x6a,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_minmax_num_f16 v255.l, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x6a,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_minmax_num_f16 v255, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x6a,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+
+0x05,0x08,0x6a,0xd6,0xff,0x05,0xa4,0x01
+# W32-REAL16: v_minmax_num_f16 v5.l, v255.h, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x6a,0xd6,0xff,0x05,0xa4,0x01]
+# W32-FAKE16: v_minmax_num_f16 v5, v255, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x6a,0xd6,0xff,0x05,0xa4,0x01]
+# W64-REAL16: v_minmax_num_f16 v5.l, v255.h, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x6a,0xd6,0xff,0x05,0xa4,0x01]
+# W64-FAKE16: v_minmax_num_f16 v5, v255, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x6a,0xd6,0xff,0x05,0xa4,0x01]
+
+
+0x05,0x10,0x6a,0xd6,0x01,0xfe,0xff,0x01
+# W32-REAL16: v_minmax_num_f16 v5.l, s1, v255.h, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x6a,0xd6,0x01,0xfe,0xff,0x01]
+# W32-FAKE16: v_minmax_num_f16 v5, s1, v255, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x6a,0xd6,0x01,0xfe,0xff,0x01]
+# W64-REAL16: v_minmax_num_f16 v5.l, s1, v255.h, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x6a,0xd6,0x01,0xfe,0xff,0x01]
+# W64-FAKE16: v_minmax_num_f16 v5, s1, v255, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x6a,0xd6,0x01,0xfe,0xff,0x01]
+
+
+0x05,0x20,0x6a,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00
+# W32-REAL16: v_minmax_num_f16 v5.l, vcc_hi, 0xfe0b, v255.h op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x6a,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_minmax_num_f16 v5, vcc_hi, 0xfe0b, v255 op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x6a,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_minmax_num_f16 v5.l, vcc_hi, 0xfe0b, v255.h op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x6a,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_minmax_num_f16 v5, vcc_hi, 0xfe0b, v255 op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x6a,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+
+0xff,0xc3,0x6a,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00
+# W32-REAL16: v_minmax_num_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp div:2 ; encoding: [0xff,0xc3,0x6a,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_minmax_num_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp div:2 ; encoding: [0xff,0xc3,0x6a,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_minmax_num_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp div:2 ; encoding: [0xff,0xc3,0x6a,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_minmax_num_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp div:2 ; encoding: [0xff,0xc3,0x6a,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
 
 0x05,0x00,0x68,0xd6,0x01,0x05,0x0e,0x00
 # GFX12: v_minmax_num_f32 v5, v1, v2, s3         ; encoding: [0x05,0x00,0x68,0xd6,0x01,0x05,0x0e,0x00]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt
index 561d3a6ca7f9..7e30a4a2096b 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt
@@ -2329,52 +2329,131 @@
 # W64-FAKE16: v_max_u16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x09,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
-# GFX12: v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
 0x05,0x00,0x6b,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
-# GFX12: v_maxmin_num_f16_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, s3, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, s3, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
 
 0x05,0x00,0x6b,0xd6,0xfa,0xea,0x0d,0x04,0x01,0x1b,0x00,0xff
-# GFX12: v_maxmin_num_f16_e64_dpp v5, v1, -2.0, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0xea,0x0d,0x04,0x01,0x1b,0x00,0xff]
+# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, -2.0, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0xea,0x0d,0x04,0x01,0x1b,0x00,0xff]
+# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, -2.0, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0xea,0x0d,0x04,0x01,0x1b,0x00,0xff]
+# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, -2.0, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0xea,0x0d,0x04,0x01,0x1b,0x00,0xff]
+# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, -2.0, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0xea,0x0d,0x04,0x01,0x1b,0x00,0xff]
 
 0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
-# GFX12: v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
+# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
+# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
+# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
+# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
 0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff
-# GFX12: v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
+# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
+# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
+# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
+# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
 
 0x05,0x00,0x6b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff
-# GFX12: v_maxmin_num_f16_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
+# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
+# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
+# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
+# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
 
 0x05,0x00,0x6b,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff
-# GFX12: v_maxmin_num_f16_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
+# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
+# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
+# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
+# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
 
 0x05,0x00,0x6b,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff
-# GFX12: v_maxmin_num_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
 
 0x05,0x00,0x6b,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff
-# GFX12: v_maxmin_num_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
 
 0x05,0x01,0x6b,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff
-# GFX12: v_maxmin_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x6b,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
+# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x6b,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
+# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x6b,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
+# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x6b,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
+# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x6b,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
 
 0x05,0x02,0x6b,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff
-# GFX12: v_maxmin_num_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x6b,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x6b,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x6b,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x6b,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x6b,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
 
 0x05,0x04,0x6b,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff
-# GFX12: v_maxmin_num_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x6b,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x6b,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x6b,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x6b,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x6b,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
 
 0x05,0x03,0x6b,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff
-# GFX12: v_maxmin_num_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x6b,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x6b,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x6b,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x6b,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x6b,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
 
 0x05,0x05,0x6b,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01
-# GFX12: v_maxmin_num_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x6b,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01]
+# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x6b,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x6b,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x6b,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x6b,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01]
 
 0x05,0x06,0x6b,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x01,0x13
-# GFX12: v_maxmin_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x06,0x6b,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x01,0x13]
+# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x06,0x6b,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x06,0x6b,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x06,0x6b,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x06,0x6b,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x01,0x13]
 
 0xff,0x87,0x6b,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30
-# GFX12: v_maxmin_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x87,0x6b,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
+# W32-REAL16: v_maxmin_num_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x87,0x6b,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_maxmin_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x87,0x6b,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_maxmin_num_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x87,0x6b,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_maxmin_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x87,0x6b,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
+
+0x05,0x78,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
+# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+
+
+0x05,0x20,0x6b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff
+# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x6b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x6b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x6b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x6b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+
+0x05,0x0a,0x6b,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01
+# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x6b,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x6b,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x6b,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x6b,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01]
+
+0x05,0x13,0x6b,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x01,0x13
+# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x6b,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x6b,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x6b,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x6b,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x01,0x13]
+
+0xff,0xc7,0x6b,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30
+# W32-REAL16: v_maxmin_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x6b,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_maxmin_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x6b,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_maxmin_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x6b,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_maxmin_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x6b,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0x69,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 # GFX12: v_maxmin_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x69,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
@@ -3082,49 +3161,125 @@
 # W64-FAKE16: v_min_u16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x0b,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
-# GFX12: v_minmax_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
 0x05,0x00,0x6a,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
-# GFX12: v_minmax_num_f16_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, s3, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, s3, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
 
 0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
-# GFX12: v_minmax_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
+# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
+# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
+# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
+# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
 0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff
-# GFX12: v_minmax_num_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
+# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
+# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
+# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
+# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
 
 0x05,0x00,0x6a,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff
-# GFX12: v_minmax_num_f16_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
+# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
+# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
+# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
+# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
 
 0x05,0x00,0x6a,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff
-# GFX12: v_minmax_num_f16_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
+# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
+# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
+# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
+# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
 
 0x05,0x00,0x6a,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff
-# GFX12: v_minmax_num_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
 
 0x05,0x00,0x6a,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff
-# GFX12: v_minmax_num_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
 
 0x05,0x01,0x6a,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff
-# GFX12: v_minmax_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x6a,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
+# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x6a,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
+# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x6a,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
+# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x6a,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
+# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x6a,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
 
 0x05,0x02,0x6a,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff
-# GFX12: v_minmax_num_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x6a,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x6a,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x6a,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x6a,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x6a,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
 
 0x05,0x04,0x6a,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff
-# GFX12: v_minmax_num_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x6a,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x6a,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x6a,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x6a,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x6a,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
 
 0x05,0x03,0x6a,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff
-# GFX12: v_minmax_num_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x6a,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x6a,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x6a,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x6a,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x6a,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
 
 0x05,0x05,0x6a,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01
-# GFX12: v_minmax_num_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x6a,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01]
+# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x6a,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x6a,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x6a,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x6a,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01]
 
 0x05,0x06,0x6a,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x01,0x13
-# GFX12: v_minmax_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x06,0x6a,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x01,0x13]
+# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x06,0x6a,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x06,0x6a,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x06,0x6a,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x06,0x6a,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x01,0x13]
 
 0xff,0x87,0x6a,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30
-# GFX12: v_minmax_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x87,0x6a,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
+# W32-REAL16: v_minmax_num_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x87,0x6a,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_minmax_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x87,0x6a,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_minmax_num_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x87,0x6a,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_minmax_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x87,0x6a,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
+
+0x05,0x78,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
+# W32-REAL16: v_minmax_num_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W64-REAL16: v_minmax_num_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+
+
+0x05,0x20,0x6a,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff
+# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x6a,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x6a,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x6a,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x6a,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+
+0x05,0x0a,0x6a,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01
+# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x6a,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x6a,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x6a,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x6a,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01]
+
+0x05,0x13,0x6a,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x01,0x13
+# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x6a,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x6a,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x6a,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x6a,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x01,0x13]
+
+0xff,0xc7,0x6a,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30
+# W32-REAL16: v_minmax_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x6a,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_minmax_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x6a,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_minmax_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x6a,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_minmax_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x6a,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0x68,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 # GFX12: v_minmax_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt
index 06b4bfcc8985..2aaba2a17fae 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt
@@ -1294,43 +1294,112 @@
 # W64-FAKE16: v_max_u16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x09,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0x6b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
-# GFX12: v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x6b,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
-# GFX12: v_maxmin_num_f16_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, s3, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, s3, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x6b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
-# GFX12: v_maxmin_num_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x6b,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05
-# GFX12: v_maxmin_num_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x6b,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05
-# GFX12: v_maxmin_num_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x6b,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05
-# GFX12: v_maxmin_num_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
 
 0x05,0x01,0x6b,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05
-# GFX12: v_maxmin_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x6b,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x6b,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x6b,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x6b,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x6b,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
 
 0x05,0x02,0x6b,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05
-# GFX12: v_maxmin_num_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x6b,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x6b,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x6b,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x6b,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x6b,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
 
 0x05,0x04,0x6b,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05
-# GFX12: v_maxmin_num_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x6b,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x6b,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x6b,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x6b,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x6b,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
 
 0x05,0x03,0x6b,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05
-# GFX12: v_maxmin_num_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x6b,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x6b,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x6b,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x6b,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x6b,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
 
 0x05,0x05,0x6b,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05
-# GFX12: v_maxmin_num_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x6b,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x6b,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x6b,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x6b,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x6b,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05]
 
 0x05,0x06,0x6b,0xd6,0xe9,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05
-# GFX12: v_maxmin_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x06,0x6b,0xd6,0xe9,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x06,0x6b,0xd6,0xe9,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x06,0x6b,0xd6,0xe9,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x06,0x6b,0xd6,0xe9,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x06,0x6b,0xd6,0xe9,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05]
 
 0xff,0x87,0x6b,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00
-# GFX12: v_maxmin_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x87,0x6b,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+# W32-REAL16: v_maxmin_num_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x87,0x6b,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_maxmin_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x87,0x6b,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_maxmin_num_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x87,0x6b,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_maxmin_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x87,0x6b,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+
+0x05,0x78,0x6b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
+# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x6b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x6b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x6b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x6b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+
+0x05,0x20,0x6b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
+# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x6b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x6b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x6b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x6b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+
+0x05,0x0a,0x6b,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05
+# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x6b,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x6b,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x6b,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x6b,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05]
+
+0x05,0x13,0x6b,0xd6,0xe9,0x04,0xc2,0x73,0x01,0x77,0x39,0x05
+# W32-REAL16: v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x6b,0xd6,0xe9,0x04,0xc2,0x73,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maxmin_num_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x6b,0xd6,0xe9,0x04,0xc2,0x73,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maxmin_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x6b,0xd6,0xe9,0x04,0xc2,0x73,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maxmin_num_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x6b,0xd6,0xe9,0x04,0xc2,0x73,0x01,0x77,0x39,0x05]
+
+0xff,0xc7,0x6b,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00
+# W32-REAL16: v_maxmin_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x6b,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_maxmin_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x6b,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_maxmin_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x6b,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_maxmin_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x6b,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0x69,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 # GFX12: v_maxmin_num_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x69,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
@@ -1768,43 +1837,112 @@
 # W64-FAKE16: v_min_u16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x0b,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0x6a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
-# GFX12: v_minmax_num_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x6a,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
-# GFX12: v_minmax_num_f16_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, s3, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, s3, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x6a,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
-# GFX12: v_minmax_num_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x6a,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05
-# GFX12: v_minmax_num_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x6a,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05
-# GFX12: v_minmax_num_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x6a,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05
-# GFX12: v_minmax_num_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
 
 0x05,0x01,0x6a,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05
-# GFX12: v_minmax_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x6a,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x6a,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x6a,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x6a,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x6a,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
 
 0x05,0x02,0x6a,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05
-# GFX12: v_minmax_num_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x6a,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x6a,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x6a,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x6a,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x6a,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
 
 0x05,0x04,0x6a,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05
-# GFX12: v_minmax_num_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x6a,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x6a,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x6a,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x6a,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x6a,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
 
 0x05,0x03,0x6a,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05
-# GFX12: v_minmax_num_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x6a,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x6a,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x6a,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x6a,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x6a,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
 
 0x05,0x05,0x6a,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05
-# GFX12: v_minmax_num_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x6a,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x6a,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x6a,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x6a,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x6a,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05]
 
 0x05,0x06,0x6a,0xd6,0xe9,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05
-# GFX12: v_minmax_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x06,0x6a,0xd6,0xe9,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x06,0x6a,0xd6,0xe9,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x06,0x6a,0xd6,0xe9,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x06,0x6a,0xd6,0xe9,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x06,0x6a,0xd6,0xe9,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05]
 
 0xff,0x87,0x6a,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00
-# GFX12: v_minmax_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x87,0x6a,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+# W32-REAL16: v_minmax_num_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x87,0x6a,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_minmax_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x87,0x6a,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_minmax_num_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x87,0x6a,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_minmax_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x87,0x6a,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+
+0x05,0x78,0x6a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
+# W32-REAL16: v_minmax_num_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x6a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x6a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minmax_num_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x6a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x6a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+
+0x05,0x20,0x6a,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
+# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x6a,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x6a,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x6a,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x6a,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+
+0x05,0x0a,0x6a,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05
+# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x6a,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x6a,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x6a,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x6a,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05]
+
+0x05,0x13,0x6a,0xd6,0xe9,0x04,0xc2,0x73,0x01,0x77,0x39,0x05
+# W32-REAL16: v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x6a,0xd6,0xe9,0x04,0xc2,0x73,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minmax_num_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x6a,0xd6,0xe9,0x04,0xc2,0x73,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minmax_num_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x6a,0xd6,0xe9,0x04,0xc2,0x73,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minmax_num_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x6a,0xd6,0xe9,0x04,0xc2,0x73,0x01,0x77,0x39,0x05]
+
+0xff,0xc7,0x6a,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00
+# W32-REAL16: v_minmax_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x6a,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_minmax_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x6a,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_minmax_num_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x6a,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_minmax_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x6a,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0x68,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 # GFX12: v_minmax_num_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x68,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
diff --git a/llvm/test/MC/RISCV/custom_reloc.s b/llvm/test/MC/RISCV/custom_reloc.s
index 4bd470008ee5..cdb819467875 100644
--- a/llvm/test/MC/RISCV/custom_reloc.s
+++ b/llvm/test/MC/RISCV/custom_reloc.s
@@ -21,16 +21,33 @@
   .reloc ., R_RISCV_VENDOR,    VENDOR_NAME
   .reloc ., R_RISCV_CUSTOM192, my_foo + 1
   addi a0, a0, 0
-  # CHECK-ASM: [[L1:.L[^:]+]]:
+  # CHECK-ASM:      [[L1:.L[^:]+]]:
   # CHECK-ASM-NEXT: .reloc [[L1]], R_RISCV_VENDOR, VENDOR_NAME
   # CHECK-ASM-NEXT: [[L2:.L[^:]+]]:
   # CHECK-ASM-NEXT: .reloc [[L2]], R_RISCV_CUSTOM192, my_foo+1
   # CHECK-ASM-NEXT: mv a0, a0
 
-  # CHECK-OBJ: addi a0, a0, 0
+  # CHECK-OBJ:      addi a0, a0, 0
   # CHECK-OBJ-NEXT: R_RISCV_VENDOR    VENDOR_NAME
   # CHECK-OBJ-NEXT: R_RISCV_CUSTOM192 my_foo+0x1
 
   nop
   # CHECK-ASM: nop
   # CHECK-OBJ: addi zero, zero, 0x0
+
+  .reloc ., R_RISCV_VENDOR,     QUALCOMM
+  .reloc ., R_RISCV_QC_ABS20_U, my_bar + 2
+  addi a1, a1, 0
+  # CHECK-ASM:      [[L3:.L[^:]+]]:
+  # CHECK-ASM-NEXT: .reloc [[L3]], R_RISCV_VENDOR, QUALCOMM
+  # CHECK-ASM-NEXT: [[L4:.L[^:]+]]:
+  # CHECK-ASM-NEXT: .reloc [[L4]], R_RISCV_QC_ABS20_U, my_bar+2
+  # CHECK-ASM-NEXT: mv a1, a1
+
+  # CHECK-OBJ:      addi a1, a1, 0
+  # CHECK-OBJ-NEXT: R_RISCV_VENDOR    QUALCOMM
+  # CHECK-OBJ-NEXT: R_RISCV_CUSTOM192 my_bar+0x2
+
+  nop
+  # CHECK-ASM: nop
+  # CHECK-OBJ: addi zero, zero, 0x0
diff --git a/llvm/test/MC/RISCV/xqcicm-invalid.s b/llvm/test/MC/RISCV/xqcicm-invalid.s
new file mode 100644
index 000000000000..8b37ed4edeb0
--- /dev/null
+++ b/llvm/test/MC/RISCV/xqcicm-invalid.s
@@ -0,0 +1,152 @@
+# Xqcicm - Qualcomm uC Conditional Move Extension
+# RUN: not llvm-mc -triple riscv32 -mattr=+experimental-xqcicm < %s 2>&1 \
+# RUN:     | FileCheck -check-prefixes=CHECK,CHECK-IMM %s
+# RUN: not llvm-mc -triple riscv32 -mattr=-experimental-xqcicm < %s 2>&1 \
+# RUN:     | FileCheck -check-prefixes=CHECK,CHECK-EXT %s
+
+# CHECK: :[[@LINE+1]]:12: error: invalid operand for instruction
+qc.c.mveqz 9, x10
+
+# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction
+qc.c.mveqz x9
+
+# CHECK-EXT: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqcicm' (Qualcomm uC Conditional Move Extension)
+qc.c.mveqz x9, x10
+
+
+# CHECK: :[[@LINE+1]]:9: error: invalid operand for instruction
+qc.mveq 9, x10, x11, x12
+
+# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction
+qc.mveq x9
+
+# CHECK-EXT: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqcicm' (Qualcomm uC Conditional Move Extension)
+qc.mveq x9, x10, x11, x12
+
+
+# CHECK: :[[@LINE+1]]:9: error: invalid operand for instruction
+qc.mvge 9, x10, x11, x12
+
+# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction
+qc.mvge x9
+
+# CHECK-EXT: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqcicm' (Qualcomm uC Conditional Move Extension)
+qc.mvge x9, x10, x11, x12
+
+
+# CHECK: :[[@LINE+1]]:10: error: invalid operand for instruction
+qc.mvgeu 9, x10, x11, x12
+
+# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction
+qc.mvgeu x9
+
+# CHECK-EXT: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqcicm' (Qualcomm uC Conditional Move Extension)
+qc.mvgeu x9, x10, x11, x12
+
+
+# CHECK: :[[@LINE+1]]:9: error: invalid operand for instruction
+qc.mvlt 9, x10, x11, x12
+
+# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction
+qc.mvlt x9
+
+# CHECK-EXT: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqcicm' (Qualcomm uC Conditional Move Extension)
+qc.mvlt x9, x10, x11, x12
+
+
+# CHECK: :[[@LINE+1]]:10: error: invalid operand for instruction
+qc.mvltu 9, x10, x11, x12
+
+# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction
+qc.mvltu x9
+
+# CHECK-EXT: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqcicm' (Qualcomm uC Conditional Move Extension)
+qc.mvltu x9, x10, x11, x12
+
+
+# CHECK: :[[@LINE+1]]:9: error: invalid operand for instruction
+qc.mvne 9, x10, x11, x12
+
+# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction
+qc.mvne x9
+
+# CHECK-EXT: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqcicm' (Qualcomm uC Conditional Move Extension)
+qc.mvne x9, x10, x11, x12
+
+
+# CHECK: :[[@LINE+1]]:10: error: invalid operand for instruction
+qc.mveqi 9, x10, 5, x12
+
+# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction
+qc.mveqi x9
+
+# CHECK-IMM: :[[@LINE+1]]:19: error: immediate must be an integer in the range [-16, 15]
+qc.mveqi x9, x10, 17, x12
+
+# CHECK-EXT: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqcicm' (Qualcomm uC Conditional Move Extension)
+qc.mveqi x9, x10, 5, x12
+
+
+# CHECK: :[[@LINE+1]]:10: error: invalid operand for instruction
+qc.mvgei 9, x10, 5, x12
+
+# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction
+qc.mvgei x9
+
+# CHECK-IMM: :[[@LINE+1]]:19: error: immediate must be an integer in the range [-16, 15]
+qc.mvgei x9, x10, 17, x12
+
+# CHECK-EXT: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqcicm' (Qualcomm uC Conditional Move Extension)
+qc.mvgei x9, x10, 5, x12
+
+
+# CHECK: :[[@LINE+1]]:10: error: invalid operand for instruction
+qc.mvlti 9, x10, 5, x12
+
+# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction
+qc.mvlti x9
+
+# CHECK-IMM: :[[@LINE+1]]:19: error: immediate must be an integer in the range [-16, 15]
+qc.mvlti x9, x10, 17, x12
+
+# CHECK-EXT: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqcicm' (Qualcomm uC Conditional Move Extension)
+qc.mvlti x9, x10, 5, x12
+
+
+# CHECK: :[[@LINE+1]]:10: error: invalid operand for instruction
+qc.mvnei 9, x10, 5, x12
+
+# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction
+qc.mvnei x9
+
+# CHECK-IMM: :[[@LINE+1]]:19: error: immediate must be an integer in the range [-16, 15]
+qc.mvnei x9, x10, 17, x12
+
+# CHECK-EXT: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqcicm' (Qualcomm uC Conditional Move Extension)
+qc.mvnei x9, x10, 5, x12
+
+
+# CHECK: :[[@LINE+1]]:11: error: invalid operand for instruction
+qc.mvltui 9, x10, 5, x12
+
+# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction
+qc.mvltui x9
+
+# CHECK-IMM: :[[@LINE+1]]:20: error: immediate must be an integer in the range [0, 31]
+qc.mvltui x9, x10, 37, x12
+
+# CHECK-EXT: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqcicm' (Qualcomm uC Conditional Move Extension)
+qc.mvltui x9, x10, 5, x12
+
+
+# CHECK: :[[@LINE+1]]:11: error: invalid operand for instruction
+qc.mvgeui 9, x10, 5, x12
+
+# CHECK: :[[@LINE+1]]:1: error: too few operands for instruction
+qc.mvgeui x9
+
+# CHECK-IMM: :[[@LINE+1]]:20: error: immediate must be an integer in the range [0, 31]
+qc.mvgeui x9, x10, 37, x12
+
+# CHECK-EXT: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqcicm' (Qualcomm uC Conditional Move Extension)
+qc.mvgeui x9, x10, 5, x12
diff --git a/llvm/test/MC/RISCV/xqcicm-valid.s b/llvm/test/MC/RISCV/xqcicm-valid.s
new file mode 100644
index 000000000000..7d0050b6dafa
--- /dev/null
+++ b/llvm/test/MC/RISCV/xqcicm-valid.s
@@ -0,0 +1,123 @@
+# Xqcicm - Qualcomm uC Conditional Move Extension
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-xqcicm -riscv-no-aliases -show-encoding \
+# RUN:     | FileCheck -check-prefixes=CHECK-ENC,CHECK-INST %s
+# RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+experimental-xqcicm < %s \
+# RUN:     | llvm-objdump --mattr=+experimental-xqcicm -M no-aliases --no-print-imm-hex -d - \
+# RUN:     | FileCheck -check-prefix=CHECK-INST %s
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-xqcicm -show-encoding \
+# RUN:     | FileCheck -check-prefixes=CHECK-ENC,CHECK-INST %s
+# RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+experimental-xqcicm < %s \
+# RUN:     | llvm-objdump --mattr=+experimental-xqcicm --no-print-imm-hex -d - \
+# RUN:     | FileCheck -check-prefix=CHECK-INST %s
+
+# CHECK-INST: qc.c.mveqz      s1, a0
+# CHECK-ENC: encoding: [0x06,0xad]
+qc.c.mveqz x9, x10
+
+
+# CHECK-INST: qc.mveq s1, a0, a1, a2
+# CHECK-ENC: encoding: [0xdb,0x04,0xb5,0x60]
+qc.mveq x9, x10, x11, x12
+
+
+# CHECK-INST: qc.mvge s1, a0, a1, a2
+# CHECK-ENC: encoding: [0xdb,0x54,0xb5,0x60]
+qc.mvge x9, x10, x11, x12
+
+
+# CHECK-INST: qc.mvgeu        s1, a0, a1, a2
+# CHECK-ENC: encoding: [0xdb,0x74,0xb5,0x60]
+qc.mvgeu x9, x10, x11, x12
+
+
+# CHECK-INST: qc.mvlt s1, a0, a1, a2
+# CHECK-ENC: encoding: [0xdb,0x44,0xb5,0x60]
+qc.mvlt x9, x10, x11, x12
+
+
+# CHECK-INST: qc.mvltu        s1, a0, a1, a2
+# CHECK-ENC: encoding: [0xdb,0x64,0xb5,0x60]
+qc.mvltu x9, x10, x11, x12
+
+
+# CHECK-INST: qc.mvne s1, a0, a1, a2
+# CHECK-ENC: encoding: [0xdb,0x14,0xb5,0x60]
+qc.mvne x9, x10, x11, x12
+
+
+# CHECK-INST: qc.mveqi        s1, a0, 5, a2
+# CHECK-ENC: encoding: [0xdb,0x04,0x55,0x64]
+qc.mveqi x9, x10, 5, x12
+
+# CHECK-INST: qc.mveqi        s1, a0, -16, a2
+# CHECK-ENC: encoding: [0xdb,0x04,0x05,0x65]
+qc.mveqi x9, x10, -16, x12
+
+# CHECK-INST: qc.mveqi        s1, a0, 15, a2
+# CHECK-ENC: encoding: [0xdb,0x04,0xf5,0x64]
+qc.mveqi x9, x10, 15, x12
+
+
+# CHECK-INST: qc.mvgei        s1, a0, 5, a2
+# CHECK-ENC: encoding: [0xdb,0x54,0x55,0x64]
+qc.mvgei x9, x10, 5, x12
+
+# CHECK-INST: qc.mvgei        s1, a0, -16, a2
+# CHECK-ENC: encoding: [0xdb,0x54,0x05,0x65]
+qc.mvgei x9, x10, -16, x12
+
+# CHECK-INST: qc.mvgei        s1, a0, 15, a2
+# CHECK-ENC: encoding: [0xdb,0x54,0xf5,0x64]
+qc.mvgei x9, x10, 15, x12
+
+
+# CHECK-INST: qc.mvlti        s1, a0, 5, a2
+# CHECK-ENC: encoding: [0xdb,0x44,0x55,0x64]
+qc.mvlti x9, x10, 5, x12
+
+# CHECK-INST: qc.mvlti        s1, a0, -16, a2
+# CHECK-ENC: encoding: [0xdb,0x44,0x05,0x65]
+qc.mvlti x9, x10, -16, x12
+
+# CHECK-INST: qc.mvlti        s1, a0, 15, a2
+# CHECK-ENC: encoding: [0xdb,0x44,0xf5,0x64]
+qc.mvlti x9, x10, 15, x12
+
+
+# CHECK-INST: qc.mvnei        s1, a0, 5, a2
+# CHECK-ENC: encoding: [0xdb,0x14,0x55,0x64]
+qc.mvnei x9, x10, 5, x12
+
+# CHECK-INST: qc.mvnei        s1, a0, -16, a2
+# CHECK-ENC: encoding: [0xdb,0x14,0x05,0x65]
+qc.mvnei x9, x10, -16, x12
+
+# CHECK-INST: qc.mvnei        s1, a0, 15, a2
+# CHECK-ENC: encoding: [0xdb,0x14,0xf5,0x64]
+qc.mvnei x9, x10, 15, x12
+
+
+# CHECK-INST: qc.mvltui       s1, a0, 5, a2
+# CHECK-ENC: encoding: [0xdb,0x64,0x55,0x64]
+qc.mvltui x9, x10, 5, x12
+
+# CHECK-INST: qc.mvltui       s1, a0, 0, a2
+# CHECK-ENC: encoding: [0xdb,0x64,0x05,0x64]
+qc.mvltui x9, x10, 0, x12
+
+# CHECK-INST: qc.mvltui       s1, a0, 31, a2
+# CHECK-ENC: encoding: [0xdb,0x64,0xf5,0x65]
+qc.mvltui x9, x10, 31, x12
+
+
+# CHECK-INST: qc.mvgeui       s1, a0, 5, a2
+# CHECK-ENC: encoding: [0xdb,0x74,0x55,0x64]
+qc.mvgeui x9, x10, 5, x12
+
+# CHECK-INST: qc.mvgeui       s1, a0, 0, a2
+# CHECK-ENC: encoding: [0xdb,0x74,0x05,0x64]
+qc.mvgeui x9, x10, 0, x12
+
+# CHECK-INST: qc.mvgeui       s1, a0, 31, a2
+# CHECK-ENC: encoding: [0xdb,0x74,0xf5,0x65]
+qc.mvgeui x9, x10, 31, x12
diff --git a/llvm/test/ThinLTO/X86/memprof-recursive.ll b/llvm/test/ThinLTO/X86/memprof-recursive.ll
new file mode 100644
index 000000000000..2b1d7081b761
--- /dev/null
+++ b/llvm/test/ThinLTO/X86/memprof-recursive.ll
@@ -0,0 +1,141 @@
+;; Test recursion handling during cloning.
+;;
+;; See llvm/test/Transforms/MemProfContextDisambiguation/recursive.ll for
+;; information on how the test was created.
+
+; RUN: opt -thinlto-bc %s >%t.o
+
+;; By default we should enable cloning of contexts involved with recursive
+;; cycles, but not through the cycle itself. I.e. until full support for
+;; recursion is added, the cloned recursive call from C back to B (line 12) will
+;; not be updated to call a clone.
+; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \
+; RUN:  -supports-hot-cold-new \
+; RUN:  -r=%t.o,_Z1Dv,plx \
+; RUN:  -r=%t.o,_Z1Ci,plx \
+; RUN:  -r=%t.o,_Z1Bi,plx \
+; RUN:  -r=%t.o,main,plx \
+; RUN:  -r=%t.o,_Znam, \
+; RUN:  -memprof-verify-ccg -memprof-verify-nodes \
+; RUN:  -pass-remarks=memprof-context-disambiguation \
+; RUN:  -o %t.out 2>&1 | FileCheck %s \
+; RUN:  --implicit-check-not "memprof_recursive3.cc:12:10: call in clone _Z1Ci.memprof.1 assigned" \
+; RUN:  --check-prefix=ALLOW-RECUR-CALLSITES --check-prefix=ALLOW-RECUR-CONTEXTS
+
+;; Skipping recursive callsites should result in no cloning.
+; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \
+; RUN:  -supports-hot-cold-new \
+; RUN:  -r=%t.o,_Z1Dv,plx \
+; RUN:  -r=%t.o,_Z1Ci,plx \
+; RUN:  -r=%t.o,_Z1Bi,plx \
+; RUN:  -r=%t.o,main,plx \
+; RUN:  -r=%t.o,_Znam, \
+; RUN:  -memprof-verify-ccg -memprof-verify-nodes \
+; RUN:  -pass-remarks=memprof-context-disambiguation \
+; RUN:	-memprof-allow-recursive-callsites=false \
+; RUN:  -o %t.out 2>&1 | FileCheck %s --allow-empty \
+; RUN:  --implicit-check-not "memprof_recursive3.cc:12:10: call in clone _Z1Ci.memprof.1 assigned" \
+; RUN:  --implicit-check-not="created clone" \
+; RUN:	--implicit-check-not="marked with memprof allocation attribute cold"
+
+;; Skipping recursive contexts should prevent spurious call to cloned version of
+;; B from the context starting at memprof_recursive.cc:19:13, which is actually
+;; recursive (until that support is added).
+; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \
+; RUN:  -supports-hot-cold-new \
+; RUN:  -r=%t.o,_Z1Dv,plx \
+; RUN:  -r=%t.o,_Z1Ci,plx \
+; RUN:  -r=%t.o,_Z1Bi,plx \
+; RUN:  -r=%t.o,main,plx \
+; RUN:  -r=%t.o,_Znam, \
+; RUN:  -memprof-verify-ccg -memprof-verify-nodes \
+; RUN:  -pass-remarks=memprof-context-disambiguation \
+; RUN:	-memprof-allow-recursive-contexts=false \
+; RUN:  -o %t.out 2>&1 | FileCheck %s \
+; RUN:  --implicit-check-not "memprof_recursive3.cc:12:10: call in clone _Z1Ci.memprof.1 assigned" \
+; RUN:  --check-prefix=ALLOW-RECUR-CALLSITES --check-prefix=SKIP-RECUR-CONTEXTS
+
+; ALLOW-RECUR-CALLSITES: memprof_recursive.cc:4:0: created clone _Z1Dv.memprof.1
+; ALLOW-RECUR-CALLSITES: memprof_recursive.cc:5:10: call in clone _Z1Dv marked with memprof allocation attribute notcold
+; ALLOW-RECUR-CALLSITES: memprof_recursive.cc:5:10: call in clone _Z1Dv.memprof.1 marked with memprof allocation attribute cold
+; ALLOW-RECUR-CALLSITES: memprof_recursive.cc:8:0: created clone _Z1Ci.memprof.1
+; ALLOW-RECUR-CALLSITES: memprof_recursive.cc:10:12: call in clone _Z1Ci.memprof.1 assigned to call function clone _Z1Dv.memprof.1
+; ALLOW-RECUR-CALLSITES: memprof_recursive.cc:14:0: created clone _Z1Bi.memprof.1
+; ALLOW-RECUR-CALLSITES: memprof_recursive.cc:15:10: call in clone _Z1Bi.memprof.1 assigned to call function clone _Z1Ci.memprof.1
+;; We should only call the cold clone for the recursive context if we enabled
+;; recursive contexts via -memprof-allow-recursive-contexts=true (default).
+; ALLOW-RECUR-CONTEXTS: memprof_recursive.cc:19:13: call in clone main assigned to call function clone _Z1Bi.memprof.1
+; SKIP-RECUR-CONTEXTS-NOT: memprof_recursive.cc:19:13: call in clone main assigned to call function clone _Z1Bi.memprof.1
+; ALLOW-RECUR-CALLSITES: memprof_recursive.cc:20:13: call in clone main assigned to call function clone _Z1Bi.memprof.1
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define ptr @_Z1Dv() !dbg !3 {
+entry:
+  %call = tail call ptr @_Znam(i64 10), !dbg !6, !memprof !7, !callsite !14
+  ret ptr null
+}
+
+define ptr @_Z1Ci(i32 %n) !dbg !15 {
+entry:
+  %call = tail call ptr @_Z1Dv(), !dbg !16, !callsite !17
+  br label %return
+
+if.end:                                           ; No predecessors!
+  %call1 = tail call ptr @_Z1Bi(i32 0), !dbg !18, !callsite !19
+  br label %return
+
+return:                                           ; preds = %if.end, %entry
+  ret ptr null
+}
+
+define ptr @_Z1Bi(i32 %n) !dbg !20 {
+entry:
+  %call = tail call ptr @_Z1Ci(i32 0), !dbg !21, !callsite !22
+  ret ptr null
+}
+
+define i32 @main() {
+entry:
+  %call = tail call ptr @_Z1Bi(i32 0), !dbg !23, !callsite !25
+  %call1 = tail call ptr @_Z1Bi(i32 0), !dbg !26, !callsite !27
+  %call2 = tail call ptr @_Z1Bi(i32 0), !dbg !28, !callsite !29
+  ret i32 0
+}
+
+declare ptr @_Znam(i64)
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 20.0.0git (https://github.com/llvm/llvm-project.git 7aec6dc477f8148ed066d10dfc7a012a51b6599c)", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, splitDebugInlining: false, debugInfoForProfiling: true, nameTableKind: None)
+!1 = !DIFile(filename: "memprof_recursive.cc", directory: ".", checksumkind: CSK_MD5, checksum: "2f15f63b187a0e0d40e7fdd18b10576a")
+!2 = !{i32 2, !"Debug Info Version", i32 3}
+!3 = distinct !DISubprogram(name: "D", linkageName: "_Z1Dv", scope: !1, file: !1, line: 4, type: !4, scopeLine: 4, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!4 = !DISubroutineType(types: !5)
+!5 = !{}
+!6 = !DILocation(line: 5, column: 10, scope: !3)
+!7 = !{!8, !10, !12}
+!8 = !{!9, !"cold"}
+!9 = !{i64 6541423618768552252, i64 -200552803509692312, i64 -2954124005641725917, i64 6307901912192269588}
+!10 = !{!11, !"notcold"}
+!11 = !{i64 6541423618768552252, i64 -200552803509692312, i64 -2954124005641725917, i64 -7155190423157709404, i64 -2954124005641725917, i64 8632435727821051414}
+!12 = !{!13, !"cold"}
+!13 = !{i64 6541423618768552252, i64 -200552803509692312, i64 -2954124005641725917, i64 -7155190423157709404, i64 -2954124005641725917, i64 -3421689549917153178}
+!14 = !{i64 6541423618768552252}
+!15 = distinct !DISubprogram(name: "C", linkageName: "_Z1Ci", scope: !1, file: !1, line: 8, type: !4, scopeLine: 8, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!16 = !DILocation(line: 10, column: 12, scope: !15)
+!17 = !{i64 -200552803509692312}
+!18 = !DILocation(line: 12, column: 10, scope: !15)
+!19 = !{i64 -7155190423157709404}
+!20 = distinct !DISubprogram(name: "B", linkageName: "_Z1Bi", scope: !1, file: !1, line: 14, type: !4, scopeLine: 14, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!21 = !DILocation(line: 15, column: 10, scope: !20)
+!22 = !{i64 -2954124005641725917}
+!23 = !DILocation(line: 18, column: 13, scope: !24)
+!24 = distinct !DISubprogram(name: "main", scope: !1, file: !1, line: 17, type: !4, scopeLine: 17, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!25 = !{i64 8632435727821051414}
+!26 = !DILocation(line: 19, column: 13, scope: !24)
+!27 = !{i64 -3421689549917153178}
+!28 = !DILocation(line: 20, column: 13, scope: !24)
+!29 = !{i64 6307901912192269588}
diff --git a/llvm/test/Transforms/InstCombine/and-xor-or.ll b/llvm/test/Transforms/InstCombine/and-xor-or.ll
index 5a0890e918ef..5a58995f6c31 100644
--- a/llvm/test/Transforms/InstCombine/and-xor-or.ll
+++ b/llvm/test/Transforms/InstCombine/and-xor-or.ll
@@ -388,10 +388,9 @@ define i8 @xor_shl(i8 %x, i8 %y, i8 %zarg, i8 %shamt) {
 ; CHECK-LABEL: define {{[^@]+}}@xor_shl
 ; CHECK-SAME: (i8 [[X:%.*]], i8 [[Y:%.*]], i8 [[ZARG:%.*]], i8 [[SHAMT:%.*]]) {
 ; CHECK-NEXT:    [[Z:%.*]] = sdiv i8 42, [[ZARG]]
-; CHECK-NEXT:    [[SX:%.*]] = shl i8 [[X]], [[SHAMT]]
-; CHECK-NEXT:    [[SY:%.*]] = shl i8 [[Y]], [[SHAMT]]
-; CHECK-NEXT:    [[A:%.*]] = xor i8 [[Z]], [[SX]]
-; CHECK-NEXT:    [[R:%.*]] = xor i8 [[A]], [[SY]]
+; CHECK-NEXT:    [[TMP1:%.*]] = xor i8 [[X]], [[Y]]
+; CHECK-NEXT:    [[TMP2:%.*]] = shl i8 [[TMP1]], [[SHAMT]]
+; CHECK-NEXT:    [[R:%.*]] = xor i8 [[TMP2]], [[Z]]
 ; CHECK-NEXT:    ret i8 [[R]]
 ;
   %z = sdiv i8 42, %zarg ; thwart complexity-based canonicalization
@@ -406,10 +405,9 @@ define i8 @and_lshr(i8 %x, i8 %y, i8 %zarg, i8 %shamt) {
 ; CHECK-LABEL: define {{[^@]+}}@and_lshr
 ; CHECK-SAME: (i8 [[X:%.*]], i8 [[Y:%.*]], i8 [[ZARG:%.*]], i8 [[SHAMT:%.*]]) {
 ; CHECK-NEXT:    [[Z:%.*]] = sdiv i8 42, [[ZARG]]
-; CHECK-NEXT:    [[SX:%.*]] = lshr i8 [[X]], [[SHAMT]]
-; CHECK-NEXT:    [[SY:%.*]] = lshr i8 [[Y]], [[SHAMT]]
-; CHECK-NEXT:    [[A:%.*]] = and i8 [[Z]], [[SX]]
-; CHECK-NEXT:    [[R:%.*]] = and i8 [[SY]], [[A]]
+; CHECK-NEXT:    [[TMP1:%.*]] = and i8 [[X]], [[Y]]
+; CHECK-NEXT:    [[TMP2:%.*]] = lshr i8 [[TMP1]], [[SHAMT]]
+; CHECK-NEXT:    [[R:%.*]] = and i8 [[TMP2]], [[Z]]
 ; CHECK-NEXT:    ret i8 [[R]]
 ;
   %z = sdiv i8 42, %zarg ; thwart complexity-based canonicalization
@@ -435,6 +433,51 @@ define i8 @or_lshr(i8 %x, i8 %y, i8 %z, i8 %shamt) {
   ret i8 %r
 }
 
+define i8 @or_lshr_commuted1(i8 %x, i8 %y, i8 %z, i8 %shamt) {
+; CHECK-LABEL: define {{[^@]+}}@or_lshr_commuted1
+; CHECK-SAME: (i8 [[X:%.*]], i8 [[Y:%.*]], i8 [[Z:%.*]], i8 [[SHAMT:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = or i8 [[X]], [[Y]]
+; CHECK-NEXT:    [[TMP2:%.*]] = lshr i8 [[TMP1]], [[SHAMT]]
+; CHECK-NEXT:    [[R:%.*]] = or i8 [[TMP2]], [[Z]]
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %sx = lshr i8 %x, %shamt
+  %sy = lshr i8 %y, %shamt
+  %a = or i8 %z, %sx
+  %r = or i8 %sy, %a
+  ret i8 %r
+}
+
+define i8 @or_lshr_commuted2(i8 %x, i8 %y, i8 %z, i8 %shamt) {
+; CHECK-LABEL: define {{[^@]+}}@or_lshr_commuted2
+; CHECK-SAME: (i8 [[X:%.*]], i8 [[Y:%.*]], i8 [[Z:%.*]], i8 [[SHAMT:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = or i8 [[X]], [[Y]]
+; CHECK-NEXT:    [[TMP2:%.*]] = lshr i8 [[TMP1]], [[SHAMT]]
+; CHECK-NEXT:    [[R:%.*]] = or i8 [[TMP2]], [[Z]]
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %sx = lshr i8 %x, %shamt
+  %sy = lshr i8 %y, %shamt
+  %a = or i8 %z, %sx
+  %r = or i8 %a, %sy
+  ret i8 %r
+}
+
+define i8 @or_lshr_commuted3(i8 %x, i8 %y, i8 %z, i8 %shamt) {
+; CHECK-LABEL: define {{[^@]+}}@or_lshr_commuted3
+; CHECK-SAME: (i8 [[X:%.*]], i8 [[Y:%.*]], i8 [[Z:%.*]], i8 [[SHAMT:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = or i8 [[X]], [[Y]]
+; CHECK-NEXT:    [[TMP2:%.*]] = lshr i8 [[TMP1]], [[SHAMT]]
+; CHECK-NEXT:    [[R:%.*]] = or i8 [[TMP2]], [[Z]]
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %sx = lshr i8 %x, %shamt
+  %sy = lshr i8 %y, %shamt
+  %a = or i8 %sx, %z
+  %r = or i8 %a, %sy
+  ret i8 %r
+}
+
 define i8 @xor_lshr(i8 %x, i8 %y, i8 %z, i8 %shamt) {
 ; CHECK-LABEL: define {{[^@]+}}@xor_lshr
 ; CHECK-SAME: (i8 [[X:%.*]], i8 [[Y:%.*]], i8 [[Z:%.*]], i8 [[SHAMT:%.*]]) {
diff --git a/llvm/test/Transforms/InstCombine/compare-signs.ll b/llvm/test/Transforms/InstCombine/compare-signs.ll
index 9703b47b44d0..59ec9adb30b9 100644
--- a/llvm/test/Transforms/InstCombine/compare-signs.ll
+++ b/llvm/test/Transforms/InstCombine/compare-signs.ll
@@ -152,6 +152,19 @@ define i1 @test4a(i32 %a) {
   ret i1 %c
 }
 
+define i1 @test4a_commuted(i32 %a) {
+; CHECK-LABEL: @test4a_commuted(
+; CHECK-NEXT:    [[C:%.*]] = icmp slt i32 [[SIGNUM:%.*]], 1
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %l = ashr i32 %a, 31
+  %na = sub i32 0, %a
+  %r = lshr i32 %na, 31
+  %signum = or i32 %r, %l
+  %c = icmp slt i32 %signum, 1
+  ret i1 %c
+}
+
 define <2 x i1> @test4a_vec(<2 x i32> %a) {
 ; CHECK-LABEL: @test4a_vec(
 ; CHECK-NEXT:    [[C:%.*]] = icmp slt <2 x i32> [[A:%.*]], splat (i32 1)
diff --git a/llvm/test/Transforms/InstCombine/icmp-add.ll b/llvm/test/Transforms/InstCombine/icmp-add.ll
index 579247aaccf2..a8cdf80948a8 100644
--- a/llvm/test/Transforms/InstCombine/icmp-add.ll
+++ b/llvm/test/Transforms/InstCombine/icmp-add.ll
@@ -79,6 +79,19 @@ bb:
   ret i1 %i4
 }
 
+define i1 @cvt_icmp_0_zext_plus_zext_eq_i2(i1 %a, i1 %b) {
+; CHECK-LABEL: @cvt_icmp_0_zext_plus_zext_eq_i2(
+; CHECK-NEXT:    [[TMP1:%.*]] = or i1 [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = xor i1 [[TMP1]], true
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %a.ext = zext i1 %a to i2
+  %b.ext = zext i1 %b to i2
+  %add = add i2 %a.ext, %b.ext
+  %cmp = icmp eq i2 %add, 0
+  ret i1 %cmp
+}
+
 define i1 @cvt_icmp_1_zext_plus_zext_eq(i1 %arg, i1 %arg1) {
 ; CHECK-LABEL: @cvt_icmp_1_zext_plus_zext_eq(
 ; CHECK-NEXT:  bb:
diff --git a/llvm/test/Transforms/InstCombine/onehot_merge.ll b/llvm/test/Transforms/InstCombine/onehot_merge.ll
index 2e57597455c2..3b7314d36eaa 100644
--- a/llvm/test/Transforms/InstCombine/onehot_merge.ll
+++ b/llvm/test/Transforms/InstCombine/onehot_merge.ll
@@ -1143,3 +1143,19 @@ define i1 @foo1_and_signbit_lshr_without_shifting_signbit_not_pwr2_logical(i32 %
   %or = select i1 %t2, i1 true, i1 %t4
   ret i1 %or
 }
+
+define i1 @two_types_of_bittest(i8 %x, i8 %c) {
+; CHECK-LABEL: @two_types_of_bittest(
+; CHECK-NEXT:    [[T0:%.*]] = shl nuw i8 1, [[C:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = or i8 [[T0]], -128
+; CHECK-NEXT:    [[TMP2:%.*]] = and i8 [[X:%.*]], [[TMP1]]
+; CHECK-NEXT:    [[RET:%.*]] = icmp eq i8 [[TMP2]], [[TMP1]]
+; CHECK-NEXT:    ret i1 [[RET]]
+;
+  %t0 = shl i8 1, %c
+  %icmp1 = icmp slt i8 %x, 0
+  %and = and i8 %x, %t0
+  %icmp2 = icmp ne i8 %and, 0
+  %ret = and i1 %icmp1, %icmp2
+  ret i1 %ret
+}
diff --git a/llvm/test/Transforms/InstCombine/phi.ll b/llvm/test/Transforms/InstCombine/phi.ll
index 33849291d832..4756b4f30e56 100644
--- a/llvm/test/Transforms/InstCombine/phi.ll
+++ b/llvm/test/Transforms/InstCombine/phi.ll
@@ -2828,13 +2828,13 @@ define i1 @test_zext_icmp_eq_0(i1 %a, i1 %b, i32 %c) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 [[A:%.*]], label [[IF:%.*]], label [[ELSE:%.*]]
 ; CHECK:       if:
-; CHECK-NEXT:    [[B_EXT:%.*]] = zext i1 [[B:%.*]] to i32
+; CHECK-NEXT:    [[TMP0:%.*]] = xor i1 [[B:%.*]], true
 ; CHECK-NEXT:    br label [[JOIN:%.*]]
 ; CHECK:       else:
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[C:%.*]], 0
 ; CHECK-NEXT:    br label [[JOIN]]
 ; CHECK:       join:
-; CHECK-NEXT:    [[PHI:%.*]] = phi i32 [ [[B_EXT]], [[IF]] ], [ [[C:%.*]], [[ELSE]] ]
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI]], 0
+; CHECK-NEXT:    [[CMP:%.*]] = phi i1 [ [[TMP0]], [[IF]] ], [ [[TMP1]], [[ELSE]] ]
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
 entry:
@@ -2916,10 +2916,9 @@ define i1 @test_zext_icmp_eq_0_loop(i1 %c, i1 %b) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[PHI:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ], [ [[EXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[X:%.*]] = icmp eq i32 [[PHI]], 0
+; CHECK-NEXT:    [[X:%.*]] = phi i1 [ false, [[ENTRY:%.*]] ], [ [[TMP0:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[Y:%.*]] = and i1 [[X]], [[B:%.*]]
-; CHECK-NEXT:    [[EXT]] = zext i1 [[Y]] to i32
+; CHECK-NEXT:    [[TMP0]] = xor i1 [[Y]], true
 ; CHECK-NEXT:    br i1 [[C:%.*]], label [[LOOP]], label [[EXIT:%.*]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret i1 [[X]]
diff --git a/llvm/test/Transforms/InstCombine/select-divrem.ll b/llvm/test/Transforms/InstCombine/select-divrem.ll
index a674f9c64b20..7dff78e3057e 100644
--- a/llvm/test/Transforms/InstCombine/select-divrem.ll
+++ b/llvm/test/Transforms/InstCombine/select-divrem.ll
@@ -322,6 +322,21 @@ define i8 @rem_euclid_non_const_pow2(i8 %0, i8 %1) {
   ret i8 %sel
 }
 
+define i8 @rem_euclid_non_const_pow2_commuted(i8 %0, i8 %1) {
+; CHECK-LABEL: @rem_euclid_non_const_pow2_commuted(
+; CHECK-NEXT:    [[NOTMASK:%.*]] = shl nsw i8 -1, [[TMP0:%.*]]
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i8 [[NOTMASK]], -1
+; CHECK-NEXT:    [[SEL:%.*]] = and i8 [[TMP1:%.*]], [[TMP3]]
+; CHECK-NEXT:    ret i8 [[SEL]]
+;
+  %pow2 = shl i8 1, %0
+  %rem = srem i8 %1, %pow2
+  %cond = icmp slt i8 %rem, 0
+  %add = add i8 %pow2, %rem
+  %sel = select i1 %cond, i8 %add, i8 %rem
+  ret i8 %sel
+}
+
 define i32 @rem_euclid_pow2_true_arm_folded(i32 %n) {
 ; CHECK-LABEL: @rem_euclid_pow2_true_arm_folded(
 ; CHECK-NEXT:    [[RES:%.*]] = and i32 [[N:%.*]], 1
diff --git a/llvm/test/Transforms/InstCombine/select.ll b/llvm/test/Transforms/InstCombine/select.ll
index 9de3c2483ba4..0f15fa6c0cc5 100644
--- a/llvm/test/Transforms/InstCombine/select.ll
+++ b/llvm/test/Transforms/InstCombine/select.ll
@@ -3937,11 +3937,8 @@ entry:
 define i32 @src_or_eq_0_and_xor(i32 %x, i32 %y) {
 ; CHECK-LABEL: @src_or_eq_0_and_xor(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[OR:%.*]] = or i32 [[Y:%.*]], [[X:%.*]]
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[OR]], 0
-; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[Y]], [[X]]
-; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP]], i32 0, i32 [[XOR]]
-; CHECK-NEXT:    ret i32 [[COND]]
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    ret i32 [[XOR]]
 ;
 entry:
   %or = or i32 %y, %x
@@ -3956,11 +3953,8 @@ entry:
 define i32 @src_or_eq_0_xor_and(i32 %x, i32 %y) {
 ; CHECK-LABEL: @src_or_eq_0_xor_and(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[OR:%.*]] = or i32 [[Y:%.*]], [[X:%.*]]
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[OR]], 0
-; CHECK-NEXT:    [[AND:%.*]] = and i32 [[Y]], [[X]]
-; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP]], i32 0, i32 [[AND]]
-; CHECK-NEXT:    ret i32 [[COND]]
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    ret i32 [[AND]]
 ;
 entry:
   %or = or i32 %y, %x
@@ -4438,11 +4432,8 @@ define i32 @src_no_trans_select_and_eq0_xor_and(i32 %x, i32 %y) {
 
 define i32 @src_no_trans_select_or_eq0_or_and(i32 %x, i32 %y) {
 ; CHECK-LABEL: @src_no_trans_select_or_eq0_or_and(
-; CHECK-NEXT:    [[OR:%.*]] = or i32 [[X:%.*]], [[Y:%.*]]
-; CHECK-NEXT:    [[OR0:%.*]] = icmp eq i32 [[OR]], 0
-; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X]], [[Y]]
-; CHECK-NEXT:    [[COND:%.*]] = select i1 [[OR0]], i32 0, i32 [[AND]]
-; CHECK-NEXT:    ret i32 [[COND]]
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i32 [[AND]]
 ;
   %or = or i32 %x, %y
   %or0 = icmp eq i32 %or, 0
@@ -4837,3 +4828,16 @@ define i32 @replace_and_cond_multiuse2(i1 %cond1, i1 %cond2) {
   %mux = select i1 %cond1, i32 %sel, i32 1
   ret i32 %mux
 }
+
+define i32 @src_simplify_2x_at_once_and(i32 %x, i32 %y) {
+; CHECK-LABEL: @src_simplify_2x_at_once_and(
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i32 [[XOR]]
+;
+  %and = and i32 %x, %y
+  %and0 = icmp eq i32 %and, -1
+  %sub = sub i32 %x, %y
+  %xor = xor i32 %x, %y
+  %cond = select i1 %and0, i32 %sub, i32 %xor
+  ret i32 %cond
+}
diff --git a/llvm/test/Transforms/InstCombine/xor-and-or.ll b/llvm/test/Transforms/InstCombine/xor-and-or.ll
index 47275ce31070..c380e2748f89 100644
--- a/llvm/test/Transforms/InstCombine/xor-and-or.ll
+++ b/llvm/test/Transforms/InstCombine/xor-and-or.ll
@@ -25,6 +25,18 @@ define i1 @xor_logic_and_logic_or2(i1 %c, i1 %x, i1 %y) {
   ret i1 %r
 }
 
+define i1 @xor_logic_and_logic_or2_commuted(i1 %c, i1 %x, i1 %y) {
+; CHECK-LABEL: @xor_logic_and_logic_or2_commuted(
+; CHECK-NEXT:    [[TMP1:%.*]] = xor i1 [[X:%.*]], true
+; CHECK-NEXT:    [[R:%.*]] = select i1 [[C:%.*]], i1 [[TMP1]], i1 [[Y:%.*]]
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %o = select i1 %y, i1 true, i1 %c
+  %a = select i1 %c, i1 %x, i1 false
+  %r = xor i1 %o, %a
+  ret i1 %r
+}
+
 define i1 @xor_logic_and_logic_or3(i1 %c, i1 %x, i1 %y) {
 ; CHECK-LABEL: @xor_logic_and_logic_or3(
 ; CHECK-NEXT:    [[TMP1:%.*]] = freeze i1 [[C:%.*]]
diff --git a/llvm/test/Transforms/InstSimplify/const-fold-nvvm-f2i-d2i.ll b/llvm/test/Transforms/InstSimplify/const-fold-nvvm-f2i-d2i.ll
new file mode 100644
index 000000000000..543c73137c1b
--- /dev/null
+++ b/llvm/test/Transforms/InstSimplify/const-fold-nvvm-f2i-d2i.ll
@@ -0,0 +1,1129 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt < %s -passes=instsimplify -march=nvptx64 -S | FileCheck %s
+
+; f2i/f2ui and d2i/d2ui - double/float to i32 tests
+
+;###############################################################
+;#               Tests with Positive 1.5                       #
+;###############################################################
+
+;+-------------------------------------------------------------+
+;|                        f2i                                  |
+;+-------------------------------------------------------------+
+define i32 @test_pos_1_5_f2i_rm() {
+; CHECK-LABEL: define i32 @test_pos_1_5_f2i_rm() {
+; CHECK-NEXT:    ret i32 1
+;
+  %res = call i32 @llvm.nvvm.f2i.rm(float 1.5)
+  ret i32 %res
+}
+
+define i32 @test_pos_1_5_f2i_rn() {
+; CHECK-LABEL: define i32 @test_pos_1_5_f2i_rn() {
+; CHECK-NEXT:    ret i32 2
+;
+  %res = call i32 @llvm.nvvm.f2i.rn(float 1.5)
+  ret i32 %res
+}
+
+
+define i32 @test_pos_1_5_f2i_rp() {
+; CHECK-LABEL: define i32 @test_pos_1_5_f2i_rp() {
+; CHECK-NEXT:    ret i32 2
+;
+  %res = call i32 @llvm.nvvm.f2i.rp(float 1.5)
+  ret i32 %res
+}
+
+define i32 @test_pos_1_5_f2i_rz() {
+; CHECK-LABEL: define i32 @test_pos_1_5_f2i_rz() {
+; CHECK-NEXT:    ret i32 1
+;
+  %res = call i32 @llvm.nvvm.f2i.rz(float 1.5)
+  ret i32 %res
+}
+
+;+-------------------------------------------------------------+
+;|                      f2i_ftz                                |
+;+-------------------------------------------------------------+
+define i32 @test_pos_1_5_f2i_rm_ftz() {
+; CHECK-LABEL: define i32 @test_pos_1_5_f2i_rm_ftz() {
+; CHECK-NEXT:    ret i32 1
+;
+  %res = call i32 @llvm.nvvm.f2i.rm.ftz(float 1.5)
+  ret i32 %res
+}
+
+define i32 @test_pos_1_5_f2i_rn_ftz() {
+; CHECK-LABEL: define i32 @test_pos_1_5_f2i_rn_ftz() {
+; CHECK-NEXT:    ret i32 2
+;
+  %res = call i32 @llvm.nvvm.f2i.rn.ftz(float 1.5)
+  ret i32 %res
+}
+
+define i32 @test_pos_1_5_f2i_rp_ftz() {
+; CHECK-LABEL: define i32 @test_pos_1_5_f2i_rp_ftz() {
+; CHECK-NEXT:    ret i32 2
+;
+  %res = call i32 @llvm.nvvm.f2i.rp.ftz(float 1.5)
+  ret i32 %res
+}
+
+define i32 @test_pos_1_5_f2i_rz_ftz() {
+; CHECK-LABEL: define i32 @test_pos_1_5_f2i_rz_ftz() {
+; CHECK-NEXT:    ret i32 1
+;
+  %res = call i32 @llvm.nvvm.f2i.rz.ftz(float 1.5)
+  ret i32 %res
+}
+;+-------------------------------------------------------------+
+;|                        d2i                                  |
+;+-------------------------------------------------------------+
+define i32 @test_pos_1_5_d2i_rm() {
+; CHECK-LABEL: define i32 @test_pos_1_5_d2i_rm() {
+; CHECK-NEXT:    ret i32 1
+;
+  %res = call i32 @llvm.nvvm.d2i.rm(double 1.5)
+  ret i32 %res
+}
+
+define i32 @test_pos_1_5_d2i_rn() {
+; CHECK-LABEL: define i32 @test_pos_1_5_d2i_rn() {
+; CHECK-NEXT:    ret i32 2
+;
+  %res = call i32 @llvm.nvvm.d2i.rn(double 1.5)
+  ret i32 %res
+}
+
+
+define i32 @test_pos_1_5_d2i_rp() {
+; CHECK-LABEL: define i32 @test_pos_1_5_d2i_rp() {
+; CHECK-NEXT:    ret i32 2
+;
+  %res = call i32 @llvm.nvvm.d2i.rp(double 1.5)
+  ret i32 %res
+}
+
+define i32 @test_pos_1_5_d2i_rz() {
+; CHECK-LABEL: define i32 @test_pos_1_5_d2i_rz() {
+; CHECK-NEXT:    ret i32 1
+;
+  %res = call i32 @llvm.nvvm.d2i.rz(double 1.5)
+  ret i32 %res
+}
+
+;+-------------------------------------------------------------+
+;|                        f2ui                                  |
+;+-------------------------------------------------------------+
+define i32 @test_pos_1_5_f2ui_rm() {
+; CHECK-LABEL: define i32 @test_pos_1_5_f2ui_rm() {
+; CHECK-NEXT:    ret i32 1
+;
+  %res = call i32 @llvm.nvvm.f2ui.rm(float 1.5)
+  ret i32 %res
+}
+
+define i32 @test_pos_1_5_f2ui_rn() {
+; CHECK-LABEL: define i32 @test_pos_1_5_f2ui_rn() {
+; CHECK-NEXT:    ret i32 2
+;
+  %res = call i32 @llvm.nvvm.f2ui.rn(float 1.5)
+  ret i32 %res
+}
+
+
+define i32 @test_pos_1_5_f2ui_rp() {
+; CHECK-LABEL: define i32 @test_pos_1_5_f2ui_rp() {
+; CHECK-NEXT:    ret i32 2
+;
+  %res = call i32 @llvm.nvvm.f2ui.rp(float 1.5)
+  ret i32 %res
+}
+
+define i32 @test_pos_1_5_f2ui_rz() {
+; CHECK-LABEL: define i32 @test_pos_1_5_f2ui_rz() {
+; CHECK-NEXT:    ret i32 1
+;
+  %res = call i32 @llvm.nvvm.f2ui.rz(float 1.5)
+  ret i32 %res
+}
+
+;+-------------------------------------------------------------+
+;|                      f2ui_ftz                                |
+;+-------------------------------------------------------------+
+define i32 @test_pos_1_5_f2ui_rm_ftz() {
+; CHECK-LABEL: define i32 @test_pos_1_5_f2ui_rm_ftz() {
+; CHECK-NEXT:    ret i32 1
+;
+  %res = call i32 @llvm.nvvm.f2ui.rm.ftz(float 1.5)
+  ret i32 %res
+}
+
+define i32 @test_pos_1_5_f2ui_rn_ftz() {
+; CHECK-LABEL: define i32 @test_pos_1_5_f2ui_rn_ftz() {
+; CHECK-NEXT:    ret i32 2
+;
+  %res = call i32 @llvm.nvvm.f2ui.rn.ftz(float 1.5)
+  ret i32 %res
+}
+
+define i32 @test_pos_1_5_f2ui_rp_ftz() {
+; CHECK-LABEL: define i32 @test_pos_1_5_f2ui_rp_ftz() {
+; CHECK-NEXT:    ret i32 2
+;
+  %res = call i32 @llvm.nvvm.f2ui.rp.ftz(float 1.5)
+  ret i32 %res
+}
+
+define i32 @test_pos_1_5_f2ui_rz_ftz() {
+; CHECK-LABEL: define i32 @test_pos_1_5_f2ui_rz_ftz() {
+; CHECK-NEXT:    ret i32 1
+;
+  %res = call i32 @llvm.nvvm.f2ui.rz.ftz(float 1.5)
+  ret i32 %res
+}
+;+-------------------------------------------------------------+
+;|                        d2ui                                  |
+;+-------------------------------------------------------------+
+define i32 @test_pos_1_5_d2ui_rm() {
+; CHECK-LABEL: define i32 @test_pos_1_5_d2ui_rm() {
+; CHECK-NEXT:    ret i32 1
+;
+  %res = call i32 @llvm.nvvm.d2ui.rm(double 1.5)
+  ret i32 %res
+}
+
+define i32 @test_pos_1_5_d2ui_rn() {
+; CHECK-LABEL: define i32 @test_pos_1_5_d2ui_rn() {
+; CHECK-NEXT:    ret i32 2
+;
+  %res = call i32 @llvm.nvvm.d2ui.rn(double 1.5)
+  ret i32 %res
+}
+
+
+define i32 @test_pos_1_5_d2ui_rp() {
+; CHECK-LABEL: define i32 @test_pos_1_5_d2ui_rp() {
+; CHECK-NEXT:    ret i32 2
+;
+  %res = call i32 @llvm.nvvm.d2ui.rp(double 1.5)
+  ret i32 %res
+}
+
+define i32 @test_pos_1_5_d2ui_rz() {
+; CHECK-LABEL: define i32 @test_pos_1_5_d2ui_rz() {
+; CHECK-NEXT:    ret i32 1
+;
+  %res = call i32 @llvm.nvvm.d2ui.rz(double 1.5)
+  ret i32 %res
+}
+
+;###############################################################
+;#               Tests with Negative 1.5                       #
+;###############################################################
+
+;+-------------------------------------------------------------+
+;|                        f2i                                  |
+;+-------------------------------------------------------------+
+define i32 @test_neg_1_5_f2i_rm() {
+; CHECK-LABEL: define i32 @test_neg_1_5_f2i_rm() {
+; CHECK-NEXT:    ret i32 -2
+;
+  %res = call i32 @llvm.nvvm.f2i.rm(float -1.5)
+  ret i32 %res
+}
+
+define i32 @test_neg_1_5_f2i_rn() {
+; CHECK-LABEL: define i32 @test_neg_1_5_f2i_rn() {
+; CHECK-NEXT:    ret i32 -2
+;
+  %res = call i32 @llvm.nvvm.f2i.rn(float -1.5)
+  ret i32 %res
+}
+
+
+define i32 @test_neg_1_5_f2i_rp() {
+; CHECK-LABEL: define i32 @test_neg_1_5_f2i_rp() {
+; CHECK-NEXT:    ret i32 -1
+;
+  %res = call i32 @llvm.nvvm.f2i.rp(float -1.5)
+  ret i32 %res
+}
+
+define i32 @test_neg_1_5_f2i_rz() {
+; CHECK-LABEL: define i32 @test_neg_1_5_f2i_rz() {
+; CHECK-NEXT:    ret i32 -1
+;
+  %res = call i32 @llvm.nvvm.f2i.rz(float -1.5)
+  ret i32 %res
+}
+
+;+-------------------------------------------------------------+
+;|                      f2i_ftz                                |
+;+-------------------------------------------------------------+
+define i32 @test_neg_1_5_f2i_rm_ftz() {
+; CHECK-LABEL: define i32 @test_neg_1_5_f2i_rm_ftz() {
+; CHECK-NEXT:    ret i32 -2
+;
+  %res = call i32 @llvm.nvvm.f2i.rm.ftz(float -1.5)
+  ret i32 %res
+}
+
+define i32 @test_neg_1_5_f2i_rn_ftz() {
+; CHECK-LABEL: define i32 @test_neg_1_5_f2i_rn_ftz() {
+; CHECK-NEXT:    ret i32 -2
+;
+  %res = call i32 @llvm.nvvm.f2i.rn.ftz(float -1.5)
+  ret i32 %res
+}
+
+define i32 @test_neg_1_5_f2i_rp_ftz() {
+; CHECK-LABEL: define i32 @test_neg_1_5_f2i_rp_ftz() {
+; CHECK-NEXT:    ret i32 -1
+;
+  %res = call i32 @llvm.nvvm.f2i.rp.ftz(float -1.5)
+  ret i32 %res
+}
+
+define i32 @test_neg_1_5_f2i_rz_ftz() {
+; CHECK-LABEL: define i32 @test_neg_1_5_f2i_rz_ftz() {
+; CHECK-NEXT:    ret i32 -1
+;
+  %res = call i32 @llvm.nvvm.f2i.rz.ftz(float -1.5)
+  ret i32 %res
+}
+;+-------------------------------------------------------------+
+;|                        d2i                                  |
+;+-------------------------------------------------------------+
+define i32 @test_neg_1_5_d2i_rm() {
+; CHECK-LABEL: define i32 @test_neg_1_5_d2i_rm() {
+; CHECK-NEXT:    ret i32 -2
+;
+  %res = call i32 @llvm.nvvm.d2i.rm(double -1.5)
+  ret i32 %res
+}
+
+define i32 @test_neg_1_5_d2i_rn() {
+; CHECK-LABEL: define i32 @test_neg_1_5_d2i_rn() {
+; CHECK-NEXT:    ret i32 -2
+;
+  %res = call i32 @llvm.nvvm.d2i.rn(double -1.5)
+  ret i32 %res
+}
+
+
+define i32 @test_neg_1_5_d2i_rp() {
+; CHECK-LABEL: define i32 @test_neg_1_5_d2i_rp() {
+; CHECK-NEXT:    ret i32 -1
+;
+  %res = call i32 @llvm.nvvm.d2i.rp(double -1.5)
+  ret i32 %res
+}
+
+define i32 @test_neg_1_5_d2i_rz() {
+; CHECK-LABEL: define i32 @test_neg_1_5_d2i_rz() {
+; CHECK-NEXT:    ret i32 -1
+;
+  %res = call i32 @llvm.nvvm.d2i.rz(double -1.5)
+  ret i32 %res
+}
+
+;+-------------------------------------------------------------+
+;|                        f2ui                                  |
+;+-------------------------------------------------------------+
+define i32 @test_neg_1_5_f2ui_rm() {
+; CHECK-LABEL: define i32 @test_neg_1_5_f2ui_rm() {
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.nvvm.f2ui.rm(float -1.500000e+00)
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %res = call i32 @llvm.nvvm.f2ui.rm(float -1.5)
+  ret i32 %res
+}
+
+define i32 @test_neg_1_5_f2ui_rn() {
+; CHECK-LABEL: define i32 @test_neg_1_5_f2ui_rn() {
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.nvvm.f2ui.rn(float -1.500000e+00)
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %res = call i32 @llvm.nvvm.f2ui.rn(float -1.5)
+  ret i32 %res
+}
+
+
+define i32 @test_neg_1_5_f2ui_rp() {
+; CHECK-LABEL: define i32 @test_neg_1_5_f2ui_rp() {
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.nvvm.f2ui.rp(float -1.500000e+00)
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %res = call i32 @llvm.nvvm.f2ui.rp(float -1.5)
+  ret i32 %res
+}
+
+define i32 @test_neg_1_5_f2ui_rz() {
+; CHECK-LABEL: define i32 @test_neg_1_5_f2ui_rz() {
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.nvvm.f2ui.rz(float -1.500000e+00)
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %res = call i32 @llvm.nvvm.f2ui.rz(float -1.5)
+  ret i32 %res
+}
+
+;+-------------------------------------------------------------+
+;|                      f2ui_ftz                                |
+;+-------------------------------------------------------------+
+define i32 @test_neg_1_5_f2ui_rm_ftz() {
+; CHECK-LABEL: define i32 @test_neg_1_5_f2ui_rm_ftz() {
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.nvvm.f2ui.rm.ftz(float -1.500000e+00)
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %res = call i32 @llvm.nvvm.f2ui.rm.ftz(float -1.5)
+  ret i32 %res
+}
+
+define i32 @test_neg_1_5_f2ui_rn_ftz() {
+; CHECK-LABEL: define i32 @test_neg_1_5_f2ui_rn_ftz() {
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.nvvm.f2ui.rn.ftz(float -1.500000e+00)
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %res = call i32 @llvm.nvvm.f2ui.rn.ftz(float -1.5)
+  ret i32 %res
+}
+
+define i32 @test_neg_1_5_f2ui_rp_ftz() {
+; CHECK-LABEL: define i32 @test_neg_1_5_f2ui_rp_ftz() {
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.nvvm.f2ui.rp.ftz(float -1.500000e+00)
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %res = call i32 @llvm.nvvm.f2ui.rp.ftz(float -1.5)
+  ret i32 %res
+}
+
+define i32 @test_neg_1_5_f2ui_rz_ftz() {
+; CHECK-LABEL: define i32 @test_neg_1_5_f2ui_rz_ftz() {
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.nvvm.f2ui.rz.ftz(float -1.500000e+00)
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %res = call i32 @llvm.nvvm.f2ui.rz.ftz(float -1.5)
+  ret i32 %res
+}
+;+-------------------------------------------------------------+
+;|                        d2ui                                  |
+;+-------------------------------------------------------------+
+define i32 @test_neg_1_5_d2ui_rm() {
+; CHECK-LABEL: define i32 @test_neg_1_5_d2ui_rm() {
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.nvvm.d2ui.rm(double -1.500000e+00)
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %res = call i32 @llvm.nvvm.d2ui.rm(double -1.5)
+  ret i32 %res
+}
+
+define i32 @test_neg_1_5_d2ui_rn() {
+; CHECK-LABEL: define i32 @test_neg_1_5_d2ui_rn() {
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.nvvm.d2ui.rn(double -1.500000e+00)
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %res = call i32 @llvm.nvvm.d2ui.rn(double -1.5)
+  ret i32 %res
+}
+
+
+define i32 @test_neg_1_5_d2ui_rp() {
+; CHECK-LABEL: define i32 @test_neg_1_5_d2ui_rp() {
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.nvvm.d2ui.rp(double -1.500000e+00)
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %res = call i32 @llvm.nvvm.d2ui.rp(double -1.5)
+  ret i32 %res
+}
+
+define i32 @test_neg_1_5_d2ui_rz() {
+; CHECK-LABEL: define i32 @test_neg_1_5_d2ui_rz() {
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.nvvm.d2ui.rz(double -1.500000e+00)
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %res = call i32 @llvm.nvvm.d2ui.rz(double -1.5)
+  ret i32 %res
+}
+
+;###############################################################
+;#                    Tests with NaN                           #
+;###############################################################
+
+;+-------------------------------------------------------------+
+;|                        f2i                                  |
+;+-------------------------------------------------------------+
+define i32 @test_nan_f2i_rm() {
+; CHECK-LABEL: define i32 @test_nan_f2i_rm() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2i.rm(float 0x7FFFFF0000000000)
+  ret i32 %res
+}
+
+define i32 @test_nan_f2i_rn() {
+; CHECK-LABEL: define i32 @test_nan_f2i_rn() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2i.rn(float 0x7FFFFF0000000000)
+  ret i32 %res
+}
+
+
+define i32 @test_nan_f2i_rp() {
+; CHECK-LABEL: define i32 @test_nan_f2i_rp() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2i.rp(float 0x7FFFFF0000000000)
+  ret i32 %res
+}
+
+define i32 @test_nan_f2i_rz() {
+; CHECK-LABEL: define i32 @test_nan_f2i_rz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2i.rz(float 0x7FFFFF0000000000)
+  ret i32 %res
+}
+
+;+-------------------------------------------------------------+
+;|                      f2i_ftz                                |
+;+-------------------------------------------------------------+
+define i32 @test_nan_f2i_rm_ftz() {
+; CHECK-LABEL: define i32 @test_nan_f2i_rm_ftz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2i.rm.ftz(float 0x7FFFFF0000000000)
+  ret i32 %res
+}
+
+define i32 @test_nan_f2i_rn_ftz() {
+; CHECK-LABEL: define i32 @test_nan_f2i_rn_ftz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2i.rn.ftz(float 0x7FFFFF0000000000)
+  ret i32 %res
+}
+
+define i32 @test_nan_f2i_rp_ftz() {
+; CHECK-LABEL: define i32 @test_nan_f2i_rp_ftz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2i.rp.ftz(float 0x7FFFFF0000000000)
+  ret i32 %res
+}
+
+define i32 @test_nan_f2i_rz_ftz() {
+; CHECK-LABEL: define i32 @test_nan_f2i_rz_ftz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2i.rz.ftz(float 0x7FFFFF0000000000)
+  ret i32 %res
+}
+;+-------------------------------------------------------------+
+;|                        d2i                                  |
+;+-------------------------------------------------------------+
+define i32 @test_nan_d2i_rm() {
+; CHECK-LABEL: define i32 @test_nan_d2i_rm() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.d2i.rm(double 0xFFF8000000000000)
+  ret i32 %res
+}
+
+define i32 @test_nan_d2i_rn() {
+; CHECK-LABEL: define i32 @test_nan_d2i_rn() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.d2i.rn(double 0xFFF8000000000000)
+  ret i32 %res
+}
+
+
+define i32 @test_nan_d2i_rp() {
+; CHECK-LABEL: define i32 @test_nan_d2i_rp() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.d2i.rp(double 0xFFF8000000000000)
+  ret i32 %res
+}
+
+define i32 @test_nan_d2i_rz() {
+; CHECK-LABEL: define i32 @test_nan_d2i_rz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.d2i.rz(double 0xFFF8000000000000)
+  ret i32 %res
+}
+
+;+-------------------------------------------------------------+
+;|                        f2ui                                  |
+;+-------------------------------------------------------------+
+define i32 @test_nan_f2ui_rm() {
+; CHECK-LABEL: define i32 @test_nan_f2ui_rm() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2ui.rm(float 0x7FFFFF0000000000)
+  ret i32 %res
+}
+
+define i32 @test_nan_f2ui_rn() {
+; CHECK-LABEL: define i32 @test_nan_f2ui_rn() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2ui.rn(float 0x7FFFFF0000000000)
+  ret i32 %res
+}
+
+
+define i32 @test_nan_f2ui_rp() {
+; CHECK-LABEL: define i32 @test_nan_f2ui_rp() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2ui.rp(float 0x7FFFFF0000000000)
+  ret i32 %res
+}
+
+define i32 @test_nan_f2ui_rz() {
+; CHECK-LABEL: define i32 @test_nan_f2ui_rz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2ui.rz(float 0x7FFFFF0000000000)
+  ret i32 %res
+}
+
+;+-------------------------------------------------------------+
+;|                      f2ui_ftz                                |
+;+-------------------------------------------------------------+
+define i32 @test_nan_f2ui_rm_ftz() {
+; CHECK-LABEL: define i32 @test_nan_f2ui_rm_ftz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2ui.rm.ftz(float 0x7FFFFF0000000000)
+  ret i32 %res
+}
+
+define i32 @test_nan_f2ui_rn_ftz() {
+; CHECK-LABEL: define i32 @test_nan_f2ui_rn_ftz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2ui.rn.ftz(float 0x7FFFFF0000000000)
+  ret i32 %res
+}
+
+define i32 @test_nan_f2ui_rp_ftz() {
+; CHECK-LABEL: define i32 @test_nan_f2ui_rp_ftz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2ui.rp.ftz(float 0x7FFFFF0000000000)
+  ret i32 %res
+}
+
+define i32 @test_nan_f2ui_rz_ftz() {
+; CHECK-LABEL: define i32 @test_nan_f2ui_rz_ftz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2ui.rz.ftz(float 0x7FFFFF0000000000)
+  ret i32 %res
+}
+;+-------------------------------------------------------------+
+;|                        d2ui                                  |
+;+-------------------------------------------------------------+
+define i32 @test_nan_d2ui_rm() {
+; CHECK-LABEL: define i32 @test_nan_d2ui_rm() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.d2ui.rm(double 0xFFF8000000000000)
+  ret i32 %res
+}
+
+define i32 @test_nan_d2ui_rn() {
+; CHECK-LABEL: define i32 @test_nan_d2ui_rn() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.d2ui.rn(double 0xFFF8000000000000)
+  ret i32 %res
+}
+
+
+define i32 @test_nan_d2ui_rp() {
+; CHECK-LABEL: define i32 @test_nan_d2ui_rp() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.d2ui.rp(double 0xFFF8000000000000)
+  ret i32 %res
+}
+
+define i32 @test_nan_d2ui_rz() {
+; CHECK-LABEL: define i32 @test_nan_d2ui_rz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.d2ui.rz(double 0xFFF8000000000000)
+  ret i32 %res
+}
+
+;###############################################################
+;#            Tests with Positive Subnormal                    #
+;###############################################################
+
+;+-------------------------------------------------------------+
+;|                        f2i                                  |
+;+-------------------------------------------------------------+
+define i32 @test_pos_subnormal_f2i_rm() {
+; CHECK-LABEL: define i32 @test_pos_subnormal_f2i_rm() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2i.rm(float 0x380FFFFFC0000000)
+  ret i32 %res
+}
+
+define i32 @test_pos_subnormal_f2i_rn() {
+; CHECK-LABEL: define i32 @test_pos_subnormal_f2i_rn() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2i.rn(float 0x380FFFFFC0000000)
+  ret i32 %res
+}
+
+
+define i32 @test_pos_subnormal_f2i_rp() {
+; CHECK-LABEL: define i32 @test_pos_subnormal_f2i_rp() {
+; CHECK-NEXT:    ret i32 1
+;
+  %res = call i32 @llvm.nvvm.f2i.rp(float 0x380FFFFFC0000000)
+  ret i32 %res
+}
+
+define i32 @test_pos_subnormal_f2i_rz() {
+; CHECK-LABEL: define i32 @test_pos_subnormal_f2i_rz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2i.rz(float 0x380FFFFFC0000000)
+  ret i32 %res
+}
+
+;+-------------------------------------------------------------+
+;|                      f2i_ftz                                |
+;+-------------------------------------------------------------+
+define i32 @test_pos_subnormal_f2i_rm_ftz() {
+; CHECK-LABEL: define i32 @test_pos_subnormal_f2i_rm_ftz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2i.rm.ftz(float 0x380FFFFFC0000000)
+  ret i32 %res
+}
+
+define i32 @test_pos_subnormal_f2i_rn_ftz() {
+; CHECK-LABEL: define i32 @test_pos_subnormal_f2i_rn_ftz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2i.rn.ftz(float 0x380FFFFFC0000000)
+  ret i32 %res
+}
+
+define i32 @test_pos_subnormal_f2i_rp_ftz() {
+; CHECK-LABEL: define i32 @test_pos_subnormal_f2i_rp_ftz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2i.rp.ftz(float 0x380FFFFFC0000000)
+  ret i32 %res
+}
+
+define i32 @test_pos_subnormal_f2i_rz_ftz() {
+; CHECK-LABEL: define i32 @test_pos_subnormal_f2i_rz_ftz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2i.rz.ftz(float 0x380FFFFFC0000000)
+  ret i32 %res
+}
+;+-------------------------------------------------------------+
+;|                        d2i                                  |
+;+-------------------------------------------------------------+
+define i32 @test_pos_subnormal_d2i_rm() {
+; CHECK-LABEL: define i32 @test_pos_subnormal_d2i_rm() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.d2i.rm(double 0x000fffffffffffff)
+  ret i32 %res
+}
+
+define i32 @test_pos_subnormal_d2i_rn() {
+; CHECK-LABEL: define i32 @test_pos_subnormal_d2i_rn() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.d2i.rn(double 0x000fffffffffffff)
+  ret i32 %res
+}
+
+
+define i32 @test_pos_subnormal_d2i_rp() {
+; CHECK-LABEL: define i32 @test_pos_subnormal_d2i_rp() {
+; CHECK-NEXT:    ret i32 1
+;
+  %res = call i32 @llvm.nvvm.d2i.rp(double 0x000fffffffffffff)
+  ret i32 %res
+}
+
+define i32 @test_pos_subnormal_d2i_rz() {
+; CHECK-LABEL: define i32 @test_pos_subnormal_d2i_rz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.d2i.rz(double 0x000fffffffffffff)
+  ret i32 %res
+}
+
+;+-------------------------------------------------------------+
+;|                        f2ui                                  |
+;+-------------------------------------------------------------+
+define i32 @test_pos_subnormal_f2ui_rm() {
+; CHECK-LABEL: define i32 @test_pos_subnormal_f2ui_rm() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2ui.rm(float 0x380FFFFFC0000000)
+  ret i32 %res
+}
+
+define i32 @test_pos_subnormal_f2ui_rn() {
+; CHECK-LABEL: define i32 @test_pos_subnormal_f2ui_rn() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2ui.rn(float 0x380FFFFFC0000000)
+  ret i32 %res
+}
+
+
+define i32 @test_pos_subnormal_f2ui_rp() {
+; CHECK-LABEL: define i32 @test_pos_subnormal_f2ui_rp() {
+; CHECK-NEXT:    ret i32 1
+;
+  %res = call i32 @llvm.nvvm.f2ui.rp(float 0x380FFFFFC0000000)
+  ret i32 %res
+}
+
+define i32 @test_pos_subnormal_f2ui_rz() {
+; CHECK-LABEL: define i32 @test_pos_subnormal_f2ui_rz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2ui.rz(float 0x380FFFFFC0000000)
+  ret i32 %res
+}
+
+;+-------------------------------------------------------------+
+;|                      f2ui_ftz                                |
+;+-------------------------------------------------------------+
+define i32 @test_pos_subnormal_f2ui_rm_ftz() {
+; CHECK-LABEL: define i32 @test_pos_subnormal_f2ui_rm_ftz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2ui.rm.ftz(float 0x380FFFFFC0000000)
+  ret i32 %res
+}
+
+define i32 @test_pos_subnormal_f2ui_rn_ftz() {
+; CHECK-LABEL: define i32 @test_pos_subnormal_f2ui_rn_ftz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2ui.rn.ftz(float 0x380FFFFFC0000000)
+  ret i32 %res
+}
+
+define i32 @test_pos_subnormal_f2ui_rp_ftz() {
+; CHECK-LABEL: define i32 @test_pos_subnormal_f2ui_rp_ftz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2ui.rp.ftz(float 0x380FFFFFC0000000)
+  ret i32 %res
+}
+
+define i32 @test_pos_subnormal_f2ui_rz_ftz() {
+; CHECK-LABEL: define i32 @test_pos_subnormal_f2ui_rz_ftz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2ui.rz.ftz(float 0x380FFFFFC0000000)
+  ret i32 %res
+}
+;+-------------------------------------------------------------+
+;|                        d2ui                                  |
+;+-------------------------------------------------------------+
+define i32 @test_pos_subnormal_d2ui_rm() {
+; CHECK-LABEL: define i32 @test_pos_subnormal_d2ui_rm() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.d2ui.rm(double 0x000fffffffffffff)
+  ret i32 %res
+}
+
+define i32 @test_pos_subnormal_d2ui_rn() {
+; CHECK-LABEL: define i32 @test_pos_subnormal_d2ui_rn() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.d2ui.rn(double 0x000fffffffffffff)
+  ret i32 %res
+}
+
+
+define i32 @test_pos_subnormal_d2ui_rp() {
+; CHECK-LABEL: define i32 @test_pos_subnormal_d2ui_rp() {
+; CHECK-NEXT:    ret i32 1
+;
+  %res = call i32 @llvm.nvvm.d2ui.rp(double 0x000fffffffffffff)
+  ret i32 %res
+}
+
+define i32 @test_pos_subnormal_d2ui_rz() {
+; CHECK-LABEL: define i32 @test_pos_subnormal_d2ui_rz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.d2ui.rz(double 0x000fffffffffffff)
+  ret i32 %res
+}
+
+;###############################################################
+;#            Tests with Negative Subnormal                    #
+;###############################################################
+
+;+-------------------------------------------------------------+
+;|                        f2i                                  |
+;+-------------------------------------------------------------+
+define i32 @test_neg_subnormal_f2i_rm() {
+; CHECK-LABEL: define i32 @test_neg_subnormal_f2i_rm() {
+; CHECK-NEXT:    ret i32 -1
+;
+  %res = call i32 @llvm.nvvm.f2i.rm(float 0xB80FFFFFC0000000)
+  ret i32 %res
+}
+
+define i32 @test_neg_subnormal_f2i_rn() {
+; CHECK-LABEL: define i32 @test_neg_subnormal_f2i_rn() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2i.rn(float 0xB80FFFFFC0000000)
+  ret i32 %res
+}
+
+
+define i32 @test_neg_subnormal_f2i_rp() {
+; CHECK-LABEL: define i32 @test_neg_subnormal_f2i_rp() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2i.rp(float 0xB80FFFFFC0000000)
+  ret i32 %res
+}
+
+define i32 @test_neg_subnormal_f2i_rz() {
+; CHECK-LABEL: define i32 @test_neg_subnormal_f2i_rz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2i.rz(float 0xB80FFFFFC0000000)
+  ret i32 %res
+}
+
+;+-------------------------------------------------------------+
+;|                      f2i_ftz                                |
+;+-------------------------------------------------------------+
+define i32 @test_neg_subnormal_f2i_rm_ftz() {
+; CHECK-LABEL: define i32 @test_neg_subnormal_f2i_rm_ftz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2i.rm.ftz(float 0xB80FFFFFC0000000)
+  ret i32 %res
+}
+
+define i32 @test_neg_subnormal_f2i_rn_ftz() {
+; CHECK-LABEL: define i32 @test_neg_subnormal_f2i_rn_ftz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2i.rn.ftz(float 0xB80FFFFFC0000000)
+  ret i32 %res
+}
+
+define i32 @test_neg_subnormal_f2i_rp_ftz() {
+; CHECK-LABEL: define i32 @test_neg_subnormal_f2i_rp_ftz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2i.rp.ftz(float 0xB80FFFFFC0000000)
+  ret i32 %res
+}
+
+define i32 @test_neg_subnormal_f2i_rz_ftz() {
+; CHECK-LABEL: define i32 @test_neg_subnormal_f2i_rz_ftz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2i.rz.ftz(float 0xB80FFFFFC0000000)
+  ret i32 %res
+}
+;+-------------------------------------------------------------+
+;|                        d2i                                  |
+;+-------------------------------------------------------------+
+define i32 @test_neg_subnormal_d2i_rm() {
+; CHECK-LABEL: define i32 @test_neg_subnormal_d2i_rm() {
+; CHECK-NEXT:    ret i32 -1
+;
+  %res = call i32 @llvm.nvvm.d2i.rm(double 0x800fffffffffffff)
+  ret i32 %res
+}
+
+define i32 @test_neg_subnormal_d2i_rn() {
+; CHECK-LABEL: define i32 @test_neg_subnormal_d2i_rn() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.d2i.rn(double 0x800fffffffffffff)
+  ret i32 %res
+}
+
+
+define i32 @test_neg_subnormal_d2i_rp() {
+; CHECK-LABEL: define i32 @test_neg_subnormal_d2i_rp() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.d2i.rp(double 0x800fffffffffffff)
+  ret i32 %res
+}
+
+define i32 @test_neg_subnormal_d2i_rz() {
+; CHECK-LABEL: define i32 @test_neg_subnormal_d2i_rz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.d2i.rz(double 0x800fffffffffffff)
+  ret i32 %res
+}
+
+;+-------------------------------------------------------------+
+;|                        f2ui                                  |
+;+-------------------------------------------------------------+
+define i32 @test_neg_subnormal_f2ui_rm() {
+; CHECK-LABEL: define i32 @test_neg_subnormal_f2ui_rm() {
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.nvvm.f2ui.rm(float 0xB80FFFFFC0000000)
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %res = call i32 @llvm.nvvm.f2ui.rm(float 0xB80FFFFFC0000000)
+  ret i32 %res
+}
+
+define i32 @test_neg_subnormal_f2ui_rn() {
+; CHECK-LABEL: define i32 @test_neg_subnormal_f2ui_rn() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2ui.rn(float 0xB80FFFFFC0000000)
+  ret i32 %res
+}
+
+
+define i32 @test_neg_subnormal_f2ui_rp() {
+; CHECK-LABEL: define i32 @test_neg_subnormal_f2ui_rp() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2ui.rp(float 0xB80FFFFFC0000000)
+  ret i32 %res
+}
+
+define i32 @test_neg_subnormal_f2ui_rz() {
+; CHECK-LABEL: define i32 @test_neg_subnormal_f2ui_rz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2ui.rz(float 0xB80FFFFFC0000000)
+  ret i32 %res
+}
+
+;+-------------------------------------------------------------+
+;|                      f2ui_ftz                                |
+;+-------------------------------------------------------------+
+define i32 @test_neg_subnormal_f2ui_rm_ftz() {
+; CHECK-LABEL: define i32 @test_neg_subnormal_f2ui_rm_ftz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2ui.rm.ftz(float 0xB80FFFFFC0000000)
+  ret i32 %res
+}
+
+define i32 @test_neg_subnormal_f2ui_rn_ftz() {
+; CHECK-LABEL: define i32 @test_neg_subnormal_f2ui_rn_ftz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2ui.rn.ftz(float 0xB80FFFFFC0000000)
+  ret i32 %res
+}
+
+define i32 @test_neg_subnormal_f2ui_rp_ftz() {
+; CHECK-LABEL: define i32 @test_neg_subnormal_f2ui_rp_ftz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2ui.rp.ftz(float 0xB80FFFFFC0000000)
+  ret i32 %res
+}
+
+define i32 @test_neg_subnormal_f2ui_rz_ftz() {
+; CHECK-LABEL: define i32 @test_neg_subnormal_f2ui_rz_ftz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.f2ui.rz.ftz(float 0xB80FFFFFC0000000)
+  ret i32 %res
+}
+;+-------------------------------------------------------------+
+;|                        d2ui                                  |
+;+-------------------------------------------------------------+
+define i32 @test_neg_subnormal_d2ui_rm() {
+; CHECK-LABEL: define i32 @test_neg_subnormal_d2ui_rm() {
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.nvvm.d2ui.rm(double 0x800FFFFFFFFFFFFF)
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %res = call i32 @llvm.nvvm.d2ui.rm(double 0x800fffffffffffff)
+  ret i32 %res
+}
+
+define i32 @test_neg_subnormal_d2ui_rn() {
+; CHECK-LABEL: define i32 @test_neg_subnormal_d2ui_rn() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.d2ui.rn(double 0x800fffffffffffff)
+  ret i32 %res
+}
+
+
+define i32 @test_neg_subnormal_d2ui_rp() {
+; CHECK-LABEL: define i32 @test_neg_subnormal_d2ui_rp() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.d2ui.rp(double 0x800fffffffffffff)
+  ret i32 %res
+}
+
+define i32 @test_neg_subnormal_d2ui_rz() {
+; CHECK-LABEL: define i32 @test_neg_subnormal_d2ui_rz() {
+; CHECK-NEXT:    ret i32 0
+;
+  %res = call i32 @llvm.nvvm.d2ui.rz(double 0x800fffffffffffff)
+  ret i32 %res
+}
+
+declare i32 @llvm.nvvm.f2i.rm(float)
+declare i32 @llvm.nvvm.f2i.rn(float)
+declare i32 @llvm.nvvm.f2i.rp(float)
+declare i32 @llvm.nvvm.f2i.rz(float)
+
+declare i32 @llvm.nvvm.f2i.rm.ftz(float)
+declare i32 @llvm.nvvm.f2i.rn.ftz(float)
+declare i32 @llvm.nvvm.f2i.rp.ftz(float)
+declare i32 @llvm.nvvm.f2i.rz.ftz(float)
+
+declare i32 @llvm.nvvm.d2i.rm(double)
+declare i32 @llvm.nvvm.d2i.rn(double)
+declare i32 @llvm.nvvm.d2i.rp(double)
+declare i32 @llvm.nvvm.d2i.rz(double)
+
+
+declare i32 @llvm.nvvm.f2ui.rm(float)
+declare i32 @llvm.nvvm.f2ui.rn(float)
+declare i32 @llvm.nvvm.f2ui.rp(float)
+declare i32 @llvm.nvvm.f2ui.rz(float)
+
+declare i32 @llvm.nvvm.f2ui.rm.ftz(float)
+declare i32 @llvm.nvvm.f2ui.rn.ftz(float)
+declare i32 @llvm.nvvm.f2ui.rp.ftz(float)
+declare i32 @llvm.nvvm.f2ui.rz.ftz(float)
+
+declare i32 @llvm.nvvm.d2ui.rm(double)
+declare i32 @llvm.nvvm.d2ui.rn(double)
+declare i32 @llvm.nvvm.d2ui.rp(double)
+declare i32 @llvm.nvvm.d2ui.rz(double)
diff --git a/llvm/test/Transforms/InstSimplify/const-fold-nvvm-f2ll-d2ll.ll b/llvm/test/Transforms/InstSimplify/const-fold-nvvm-f2ll-d2ll.ll
new file mode 100644
index 000000000000..be38177dce2c
--- /dev/null
+++ b/llvm/test/Transforms/InstSimplify/const-fold-nvvm-f2ll-d2ll.ll
@@ -0,0 +1,1129 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt < %s -passes=instsimplify -march=nvptx64 -S | FileCheck %s
+
+; f2ll/f2ull and d2ll/d2ull - double/float to i64 tests
+
+;###############################################################
+;#               Tests with Positive 1.5                       #
+;###############################################################
+
+;+-------------------------------------------------------------+
+;|                        f2ll                                  |
+;+-------------------------------------------------------------+
+define i64 @test_pos_1_5_f2ll_rm() {
+; CHECK-LABEL: define i64 @test_pos_1_5_f2ll_rm() {
+; CHECK-NEXT:    ret i64 1
+;
+  %res = call i64 @llvm.nvvm.f2ll.rm(float 1.5)
+  ret i64 %res
+}
+
+define i64 @test_pos_1_5_f2ll_rn() {
+; CHECK-LABEL: define i64 @test_pos_1_5_f2ll_rn() {
+; CHECK-NEXT:    ret i64 2
+;
+  %res = call i64 @llvm.nvvm.f2ll.rn(float 1.5)
+  ret i64 %res
+}
+
+
+define i64 @test_pos_1_5_f2ll_rp() {
+; CHECK-LABEL: define i64 @test_pos_1_5_f2ll_rp() {
+; CHECK-NEXT:    ret i64 2
+;
+  %res = call i64 @llvm.nvvm.f2ll.rp(float 1.5)
+  ret i64 %res
+}
+
+define i64 @test_pos_1_5_f2ll_rz() {
+; CHECK-LABEL: define i64 @test_pos_1_5_f2ll_rz() {
+; CHECK-NEXT:    ret i64 1
+;
+  %res = call i64 @llvm.nvvm.f2ll.rz(float 1.5)
+  ret i64 %res
+}
+
+;+-------------------------------------------------------------+
+;|                      f2ll_ftz                                |
+;+-------------------------------------------------------------+
+define i64 @test_pos_1_5_f2ll_rm_ftz() {
+; CHECK-LABEL: define i64 @test_pos_1_5_f2ll_rm_ftz() {
+; CHECK-NEXT:    ret i64 1
+;
+  %res = call i64 @llvm.nvvm.f2ll.rm.ftz(float 1.5)
+  ret i64 %res
+}
+
+define i64 @test_pos_1_5_f2ll_rn_ftz() {
+; CHECK-LABEL: define i64 @test_pos_1_5_f2ll_rn_ftz() {
+; CHECK-NEXT:    ret i64 2
+;
+  %res = call i64 @llvm.nvvm.f2ll.rn.ftz(float 1.5)
+  ret i64 %res
+}
+
+define i64 @test_pos_1_5_f2ll_rp_ftz() {
+; CHECK-LABEL: define i64 @test_pos_1_5_f2ll_rp_ftz() {
+; CHECK-NEXT:    ret i64 2
+;
+  %res = call i64 @llvm.nvvm.f2ll.rp.ftz(float 1.5)
+  ret i64 %res
+}
+
+define i64 @test_pos_1_5_f2ll_rz_ftz() {
+; CHECK-LABEL: define i64 @test_pos_1_5_f2ll_rz_ftz() {
+; CHECK-NEXT:    ret i64 1
+;
+  %res = call i64 @llvm.nvvm.f2ll.rz.ftz(float 1.5)
+  ret i64 %res
+}
+;+-------------------------------------------------------------+
+;|                        d2ll                                  |
+;+-------------------------------------------------------------+
+define i64 @test_pos_1_5_d2ll_rm() {
+; CHECK-LABEL: define i64 @test_pos_1_5_d2ll_rm() {
+; CHECK-NEXT:    ret i64 1
+;
+  %res = call i64 @llvm.nvvm.d2ll.rm(double 1.5)
+  ret i64 %res
+}
+
+define i64 @test_pos_1_5_d2ll_rn() {
+; CHECK-LABEL: define i64 @test_pos_1_5_d2ll_rn() {
+; CHECK-NEXT:    ret i64 2
+;
+  %res = call i64 @llvm.nvvm.d2ll.rn(double 1.5)
+  ret i64 %res
+}
+
+
+define i64 @test_pos_1_5_d2ll_rp() {
+; CHECK-LABEL: define i64 @test_pos_1_5_d2ll_rp() {
+; CHECK-NEXT:    ret i64 2
+;
+  %res = call i64 @llvm.nvvm.d2ll.rp(double 1.5)
+  ret i64 %res
+}
+
+define i64 @test_pos_1_5_d2ll_rz() {
+; CHECK-LABEL: define i64 @test_pos_1_5_d2ll_rz() {
+; CHECK-NEXT:    ret i64 1
+;
+  %res = call i64 @llvm.nvvm.d2ll.rz(double 1.5)
+  ret i64 %res
+}
+
+;+-------------------------------------------------------------+
+;|                        f2ull                                  |
+;+-------------------------------------------------------------+
+define i64 @test_pos_1_5_f2ull_rm() {
+; CHECK-LABEL: define i64 @test_pos_1_5_f2ull_rm() {
+; CHECK-NEXT:    ret i64 1
+;
+  %res = call i64 @llvm.nvvm.f2ull.rm(float 1.5)
+  ret i64 %res
+}
+
+define i64 @test_pos_1_5_f2ull_rn() {
+; CHECK-LABEL: define i64 @test_pos_1_5_f2ull_rn() {
+; CHECK-NEXT:    ret i64 2
+;
+  %res = call i64 @llvm.nvvm.f2ull.rn(float 1.5)
+  ret i64 %res
+}
+
+
+define i64 @test_pos_1_5_f2ull_rp() {
+; CHECK-LABEL: define i64 @test_pos_1_5_f2ull_rp() {
+; CHECK-NEXT:    ret i64 2
+;
+  %res = call i64 @llvm.nvvm.f2ull.rp(float 1.5)
+  ret i64 %res
+}
+
+define i64 @test_pos_1_5_f2ull_rz() {
+; CHECK-LABEL: define i64 @test_pos_1_5_f2ull_rz() {
+; CHECK-NEXT:    ret i64 1
+;
+  %res = call i64 @llvm.nvvm.f2ull.rz(float 1.5)
+  ret i64 %res
+}
+
+;+-------------------------------------------------------------+
+;|                      f2ull_ftz                                |
+;+-------------------------------------------------------------+
+define i64 @test_pos_1_5_f2ull_rm_ftz() {
+; CHECK-LABEL: define i64 @test_pos_1_5_f2ull_rm_ftz() {
+; CHECK-NEXT:    ret i64 1
+;
+  %res = call i64 @llvm.nvvm.f2ull.rm.ftz(float 1.5)
+  ret i64 %res
+}
+
+define i64 @test_pos_1_5_f2ull_rn_ftz() {
+; CHECK-LABEL: define i64 @test_pos_1_5_f2ull_rn_ftz() {
+; CHECK-NEXT:    ret i64 2
+;
+  %res = call i64 @llvm.nvvm.f2ull.rn.ftz(float 1.5)
+  ret i64 %res
+}
+
+define i64 @test_pos_1_5_f2ull_rp_ftz() {
+; CHECK-LABEL: define i64 @test_pos_1_5_f2ull_rp_ftz() {
+; CHECK-NEXT:    ret i64 2
+;
+  %res = call i64 @llvm.nvvm.f2ull.rp.ftz(float 1.5)
+  ret i64 %res
+}
+
+define i64 @test_pos_1_5_f2ull_rz_ftz() {
+; CHECK-LABEL: define i64 @test_pos_1_5_f2ull_rz_ftz() {
+; CHECK-NEXT:    ret i64 1
+;
+  %res = call i64 @llvm.nvvm.f2ull.rz.ftz(float 1.5)
+  ret i64 %res
+}
+;+-------------------------------------------------------------+
+;|                        d2ull                                  |
+;+-------------------------------------------------------------+
+define i64 @test_pos_1_5_d2ull_rm() {
+; CHECK-LABEL: define i64 @test_pos_1_5_d2ull_rm() {
+; CHECK-NEXT:    ret i64 1
+;
+  %res = call i64 @llvm.nvvm.d2ull.rm(double 1.5)
+  ret i64 %res
+}
+
+define i64 @test_pos_1_5_d2ull_rn() {
+; CHECK-LABEL: define i64 @test_pos_1_5_d2ull_rn() {
+; CHECK-NEXT:    ret i64 2
+;
+  %res = call i64 @llvm.nvvm.d2ull.rn(double 1.5)
+  ret i64 %res
+}
+
+
+define i64 @test_pos_1_5_d2ull_rp() {
+; CHECK-LABEL: define i64 @test_pos_1_5_d2ull_rp() {
+; CHECK-NEXT:    ret i64 2
+;
+  %res = call i64 @llvm.nvvm.d2ull.rp(double 1.5)
+  ret i64 %res
+}
+
+define i64 @test_pos_1_5_d2ull_rz() {
+; CHECK-LABEL: define i64 @test_pos_1_5_d2ull_rz() {
+; CHECK-NEXT:    ret i64 1
+;
+  %res = call i64 @llvm.nvvm.d2ull.rz(double 1.5)
+  ret i64 %res
+}
+
+;###############################################################
+;#               Tests with Negative 1.5                       #
+;###############################################################
+
+;+-------------------------------------------------------------+
+;|                        f2ll                                  |
+;+-------------------------------------------------------------+
+define i64 @test_neg_1_5_f2ll_rm() {
+; CHECK-LABEL: define i64 @test_neg_1_5_f2ll_rm() {
+; CHECK-NEXT:    ret i64 -2
+;
+  %res = call i64 @llvm.nvvm.f2ll.rm(float -1.5)
+  ret i64 %res
+}
+
+define i64 @test_neg_1_5_f2ll_rn() {
+; CHECK-LABEL: define i64 @test_neg_1_5_f2ll_rn() {
+; CHECK-NEXT:    ret i64 -2
+;
+  %res = call i64 @llvm.nvvm.f2ll.rn(float -1.5)
+  ret i64 %res
+}
+
+
+define i64 @test_neg_1_5_f2ll_rp() {
+; CHECK-LABEL: define i64 @test_neg_1_5_f2ll_rp() {
+; CHECK-NEXT:    ret i64 -1
+;
+  %res = call i64 @llvm.nvvm.f2ll.rp(float -1.5)
+  ret i64 %res
+}
+
+define i64 @test_neg_1_5_f2ll_rz() {
+; CHECK-LABEL: define i64 @test_neg_1_5_f2ll_rz() {
+; CHECK-NEXT:    ret i64 -1
+;
+  %res = call i64 @llvm.nvvm.f2ll.rz(float -1.5)
+  ret i64 %res
+}
+
+;+-------------------------------------------------------------+
+;|                      f2ll_ftz                                |
+;+-------------------------------------------------------------+
+define i64 @test_neg_1_5_f2ll_rm_ftz() {
+; CHECK-LABEL: define i64 @test_neg_1_5_f2ll_rm_ftz() {
+; CHECK-NEXT:    ret i64 -2
+;
+  %res = call i64 @llvm.nvvm.f2ll.rm.ftz(float -1.5)
+  ret i64 %res
+}
+
+define i64 @test_neg_1_5_f2ll_rn_ftz() {
+; CHECK-LABEL: define i64 @test_neg_1_5_f2ll_rn_ftz() {
+; CHECK-NEXT:    ret i64 -2
+;
+  %res = call i64 @llvm.nvvm.f2ll.rn.ftz(float -1.5)
+  ret i64 %res
+}
+
+define i64 @test_neg_1_5_f2ll_rp_ftz() {
+; CHECK-LABEL: define i64 @test_neg_1_5_f2ll_rp_ftz() {
+; CHECK-NEXT:    ret i64 -1
+;
+  %res = call i64 @llvm.nvvm.f2ll.rp.ftz(float -1.5)
+  ret i64 %res
+}
+
+define i64 @test_neg_1_5_f2ll_rz_ftz() {
+; CHECK-LABEL: define i64 @test_neg_1_5_f2ll_rz_ftz() {
+; CHECK-NEXT:    ret i64 -1
+;
+  %res = call i64 @llvm.nvvm.f2ll.rz.ftz(float -1.5)
+  ret i64 %res
+}
+;+-------------------------------------------------------------+
+;|                        d2ll                                  |
+;+-------------------------------------------------------------+
+define i64 @test_neg_1_5_d2ll_rm() {
+; CHECK-LABEL: define i64 @test_neg_1_5_d2ll_rm() {
+; CHECK-NEXT:    ret i64 -2
+;
+  %res = call i64 @llvm.nvvm.d2ll.rm(double -1.5)
+  ret i64 %res
+}
+
+define i64 @test_neg_1_5_d2ll_rn() {
+; CHECK-LABEL: define i64 @test_neg_1_5_d2ll_rn() {
+; CHECK-NEXT:    ret i64 -2
+;
+  %res = call i64 @llvm.nvvm.d2ll.rn(double -1.5)
+  ret i64 %res
+}
+
+
+define i64 @test_neg_1_5_d2ll_rp() {
+; CHECK-LABEL: define i64 @test_neg_1_5_d2ll_rp() {
+; CHECK-NEXT:    ret i64 -1
+;
+  %res = call i64 @llvm.nvvm.d2ll.rp(double -1.5)
+  ret i64 %res
+}
+
+define i64 @test_neg_1_5_d2ll_rz() {
+; CHECK-LABEL: define i64 @test_neg_1_5_d2ll_rz() {
+; CHECK-NEXT:    ret i64 -1
+;
+  %res = call i64 @llvm.nvvm.d2ll.rz(double -1.5)
+  ret i64 %res
+}
+
+;+-------------------------------------------------------------+
+;|                        f2ull                                  |
+;+-------------------------------------------------------------+
+define i64 @test_neg_1_5_f2ull_rm() {
+; CHECK-LABEL: define i64 @test_neg_1_5_f2ull_rm() {
+; CHECK-NEXT:    [[RES:%.*]] = call i64 @llvm.nvvm.f2ull.rm(float -1.500000e+00)
+; CHECK-NEXT:    ret i64 [[RES]]
+;
+  %res = call i64 @llvm.nvvm.f2ull.rm(float -1.5)
+  ret i64 %res
+}
+
+define i64 @test_neg_1_5_f2ull_rn() {
+; CHECK-LABEL: define i64 @test_neg_1_5_f2ull_rn() {
+; CHECK-NEXT:    [[RES:%.*]] = call i64 @llvm.nvvm.f2ull.rn(float -1.500000e+00)
+; CHECK-NEXT:    ret i64 [[RES]]
+;
+  %res = call i64 @llvm.nvvm.f2ull.rn(float -1.5)
+  ret i64 %res
+}
+
+
+define i64 @test_neg_1_5_f2ull_rp() {
+; CHECK-LABEL: define i64 @test_neg_1_5_f2ull_rp() {
+; CHECK-NEXT:    [[RES:%.*]] = call i64 @llvm.nvvm.f2ull.rp(float -1.500000e+00)
+; CHECK-NEXT:    ret i64 [[RES]]
+;
+  %res = call i64 @llvm.nvvm.f2ull.rp(float -1.5)
+  ret i64 %res
+}
+
+define i64 @test_neg_1_5_f2ull_rz() {
+; CHECK-LABEL: define i64 @test_neg_1_5_f2ull_rz() {
+; CHECK-NEXT:    [[RES:%.*]] = call i64 @llvm.nvvm.f2ull.rz(float -1.500000e+00)
+; CHECK-NEXT:    ret i64 [[RES]]
+;
+  %res = call i64 @llvm.nvvm.f2ull.rz(float -1.5)
+  ret i64 %res
+}
+
+;+-------------------------------------------------------------+
+;|                      f2ull_ftz                                |
+;+-------------------------------------------------------------+
+define i64 @test_neg_1_5_f2ull_rm_ftz() {
+; CHECK-LABEL: define i64 @test_neg_1_5_f2ull_rm_ftz() {
+; CHECK-NEXT:    [[RES:%.*]] = call i64 @llvm.nvvm.f2ull.rm.ftz(float -1.500000e+00)
+; CHECK-NEXT:    ret i64 [[RES]]
+;
+  %res = call i64 @llvm.nvvm.f2ull.rm.ftz(float -1.5)
+  ret i64 %res
+}
+
+define i64 @test_neg_1_5_f2ull_rn_ftz() {
+; CHECK-LABEL: define i64 @test_neg_1_5_f2ull_rn_ftz() {
+; CHECK-NEXT:    [[RES:%.*]] = call i64 @llvm.nvvm.f2ull.rn.ftz(float -1.500000e+00)
+; CHECK-NEXT:    ret i64 [[RES]]
+;
+  %res = call i64 @llvm.nvvm.f2ull.rn.ftz(float -1.5)
+  ret i64 %res
+}
+
+define i64 @test_neg_1_5_f2ull_rp_ftz() {
+; CHECK-LABEL: define i64 @test_neg_1_5_f2ull_rp_ftz() {
+; CHECK-NEXT:    [[RES:%.*]] = call i64 @llvm.nvvm.f2ull.rp.ftz(float -1.500000e+00)
+; CHECK-NEXT:    ret i64 [[RES]]
+;
+  %res = call i64 @llvm.nvvm.f2ull.rp.ftz(float -1.5)
+  ret i64 %res
+}
+
+define i64 @test_neg_1_5_f2ull_rz_ftz() {
+; CHECK-LABEL: define i64 @test_neg_1_5_f2ull_rz_ftz() {
+; CHECK-NEXT:    [[RES:%.*]] = call i64 @llvm.nvvm.f2ull.rz.ftz(float -1.500000e+00)
+; CHECK-NEXT:    ret i64 [[RES]]
+;
+  %res = call i64 @llvm.nvvm.f2ull.rz.ftz(float -1.5)
+  ret i64 %res
+}
+;+-------------------------------------------------------------+
+;|                        d2ull                                  |
+;+-------------------------------------------------------------+
+define i64 @test_neg_1_5_d2ull_rm() {
+; CHECK-LABEL: define i64 @test_neg_1_5_d2ull_rm() {
+; CHECK-NEXT:    [[RES:%.*]] = call i64 @llvm.nvvm.d2ull.rm(double -1.500000e+00)
+; CHECK-NEXT:    ret i64 [[RES]]
+;
+  %res = call i64 @llvm.nvvm.d2ull.rm(double -1.5)
+  ret i64 %res
+}
+
+define i64 @test_neg_1_5_d2ull_rn() {
+; CHECK-LABEL: define i64 @test_neg_1_5_d2ull_rn() {
+; CHECK-NEXT:    [[RES:%.*]] = call i64 @llvm.nvvm.d2ull.rn(double -1.500000e+00)
+; CHECK-NEXT:    ret i64 [[RES]]
+;
+  %res = call i64 @llvm.nvvm.d2ull.rn(double -1.5)
+  ret i64 %res
+}
+
+
+define i64 @test_neg_1_5_d2ull_rp() {
+; CHECK-LABEL: define i64 @test_neg_1_5_d2ull_rp() {
+; CHECK-NEXT:    [[RES:%.*]] = call i64 @llvm.nvvm.d2ull.rp(double -1.500000e+00)
+; CHECK-NEXT:    ret i64 [[RES]]
+;
+  %res = call i64 @llvm.nvvm.d2ull.rp(double -1.5)
+  ret i64 %res
+}
+
+define i64 @test_neg_1_5_d2ull_rz() {
+; CHECK-LABEL: define i64 @test_neg_1_5_d2ull_rz() {
+; CHECK-NEXT:    [[RES:%.*]] = call i64 @llvm.nvvm.d2ull.rz(double -1.500000e+00)
+; CHECK-NEXT:    ret i64 [[RES]]
+;
+  %res = call i64 @llvm.nvvm.d2ull.rz(double -1.5)
+  ret i64 %res
+}
+
+;###############################################################
+;#                    Tests with NaN                           #
+;###############################################################
+
+;+-------------------------------------------------------------+
+;|                        f2ll                                  |
+;+-------------------------------------------------------------+
+define i64 @test_nan_f2ll_rm() {
+; CHECK-LABEL: define i64 @test_nan_f2ll_rm() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ll.rm(float 0x7FFFFF0000000000)
+  ret i64 %res
+}
+
+define i64 @test_nan_f2ll_rn() {
+; CHECK-LABEL: define i64 @test_nan_f2ll_rn() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ll.rn(float 0x7FFFFF0000000000)
+  ret i64 %res
+}
+
+
+define i64 @test_nan_f2ll_rp() {
+; CHECK-LABEL: define i64 @test_nan_f2ll_rp() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ll.rp(float 0x7FFFFF0000000000)
+  ret i64 %res
+}
+
+define i64 @test_nan_f2ll_rz() {
+; CHECK-LABEL: define i64 @test_nan_f2ll_rz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ll.rz(float 0x7FFFFF0000000000)
+  ret i64 %res
+}
+
+;+-------------------------------------------------------------+
+;|                      f2ll_ftz                                |
+;+-------------------------------------------------------------+
+define i64 @test_nan_f2ll_rm_ftz() {
+; CHECK-LABEL: define i64 @test_nan_f2ll_rm_ftz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ll.rm.ftz(float 0x7FFFFF0000000000)
+  ret i64 %res
+}
+
+define i64 @test_nan_f2ll_rn_ftz() {
+; CHECK-LABEL: define i64 @test_nan_f2ll_rn_ftz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ll.rn.ftz(float 0x7FFFFF0000000000)
+  ret i64 %res
+}
+
+define i64 @test_nan_f2ll_rp_ftz() {
+; CHECK-LABEL: define i64 @test_nan_f2ll_rp_ftz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ll.rp.ftz(float 0x7FFFFF0000000000)
+  ret i64 %res
+}
+
+define i64 @test_nan_f2ll_rz_ftz() {
+; CHECK-LABEL: define i64 @test_nan_f2ll_rz_ftz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ll.rz.ftz(float 0x7FFFFF0000000000)
+  ret i64 %res
+}
+;+-------------------------------------------------------------+
+;|                        d2ll                                  |
+;+-------------------------------------------------------------+
+define i64 @test_nan_d2ll_rm() {
+; CHECK-LABEL: define i64 @test_nan_d2ll_rm() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.d2ll.rm(double 0xFFF8000000000000)
+  ret i64 %res
+}
+
+define i64 @test_nan_d2ll_rn() {
+; CHECK-LABEL: define i64 @test_nan_d2ll_rn() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.d2ll.rn(double 0xFFF8000000000000)
+  ret i64 %res
+}
+
+
+define i64 @test_nan_d2ll_rp() {
+; CHECK-LABEL: define i64 @test_nan_d2ll_rp() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.d2ll.rp(double 0xFFF8000000000000)
+  ret i64 %res
+}
+
+define i64 @test_nan_d2ll_rz() {
+; CHECK-LABEL: define i64 @test_nan_d2ll_rz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.d2ll.rz(double 0xFFF8000000000000)
+  ret i64 %res
+}
+
+;+-------------------------------------------------------------+
+;|                        f2ull                                  |
+;+-------------------------------------------------------------+
+define i64 @test_nan_f2ull_rm() {
+; CHECK-LABEL: define i64 @test_nan_f2ull_rm() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ull.rm(float 0x7FFFFF0000000000)
+  ret i64 %res
+}
+
+define i64 @test_nan_f2ull_rn() {
+; CHECK-LABEL: define i64 @test_nan_f2ull_rn() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ull.rn(float 0x7FFFFF0000000000)
+  ret i64 %res
+}
+
+
+define i64 @test_nan_f2ull_rp() {
+; CHECK-LABEL: define i64 @test_nan_f2ull_rp() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ull.rp(float 0x7FFFFF0000000000)
+  ret i64 %res
+}
+
+define i64 @test_nan_f2ull_rz() {
+; CHECK-LABEL: define i64 @test_nan_f2ull_rz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ull.rz(float 0x7FFFFF0000000000)
+  ret i64 %res
+}
+
+;+-------------------------------------------------------------+
+;|                      f2ull_ftz                                |
+;+-------------------------------------------------------------+
+define i64 @test_nan_f2ull_rm_ftz() {
+; CHECK-LABEL: define i64 @test_nan_f2ull_rm_ftz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ull.rm.ftz(float 0x7FFFFF0000000000)
+  ret i64 %res
+}
+
+define i64 @test_nan_f2ull_rn_ftz() {
+; CHECK-LABEL: define i64 @test_nan_f2ull_rn_ftz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ull.rn.ftz(float 0x7FFFFF0000000000)
+  ret i64 %res
+}
+
+define i64 @test_nan_f2ull_rp_ftz() {
+; CHECK-LABEL: define i64 @test_nan_f2ull_rp_ftz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ull.rp.ftz(float 0x7FFFFF0000000000)
+  ret i64 %res
+}
+
+define i64 @test_nan_f2ull_rz_ftz() {
+; CHECK-LABEL: define i64 @test_nan_f2ull_rz_ftz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ull.rz.ftz(float 0x7FFFFF0000000000)
+  ret i64 %res
+}
+;+-------------------------------------------------------------+
+;|                        d2ull                                  |
+;+-------------------------------------------------------------+
+define i64 @test_nan_d2ull_rm() {
+; CHECK-LABEL: define i64 @test_nan_d2ull_rm() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.d2ull.rm(double 0xFFF8000000000000)
+  ret i64 %res
+}
+
+define i64 @test_nan_d2ull_rn() {
+; CHECK-LABEL: define i64 @test_nan_d2ull_rn() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.d2ull.rn(double 0xFFF8000000000000)
+  ret i64 %res
+}
+
+
+define i64 @test_nan_d2ull_rp() {
+; CHECK-LABEL: define i64 @test_nan_d2ull_rp() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.d2ull.rp(double 0xFFF8000000000000)
+  ret i64 %res
+}
+
+define i64 @test_nan_d2ull_rz() {
+; CHECK-LABEL: define i64 @test_nan_d2ull_rz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.d2ull.rz(double 0xFFF8000000000000)
+  ret i64 %res
+}
+
+;###############################################################
+;#            Tests with Positive Subnormal                    #
+;###############################################################
+
+;+-------------------------------------------------------------+
+;|                        f2ll                                  |
+;+-------------------------------------------------------------+
+define i64 @test_pos_subnormal_f2ll_rm() {
+; CHECK-LABEL: define i64 @test_pos_subnormal_f2ll_rm() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ll.rm(float 0x380FFFFFC0000000)
+  ret i64 %res
+}
+
+define i64 @test_pos_subnormal_f2ll_rn() {
+; CHECK-LABEL: define i64 @test_pos_subnormal_f2ll_rn() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ll.rn(float 0x380FFFFFC0000000)
+  ret i64 %res
+}
+
+
+define i64 @test_pos_subnormal_f2ll_rp() {
+; CHECK-LABEL: define i64 @test_pos_subnormal_f2ll_rp() {
+; CHECK-NEXT:    ret i64 1
+;
+  %res = call i64 @llvm.nvvm.f2ll.rp(float 0x380FFFFFC0000000)
+  ret i64 %res
+}
+
+define i64 @test_pos_subnormal_f2ll_rz() {
+; CHECK-LABEL: define i64 @test_pos_subnormal_f2ll_rz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ll.rz(float 0x380FFFFFC0000000)
+  ret i64 %res
+}
+
+;+-------------------------------------------------------------+
+;|                      f2ll_ftz                                |
+;+-------------------------------------------------------------+
+define i64 @test_pos_subnormal_f2ll_rm_ftz() {
+; CHECK-LABEL: define i64 @test_pos_subnormal_f2ll_rm_ftz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ll.rm.ftz(float 0x380FFFFFC0000000)
+  ret i64 %res
+}
+
+define i64 @test_pos_subnormal_f2ll_rn_ftz() {
+; CHECK-LABEL: define i64 @test_pos_subnormal_f2ll_rn_ftz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ll.rn.ftz(float 0x380FFFFFC0000000)
+  ret i64 %res
+}
+
+define i64 @test_pos_subnormal_f2ll_rp_ftz() {
+; CHECK-LABEL: define i64 @test_pos_subnormal_f2ll_rp_ftz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ll.rp.ftz(float 0x380FFFFFC0000000)
+  ret i64 %res
+}
+
+define i64 @test_pos_subnormal_f2ll_rz_ftz() {
+; CHECK-LABEL: define i64 @test_pos_subnormal_f2ll_rz_ftz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ll.rz.ftz(float 0x380FFFFFC0000000)
+  ret i64 %res
+}
+;+-------------------------------------------------------------+
+;|                        d2ll                                  |
+;+-------------------------------------------------------------+
+define i64 @test_pos_subnormal_d2ll_rm() {
+; CHECK-LABEL: define i64 @test_pos_subnormal_d2ll_rm() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.d2ll.rm(double 0x000fffffffffffff)
+  ret i64 %res
+}
+
+define i64 @test_pos_subnormal_d2ll_rn() {
+; CHECK-LABEL: define i64 @test_pos_subnormal_d2ll_rn() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.d2ll.rn(double 0x000fffffffffffff)
+  ret i64 %res
+}
+
+
+define i64 @test_pos_subnormal_d2ll_rp() {
+; CHECK-LABEL: define i64 @test_pos_subnormal_d2ll_rp() {
+; CHECK-NEXT:    ret i64 1
+;
+  %res = call i64 @llvm.nvvm.d2ll.rp(double 0x000fffffffffffff)
+  ret i64 %res
+}
+
+define i64 @test_pos_subnormal_d2ll_rz() {
+; CHECK-LABEL: define i64 @test_pos_subnormal_d2ll_rz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.d2ll.rz(double 0x000fffffffffffff)
+  ret i64 %res
+}
+
+;+-------------------------------------------------------------+
+;|                        f2ull                                  |
+;+-------------------------------------------------------------+
+define i64 @test_pos_subnormal_f2ull_rm() {
+; CHECK-LABEL: define i64 @test_pos_subnormal_f2ull_rm() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ull.rm(float 0x380FFFFFC0000000)
+  ret i64 %res
+}
+
+define i64 @test_pos_subnormal_f2ull_rn() {
+; CHECK-LABEL: define i64 @test_pos_subnormal_f2ull_rn() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ull.rn(float 0x380FFFFFC0000000)
+  ret i64 %res
+}
+
+
+define i64 @test_pos_subnormal_f2ull_rp() {
+; CHECK-LABEL: define i64 @test_pos_subnormal_f2ull_rp() {
+; CHECK-NEXT:    ret i64 1
+;
+  %res = call i64 @llvm.nvvm.f2ull.rp(float 0x380FFFFFC0000000)
+  ret i64 %res
+}
+
+define i64 @test_pos_subnormal_f2ull_rz() {
+; CHECK-LABEL: define i64 @test_pos_subnormal_f2ull_rz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ull.rz(float 0x380FFFFFC0000000)
+  ret i64 %res
+}
+
+;+-------------------------------------------------------------+
+;|                      f2ull_ftz                                |
+;+-------------------------------------------------------------+
+define i64 @test_pos_subnormal_f2ull_rm_ftz() {
+; CHECK-LABEL: define i64 @test_pos_subnormal_f2ull_rm_ftz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ull.rm.ftz(float 0x380FFFFFC0000000)
+  ret i64 %res
+}
+
+define i64 @test_pos_subnormal_f2ull_rn_ftz() {
+; CHECK-LABEL: define i64 @test_pos_subnormal_f2ull_rn_ftz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ull.rn.ftz(float 0x380FFFFFC0000000)
+  ret i64 %res
+}
+
+define i64 @test_pos_subnormal_f2ull_rp_ftz() {
+; CHECK-LABEL: define i64 @test_pos_subnormal_f2ull_rp_ftz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ull.rp.ftz(float 0x380FFFFFC0000000)
+  ret i64 %res
+}
+
+define i64 @test_pos_subnormal_f2ull_rz_ftz() {
+; CHECK-LABEL: define i64 @test_pos_subnormal_f2ull_rz_ftz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ull.rz.ftz(float 0x380FFFFFC0000000)
+  ret i64 %res
+}
+;+-------------------------------------------------------------+
+;|                        d2ull                                  |
+;+-------------------------------------------------------------+
+define i64 @test_pos_subnormal_d2ull_rm() {
+; CHECK-LABEL: define i64 @test_pos_subnormal_d2ull_rm() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.d2ull.rm(double 0x000fffffffffffff)
+  ret i64 %res
+}
+
+define i64 @test_pos_subnormal_d2ull_rn() {
+; CHECK-LABEL: define i64 @test_pos_subnormal_d2ull_rn() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.d2ull.rn(double 0x000fffffffffffff)
+  ret i64 %res
+}
+
+
+define i64 @test_pos_subnormal_d2ull_rp() {
+; CHECK-LABEL: define i64 @test_pos_subnormal_d2ull_rp() {
+; CHECK-NEXT:    ret i64 1
+;
+  %res = call i64 @llvm.nvvm.d2ull.rp(double 0x000fffffffffffff)
+  ret i64 %res
+}
+
+define i64 @test_pos_subnormal_d2ull_rz() {
+; CHECK-LABEL: define i64 @test_pos_subnormal_d2ull_rz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.d2ull.rz(double 0x000fffffffffffff)
+  ret i64 %res
+}
+
+;###############################################################
+;#            Tests with Negative Subnormal                    #
+;###############################################################
+
+;+-------------------------------------------------------------+
+;|                        f2ll                                  |
+;+-------------------------------------------------------------+
+define i64 @test_neg_subnormal_f2ll_rm() {
+; CHECK-LABEL: define i64 @test_neg_subnormal_f2ll_rm() {
+; CHECK-NEXT:    ret i64 -1
+;
+  %res = call i64 @llvm.nvvm.f2ll.rm(float 0xB80FFFFFC0000000)
+  ret i64 %res
+}
+
+define i64 @test_neg_subnormal_f2ll_rn() {
+; CHECK-LABEL: define i64 @test_neg_subnormal_f2ll_rn() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ll.rn(float 0xB80FFFFFC0000000)
+  ret i64 %res
+}
+
+
+define i64 @test_neg_subnormal_f2ll_rp() {
+; CHECK-LABEL: define i64 @test_neg_subnormal_f2ll_rp() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ll.rp(float 0xB80FFFFFC0000000)
+  ret i64 %res
+}
+
+define i64 @test_neg_subnormal_f2ll_rz() {
+; CHECK-LABEL: define i64 @test_neg_subnormal_f2ll_rz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ll.rz(float 0xB80FFFFFC0000000)
+  ret i64 %res
+}
+
+;+-------------------------------------------------------------+
+;|                      f2ll_ftz                                |
+;+-------------------------------------------------------------+
+define i64 @test_neg_subnormal_f2ll_rm_ftz() {
+; CHECK-LABEL: define i64 @test_neg_subnormal_f2ll_rm_ftz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ll.rm.ftz(float 0xB80FFFFFC0000000)
+  ret i64 %res
+}
+
+define i64 @test_neg_subnormal_f2ll_rn_ftz() {
+; CHECK-LABEL: define i64 @test_neg_subnormal_f2ll_rn_ftz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ll.rn.ftz(float 0xB80FFFFFC0000000)
+  ret i64 %res
+}
+
+define i64 @test_neg_subnormal_f2ll_rp_ftz() {
+; CHECK-LABEL: define i64 @test_neg_subnormal_f2ll_rp_ftz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ll.rp.ftz(float 0xB80FFFFFC0000000)
+  ret i64 %res
+}
+
+define i64 @test_neg_subnormal_f2ll_rz_ftz() {
+; CHECK-LABEL: define i64 @test_neg_subnormal_f2ll_rz_ftz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ll.rz.ftz(float 0xB80FFFFFC0000000)
+  ret i64 %res
+}
+;+-------------------------------------------------------------+
+;|                        d2ll                                  |
+;+-------------------------------------------------------------+
+define i64 @test_neg_subnormal_d2ll_rm() {
+; CHECK-LABEL: define i64 @test_neg_subnormal_d2ll_rm() {
+; CHECK-NEXT:    ret i64 -1
+;
+  %res = call i64 @llvm.nvvm.d2ll.rm(double 0x800fffffffffffff)
+  ret i64 %res
+}
+
+define i64 @test_neg_subnormal_d2ll_rn() {
+; CHECK-LABEL: define i64 @test_neg_subnormal_d2ll_rn() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.d2ll.rn(double 0x800fffffffffffff)
+  ret i64 %res
+}
+
+
+define i64 @test_neg_subnormal_d2ll_rp() {
+; CHECK-LABEL: define i64 @test_neg_subnormal_d2ll_rp() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.d2ll.rp(double 0x800fffffffffffff)
+  ret i64 %res
+}
+
+define i64 @test_neg_subnormal_d2ll_rz() {
+; CHECK-LABEL: define i64 @test_neg_subnormal_d2ll_rz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.d2ll.rz(double 0x800fffffffffffff)
+  ret i64 %res
+}
+
+;+-------------------------------------------------------------+
+;|                        f2ull                                  |
+;+-------------------------------------------------------------+
+define i64 @test_neg_subnormal_f2ull_rm() {
+; CHECK-LABEL: define i64 @test_neg_subnormal_f2ull_rm() {
+; CHECK-NEXT:    [[RES:%.*]] = call i64 @llvm.nvvm.f2ull.rm(float 0xB80FFFFFC0000000)
+; CHECK-NEXT:    ret i64 [[RES]]
+;
+  %res = call i64 @llvm.nvvm.f2ull.rm(float 0xB80FFFFFC0000000)
+  ret i64 %res
+}
+
+define i64 @test_neg_subnormal_f2ull_rn() {
+; CHECK-LABEL: define i64 @test_neg_subnormal_f2ull_rn() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ull.rn(float 0xB80FFFFFC0000000)
+  ret i64 %res
+}
+
+
+define i64 @test_neg_subnormal_f2ull_rp() {
+; CHECK-LABEL: define i64 @test_neg_subnormal_f2ull_rp() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ull.rp(float 0xB80FFFFFC0000000)
+  ret i64 %res
+}
+
+define i64 @test_neg_subnormal_f2ull_rz() {
+; CHECK-LABEL: define i64 @test_neg_subnormal_f2ull_rz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ull.rz(float 0xB80FFFFFC0000000)
+  ret i64 %res
+}
+
+;+-------------------------------------------------------------+
+;|                      f2ull_ftz                                |
+;+-------------------------------------------------------------+
+define i64 @test_neg_subnormal_f2ull_rm_ftz() {
+; CHECK-LABEL: define i64 @test_neg_subnormal_f2ull_rm_ftz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ull.rm.ftz(float 0xB80FFFFFC0000000)
+  ret i64 %res
+}
+
+define i64 @test_neg_subnormal_f2ull_rn_ftz() {
+; CHECK-LABEL: define i64 @test_neg_subnormal_f2ull_rn_ftz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ull.rn.ftz(float 0xB80FFFFFC0000000)
+  ret i64 %res
+}
+
+define i64 @test_neg_subnormal_f2ull_rp_ftz() {
+; CHECK-LABEL: define i64 @test_neg_subnormal_f2ull_rp_ftz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ull.rp.ftz(float 0xB80FFFFFC0000000)
+  ret i64 %res
+}
+
+define i64 @test_neg_subnormal_f2ull_rz_ftz() {
+; CHECK-LABEL: define i64 @test_neg_subnormal_f2ull_rz_ftz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.f2ull.rz.ftz(float 0xB80FFFFFC0000000)
+  ret i64 %res
+}
+;+-------------------------------------------------------------+
+;|                        d2ull                                  |
+;+-------------------------------------------------------------+
+define i64 @test_neg_subnormal_d2ull_rm() {
+; CHECK-LABEL: define i64 @test_neg_subnormal_d2ull_rm() {
+; CHECK-NEXT:    [[RES:%.*]] = call i64 @llvm.nvvm.d2ull.rm(double 0x800FFFFFFFFFFFFF)
+; CHECK-NEXT:    ret i64 [[RES]]
+;
+  %res = call i64 @llvm.nvvm.d2ull.rm(double 0x800fffffffffffff)
+  ret i64 %res
+}
+
+define i64 @test_neg_subnormal_d2ull_rn() {
+; CHECK-LABEL: define i64 @test_neg_subnormal_d2ull_rn() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.d2ull.rn(double 0x800fffffffffffff)
+  ret i64 %res
+}
+
+
+define i64 @test_neg_subnormal_d2ull_rp() {
+; CHECK-LABEL: define i64 @test_neg_subnormal_d2ull_rp() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.d2ull.rp(double 0x800fffffffffffff)
+  ret i64 %res
+}
+
+define i64 @test_neg_subnormal_d2ull_rz() {
+; CHECK-LABEL: define i64 @test_neg_subnormal_d2ull_rz() {
+; CHECK-NEXT:    ret i64 0
+;
+  %res = call i64 @llvm.nvvm.d2ull.rz(double 0x800fffffffffffff)
+  ret i64 %res
+}
+
+declare i64 @llvm.nvvm.f2ll.rm(float)
+declare i64 @llvm.nvvm.f2ll.rn(float)
+declare i64 @llvm.nvvm.f2ll.rp(float)
+declare i64 @llvm.nvvm.f2ll.rz(float)
+
+declare i64 @llvm.nvvm.f2ll.rm.ftz(float)
+declare i64 @llvm.nvvm.f2ll.rn.ftz(float)
+declare i64 @llvm.nvvm.f2ll.rp.ftz(float)
+declare i64 @llvm.nvvm.f2ll.rz.ftz(float)
+
+declare i64 @llvm.nvvm.d2ll.rm(double)
+declare i64 @llvm.nvvm.d2ll.rn(double)
+declare i64 @llvm.nvvm.d2ll.rp(double)
+declare i64 @llvm.nvvm.d2ll.rz(double)
+
+
+declare i64 @llvm.nvvm.f2ull.rm(float)
+declare i64 @llvm.nvvm.f2ull.rn(float)
+declare i64 @llvm.nvvm.f2ull.rp(float)
+declare i64 @llvm.nvvm.f2ull.rz(float)
+
+declare i64 @llvm.nvvm.f2ull.rm.ftz(float)
+declare i64 @llvm.nvvm.f2ull.rn.ftz(float)
+declare i64 @llvm.nvvm.f2ull.rp.ftz(float)
+declare i64 @llvm.nvvm.f2ull.rz.ftz(float)
+
+declare i64 @llvm.nvvm.d2ull.rm(double)
+declare i64 @llvm.nvvm.d2ull.rn(double)
+declare i64 @llvm.nvvm.d2ull.rp(double)
+declare i64 @llvm.nvvm.d2ull.rz(double)
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/X86/massive_indirection.ll b/llvm/test/Transforms/LoadStoreVectorizer/X86/massive_indirection.ll
new file mode 100644
index 000000000000..fe8a7e58a6a5
--- /dev/null
+++ b/llvm/test/Transforms/LoadStoreVectorizer/X86/massive_indirection.ll
@@ -0,0 +1,180 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt %s -mtriple=x86_64-unknown-linux-gnu -passes=load-store-vectorizer -mcpu=skx -S -o - | FileCheck %s
+
+; This test verifies that the vectorizer can handle an extended sequence of
+; getelementptr instructions and generate longer vectors. With special handling,
+; some elements can still be vectorized even if they require looking up the
+; common underlying object deeper than 6 levels from the original pointer.
+
+; The test below is the simplified version of actual performance oriented
+; workload; the offsets in getelementptr instructions are similar or same for
+; the test simplicity.
+
+define void @v1_v2_v4_v1_to_v8_levels_6_7_8_8(i32 %arg0, ptr align 16 %arg1) {
+; CHECK-LABEL: define void @v1_v2_v4_v1_to_v8_levels_6_7_8_8(
+; CHECK-SAME: i32 [[ARG0:%.*]], ptr align 16 [[ARG1:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[LEVEL1:%.*]] = getelementptr i8, ptr [[ARG1]], i32 917504
+; CHECK-NEXT:    [[LEVEL2:%.*]] = getelementptr i8, ptr [[LEVEL1]], i32 [[ARG0]]
+; CHECK-NEXT:    [[LEVEL3:%.*]] = getelementptr i8, ptr [[LEVEL2]], i32 32768
+; CHECK-NEXT:    [[LEVEL4:%.*]] = getelementptr i8, ptr [[LEVEL3]], i32 [[ARG0]]
+; CHECK-NEXT:    [[LEVEL5:%.*]] = getelementptr i8, ptr [[LEVEL4]], i32 [[ARG0]]
+; CHECK-NEXT:    [[A6:%.*]] = getelementptr i8, ptr [[LEVEL5]], i32 [[ARG0]]
+; CHECK-NEXT:    store <8 x half> zeroinitializer, ptr [[A6]], align 16
+; CHECK-NEXT:    ret void
+;
+
+  %level1 = getelementptr i8, ptr %arg1, i32 917504
+  %level2 = getelementptr i8, ptr %level1, i32 %arg0
+  %level3 = getelementptr i8, ptr %level2, i32 32768
+  %level4 = getelementptr i8, ptr %level3, i32 %arg0
+  %level5 = getelementptr i8, ptr %level4, i32 %arg0
+
+  %a6 = getelementptr i8, ptr %level5, i32 %arg0
+  %b7 = getelementptr i8, ptr %a6, i32 2
+  %c8 = getelementptr i8, ptr %b7, i32 8
+  %d8 = getelementptr i8, ptr %b7, i32 12
+
+  store half 0xH0000, ptr %a6, align 16
+  store <4 x half> zeroinitializer, ptr %b7, align 2
+  store <2 x half> zeroinitializer, ptr %c8, align 2
+  store half 0xH0000, ptr %d8, align 2
+  ret void
+}
+
+define void @v1x8_levels_6_7_8_9_10_11_12_13(i32 %arg0, ptr align 16 %arg1) {
+; CHECK-LABEL: define void @v1x8_levels_6_7_8_9_10_11_12_13(
+; CHECK-SAME: i32 [[ARG0:%.*]], ptr align 16 [[ARG1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[LEVEL1:%.*]] = getelementptr i8, ptr [[ARG1]], i32 917504
+; CHECK-NEXT:    [[LEVEL2:%.*]] = getelementptr i8, ptr [[LEVEL1]], i32 [[ARG0]]
+; CHECK-NEXT:    [[LEVEL3:%.*]] = getelementptr i8, ptr [[LEVEL2]], i32 32768
+; CHECK-NEXT:    [[LEVEL4:%.*]] = getelementptr i8, ptr [[LEVEL3]], i32 [[ARG0]]
+; CHECK-NEXT:    [[LEVEL5:%.*]] = getelementptr i8, ptr [[LEVEL4]], i32 [[ARG0]]
+; CHECK-NEXT:    [[A6:%.*]] = getelementptr i8, ptr [[LEVEL5]], i32 [[ARG0]]
+; CHECK-NEXT:    store <8 x half> zeroinitializer, ptr [[A6]], align 16
+; CHECK-NEXT:    ret void
+;
+
+  %level1 = getelementptr i8, ptr %arg1, i32 917504
+  %level2 = getelementptr i8, ptr %level1, i32 %arg0
+  %level3 = getelementptr i8, ptr %level2, i32 32768
+  %level4 = getelementptr i8, ptr %level3, i32 %arg0
+  %level5 = getelementptr i8, ptr %level4, i32 %arg0
+
+  %a6 = getelementptr i8, ptr %level5, i32 %arg0
+  %b7 = getelementptr i8, ptr %a6, i32 2
+  %c8 = getelementptr i8, ptr %b7, i32 2
+  %d9 = getelementptr i8, ptr %c8, i32 2
+  %e10 = getelementptr i8, ptr %d9, i32 2
+  %f11 = getelementptr i8, ptr %e10, i32 2
+  %g12 = getelementptr i8, ptr %f11, i32 2
+  %h13 = getelementptr i8, ptr %g12, i32 2
+
+  store half 0xH0000, ptr %a6, align 16
+  store half 0xH0000, ptr %b7, align 2
+  store half 0xH0000, ptr %c8, align 2
+  store half 0xH0000, ptr %d9, align 2
+  store half 0xH0000, ptr %e10, align 8
+  store half 0xH0000, ptr %f11, align 2
+  store half 0xH0000, ptr %g12, align 2
+  store half 0xH0000, ptr %h13, align 2
+  ret void
+}
+
+define void @v1_4_4_4_2_1_to_v8_8_levels_6_7(i32 %arg0, ptr addrspace(3) align 16 %arg1_ptr, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, half %arg6_half, half %arg7_half, <2 x half> %arg8_2xhalf) {
+; CHECK-LABEL: define void @v1_4_4_4_2_1_to_v8_8_levels_6_7(
+; CHECK-SAME: i32 [[ARG0:%.*]], ptr addrspace(3) align 16 [[ARG1_PTR:%.*]], i32 [[ARG2:%.*]], i32 [[ARG3:%.*]], i32 [[ARG4:%.*]], i32 [[ARG5:%.*]], half [[ARG6_HALF:%.*]], half [[ARG7_HALF:%.*]], <2 x half> [[ARG8_2XHALF:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[ARG1_PTR]], i32 458752
+; CHECK-NEXT:    br [[DOTPREHEADER11_PREHEADER:label %.*]]
+; CHECK:       [[_PREHEADER11_PREHEADER:.*:]]
+; CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i32 [[ARG0]], 6
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP1]], i32 [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP3]], i32 [[ARG2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP4]], i32 [[ARG3]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[ARG0]], 2
+; CHECK-NEXT:    br i1 [[CMP]], [[DOTLR_PH:label %.*]], [[DOTEXIT_POINT:label %.*]]
+; CHECK:       [[_LR_PH:.*:]]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP5]], i32 [[ARG4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[GEP]], i32 [[ARG5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <8 x half> poison, half [[ARG6_HALF]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <8 x half> [[TMP7]], half 0xH0000, i32 1
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <8 x half> [[TMP8]], half 0xH0000, i32 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <8 x half> [[TMP9]], half 0xH0000, i32 3
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <8 x half> [[TMP10]], half 0xH0000, i32 4
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x half> [[ARG8_2XHALF]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <8 x half> [[TMP11]], half [[TMP12]], i32 5
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x half> [[ARG8_2XHALF]], i32 1
+; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <8 x half> [[TMP13]], half [[TMP14]], i32 6
+; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <8 x half> [[TMP15]], half [[ARG7_HALF]], i32 7
+; CHECK-NEXT:    store <8 x half> [[TMP16]], ptr addrspace(3) [[TMP6]], align 2
+; CHECK-NEXT:    br [[DOTEXIT_POINT]]
+; CHECK:       [[_EXIT_POINT:.*:]]
+; CHECK-NEXT:    ret void
+;
+  %base1 = getelementptr inbounds i8, ptr addrspace(3) %arg1_ptr, i32 458752
+  br label %.preheader11.preheader
+
+.preheader11.preheader:
+  %base2 = shl nuw nsw i32 %arg0, 6
+  %base3 = getelementptr inbounds i8, ptr addrspace(3) %base1, i32 %base2
+
+  %base4 = getelementptr inbounds i8, ptr addrspace(3) %base3, i32 %arg2
+  %base5 = getelementptr inbounds i8, ptr addrspace(3) %base4, i32 %arg3
+
+  %cmp = icmp sgt i32 %arg0, 2
+  br i1 %cmp, label %.lr.ph, label %.exit_point
+
+.lr.ph:
+  %gep = getelementptr inbounds i8, ptr addrspace(3) %base5, i32 %arg4
+
+  %dst = getelementptr inbounds i8, ptr addrspace(3) %gep, i32 %arg5
+  %dst_off2 = getelementptr inbounds i8, ptr addrspace(3) %dst, i32 2
+  %dst_off10 = getelementptr inbounds i8, ptr addrspace(3) %dst, i32 10
+  %dst_off14 = getelementptr inbounds i8, ptr addrspace(3) %dst, i32 14
+
+  store half %arg6_half, ptr addrspace(3) %dst, align 2
+  store <4 x half> zeroinitializer, ptr addrspace(3) %dst_off2, align 2
+  store <2 x half> %arg8_2xhalf, ptr addrspace(3) %dst_off10, align 2
+  store half %arg7_half, ptr addrspace(3) %dst_off14, align 2
+  br label %.exit_point
+
+.exit_point:
+  ret void
+}
+
+; The regression test for merging equivalence classes. It is reduced and adapted
+; for LSV from llvm/test/CodeGen/NVPTX/variadics-backend.ll, which failed at
+; post-commit checks with memory sanitizer on the initial attempt to implement
+; the merging of the equivalence classes.
+define void @variadics1(ptr %vlist) {
+; CHECK-LABEL: define void @variadics1(
+; CHECK-SAME: ptr [[VLIST:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[ARGP_CUR7_ALIGNED2:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[VLIST]], i64 0)
+; CHECK-NEXT:    [[ARGP_NEXT8:%.*]] = getelementptr i8, ptr [[ARGP_CUR7_ALIGNED2]], i64 8
+; CHECK-NEXT:    [[X0:%.*]] = getelementptr i8, ptr [[ARGP_NEXT8]], i32 7
+; CHECK-NEXT:    [[ARGP_CUR11_ALIGNED:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[X0]], i64 0)
+; CHECK-NEXT:    [[ARGP_NEXT12:%.*]] = getelementptr i8, ptr [[ARGP_CUR11_ALIGNED]], i64 8
+; CHECK-NEXT:    [[X2:%.*]] = getelementptr i8, ptr [[ARGP_NEXT12]], i32 7
+; CHECK-NEXT:    [[ARGP_CUR16_ALIGNED:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[X2]], i64 0)
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[ARGP_CUR16_ALIGNED]], align 4294967296
+; CHECK-NEXT:    [[X31:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
+; CHECK-NEXT:    [[X42:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
+; CHECK-NEXT:    [[X5:%.*]] = fadd double [[X42]], [[X31]]
+; CHECK-NEXT:    store double [[X5]], ptr null, align 8
+; CHECK-NEXT:    ret void
+;
+  %argp.cur7.aligned2 = call ptr @llvm.ptrmask.p0.i64(ptr %vlist, i64 0)
+  %argp.next8 = getelementptr i8, ptr %argp.cur7.aligned2, i64 8
+  %x0 = getelementptr i8, ptr %argp.next8, i32 7
+  %argp.cur11.aligned = call ptr @llvm.ptrmask.p0.i64(ptr %x0, i64 0)
+  %argp.next12 = getelementptr i8, ptr %argp.cur11.aligned, i64 8
+  %x2 = getelementptr i8, ptr %argp.next12, i32 7
+  %argp.cur16.aligned = call ptr @llvm.ptrmask.p0.i64(ptr %x2, i64 0)
+  %x3 = load double, ptr %argp.cur16.aligned, align 8
+  %argp.cur16.aligned_off8 = getelementptr i8, ptr %argp.cur16.aligned, i32 8
+  %x4 = load double, ptr %argp.cur16.aligned_off8, align 8
+  %x5 = fadd double %x4, %x3
+  store double %x5, ptr null, align 8
+  ret void
+}
+
+declare ptr @llvm.ptrmask.p0.i64(ptr, i64)
diff --git a/llvm/test/Transforms/LoopStrengthReduce/NVPTX/trunc.ll b/llvm/test/Transforms/LoopStrengthReduce/NVPTX/trunc.ll
index 8761122f756f..e6b5991d8dfb 100644
--- a/llvm/test/Transforms/LoopStrengthReduce/NVPTX/trunc.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/NVPTX/trunc.ll
@@ -13,7 +13,7 @@ target triple = "nvptx64-nvidia-cuda"
 ; That would be worthless, because "i" is simulated by two 32-bit registers and
 ; truncating it to 32-bit is as simple as directly using the register that
 ; contains the low bits.
-define void @trunc_is_free(i64 %begin, i64 %stride, i64 %end) {
+define ptx_kernel void @trunc_is_free(i64 %begin, i64 %stride, i64 %end) {
 ; CHECK-LABEL: @trunc_is_free(
 entry:
   %cmp.4 = icmp eq i64 %begin, %end
@@ -41,5 +41,3 @@ for.body:                                         ; preds = %for.body.preheader,
 
 declare void @_Z3usei(i32)
 
-!nvvm.annotations = !{!0}
-!0 = !{ptr @trunc_is_free, !"kernel", i32 1}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/blend-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/blend-costs.ll
index ddf6c1005e05..254cdf2d14d9 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/blend-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/blend-costs.ll
@@ -209,6 +209,7 @@ define void @test_blend_feeding_replicated_store_2(ptr noalias %src, ptr %dst, i
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i1> poison, i1 [[C_0]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i1> [[BROADCAST_SPLATINSERT]], <16 x i1> poison, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = xor <16 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE30:.*]] ]
@@ -218,7 +219,6 @@ define void @test_blend_feeding_replicated_store_2(ptr noalias %src, ptr %dst, i
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq <16 x i8> [[WIDE_LOAD]], zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor <16 x i1> [[TMP3]], splat (i1 true)
-; CHECK-NEXT:    [[TMP5:%.*]] = xor <16 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
 ; CHECK-NEXT:    [[TMP6:%.*]] = select <16 x i1> [[TMP4]], <16 x i1> [[TMP5]], <16 x i1> zeroinitializer
 ; CHECK-NEXT:    [[TMP7:%.*]] = or <16 x i1> [[TMP6]], [[TMP3]]
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <16 x i1> [[TMP6]], <16 x i8> zeroinitializer, <16 x i8> splat (i8 1)
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/force-target-instruction-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/force-target-instruction-cost.ll
index 08a600143190..8c5d84e6981b 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/force-target-instruction-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/force-target-instruction-cost.ll
@@ -151,9 +151,9 @@ exit:
   ret void
 }
 
-define void @test_exit_branch_cost(ptr %dst, i64 %x, i32 %y, ptr %dst.1, i1 %c.4, ptr %src, ptr %dst.3, i1 %c.3, ptr %dst.2) {
+define void @test_exit_branch_cost(ptr %dst, ptr noalias %x.ptr, ptr noalias %y.ptr, ptr %dst.1, i1 %c.4, ptr %src, ptr %dst.3, i1 %c.3, ptr %dst.2) {
 ; CHECK-LABEL: define void @test_exit_branch_cost(
-; CHECK-SAME: ptr [[DST:%.*]], i64 [[X:%.*]], i32 [[Y:%.*]], ptr [[DST_1:%.*]], i1 [[C_4:%.*]], ptr [[SRC:%.*]], ptr [[DST_3:%.*]], i1 [[C_3:%.*]], ptr [[DST_2:%.*]]) {
+; CHECK-SAME: ptr [[DST:%.*]], ptr noalias [[X_PTR:%.*]], ptr noalias [[Y_PTR:%.*]], ptr [[DST_1:%.*]], i1 [[C_4:%.*]], ptr [[SRC:%.*]], ptr [[DST_3:%.*]], i1 [[C_3:%.*]], ptr [[DST_2:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
 ; CHECK:       [[VECTOR_MEMCHECK]]:
@@ -172,11 +172,11 @@ define void @test_exit_branch_cost(ptr %dst, i64 %x, i32 %y, ptr %dst.1, i1 %c.4
 ; CHECK-NEXT:    [[BOUND08:%.*]] = icmp ult ptr [[DST_1]], [[SCEVGEP3]]
 ; CHECK-NEXT:    [[BOUND19:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP]]
 ; CHECK-NEXT:    [[FOUND_CONFLICT10:%.*]] = and i1 [[BOUND08]], [[BOUND19]]
-; CHECK-NEXT:    [[CONFLICT_RDX11:%.*]] = or i1 [[CONFLICT_RDX]], [[FOUND_CONFLICT10]]
+; CHECK-NEXT:    [[CONFLICT_RDX21:%.*]] = or i1 [[CONFLICT_RDX]], [[FOUND_CONFLICT10]]
 ; CHECK-NEXT:    [[BOUND012:%.*]] = icmp ult ptr [[DST_1]], [[SCEVGEP4]]
 ; CHECK-NEXT:    [[BOUND113:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP]]
 ; CHECK-NEXT:    [[FOUND_CONFLICT14:%.*]] = and i1 [[BOUND012]], [[BOUND113]]
-; CHECK-NEXT:    [[CONFLICT_RDX15:%.*]] = or i1 [[CONFLICT_RDX11]], [[FOUND_CONFLICT14]]
+; CHECK-NEXT:    [[CONFLICT_RDX15:%.*]] = or i1 [[CONFLICT_RDX21]], [[FOUND_CONFLICT14]]
 ; CHECK-NEXT:    [[BOUND016:%.*]] = icmp ult ptr [[DST_3]], [[SCEVGEP2]]
 ; CHECK-NEXT:    [[BOUND117:%.*]] = icmp ult ptr [[DST_2]], [[SCEVGEP1]]
 ; CHECK-NEXT:    [[FOUND_CONFLICT18:%.*]] = and i1 [[BOUND016]], [[BOUND117]]
@@ -184,161 +184,101 @@ define void @test_exit_branch_cost(ptr %dst, i64 %x, i32 %y, ptr %dst.1, i1 %c.4
 ; CHECK-NEXT:    [[BOUND020:%.*]] = icmp ult ptr [[DST_3]], [[SCEVGEP3]]
 ; CHECK-NEXT:    [[BOUND121:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP1]]
 ; CHECK-NEXT:    [[FOUND_CONFLICT22:%.*]] = and i1 [[BOUND020]], [[BOUND121]]
-; CHECK-NEXT:    [[CONFLICT_RDX23:%.*]] = or i1 [[CONFLICT_RDX19]], [[FOUND_CONFLICT22]]
+; CHECK-NEXT:    [[CONFLICT_RDX41:%.*]] = or i1 [[CONFLICT_RDX19]], [[FOUND_CONFLICT22]]
 ; CHECK-NEXT:    [[BOUND024:%.*]] = icmp ult ptr [[DST_3]], [[SCEVGEP4]]
 ; CHECK-NEXT:    [[BOUND125:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP1]]
 ; CHECK-NEXT:    [[FOUND_CONFLICT26:%.*]] = and i1 [[BOUND024]], [[BOUND125]]
-; CHECK-NEXT:    [[CONFLICT_RDX27:%.*]] = or i1 [[CONFLICT_RDX23]], [[FOUND_CONFLICT26]]
+; CHECK-NEXT:    [[CONFLICT_RDX27:%.*]] = or i1 [[CONFLICT_RDX41]], [[FOUND_CONFLICT26]]
 ; CHECK-NEXT:    [[BOUND028:%.*]] = icmp ult ptr [[DST_2]], [[SCEVGEP3]]
 ; CHECK-NEXT:    [[BOUND129:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP2]]
 ; CHECK-NEXT:    [[FOUND_CONFLICT30:%.*]] = and i1 [[BOUND028]], [[BOUND129]]
-; CHECK-NEXT:    [[CONFLICT_RDX31:%.*]] = or i1 [[CONFLICT_RDX27]], [[FOUND_CONFLICT30]]
+; CHECK-NEXT:    [[CONFLICT_RDX65:%.*]] = or i1 [[CONFLICT_RDX27]], [[FOUND_CONFLICT30]]
 ; CHECK-NEXT:    [[BOUND032:%.*]] = icmp ult ptr [[DST_2]], [[SCEVGEP4]]
 ; CHECK-NEXT:    [[BOUND133:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP2]]
-; CHECK-NEXT:    [[FOUND_CONFLICT34:%.*]] = and i1 [[BOUND032]], [[BOUND133]]
-; CHECK-NEXT:    [[CONFLICT_RDX35:%.*]] = or i1 [[CONFLICT_RDX31]], [[FOUND_CONFLICT34]]
+; CHECK-NEXT:    [[FOUND_CONFLICT68:%.*]] = and i1 [[BOUND032]], [[BOUND133]]
+; CHECK-NEXT:    [[CONFLICT_RDX35:%.*]] = or i1 [[CONFLICT_RDX65]], [[FOUND_CONFLICT68]]
 ; CHECK-NEXT:    [[BOUND036:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP4]]
 ; CHECK-NEXT:    [[BOUND137:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP3]]
 ; CHECK-NEXT:    [[FOUND_CONFLICT38:%.*]] = and i1 [[BOUND036]], [[BOUND137]]
 ; CHECK-NEXT:    [[CONFLICT_RDX39:%.*]] = or i1 [[CONFLICT_RDX35]], [[FOUND_CONFLICT38]]
 ; CHECK-NEXT:    br i1 [[CONFLICT_RDX39]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[X]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP47:%.*]] = icmp eq <2 x i64> [[BROADCAST_SPLAT]], zeroinitializer
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT40:%.*]] = insertelement <2 x i1> poison, i1 [[C_3]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT41:%.*]] = shufflevector <2 x i1> [[BROADCAST_SPLATINSERT40]], <2 x i1> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[C_4]], <2 x i1> [[BROADCAST_SPLAT41]], <2 x i1> zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = xor <2 x i1> [[TMP2]], splat (i1 true)
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT56:%.*]] = insertelement <2 x i1> poison, i1 [[C_4]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT57:%.*]] = shufflevector <2 x i1> [[BROADCAST_SPLATINSERT56]], <2 x i1> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP33:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT57]], splat (i1 true)
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE74:.*]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = xor <2 x i1> [[TMP47]], splat (i1 true)
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE55:.*]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i64, ptr [[X_PTR]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i64, ptr [[TMP4]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP6]], align 8
+; CHECK-NEXT:    [[TMP47:%.*]] = icmp eq <2 x i64> [[WIDE_LOAD]], zeroinitializer
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor <2 x i1> [[TMP47]], splat (i1 true)
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0
-; CHECK-NEXT:    br i1 [[TMP6]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0
+; CHECK-NEXT:    br i1 [[TMP8]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
 ; CHECK:       [[PRED_STORE_IF]]:
 ; CHECK-NEXT:    store i64 0, ptr [[DST_1]], align 8, !alias.scope [[META7:![0-9]+]], !noalias [[META10:![0-9]+]]
 ; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE]]
 ; CHECK:       [[PRED_STORE_CONTINUE]]:
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1
-; CHECK-NEXT:    br i1 [[TMP7]], label %[[PRED_STORE_IF42:.*]], label %[[PRED_STORE_CONTINUE43:.*]]
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i1> [[TMP5]], i32 1
+; CHECK-NEXT:    br i1 [[TMP9]], label %[[PRED_STORE_IF42:.*]], label %[[PRED_STORE_CONTINUE43:.*]]
 ; CHECK:       [[PRED_STORE_IF42]]:
 ; CHECK-NEXT:    store i64 0, ptr [[DST_1]], align 8, !alias.scope [[META7]], !noalias [[META10]]
 ; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE43]]
 ; CHECK:       [[PRED_STORE_CONTINUE43]]:
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0
-; CHECK-NEXT:    br i1 [[TMP8]], label %[[PRED_STORE_IF44:.*]], label %[[PRED_STORE_CONTINUE45:.*]]
+; CHECK-NEXT:    [[TMP13:%.*]] = select <2 x i1> [[TMP5]], <2 x i1> [[TMP11]], <2 x i1> zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <2 x i1> [[TMP13]], i32 0
+; CHECK-NEXT:    br i1 [[TMP16]], label %[[PRED_STORE_IF44:.*]], label %[[PRED_STORE_CONTINUE45:.*]]
 ; CHECK:       [[PRED_STORE_IF44]]:
-; CHECK-NEXT:    store i64 0, ptr [[DST_1]], align 8, !alias.scope [[META7]], !noalias [[META10]]
+; CHECK-NEXT:    store i64 0, ptr [[DST_3]], align 8, !alias.scope [[META15:![0-9]+]], !noalias [[META16:![0-9]+]]
 ; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE45]]
 ; CHECK:       [[PRED_STORE_CONTINUE45]]:
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i1> [[TMP5]], i32 1
-; CHECK-NEXT:    br i1 [[TMP9]], label %[[PRED_STORE_IF46:.*]], label %[[PRED_STORE_CONTINUE47:.*]]
+; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <2 x i1> [[TMP13]], i32 1
+; CHECK-NEXT:    br i1 [[TMP17]], label %[[PRED_STORE_IF46:.*]], label %[[PRED_STORE_CONTINUE47:.*]]
 ; CHECK:       [[PRED_STORE_IF46]]:
-; CHECK-NEXT:    store i64 0, ptr [[DST_1]], align 8, !alias.scope [[META7]], !noalias [[META10]]
+; CHECK-NEXT:    store i64 0, ptr [[DST_3]], align 8, !alias.scope [[META15]], !noalias [[META16]]
 ; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE47]]
 ; CHECK:       [[PRED_STORE_CONTINUE47]]:
-; CHECK-NEXT:    [[TMP10:%.*]] = xor <2 x i1> [[TMP2]], splat (i1 true)
-; CHECK-NEXT:    [[TMP11:%.*]] = xor <2 x i1> [[TMP2]], splat (i1 true)
-; CHECK-NEXT:    [[TMP12:%.*]] = select <2 x i1> [[TMP4]], <2 x i1> [[TMP10]], <2 x i1> zeroinitializer
-; CHECK-NEXT:    [[TMP13:%.*]] = select <2 x i1> [[TMP5]], <2 x i1> [[TMP11]], <2 x i1> zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i1> [[TMP12]], i32 0
-; CHECK-NEXT:    br i1 [[TMP14]], label %[[PRED_STORE_IF48:.*]], label %[[PRED_STORE_CONTINUE49:.*]]
-; CHECK:       [[PRED_STORE_IF48]]:
-; CHECK-NEXT:    store i64 0, ptr [[DST_3]], align 8, !alias.scope [[META15:![0-9]+]], !noalias [[META16:![0-9]+]]
-; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE49]]
-; CHECK:       [[PRED_STORE_CONTINUE49]]:
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x i1> [[TMP12]], i32 1
-; CHECK-NEXT:    br i1 [[TMP15]], label %[[PRED_STORE_IF50:.*]], label %[[PRED_STORE_CONTINUE51:.*]]
-; CHECK:       [[PRED_STORE_IF50]]:
-; CHECK-NEXT:    store i64 0, ptr [[DST_3]], align 8, !alias.scope [[META15]], !noalias [[META16]]
-; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE51]]
-; CHECK:       [[PRED_STORE_CONTINUE51]]:
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <2 x i1> [[TMP13]], i32 0
-; CHECK-NEXT:    br i1 [[TMP16]], label %[[PRED_STORE_IF52:.*]], label %[[PRED_STORE_CONTINUE53:.*]]
-; CHECK:       [[PRED_STORE_IF52]]:
-; CHECK-NEXT:    store i64 0, ptr [[DST_3]], align 8, !alias.scope [[META15]], !noalias [[META16]]
-; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE53]]
-; CHECK:       [[PRED_STORE_CONTINUE53]]:
-; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <2 x i1> [[TMP13]], i32 1
-; CHECK-NEXT:    br i1 [[TMP17]], label %[[PRED_STORE_IF54:.*]], label %[[PRED_STORE_CONTINUE55:.*]]
-; CHECK:       [[PRED_STORE_IF54]]:
-; CHECK-NEXT:    store i64 0, ptr [[DST_3]], align 8, !alias.scope [[META15]], !noalias [[META16]]
-; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE55]]
-; CHECK:       [[PRED_STORE_CONTINUE55]]:
-; CHECK-NEXT:    [[TMP18:%.*]] = select <2 x i1> [[TMP4]], <2 x i1> [[BROADCAST_SPLAT41]], <2 x i1> zeroinitializer
 ; CHECK-NEXT:    [[TMP19:%.*]] = select <2 x i1> [[TMP5]], <2 x i1> [[BROADCAST_SPLAT41]], <2 x i1> zeroinitializer
-; CHECK-NEXT:    [[TMP20:%.*]] = select <2 x i1> [[TMP18]], <2 x i1> [[BROADCAST_SPLAT57]], <2 x i1> zeroinitializer
 ; CHECK-NEXT:    [[TMP21:%.*]] = select <2 x i1> [[TMP19]], <2 x i1> [[BROADCAST_SPLAT57]], <2 x i1> zeroinitializer
-; CHECK-NEXT:    [[TMP22:%.*]] = or <2 x i1> [[TMP47]], [[TMP20]]
 ; CHECK-NEXT:    [[TMP23:%.*]] = or <2 x i1> [[TMP47]], [[TMP21]]
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP20]], <2 x i64> zeroinitializer, <2 x i64> splat (i64 1)
 ; CHECK-NEXT:    [[PREDPHI58:%.*]] = select <2 x i1> [[TMP21]], <2 x i64> zeroinitializer, <2 x i64> splat (i64 1)
-; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <2 x i1> [[TMP22]], i32 0
-; CHECK-NEXT:    br i1 [[TMP24]], label %[[PRED_STORE_IF59:.*]], label %[[PRED_STORE_CONTINUE60:.*]]
-; CHECK:       [[PRED_STORE_IF59]]:
-; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <2 x i64> [[PREDPHI]], i32 0
-; CHECK-NEXT:    store i64 [[TMP25]], ptr [[DST_2]], align 8, !alias.scope [[META17:![0-9]+]], !noalias [[META18:![0-9]+]]
-; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE60]]
-; CHECK:       [[PRED_STORE_CONTINUE60]]:
-; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <2 x i1> [[TMP22]], i32 1
-; CHECK-NEXT:    br i1 [[TMP26]], label %[[PRED_STORE_IF61:.*]], label %[[PRED_STORE_CONTINUE62:.*]]
-; CHECK:       [[PRED_STORE_IF61]]:
-; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <2 x i64> [[PREDPHI]], i32 1
-; CHECK-NEXT:    store i64 [[TMP27]], ptr [[DST_2]], align 8, !alias.scope [[META17]], !noalias [[META18]]
-; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE62]]
-; CHECK:       [[PRED_STORE_CONTINUE62]]:
 ; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <2 x i1> [[TMP23]], i32 0
-; CHECK-NEXT:    br i1 [[TMP28]], label %[[PRED_STORE_IF63:.*]], label %[[PRED_STORE_CONTINUE64:.*]]
-; CHECK:       [[PRED_STORE_IF63]]:
+; CHECK-NEXT:    br i1 [[TMP28]], label %[[PRED_STORE_IF48:.*]], label %[[PRED_STORE_CONTINUE49:.*]]
+; CHECK:       [[PRED_STORE_IF48]]:
 ; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <2 x i64> [[PREDPHI58]], i32 0
-; CHECK-NEXT:    store i64 [[TMP29]], ptr [[DST_2]], align 8, !alias.scope [[META17]], !noalias [[META18]]
-; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE64]]
-; CHECK:       [[PRED_STORE_CONTINUE64]]:
+; CHECK-NEXT:    store i64 [[TMP29]], ptr [[DST_2]], align 8, !alias.scope [[META17:![0-9]+]], !noalias [[META18:![0-9]+]]
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE49]]
+; CHECK:       [[PRED_STORE_CONTINUE49]]:
 ; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <2 x i1> [[TMP23]], i32 1
-; CHECK-NEXT:    br i1 [[TMP30]], label %[[PRED_STORE_IF65:.*]], label %[[PRED_STORE_CONTINUE66:.*]]
-; CHECK:       [[PRED_STORE_IF65]]:
+; CHECK-NEXT:    br i1 [[TMP30]], label %[[PRED_STORE_IF50:.*]], label %[[PRED_STORE_CONTINUE51:.*]]
+; CHECK:       [[PRED_STORE_IF50]]:
 ; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <2 x i64> [[PREDPHI58]], i32 1
 ; CHECK-NEXT:    store i64 [[TMP31]], ptr [[DST_2]], align 8, !alias.scope [[META17]], !noalias [[META18]]
-; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE66]]
-; CHECK:       [[PRED_STORE_CONTINUE66]]:
-; CHECK-NEXT:    [[TMP32:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT57]], splat (i1 true)
-; CHECK-NEXT:    [[TMP33:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT57]], splat (i1 true)
-; CHECK-NEXT:    [[TMP34:%.*]] = select <2 x i1> [[TMP18]], <2 x i1> [[TMP32]], <2 x i1> zeroinitializer
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE51]]
+; CHECK:       [[PRED_STORE_CONTINUE51]]:
 ; CHECK-NEXT:    [[TMP35:%.*]] = select <2 x i1> [[TMP19]], <2 x i1> [[TMP33]], <2 x i1> zeroinitializer
-; CHECK-NEXT:    [[TMP36:%.*]] = or <2 x i1> [[TMP22]], [[TMP34]]
 ; CHECK-NEXT:    [[TMP37:%.*]] = or <2 x i1> [[TMP23]], [[TMP35]]
-; CHECK-NEXT:    [[TMP38:%.*]] = extractelement <2 x i1> [[TMP36]], i32 0
-; CHECK-NEXT:    br i1 [[TMP38]], label %[[PRED_STORE_IF67:.*]], label %[[PRED_STORE_CONTINUE68:.*]]
-; CHECK:       [[PRED_STORE_IF67]]:
-; CHECK-NEXT:    [[TMP45:%.*]] = load i64, ptr [[SRC]], align 8, !alias.scope [[META19:![0-9]+]]
-; CHECK-NEXT:    store i64 [[TMP45]], ptr [[DST]], align 8, !alias.scope [[META20:![0-9]+]], !noalias [[META19]]
-; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE68]]
-; CHECK:       [[PRED_STORE_CONTINUE68]]:
-; CHECK-NEXT:    [[TMP40:%.*]] = extractelement <2 x i1> [[TMP36]], i32 1
-; CHECK-NEXT:    br i1 [[TMP40]], label %[[PRED_STORE_IF69:.*]], label %[[PRED_STORE_CONTINUE70:.*]]
-; CHECK:       [[PRED_STORE_IF69]]:
-; CHECK-NEXT:    [[TMP39:%.*]] = load i64, ptr [[SRC]], align 8, !alias.scope [[META19]]
-; CHECK-NEXT:    store i64 [[TMP39]], ptr [[DST]], align 8, !alias.scope [[META20]], !noalias [[META19]]
-; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE70]]
-; CHECK:       [[PRED_STORE_CONTINUE70]]:
 ; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <2 x i1> [[TMP37]], i32 0
-; CHECK-NEXT:    br i1 [[TMP42]], label %[[PRED_STORE_IF71:.*]], label %[[PRED_STORE_CONTINUE72:.*]]
-; CHECK:       [[PRED_STORE_IF71]]:
-; CHECK-NEXT:    [[TMP41:%.*]] = load i64, ptr [[SRC]], align 8, !alias.scope [[META19]]
-; CHECK-NEXT:    store i64 [[TMP41]], ptr [[DST]], align 8, !alias.scope [[META20]], !noalias [[META19]]
-; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE72]]
-; CHECK:       [[PRED_STORE_CONTINUE72]]:
+; CHECK-NEXT:    br i1 [[TMP42]], label %[[PRED_STORE_IF52:.*]], label %[[PRED_STORE_CONTINUE53:.*]]
+; CHECK:       [[PRED_STORE_IF52]]:
+; CHECK-NEXT:    [[TMP24:%.*]] = load i64, ptr [[SRC]], align 8, !alias.scope [[META19:![0-9]+]]
+; CHECK-NEXT:    store i64 [[TMP24]], ptr [[DST]], align 8, !alias.scope [[META20:![0-9]+]], !noalias [[META19]]
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE53]]
+; CHECK:       [[PRED_STORE_CONTINUE53]]:
 ; CHECK-NEXT:    [[TMP44:%.*]] = extractelement <2 x i1> [[TMP37]], i32 1
-; CHECK-NEXT:    br i1 [[TMP44]], label %[[PRED_STORE_IF73:.*]], label %[[PRED_STORE_CONTINUE74]]
-; CHECK:       [[PRED_STORE_IF73]]:
-; CHECK-NEXT:    [[TMP43:%.*]] = load i64, ptr [[SRC]], align 8, !alias.scope [[META19]]
-; CHECK-NEXT:    store i64 [[TMP43]], ptr [[DST]], align 8, !alias.scope [[META20]], !noalias [[META19]]
-; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE74]]
-; CHECK:       [[PRED_STORE_CONTINUE74]]:
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    br i1 [[TMP44]], label %[[PRED_STORE_IF54:.*]], label %[[PRED_STORE_CONTINUE55]]
+; CHECK:       [[PRED_STORE_IF54]]:
+; CHECK-NEXT:    [[TMP25:%.*]] = load i64, ptr [[SRC]], align 8, !alias.scope [[META19]]
+; CHECK-NEXT:    store i64 [[TMP25]], ptr [[DST]], align 8, !alias.scope [[META20]], !noalias [[META19]]
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE55]]
+; CHECK:       [[PRED_STORE_CONTINUE55]]:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP46:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64
 ; CHECK-NEXT:    br i1 [[TMP46]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
@@ -348,6 +288,10 @@ define void @test_exit_branch_cost(ptr %dst, i64 %x, i32 %y, ptr %dst.1, i1 %c.4
 ; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
 ; CHECK:       [[LOOP_HEADER]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-NEXT:    [[X_GEP:%.*]] = getelementptr i64, ptr [[X_PTR]], i64 [[IV]]
+; CHECK-NEXT:    [[X:%.*]] = load i64, ptr [[X_GEP]], align 8
+; CHECK-NEXT:    [[Y_GEP:%.*]] = getelementptr i32, ptr [[Y_PTR]], i64 [[IV]]
+; CHECK-NEXT:    [[Y:%.*]] = load i32, ptr [[Y_GEP]], align 4
 ; CHECK-NEXT:    [[C1:%.*]] = icmp eq i64 [[X]], 0
 ; CHECK-NEXT:    br i1 [[C1]], label %[[THEN_4:.*]], label %[[THEN_1:.*]]
 ; CHECK:       [[THEN_1]]:
@@ -386,6 +330,10 @@ entry:
 
 loop.header:
   %iv = phi i64 [ %iv.next, %loop.latch ], [ 0, %entry ]
+  %x.gep = getelementptr i64, ptr %x.ptr, i64 %iv
+  %x = load i64, ptr %x.gep
+  %y.gep = getelementptr i32, ptr %y.ptr, i64 %iv
+  %y = load i32, ptr %y.gep
   %c1 = icmp eq i64 %x, 0
   br i1 %c1, label %then.4, label %then.1
 
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/blend-any-of-reduction-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/blend-any-of-reduction-cost.ll
index f9c1ab4a8181..3d00c228baf5 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/blend-any-of-reduction-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/blend-any-of-reduction-cost.ll
@@ -75,17 +75,17 @@ define i32 @any_of_reduction_used_in_blend_with_mutliple_phis(ptr %src, i64 %N,
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i1> poison, i1 [[C_0]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i1> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = xor <vscale x 2 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i1> poison, i1 [[C_1]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 2 x i1> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = xor <vscale x 2 x i1> [[BROADCAST_SPLAT2]], splat (i1 true)
+; CHECK-NEXT:    [[TMP8:%.*]] = select <vscale x 2 x i1> [[TMP6]], <vscale x 2 x i1> [[TMP7]], <vscale x 2 x i1> zeroinitializer
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 2 x ptr> poison, ptr [[SRC]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <vscale x 2 x ptr> [[BROADCAST_SPLATINSERT3]], <vscale x 2 x ptr> poison, <vscale x 2 x i32> zeroinitializer
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 2 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[PREDPHI:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP6:%.*]] = xor <vscale x 2 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
-; CHECK-NEXT:    [[TMP7:%.*]] = xor <vscale x 2 x i1> [[BROADCAST_SPLAT2]], splat (i1 true)
-; CHECK-NEXT:    [[TMP8:%.*]] = select <vscale x 2 x i1> [[TMP6]], <vscale x 2 x i1> [[TMP7]], <vscale x 2 x i1> zeroinitializer
 ; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x ptr> @llvm.masked.gather.nxv2p0.nxv2p0(<vscale x 2 x ptr> [[BROADCAST_SPLAT4]], i32 8, <vscale x 2 x i1> [[TMP8]], <vscale x 2 x ptr> poison)
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq <vscale x 2 x ptr> [[WIDE_MASKED_GATHER]], zeroinitializer
 ; CHECK-NEXT:    [[TMP10:%.*]] = or <vscale x 2 x i1> [[VEC_PHI]], [[TMP9]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll b/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll
index a7765f47180d..038e726adc24 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll
@@ -432,6 +432,7 @@ define void @predicated_udiv(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[V:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <vscale x 2 x i64> [[BROADCAST_SPLAT]], zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = select <vscale x 2 x i1> [[TMP6]], <vscale x 2 x i64> [[BROADCAST_SPLAT]], <vscale x 2 x i64> splat (i64 1)
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -439,7 +440,6 @@ define void @predicated_udiv(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP7]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP9]], align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = select <vscale x 2 x i1> [[TMP6]], <vscale x 2 x i64> [[BROADCAST_SPLAT]], <vscale x 2 x i64> splat (i64 1)
 ; CHECK-NEXT:    [[TMP11:%.*]] = udiv <vscale x 2 x i64> [[WIDE_LOAD]], [[TMP10]]
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP6]], <vscale x 2 x i64> [[TMP11]], <vscale x 2 x i64> [[WIDE_LOAD]]
 ; CHECK-NEXT:    store <vscale x 2 x i64> [[PREDPHI]], ptr [[TMP9]], align 8
@@ -477,6 +477,7 @@ define void @predicated_udiv(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; FIXED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[V:%.*]], i64 0
 ; FIXED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
 ; FIXED-NEXT:    [[TMP0:%.*]] = icmp ne <4 x i64> [[BROADCAST_SPLAT]], zeroinitializer
+; FIXED-NEXT:    [[TMP5:%.*]] = select <4 x i1> [[TMP0]], <4 x i64> [[BROADCAST_SPLAT]], <4 x i64> splat (i64 1)
 ; FIXED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; FIXED:       vector.body:
 ; FIXED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -486,10 +487,8 @@ define void @predicated_udiv(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; FIXED-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 4
 ; FIXED-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP3]], align 8
 ; FIXED-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
-; FIXED-NEXT:    [[TMP5:%.*]] = select <4 x i1> [[TMP0]], <4 x i64> [[BROADCAST_SPLAT]], <4 x i64> splat (i64 1)
-; FIXED-NEXT:    [[TMP6:%.*]] = select <4 x i1> [[TMP0]], <4 x i64> [[BROADCAST_SPLAT]], <4 x i64> splat (i64 1)
 ; FIXED-NEXT:    [[TMP7:%.*]] = udiv <4 x i64> [[WIDE_LOAD]], [[TMP5]]
-; FIXED-NEXT:    [[TMP8:%.*]] = udiv <4 x i64> [[WIDE_LOAD1]], [[TMP6]]
+; FIXED-NEXT:    [[TMP8:%.*]] = udiv <4 x i64> [[WIDE_LOAD1]], [[TMP5]]
 ; FIXED-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP0]], <4 x i64> [[TMP7]], <4 x i64> [[WIDE_LOAD]]
 ; FIXED-NEXT:    [[PREDPHI2:%.*]] = select <4 x i1> [[TMP0]], <4 x i64> [[TMP8]], <4 x i64> [[WIDE_LOAD1]]
 ; FIXED-NEXT:    store <4 x i64> [[PREDPHI]], ptr [[TMP3]], align 8
@@ -560,6 +559,7 @@ define void @predicated_sdiv(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[V:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <vscale x 2 x i64> [[BROADCAST_SPLAT]], zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = select <vscale x 2 x i1> [[TMP6]], <vscale x 2 x i64> [[BROADCAST_SPLAT]], <vscale x 2 x i64> splat (i64 1)
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -567,7 +567,6 @@ define void @predicated_sdiv(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP7]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP9]], align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = select <vscale x 2 x i1> [[TMP6]], <vscale x 2 x i64> [[BROADCAST_SPLAT]], <vscale x 2 x i64> splat (i64 1)
 ; CHECK-NEXT:    [[TMP11:%.*]] = sdiv <vscale x 2 x i64> [[WIDE_LOAD]], [[TMP10]]
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP6]], <vscale x 2 x i64> [[TMP11]], <vscale x 2 x i64> [[WIDE_LOAD]]
 ; CHECK-NEXT:    store <vscale x 2 x i64> [[PREDPHI]], ptr [[TMP9]], align 8
@@ -605,6 +604,7 @@ define void @predicated_sdiv(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; FIXED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[V:%.*]], i64 0
 ; FIXED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
 ; FIXED-NEXT:    [[TMP0:%.*]] = icmp ne <4 x i64> [[BROADCAST_SPLAT]], zeroinitializer
+; FIXED-NEXT:    [[TMP5:%.*]] = select <4 x i1> [[TMP0]], <4 x i64> [[BROADCAST_SPLAT]], <4 x i64> splat (i64 1)
 ; FIXED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; FIXED:       vector.body:
 ; FIXED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -614,10 +614,8 @@ define void @predicated_sdiv(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; FIXED-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 4
 ; FIXED-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP3]], align 8
 ; FIXED-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
-; FIXED-NEXT:    [[TMP5:%.*]] = select <4 x i1> [[TMP0]], <4 x i64> [[BROADCAST_SPLAT]], <4 x i64> splat (i64 1)
-; FIXED-NEXT:    [[TMP6:%.*]] = select <4 x i1> [[TMP0]], <4 x i64> [[BROADCAST_SPLAT]], <4 x i64> splat (i64 1)
 ; FIXED-NEXT:    [[TMP7:%.*]] = sdiv <4 x i64> [[WIDE_LOAD]], [[TMP5]]
-; FIXED-NEXT:    [[TMP8:%.*]] = sdiv <4 x i64> [[WIDE_LOAD1]], [[TMP6]]
+; FIXED-NEXT:    [[TMP8:%.*]] = sdiv <4 x i64> [[WIDE_LOAD1]], [[TMP5]]
 ; FIXED-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP0]], <4 x i64> [[TMP7]], <4 x i64> [[WIDE_LOAD]]
 ; FIXED-NEXT:    [[PREDPHI2:%.*]] = select <4 x i1> [[TMP0]], <4 x i64> [[TMP8]], <4 x i64> [[WIDE_LOAD1]]
 ; FIXED-NEXT:    store <4 x i64> [[PREDPHI]], ptr [[TMP3]], align 8
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll
index a330b6964a66..f323231445aa 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll
@@ -37,16 +37,16 @@ define void @reverse_load_store(i64 %startval, ptr noalias %ptr, ptr noalias %pt
 ; IF-EVL-NEXT:    [[TMP18:%.*]] = zext i32 [[TMP5]] to i64
 ; IF-EVL-NEXT:    [[TMP9:%.*]] = mul i64 0, [[TMP18]]
 ; IF-EVL-NEXT:    [[TMP10:%.*]] = sub i64 1, [[TMP18]]
-; IF-EVL-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i64 [[TMP9]]
-; IF-EVL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i64 [[TMP10]]
+; IF-EVL-NEXT:    [[TMP16:%.*]] = getelementptr i32, ptr [[TMP8]], i64 [[TMP9]]
+; IF-EVL-NEXT:    [[TMP12:%.*]] = getelementptr i32, ptr [[TMP16]], i64 [[TMP10]]
 ; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP12]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
 ; IF-EVL-NEXT:    [[VP_REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
 ; IF-EVL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[PTR2:%.*]], i64 [[TMP7]]
 ; IF-EVL-NEXT:    [[TMP19:%.*]] = zext i32 [[TMP5]] to i64
 ; IF-EVL-NEXT:    [[TMP14:%.*]] = mul i64 0, [[TMP19]]
 ; IF-EVL-NEXT:    [[TMP15:%.*]] = sub i64 1, [[TMP19]]
-; IF-EVL-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i64 [[TMP14]]
-; IF-EVL-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i64 [[TMP15]]
+; IF-EVL-NEXT:    [[TMP22:%.*]] = getelementptr i32, ptr [[TMP13]], i64 [[TMP14]]
+; IF-EVL-NEXT:    [[TMP17:%.*]] = getelementptr i32, ptr [[TMP22]], i64 [[TMP15]]
 ; IF-EVL-NEXT:    [[VP_REVERSE3:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[VP_REVERSE]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
 ; IF-EVL-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[VP_REVERSE3]], ptr align 4 [[TMP17]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
 ; IF-EVL-NEXT:    [[TMP20:%.*]] = zext i32 [[TMP5]] to i64
diff --git a/llvm/test/Transforms/LoopVectorize/X86/drop-inbounds-flags-for-reverse-vector-pointer.ll b/llvm/test/Transforms/LoopVectorize/X86/drop-inbounds-flags-for-reverse-vector-pointer.ll
index 66bb9357750c..3d23090dd123 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/drop-inbounds-flags-for-reverse-vector-pointer.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/drop-inbounds-flags-for-reverse-vector-pointer.ll
@@ -4,8 +4,6 @@
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
-; FIXME:  GEP flags on GEPs for reverse vector pointer need to be dropped when folding the tail.
-
 define i1 @fn(ptr %nno) #0 {
 ; CHECK-LABEL: define i1 @fn(
 ; CHECK-SAME: ptr [[NNO:%.*]]) #[[ATTR0:[0-9]+]] {
@@ -26,8 +24,8 @@ define i1 @fn(ptr %nno) #0 {
 ; CHECK-NEXT:    [[TMP2:%.*]] = and <4 x i64> [[VEC_IND]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq <4 x i64> [[TMP2]], zeroinitializer
 ; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds nuw i32, ptr [[NNO]], i64 [[TMP22]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP23]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 -3
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i32, ptr [[TMP23]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i32, ptr [[TMP5]], i32 -3
 ; CHECK-NEXT:    [[REVERSE:%.*]] = shufflevector <4 x i1> [[TMP1]], <4 x i1> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP6]], i32 4, <4 x i1> [[REVERSE]], <4 x i32> poison)
 ; CHECK-NEXT:    [[REVERSE1:%.*]] = shufflevector <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr109581-unused-blend.ll b/llvm/test/Transforms/LoopVectorize/X86/pr109581-unused-blend.ll
index 270e6bcd9ab1..1a9e7ddb965f 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/pr109581-unused-blend.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr109581-unused-blend.ll
@@ -14,72 +14,70 @@ define i32 @unused_blend_after_unrolling(ptr %p, i32 %a, i1 %c.1, i16 %x, i16 %y
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[C_1]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP0:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT16:%.*]] = insertelement <4 x i1> poison, i1 [[C]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT17:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT16]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT17]], splat (i1 true)
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_SDIV_CONTINUE15:.*]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP24:%.*]], %[[PRED_SDIV_CONTINUE15]] ]
-; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP25:%.*]], %[[PRED_SDIV_CONTINUE15]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
-; CHECK-NEXT:    [[TMP1:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_SDIV_CONTINUE17:.*]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP24:%.*]], %[[PRED_SDIV_CONTINUE17]] ]
+; CHECK-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP25:%.*]], %[[PRED_SDIV_CONTINUE17]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i1> [[TMP0]], i32 0
 ; CHECK-NEXT:    br i1 [[TMP2]], label %[[PRED_SDIV_IF:.*]], label %[[PRED_SDIV_CONTINUE:.*]]
 ; CHECK:       [[PRED_SDIV_IF]]:
 ; CHECK-NEXT:    br label %[[PRED_SDIV_CONTINUE]]
 ; CHECK:       [[PRED_SDIV_CONTINUE]]:
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i1> [[TMP0]], i32 1
-; CHECK-NEXT:    br i1 [[TMP3]], label %[[PRED_SDIV_IF2:.*]], label %[[PRED_SDIV_CONTINUE3:.*]]
-; CHECK:       [[PRED_SDIV_IF2]]:
-; CHECK-NEXT:    br label %[[PRED_SDIV_CONTINUE3]]
-; CHECK:       [[PRED_SDIV_CONTINUE3]]:
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i1> [[TMP0]], i32 2
-; CHECK-NEXT:    br i1 [[TMP4]], label %[[PRED_SDIV_IF4:.*]], label %[[PRED_SDIV_CONTINUE5:.*]]
+; CHECK-NEXT:    br i1 [[TMP3]], label %[[PRED_SDIV_IF4:.*]], label %[[PRED_SDIV_CONTINUE5:.*]]
 ; CHECK:       [[PRED_SDIV_IF4]]:
 ; CHECK-NEXT:    br label %[[PRED_SDIV_CONTINUE5]]
 ; CHECK:       [[PRED_SDIV_CONTINUE5]]:
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i1> [[TMP0]], i32 3
-; CHECK-NEXT:    br i1 [[TMP5]], label %[[PRED_SDIV_IF6:.*]], label %[[PRED_SDIV_CONTINUE7:.*]]
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i1> [[TMP0]], i32 2
+; CHECK-NEXT:    br i1 [[TMP4]], label %[[PRED_SDIV_IF6:.*]], label %[[PRED_SDIV_CONTINUE7:.*]]
 ; CHECK:       [[PRED_SDIV_IF6]]:
 ; CHECK-NEXT:    br label %[[PRED_SDIV_CONTINUE7]]
 ; CHECK:       [[PRED_SDIV_CONTINUE7]]:
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i1> [[TMP1]], i32 0
-; CHECK-NEXT:    br i1 [[TMP6]], label %[[PRED_SDIV_IF8:.*]], label %[[PRED_SDIV_CONTINUE9:.*]]
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i1> [[TMP0]], i32 3
+; CHECK-NEXT:    br i1 [[TMP5]], label %[[PRED_SDIV_IF8:.*]], label %[[PRED_SDIV_CONTINUE9:.*]]
 ; CHECK:       [[PRED_SDIV_IF8]]:
-; CHECK-NEXT:    [[TMP7:%.*]] = sdiv i16 [[X]], [[Y]]
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x i16> poison, i16 [[TMP7]], i32 0
 ; CHECK-NEXT:    br label %[[PRED_SDIV_CONTINUE9]]
 ; CHECK:       [[PRED_SDIV_CONTINUE9]]:
-; CHECK-NEXT:    [[TMP9:%.*]] = phi <4 x i16> [ poison, %[[PRED_SDIV_CONTINUE7]] ], [ [[TMP8]], %[[PRED_SDIV_IF8]] ]
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x i1> [[TMP1]], i32 1
-; CHECK-NEXT:    br i1 [[TMP10]], label %[[PRED_SDIV_IF10:.*]], label %[[PRED_SDIV_CONTINUE11:.*]]
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i1> [[TMP0]], i32 0
+; CHECK-NEXT:    br i1 [[TMP6]], label %[[PRED_SDIV_IF10:.*]], label %[[PRED_SDIV_CONTINUE11:.*]]
 ; CHECK:       [[PRED_SDIV_IF10]]:
-; CHECK-NEXT:    [[TMP11:%.*]] = sdiv i16 [[X]], [[Y]]
-; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <4 x i16> [[TMP9]], i16 [[TMP11]], i32 1
+; CHECK-NEXT:    [[TMP7:%.*]] = sdiv i16 [[X]], [[Y]]
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x i16> poison, i16 [[TMP7]], i32 0
 ; CHECK-NEXT:    br label %[[PRED_SDIV_CONTINUE11]]
 ; CHECK:       [[PRED_SDIV_CONTINUE11]]:
-; CHECK-NEXT:    [[TMP13:%.*]] = phi <4 x i16> [ [[TMP9]], %[[PRED_SDIV_CONTINUE9]] ], [ [[TMP12]], %[[PRED_SDIV_IF10]] ]
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x i1> [[TMP1]], i32 2
-; CHECK-NEXT:    br i1 [[TMP14]], label %[[PRED_SDIV_IF12:.*]], label %[[PRED_SDIV_CONTINUE13:.*]]
+; CHECK-NEXT:    [[TMP9:%.*]] = phi <4 x i16> [ poison, %[[PRED_SDIV_CONTINUE9]] ], [ [[TMP8]], %[[PRED_SDIV_IF10]] ]
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x i1> [[TMP0]], i32 1
+; CHECK-NEXT:    br i1 [[TMP10]], label %[[PRED_SDIV_IF12:.*]], label %[[PRED_SDIV_CONTINUE13:.*]]
 ; CHECK:       [[PRED_SDIV_IF12]]:
-; CHECK-NEXT:    [[TMP15:%.*]] = sdiv i16 [[X]], [[Y]]
-; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <4 x i16> [[TMP13]], i16 [[TMP15]], i32 2
+; CHECK-NEXT:    [[TMP11:%.*]] = sdiv i16 [[X]], [[Y]]
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <4 x i16> [[TMP9]], i16 [[TMP11]], i32 1
 ; CHECK-NEXT:    br label %[[PRED_SDIV_CONTINUE13]]
 ; CHECK:       [[PRED_SDIV_CONTINUE13]]:
-; CHECK-NEXT:    [[TMP17:%.*]] = phi <4 x i16> [ [[TMP13]], %[[PRED_SDIV_CONTINUE11]] ], [ [[TMP16]], %[[PRED_SDIV_IF12]] ]
-; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <4 x i1> [[TMP1]], i32 3
-; CHECK-NEXT:    br i1 [[TMP18]], label %[[PRED_SDIV_IF14:.*]], label %[[PRED_SDIV_CONTINUE15]]
+; CHECK-NEXT:    [[TMP13:%.*]] = phi <4 x i16> [ [[TMP9]], %[[PRED_SDIV_CONTINUE11]] ], [ [[TMP12]], %[[PRED_SDIV_IF12]] ]
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x i1> [[TMP0]], i32 2
+; CHECK-NEXT:    br i1 [[TMP14]], label %[[PRED_SDIV_IF14:.*]], label %[[PRED_SDIV_CONTINUE15:.*]]
 ; CHECK:       [[PRED_SDIV_IF14]]:
-; CHECK-NEXT:    [[TMP19:%.*]] = sdiv i16 [[X]], [[Y]]
-; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <4 x i16> [[TMP17]], i16 [[TMP19]], i32 3
+; CHECK-NEXT:    [[TMP15:%.*]] = sdiv i16 [[X]], [[Y]]
+; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <4 x i16> [[TMP13]], i16 [[TMP15]], i32 2
 ; CHECK-NEXT:    br label %[[PRED_SDIV_CONTINUE15]]
 ; CHECK:       [[PRED_SDIV_CONTINUE15]]:
-; CHECK-NEXT:    [[TMP21:%.*]] = phi <4 x i16> [ [[TMP17]], %[[PRED_SDIV_CONTINUE13]] ], [ [[TMP20]], %[[PRED_SDIV_IF14]] ]
+; CHECK-NEXT:    [[TMP17:%.*]] = phi <4 x i16> [ [[TMP13]], %[[PRED_SDIV_CONTINUE13]] ], [ [[TMP16]], %[[PRED_SDIV_IF14]] ]
+; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <4 x i1> [[TMP0]], i32 3
+; CHECK-NEXT:    br i1 [[TMP18]], label %[[PRED_SDIV_IF16:.*]], label %[[PRED_SDIV_CONTINUE17]]
+; CHECK:       [[PRED_SDIV_IF16]]:
+; CHECK-NEXT:    [[TMP19:%.*]] = sdiv i16 [[X]], [[Y]]
+; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <4 x i16> [[TMP17]], i16 [[TMP19]], i32 3
+; CHECK-NEXT:    br label %[[PRED_SDIV_CONTINUE17]]
+; CHECK:       [[PRED_SDIV_CONTINUE17]]:
+; CHECK-NEXT:    [[TMP21:%.*]] = phi <4 x i16> [ [[TMP17]], %[[PRED_SDIV_CONTINUE15]] ], [ [[TMP20]], %[[PRED_SDIV_IF16]] ]
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i16> zeroinitializer, <4 x i16> [[TMP21]]
-; CHECK-NEXT:    [[TMP22:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT17]], splat (i1 true)
-; CHECK-NEXT:    [[TMP23:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT17]], splat (i1 true)
 ; CHECK-NEXT:    [[TMP24]] = or <4 x i1> [[VEC_PHI]], [[TMP22]]
-; CHECK-NEXT:    [[TMP25]] = or <4 x i1> [[VEC_PHI1]], [[TMP23]]
+; CHECK-NEXT:    [[TMP25]] = or <4 x i1> [[VEC_PHI3]], [[TMP22]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP26:%.*]] = icmp eq i32 [[INDEX_NEXT]], 96
 ; CHECK-NEXT:    br i1 [[TMP26]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/replicate-uniform-call.ll b/llvm/test/Transforms/LoopVectorize/X86/replicate-uniform-call.ll
index 8d56c3386a3b..cfae26a3a425 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/replicate-uniform-call.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/replicate-uniform-call.ll
@@ -15,19 +15,18 @@ define void @smax_call_uniform(ptr %dst, i64 %x) {
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i1> poison, i1 [[C]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i1> [[BROADCAST_SPLATINSERT]], <2 x i1> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP1:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_UREM_CONTINUE6:.*]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
-; CHECK-NEXT:    [[TMP1:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i1> [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i1> [[TMP1]], i32 0
 ; CHECK-NEXT:    br i1 [[TMP2]], label %[[PRED_UREM_IF:.*]], label %[[PRED_UREM_CONTINUE:.*]]
 ; CHECK:       [[PRED_UREM_IF]]:
 ; CHECK-NEXT:    [[REM:%.*]] = urem i64 [[MUL]], [[X]]
 ; CHECK-NEXT:    br label %[[PRED_UREM_CONTINUE]]
 ; CHECK:       [[PRED_UREM_CONTINUE]]:
 ; CHECK-NEXT:    [[TMP4:%.*]] = phi i64 [ poison, %[[VECTOR_BODY]] ], [ [[REM]], %[[PRED_UREM_IF]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i1> [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i1> [[TMP1]], i32 1
 ; CHECK-NEXT:    br i1 [[TMP5]], label %[[PRED_UREM_IF1:.*]], label %[[PRED_UREM_CONTINUE2:.*]]
 ; CHECK:       [[PRED_UREM_IF1]]:
 ; CHECK-NEXT:    [[TMP6:%.*]] = urem i64 [[MUL]], [[X]]
@@ -48,7 +47,7 @@ define void @smax_call_uniform(ptr %dst, i64 %x) {
 ; CHECK:       [[PRED_UREM_CONTINUE6]]:
 ; CHECK-NEXT:    [[TMP12:%.*]] = tail call i64 @llvm.smax.i64(i64 [[TMP4]], i64 0)
 ; CHECK-NEXT:    [[TMP13:%.*]] = tail call i64 @llvm.smax.i64(i64 [[TMP9]], i64 0)
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i1> [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i1> [[TMP1]], i32 0
 ; CHECK-NEXT:    [[P:%.*]] = select i1 [[TMP14]], i64 [[TMP12]], i64 1
 ; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x i1> [[TMP1]], i32 0
 ; CHECK-NEXT:    [[PREDPHI7:%.*]] = select i1 [[TMP15]], i64 [[TMP13]], i64 1
diff --git a/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll b/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll
index c14c34cade6b..a0294f7ac799 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll
@@ -129,6 +129,7 @@ define void @_Z3fn1v() #0 {
 ; CHECK-NEXT:    [[IND_END43:%.*]] = mul i64 [[N_VEC32]], 2
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i1> poison, i1 [[TOBOOL6]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i1> [[BROADCAST_SPLATINSERT]], <16 x i1> poison, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP34:%.*]] = xor <16 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
 ; CHECK-NEXT:    br label [[VECTOR_BODY29:%.*]]
 ; CHECK:       vector.body29:
 ; CHECK-NEXT:    [[INDEX34:%.*]] = phi i64 [ 0, [[VECTOR_PH30]] ], [ [[INDEX_NEXT39:%.*]], [[VECTOR_BODY29]] ]
@@ -138,7 +139,6 @@ define void @_Z3fn1v() #0 {
 ; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr @d, i64 0, <16 x i64> [[VEC_IND35]]
 ; CHECK-NEXT:    [[TMP32:%.*]] = add nsw <16 x i64> [[TMP30]], [[VEC_IND37]]
 ; CHECK-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [10 x i32], <16 x ptr> [[TMP31]], <16 x i64> [[TMP32]], i64 0
-; CHECK-NEXT:    [[TMP34:%.*]] = xor <16 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
 ; CHECK-NEXT:    call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> splat (i32 8), <16 x ptr> [[TMP33]], i32 16, <16 x i1> [[TMP34]])
 ; CHECK-NEXT:    [[TMP35:%.*]] = or disjoint <16 x i64> [[VEC_IND37]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP36:%.*]] = add nsw <16 x i64> [[TMP30]], [[TMP35]]
@@ -173,16 +173,17 @@ define void @_Z3fn1v() #0 {
 ; CHECK-NEXT:    [[TMP43:%.*]] = mul i64 [[N_VEC53]], 2
 ; CHECK-NEXT:    [[IND_END54:%.*]] = add i64 8, [[TMP43]]
 ; CHECK-NEXT:    [[IND_END57:%.*]] = mul i64 [[N_VEC53]], 2
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT50:%.*]] = insertelement <8 x i1> poison, i1 [[TOBOOL6]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT73:%.*]] = shufflevector <8 x i1> [[BROADCAST_SPLATINSERT50]], <8 x i1> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP48:%.*]] = xor <8 x i1> [[BROADCAST_SPLAT73]], splat (i1 true)
 ; CHECK-NEXT:    [[DOTSPLATINSERT62:%.*]] = insertelement <8 x i64> poison, i64 [[BC_RESUME_VAL42]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT63:%.*]] = shufflevector <8 x i64> [[DOTSPLATINSERT62]], <8 x i64> poison, <8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[INDUCTION64:%.*]] = add <8 x i64> [[DOTSPLAT63]], <i64 0, i64 2, i64 4, i64 6, i64 8, i64 10, i64 12, i64 14>
 ; CHECK-NEXT:    [[DOTSPLATINSERT67:%.*]] = insertelement <8 x i64> poison, i64 [[BC_RESUME_VAL44]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT68:%.*]] = shufflevector <8 x i64> [[DOTSPLATINSERT67]], <8 x i64> poison, <8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[INDUCTION69:%.*]] = add <8 x i64> [[DOTSPLAT68]], <i64 0, i64 2, i64 4, i64 6, i64 8, i64 10, i64 12, i64 14>
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT72:%.*]] = insertelement <8 x i1> poison, i1 [[TOBOOL6]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT73:%.*]] = shufflevector <8 x i1> [[BROADCAST_SPLATINSERT72]], <8 x i1> poison, <8 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY50:%.*]]
-; CHECK:       vec.epilog.vector.body50:
+; CHECK:       vec.epilog.vector.body52:
 ; CHECK-NEXT:    [[INDEX61:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL51]], [[VEC_EPILOG_PH42]] ], [ [[INDEX_NEXT74:%.*]], [[VEC_EPILOG_VECTOR_BODY50]] ]
 ; CHECK-NEXT:    [[VEC_IND65:%.*]] = phi <8 x i64> [ [[INDUCTION64]], [[VEC_EPILOG_PH42]] ], [ [[VEC_IND_NEXT66:%.*]], [[VEC_EPILOG_VECTOR_BODY50]] ]
 ; CHECK-NEXT:    [[VEC_IND70:%.*]] = phi <8 x i64> [ [[INDUCTION69]], [[VEC_EPILOG_PH42]] ], [ [[VEC_IND_NEXT71:%.*]], [[VEC_EPILOG_VECTOR_BODY50]] ]
@@ -190,7 +191,6 @@ define void @_Z3fn1v() #0 {
 ; CHECK-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr @d, i64 0, <8 x i64> [[VEC_IND65]]
 ; CHECK-NEXT:    [[TMP46:%.*]] = add nsw <8 x i64> [[TMP44]], [[VEC_IND70]]
 ; CHECK-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [10 x i32], <8 x ptr> [[TMP45]], <8 x i64> [[TMP46]], i64 0
-; CHECK-NEXT:    [[TMP48:%.*]] = xor <8 x i1> [[BROADCAST_SPLAT73]], splat (i1 true)
 ; CHECK-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> splat (i32 8), <8 x ptr> [[TMP47]], i32 16, <8 x i1> [[TMP48]])
 ; CHECK-NEXT:    [[TMP49:%.*]] = or disjoint <8 x i64> [[VEC_IND70]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP50:%.*]] = add nsw <8 x i64> [[TMP44]], [[TMP49]]
diff --git a/llvm/test/Transforms/LoopVectorize/blend-in-header.ll b/llvm/test/Transforms/LoopVectorize/blend-in-header.ll
index 4c95584ff253..2fea016218e6 100644
--- a/llvm/test/Transforms/LoopVectorize/blend-in-header.ll
+++ b/llvm/test/Transforms/LoopVectorize/blend-in-header.ll
@@ -171,11 +171,11 @@ define i64 @invar_cond_incoming_ops_reordered(i1 %c) {
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[C]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP1:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP1]], <4 x i64> splat (i64 1), <4 x i64> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP1]], <4 x i64> splat (i64 1), <4 x i64> zeroinitializer
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP0]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll b/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll
index c4509e4ad56e..7db53d8ffced 100644
--- a/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll
+++ b/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll
@@ -172,6 +172,7 @@ define void @bug18724(i1 %cond, ptr %ptr, i1 %cond.2, i64 %v.1, i32 %v.2) {
 ; UNROLL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP4]], 2
 ; UNROLL-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP4]], [[N_MOD_VF]]
 ; UNROLL-NEXT:    [[IND_END:%.*]] = add i64 [[V_1]], [[N_VEC]]
+; UNROLL-NEXT:    [[TMP13:%.*]] = xor i1 [[COND_2:%.*]], true
 ; UNROLL-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL:       vector.body:
 ; UNROLL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE3:%.*]] ]
@@ -184,7 +185,7 @@ define void @bug18724(i1 %cond, ptr %ptr, i1 %cond.2, i64 %v.1, i32 %v.2) {
 ; UNROLL-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [768 x i32], ptr [[PTR]], i64 0, i64 [[TMP6]]
 ; UNROLL-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP7]], align 4
 ; UNROLL-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 4
-; UNROLL-NEXT:    br i1 [[COND_2:%.*]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE3]]
+; UNROLL-NEXT:    br i1 [[COND_2]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE3]]
 ; UNROLL:       pred.store.if:
 ; UNROLL-NEXT:    store i32 [[TMP9]], ptr [[TMP7]], align 4
 ; UNROLL-NEXT:    store i32 [[TMP10]], ptr [[TMP8]], align 4
@@ -192,10 +193,8 @@ define void @bug18724(i1 %cond, ptr %ptr, i1 %cond.2, i64 %v.1, i32 %v.2) {
 ; UNROLL:       pred.store.continue3:
 ; UNROLL-NEXT:    [[TMP11:%.*]] = add i32 [[VEC_PHI]], 1
 ; UNROLL-NEXT:    [[TMP12:%.*]] = add i32 [[VEC_PHI1]], 1
-; UNROLL-NEXT:    [[TMP13:%.*]] = xor i1 [[COND_2]], true
-; UNROLL-NEXT:    [[TMP14:%.*]] = xor i1 [[COND_2]], true
 ; UNROLL-NEXT:    [[PREDPHI]] = select i1 [[TMP13]], i32 [[VEC_PHI]], i32 [[TMP11]]
-; UNROLL-NEXT:    [[PREDPHI4]] = select i1 [[TMP14]], i32 [[VEC_PHI1]], i32 [[TMP12]]
+; UNROLL-NEXT:    [[PREDPHI4]] = select i1 [[TMP13]], i32 [[VEC_PHI1]], i32 [[TMP12]]
 ; UNROLL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; UNROLL-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; UNROLL-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
@@ -244,6 +243,7 @@ define void @bug18724(i1 %cond, ptr %ptr, i1 %cond.2, i64 %v.1, i32 %v.2) {
 ; UNROLL-NOSIMPLIFY-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], 2
 ; UNROLL-NOSIMPLIFY-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]]
 ; UNROLL-NOSIMPLIFY-NEXT:    [[IND_END:%.*]] = add i64 [[V_1]], [[N_VEC]]
+; UNROLL-NOSIMPLIFY-NEXT:    [[TMP12:%.*]] = xor i1 [[COND_2:%.*]], true
 ; UNROLL-NOSIMPLIFY-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL-NOSIMPLIFY:       vector.body:
 ; UNROLL-NOSIMPLIFY-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE3:%.*]] ]
@@ -256,7 +256,7 @@ define void @bug18724(i1 %cond, ptr %ptr, i1 %cond.2, i64 %v.1, i32 %v.2) {
 ; UNROLL-NOSIMPLIFY-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [768 x i32], ptr [[PTR]], i64 0, i64 [[TMP5]]
 ; UNROLL-NOSIMPLIFY-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP6]], align 4
 ; UNROLL-NOSIMPLIFY-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP7]], align 4
-; UNROLL-NOSIMPLIFY-NEXT:    br i1 [[COND_2:%.*]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; UNROLL-NOSIMPLIFY-NEXT:    br i1 [[COND_2]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
 ; UNROLL-NOSIMPLIFY:       pred.store.if:
 ; UNROLL-NOSIMPLIFY-NEXT:    store i32 [[TMP8]], ptr [[TMP6]], align 4
 ; UNROLL-NOSIMPLIFY-NEXT:    br label [[PRED_STORE_CONTINUE]]
@@ -268,10 +268,8 @@ define void @bug18724(i1 %cond, ptr %ptr, i1 %cond.2, i64 %v.1, i32 %v.2) {
 ; UNROLL-NOSIMPLIFY:       pred.store.continue3:
 ; UNROLL-NOSIMPLIFY-NEXT:    [[TMP10:%.*]] = add i32 [[VEC_PHI]], 1
 ; UNROLL-NOSIMPLIFY-NEXT:    [[TMP11:%.*]] = add i32 [[VEC_PHI1]], 1
-; UNROLL-NOSIMPLIFY-NEXT:    [[TMP12:%.*]] = xor i1 [[COND_2]], true
-; UNROLL-NOSIMPLIFY-NEXT:    [[TMP13:%.*]] = xor i1 [[COND_2]], true
 ; UNROLL-NOSIMPLIFY-NEXT:    [[PREDPHI]] = select i1 [[TMP12]], i32 [[VEC_PHI]], i32 [[TMP10]]
-; UNROLL-NOSIMPLIFY-NEXT:    [[PREDPHI4]] = select i1 [[TMP13]], i32 [[VEC_PHI1]], i32 [[TMP11]]
+; UNROLL-NOSIMPLIFY-NEXT:    [[PREDPHI4]] = select i1 [[TMP12]], i32 [[VEC_PHI1]], i32 [[TMP11]]
 ; UNROLL-NOSIMPLIFY-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; UNROLL-NOSIMPLIFY-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; UNROLL-NOSIMPLIFY-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
@@ -321,9 +319,10 @@ define void @bug18724(i1 %cond, ptr %ptr, i1 %cond.2, i64 %v.1, i32 %v.2) {
 ; VEC-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP4]], 2
 ; VEC-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP4]], [[N_MOD_VF]]
 ; VEC-NEXT:    [[IND_END:%.*]] = add i64 [[V_1]], [[N_VEC]]
-; VEC-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> zeroinitializer, i32 [[V_2:%.*]], i32 0
 ; VEC-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i1> poison, i1 [[COND_2:%.*]], i64 0
 ; VEC-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i1> [[BROADCAST_SPLATINSERT]], <2 x i1> poison, <2 x i32> zeroinitializer
+; VEC-NEXT:    [[TMP17:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
+; VEC-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> zeroinitializer, i32 [[V_2:%.*]], i32 0
 ; VEC-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; VEC:       vector.body:
 ; VEC-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE2:%.*]] ]
@@ -351,7 +350,6 @@ define void @bug18724(i1 %cond, ptr %ptr, i1 %cond.2, i64 %v.1, i32 %v.2) {
 ; VEC-NEXT:    br label [[PRED_STORE_CONTINUE2]]
 ; VEC:       pred.store.continue2:
 ; VEC-NEXT:    [[TMP16:%.*]] = add <2 x i32> [[VEC_PHI]], splat (i32 1)
-; VEC-NEXT:    [[TMP17:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
 ; VEC-NEXT:    [[PREDPHI]] = select <2 x i1> [[TMP17]], <2 x i32> [[VEC_PHI]], <2 x i32> [[TMP16]]
 ; VEC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; VEC-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/induction.ll b/llvm/test/Transforms/LoopVectorize/induction.ll
index 2175eab9752c..96311de673d8 100644
--- a/llvm/test/Transforms/LoopVectorize/induction.ll
+++ b/llvm/test/Transforms/LoopVectorize/induction.ll
@@ -1962,6 +1962,7 @@ define i32 @scalarize_induction_variable_05(ptr %a, i32 %x, i1 %c, i32 %n) {
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[SMAX]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i1> poison, i1 [[C:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i1> [[BROADCAST_SPLATINSERT]], <2 x i1> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UDIV_CONTINUE2:%.*]] ]
@@ -1989,7 +1990,6 @@ define i32 @scalarize_induction_variable_05(ptr %a, i32 %x, i1 %c, i32 %n) {
 ; CHECK-NEXT:    br label [[PRED_UDIV_CONTINUE2]]
 ; CHECK:       pred.udiv.continue2:
 ; CHECK-NEXT:    [[TMP13:%.*]] = phi <2 x i32> [ [[TMP7]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP12]], [[PRED_UDIV_IF1]] ]
-; CHECK-NEXT:    [[TMP14:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP14]], <2 x i32> [[WIDE_LOAD]], <2 x i32> [[TMP13]]
 ; CHECK-NEXT:    [[TMP15]] = add <2 x i32> [[PREDPHI]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
@@ -2030,6 +2030,8 @@ define i32 @scalarize_induction_variable_05(ptr %a, i32 %x, i1 %c, i32 %n) {
 ; IND:       vector.ph:
 ; IND-NEXT:    [[N_VEC:%.*]] = and i32 [[SMAX]], 2147483646
 ; IND-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i1> poison, i1 [[C:%.*]], i64 0
+; IND-NEXT:    [[TMP11:%.*]] = xor <2 x i1> [[BROADCAST_SPLATINSERT]], <i1 true, i1 poison>
+; IND-NEXT:    [[TMP12:%.*]] = shufflevector <2 x i1> [[TMP11]], <2 x i1> poison, <2 x i32> zeroinitializer
 ; IND-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; IND:       vector.body:
 ; IND-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UDIV_CONTINUE2:%.*]] ]
@@ -2054,8 +2056,6 @@ define i32 @scalarize_induction_variable_05(ptr %a, i32 %x, i1 %c, i32 %n) {
 ; IND-NEXT:    br label [[PRED_UDIV_CONTINUE2]]
 ; IND:       pred.udiv.continue2:
 ; IND-NEXT:    [[TMP10:%.*]] = phi <2 x i32> [ [[TMP5]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP9]], [[PRED_UDIV_IF1]] ]
-; IND-NEXT:    [[TMP11:%.*]] = xor <2 x i1> [[BROADCAST_SPLATINSERT]], <i1 true, i1 poison>
-; IND-NEXT:    [[TMP12:%.*]] = shufflevector <2 x i1> [[TMP11]], <2 x i1> poison, <2 x i32> zeroinitializer
 ; IND-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP12]], <2 x i32> [[WIDE_LOAD]], <2 x i32> [[TMP10]]
 ; IND-NEXT:    [[TMP13]] = add <2 x i32> [[PREDPHI]], [[VEC_PHI]]
 ; IND-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
@@ -2097,7 +2097,8 @@ define i32 @scalarize_induction_variable_05(ptr %a, i32 %x, i1 %c, i32 %n) {
 ; UNROLL:       vector.ph:
 ; UNROLL-NEXT:    [[N_VEC:%.*]] = and i32 [[SMAX]], 2147483644
 ; UNROLL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i1> poison, i1 [[C:%.*]], i64 0
-; UNROLL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i1> [[BROADCAST_SPLATINSERT]], <2 x i1> poison, <2 x i32> zeroinitializer
+; UNROLL-NEXT:    [[TMP27:%.*]] = xor <2 x i1> [[BROADCAST_SPLATINSERT]], <i1 true, i1 poison>
+; UNROLL-NEXT:    [[TMP28:%.*]] = shufflevector <2 x i1> [[TMP27]], <2 x i1> poison, <2 x i32> zeroinitializer
 ; UNROLL-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL:       vector.body:
 ; UNROLL-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UDIV_CONTINUE8:%.*]] ]
@@ -2143,8 +2144,8 @@ define i32 @scalarize_induction_variable_05(ptr %a, i32 %x, i1 %c, i32 %n) {
 ; UNROLL-NEXT:    br label [[PRED_UDIV_CONTINUE8]]
 ; UNROLL:       pred.udiv.continue8:
 ; UNROLL-NEXT:    [[TMP21:%.*]] = phi <2 x i32> [ [[TMP16]], [[PRED_UDIV_CONTINUE6]] ], [ [[TMP20]], [[PRED_UDIV_IF7]] ]
-; UNROLL-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[BROADCAST_SPLAT]], <2 x i32> [[TMP11]], <2 x i32> [[WIDE_LOAD]]
-; UNROLL-NEXT:    [[PREDPHI9:%.*]] = select <2 x i1> [[BROADCAST_SPLAT]], <2 x i32> [[TMP21]], <2 x i32> [[WIDE_LOAD2]]
+; UNROLL-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP28]], <2 x i32> [[WIDE_LOAD]], <2 x i32> [[TMP11]]
+; UNROLL-NEXT:    [[PREDPHI9:%.*]] = select <2 x i1> [[TMP28]], <2 x i32> [[WIDE_LOAD2]], <2 x i32> [[TMP21]]
 ; UNROLL-NEXT:    [[TMP22]] = add <2 x i32> [[PREDPHI]], [[VEC_PHI]]
 ; UNROLL-NEXT:    [[TMP23]] = add <2 x i32> [[PREDPHI9]], [[VEC_PHI1]]
 ; UNROLL-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
@@ -2189,6 +2190,7 @@ define i32 @scalarize_induction_variable_05(ptr %a, i32 %x, i1 %c, i32 %n) {
 ; UNROLL-NO-IC-NEXT:    [[N_VEC:%.*]] = sub i32 [[SMAX]], [[N_MOD_VF]]
 ; UNROLL-NO-IC-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i1> poison, i1 [[C:%.*]], i64 0
 ; UNROLL-NO-IC-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i1> [[BROADCAST_SPLATINSERT]], <2 x i1> poison, <2 x i32> zeroinitializer
+; UNROLL-NO-IC-NEXT:    [[TMP27:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
 ; UNROLL-NO-IC-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL-NO-IC:       vector.body:
 ; UNROLL-NO-IC-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UDIV_CONTINUE8:%.*]] ]
@@ -2239,10 +2241,8 @@ define i32 @scalarize_induction_variable_05(ptr %a, i32 %x, i1 %c, i32 %n) {
 ; UNROLL-NO-IC-NEXT:    br label [[PRED_UDIV_CONTINUE8]]
 ; UNROLL-NO-IC:       pred.udiv.continue8:
 ; UNROLL-NO-IC-NEXT:    [[TMP26:%.*]] = phi <2 x i32> [ [[TMP20]], [[PRED_UDIV_CONTINUE6]] ], [ [[TMP25]], [[PRED_UDIV_IF7]] ]
-; UNROLL-NO-IC-NEXT:    [[TMP27:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
-; UNROLL-NO-IC-NEXT:    [[TMP28:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
 ; UNROLL-NO-IC-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP27]], <2 x i32> [[WIDE_LOAD]], <2 x i32> [[TMP14]]
-; UNROLL-NO-IC-NEXT:    [[PREDPHI9:%.*]] = select <2 x i1> [[TMP28]], <2 x i32> [[WIDE_LOAD2]], <2 x i32> [[TMP26]]
+; UNROLL-NO-IC-NEXT:    [[PREDPHI9:%.*]] = select <2 x i1> [[TMP27]], <2 x i32> [[WIDE_LOAD2]], <2 x i32> [[TMP26]]
 ; UNROLL-NO-IC-NEXT:    [[TMP29]] = add <2 x i32> [[PREDPHI]], [[VEC_PHI]]
 ; UNROLL-NO-IC-NEXT:    [[TMP30]] = add <2 x i32> [[PREDPHI9]], [[VEC_PHI1]]
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
@@ -2284,7 +2284,8 @@ define i32 @scalarize_induction_variable_05(ptr %a, i32 %x, i1 %c, i32 %n) {
 ; INTERLEAVE:       vector.ph:
 ; INTERLEAVE-NEXT:    [[N_VEC:%.*]] = and i32 [[SMAX]], 2147483640
 ; INTERLEAVE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[C:%.*]], i64 0
-; INTERLEAVE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
+; INTERLEAVE-NEXT:    [[TMP47:%.*]] = xor <4 x i1> [[BROADCAST_SPLATINSERT]], <i1 true, i1 poison, i1 poison, i1 poison>
+; INTERLEAVE-NEXT:    [[TMP48:%.*]] = shufflevector <4 x i1> [[TMP47]], <4 x i1> poison, <4 x i32> zeroinitializer
 ; INTERLEAVE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; INTERLEAVE:       vector.body:
 ; INTERLEAVE-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UDIV_CONTINUE16:%.*]] ]
@@ -2366,8 +2367,8 @@ define i32 @scalarize_induction_variable_05(ptr %a, i32 %x, i1 %c, i32 %n) {
 ; INTERLEAVE-NEXT:    br label [[PRED_UDIV_CONTINUE16]]
 ; INTERLEAVE:       pred.udiv.continue16:
 ; INTERLEAVE-NEXT:    [[TMP41:%.*]] = phi <4 x i32> [ [[TMP36]], [[PRED_UDIV_CONTINUE14]] ], [ [[TMP40]], [[PRED_UDIV_IF15]] ]
-; INTERLEAVE-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i32> [[TMP21]], <4 x i32> [[WIDE_LOAD]]
-; INTERLEAVE-NEXT:    [[PREDPHI17:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i32> [[TMP41]], <4 x i32> [[WIDE_LOAD2]]
+; INTERLEAVE-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP48]], <4 x i32> [[WIDE_LOAD]], <4 x i32> [[TMP21]]
+; INTERLEAVE-NEXT:    [[PREDPHI17:%.*]] = select <4 x i1> [[TMP48]], <4 x i32> [[WIDE_LOAD2]], <4 x i32> [[TMP41]]
 ; INTERLEAVE-NEXT:    [[TMP42]] = add <4 x i32> [[PREDPHI]], [[VEC_PHI]]
 ; INTERLEAVE-NEXT:    [[TMP43]] = add <4 x i32> [[PREDPHI17]], [[VEC_PHI1]]
 ; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
diff --git a/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization-2.ll b/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization-2.ll
index bc1c1bf04a37..e8ad6a38d742 100644
--- a/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization-2.ll
+++ b/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization-2.ll
@@ -134,12 +134,12 @@ define void @inv_val_store_to_inv_address_conditional_inv(ptr %a, i64 %n, ptr %b
 ; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[SMAX2]], 9223372036854775804
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i1> poison, i1 [[CMP]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[NTRUNC]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i1> poison, i1 [[CMP]], i64 3
-; CHECK-NEXT:    [[BROADCAST_SPLAT6:%.*]] = insertelement <4 x i32> poison, i32 [[K]], i64 3
+; CHECK-NEXT:    [[BROADCAST_SPLAT6:%.*]] = insertelement <4 x i32> poison, i32 [[K]], i64 0
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[BROADCAST_SPLAT]], <4 x i32> [[BROADCAST_SPLAT6]]
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[PREDPHI]], i64 3
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[PREDPHI]], i64 0
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/pr37248.ll b/llvm/test/Transforms/LoopVectorize/pr37248.ll
index ed7762fbc2ba..fe660a826967 100644
--- a/llvm/test/Transforms/LoopVectorize/pr37248.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr37248.ll
@@ -41,26 +41,26 @@ define void @f1(ptr noalias %b, i1 %c, i32 %start) {
 ; CHECK-NEXT:    [[IND_END:%.*]] = sub i32 [[START]], [[N_VEC]]
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i1> poison, i1 [[C]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i1> [[BROADCAST_SPLATINSERT]], <2 x i1> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE3:%.*]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = sub i32 [[START]], [[INDEX]]
-; CHECK-NEXT:    [[TMP10:%.*]] = trunc i32 [[OFFSET_IDX]] to i16
-; CHECK-NEXT:    [[TMP11:%.*]] = add i16 [[TMP10]], 0
-; CHECK-NEXT:    [[TMP12:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i1> [[TMP12]], i32 0
+; CHECK-NEXT:    [[TMP11:%.*]] = trunc i32 [[OFFSET_IDX]] to i16
+; CHECK-NEXT:    [[TMP12:%.*]] = add i16 [[TMP11]], 0
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i1> [[TMP10]], i32 0
 ; CHECK-NEXT:    br i1 [[TMP13]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
 ; CHECK:       pred.store.if:
 ; CHECK-NEXT:    store i32 10, ptr [[B]], align 1
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE]]
 ; CHECK:       pred.store.continue:
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i1> [[TMP12]], i32 1
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i1> [[TMP10]], i32 1
 ; CHECK-NEXT:    br i1 [[TMP14]], label [[PRED_STORE_IF2:%.*]], label [[PRED_STORE_CONTINUE3]]
 ; CHECK:       pred.store.if2:
 ; CHECK-NEXT:    store i32 10, ptr [[B]], align 1
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE3]]
 ; CHECK:       pred.store.continue3:
-; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [2 x i16], ptr @a, i16 0, i16 [[TMP11]]
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [2 x i16], ptr @a, i16 0, i16 [[TMP12]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i16, ptr [[TMP15]], i32 0
 ; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i16, ptr [[TMP16]], i32 -1
 ; CHECK-NEXT:    store <2 x i16> zeroinitializer, ptr [[TMP17]], align 1
diff --git a/llvm/test/Transforms/LoopVectorize/pr55167-fold-tail-live-out.ll b/llvm/test/Transforms/LoopVectorize/pr55167-fold-tail-live-out.ll
index 4f47e66816c9..a129a4b1928c 100644
--- a/llvm/test/Transforms/LoopVectorize/pr55167-fold-tail-live-out.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr55167-fold-tail-live-out.ll
@@ -6,25 +6,25 @@ define i32 @test(i32 %a, i1 %c.1, i1 %c.2 ) #0 {
 ; CHECK-NEXT:  bb:
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[A:%.*]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = xor <2 x i32> [[BROADCAST_SPLAT]], splat (i32 1)
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x i1> poison, i1 [[C_1:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <2 x i1> [[BROADCAST_SPLATINSERT1]], <2 x i1> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT2]], splat (i1 true)
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <2 x i1> poison, i1 [[C_2:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <2 x i1> [[BROADCAST_SPLATINSERT3]], <2 x i1> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT4]], splat (i1 true)
+; CHECK-NEXT:    [[TMP7:%.*]] = select <2 x i1> [[TMP4]], <2 x i1> [[TMP6]], <2 x i1> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <2 x i32> poison, i32 [[A:%.*]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT5:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT4]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = xor <2 x i32> [[BROADCAST_SPLAT5]], splat (i32 1)
+; CHECK-NEXT:    [[TMP5:%.*]] = select <2 x i1> [[TMP4]], <2 x i1> [[BROADCAST_SPLAT4]], <2 x i1> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 6, i32 7>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i32> [ <i32 35902, i32 0>, [[VECTOR_PH]] ], [ [[PREDPHI7:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT2]], splat (i1 true)
 ; CHECK-NEXT:    [[TMP0:%.*]] = add <2 x i32> [[VEC_PHI]], splat (i32 10)
-; CHECK-NEXT:    [[TMP6:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT4]], splat (i1 true)
-; CHECK-NEXT:    [[TMP7:%.*]] = select <2 x i1> [[TMP4]], <2 x i1> [[TMP6]], <2 x i1> zeroinitializer
 ; CHECK-NEXT:    [[TMP1:%.*]] = add <2 x i32> [[TMP0]], splat (i32 20)
 ; CHECK-NEXT:    [[TMP3:%.*]] = add <2 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP5:%.*]] = select <2 x i1> [[TMP4]], <2 x i1> [[BROADCAST_SPLAT4]], <2 x i1> zeroinitializer
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP5]], <2 x i32> splat (i32 9), <2 x i32> [[VEC_IND]]
 ; CHECK-NEXT:    [[PREDPHI5:%.*]] = select <2 x i1> [[TMP7]], <2 x i32> splat (i32 9), <2 x i32> [[PREDPHI]]
 ; CHECK-NEXT:    [[PREDPHI6:%.*]] = select <2 x i1> [[TMP5]], <2 x i32> [[TMP0]], <2 x i32> [[VEC_PHI]]
diff --git a/llvm/test/Transforms/LoopVectorize/reduction-small-size.ll b/llvm/test/Transforms/LoopVectorize/reduction-small-size.ll
index 8a8439fca439..ca971f15e487 100644
--- a/llvm/test/Transforms/LoopVectorize/reduction-small-size.ll
+++ b/llvm/test/Transforms/LoopVectorize/reduction-small-size.ll
@@ -89,13 +89,13 @@ define i8 @PR34687_no_undef(i1 %c, i32 %x, i32 %n) {
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> poison, i32 [[X:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP0:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i32> [[BROADCAST_SPLAT2]], <4 x i32> splat (i32 1)
+; CHECK-NEXT:    [[TMP1:%.*]] = sdiv <4 x i32> splat (i32 99), [[TMP0]]
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i32> [[TMP1]], <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i32> [[BROADCAST_SPLAT2]], <4 x i32> splat (i32 1)
-; CHECK-NEXT:    [[TMP1:%.*]] = sdiv <4 x i32> splat (i32 99), [[TMP0]]
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i32> [[TMP1]], <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = and <4 x i32> [[VEC_PHI]], splat (i32 255)
 ; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[TMP3]], [[PREDPHI]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = trunc <4 x i32> [[TMP4]] to <4 x i8>
diff --git a/llvm/test/Transforms/LoopVectorize/select-cmp.ll b/llvm/test/Transforms/LoopVectorize/select-cmp.ll
index 301526cf3070..550e52d31823 100644
--- a/llvm/test/Transforms/LoopVectorize/select-cmp.ll
+++ b/llvm/test/Transforms/LoopVectorize/select-cmp.ll
@@ -1006,11 +1006,11 @@ define i32 @select_i32_from_icmp_same_inputs(i32 %a, i32 %b, i64 %n) {
 ; CHECK-VF4IC1-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i64 0
 ; CHECK-VF4IC1-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-VF4IC1-NEXT:    [[TMP0:%.*]] = icmp eq <4 x i32> [[BROADCAST_SPLAT]], splat (i32 3)
+; CHECK-VF4IC1-NEXT:    [[TMP1:%.*]] = xor <4 x i1> [[TMP0]], splat (i1 true)
 ; CHECK-VF4IC1-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK-VF4IC1:       [[VECTOR_BODY]]:
 ; CHECK-VF4IC1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-VF4IC1-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP2:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-VF4IC1-NEXT:    [[TMP1:%.*]] = xor <4 x i1> [[TMP0]], splat (i1 true)
 ; CHECK-VF4IC1-NEXT:    [[TMP2]] = or <4 x i1> [[VEC_PHI]], [[TMP1]]
 ; CHECK-VF4IC1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-VF4IC1-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -1048,6 +1048,7 @@ define i32 @select_i32_from_icmp_same_inputs(i32 %a, i32 %b, i64 %n) {
 ; CHECK-VF4IC4-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i64 0
 ; CHECK-VF4IC4-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-VF4IC4-NEXT:    [[TMP0:%.*]] = icmp eq <4 x i32> [[BROADCAST_SPLAT]], splat (i32 3)
+; CHECK-VF4IC4-NEXT:    [[TMP4:%.*]] = xor <4 x i1> [[TMP0]], splat (i1 true)
 ; CHECK-VF4IC4-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK-VF4IC4:       [[VECTOR_BODY]]:
 ; CHECK-VF4IC4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -1055,13 +1056,9 @@ define i32 @select_i32_from_icmp_same_inputs(i32 %a, i32 %b, i64 %n) {
 ; CHECK-VF4IC4-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-VF4IC4-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-VF4IC4-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-VF4IC4-NEXT:    [[TMP1:%.*]] = xor <4 x i1> [[TMP0]], splat (i1 true)
-; CHECK-VF4IC4-NEXT:    [[TMP2:%.*]] = xor <4 x i1> [[TMP0]], splat (i1 true)
-; CHECK-VF4IC4-NEXT:    [[TMP3:%.*]] = xor <4 x i1> [[TMP0]], splat (i1 true)
-; CHECK-VF4IC4-NEXT:    [[TMP4:%.*]] = xor <4 x i1> [[TMP0]], splat (i1 true)
-; CHECK-VF4IC4-NEXT:    [[TMP5]] = or <4 x i1> [[VEC_PHI]], [[TMP1]]
-; CHECK-VF4IC4-NEXT:    [[TMP6]] = or <4 x i1> [[VEC_PHI1]], [[TMP2]]
-; CHECK-VF4IC4-NEXT:    [[TMP7]] = or <4 x i1> [[VEC_PHI2]], [[TMP3]]
+; CHECK-VF4IC4-NEXT:    [[TMP5]] = or <4 x i1> [[VEC_PHI]], [[TMP4]]
+; CHECK-VF4IC4-NEXT:    [[TMP6]] = or <4 x i1> [[VEC_PHI1]], [[TMP4]]
+; CHECK-VF4IC4-NEXT:    [[TMP7]] = or <4 x i1> [[VEC_PHI2]], [[TMP4]]
 ; CHECK-VF4IC4-NEXT:    [[TMP8]] = or <4 x i1> [[VEC_PHI3]], [[TMP4]]
 ; CHECK-VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-VF4IC4-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -1100,6 +1097,7 @@ define i32 @select_i32_from_icmp_same_inputs(i32 %a, i32 %b, i64 %n) {
 ; CHECK-VF1IC4-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
 ; CHECK-VF1IC4-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; CHECK-VF1IC4-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[A]], 3
+; CHECK-VF1IC4-NEXT:    [[TMP4:%.*]] = xor i1 [[TMP0]], true
 ; CHECK-VF1IC4-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK-VF1IC4:       [[VECTOR_BODY]]:
 ; CHECK-VF1IC4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -1107,13 +1105,9 @@ define i32 @select_i32_from_icmp_same_inputs(i32 %a, i32 %b, i64 %n) {
 ; CHECK-VF1IC4-NEXT:    [[VEC_PHI1:%.*]] = phi i1 [ false, %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-VF1IC4-NEXT:    [[VEC_PHI2:%.*]] = phi i1 [ false, %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-VF1IC4-NEXT:    [[VEC_PHI3:%.*]] = phi i1 [ false, %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-VF1IC4-NEXT:    [[TMP1:%.*]] = xor i1 [[TMP0]], true
-; CHECK-VF1IC4-NEXT:    [[TMP2:%.*]] = xor i1 [[TMP0]], true
-; CHECK-VF1IC4-NEXT:    [[TMP3:%.*]] = xor i1 [[TMP0]], true
-; CHECK-VF1IC4-NEXT:    [[TMP4:%.*]] = xor i1 [[TMP0]], true
-; CHECK-VF1IC4-NEXT:    [[TMP5]] = or i1 [[VEC_PHI]], [[TMP1]]
-; CHECK-VF1IC4-NEXT:    [[TMP6]] = or i1 [[VEC_PHI1]], [[TMP2]]
-; CHECK-VF1IC4-NEXT:    [[TMP7]] = or i1 [[VEC_PHI2]], [[TMP3]]
+; CHECK-VF1IC4-NEXT:    [[TMP5]] = or i1 [[VEC_PHI]], [[TMP4]]
+; CHECK-VF1IC4-NEXT:    [[TMP6]] = or i1 [[VEC_PHI1]], [[TMP4]]
+; CHECK-VF1IC4-NEXT:    [[TMP7]] = or i1 [[VEC_PHI2]], [[TMP4]]
 ; CHECK-VF1IC4-NEXT:    [[TMP8]] = or i1 [[VEC_PHI3]], [[TMP4]]
 ; CHECK-VF1IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-VF1IC4-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/single_early_exit.ll b/llvm/test/Transforms/LoopVectorize/single_early_exit.ll
index 7590bb9d6868..4ba9cc661313 100644
--- a/llvm/test/Transforms/LoopVectorize/single_early_exit.ll
+++ b/llvm/test/Transforms/LoopVectorize/single_early_exit.ll
@@ -281,12 +281,12 @@ define i32 @diff_blocks_invariant_early_exit_cond(ptr %s) {
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[COND]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP0:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
+; CHECK-NEXT:    [[TMP1:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP0]])
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP0:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
-; CHECK-NEXT:    [[TMP1:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP0]])
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[INDEX_NEXT]], 276
 ; CHECK-NEXT:    [[TMP3:%.*]] = or i1 [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    br i1 [[TMP3]], label [[MIDDLE_SPLIT:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/tail-folding-switch.ll b/llvm/test/Transforms/LoopVectorize/tail-folding-switch.ll
index 31732f027f6d..892ddccbc93b 100644
--- a/llvm/test/Transforms/LoopVectorize/tail-folding-switch.ll
+++ b/llvm/test/Transforms/LoopVectorize/tail-folding-switch.ll
@@ -10,12 +10,12 @@ define void @tail_fold_switch(ptr %dst, i32 %0) {
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq <4 x i32> [[BROADCAST_SPLAT]], splat (i32 1)
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE6:.*]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE6]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ule <4 x i64> [[VEC_IND]], splat (i64 4)
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq <4 x i32> [[BROADCAST_SPLAT]], splat (i32 1)
 ; CHECK-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[TMP1]], <4 x i1> [[TMP2]], <4 x i1> zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i1> [[TMP3]], i32 0
 ; CHECK-NEXT:    br i1 [[TMP4]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/uncountable-single-exit-loops.ll b/llvm/test/Transforms/LoopVectorize/uncountable-single-exit-loops.ll
new file mode 100644
index 000000000000..252061335e73
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/uncountable-single-exit-loops.ll
@@ -0,0 +1,52 @@
+; REQUIRES: asserts
+; RUN: opt -p loop-vectorize -debug %s 2>&1 | FileCheck %s
+
+
+; CHECK-LABEL: LV: Checking a loop in 'latch_exit_cannot_compute_btc_due_to_step'
+; CHECK: 	   LV: Did not find one integer induction var.
+; CHECK-NEXT:  LV: Not vectorizing: Early exit is not the latch predecessor.
+; CHECK-NEXT:  LV: Interleaving disabled by the pass manager
+; CHECK-NEXT:  LV: Not vectorizing: Cannot prove legality.
+
+; CHECK-LABEL: LV: Checking a loop in 'header_exit_cannot_compute_btc_due_to_step'
+; CHECK:       LV: Found an induction variable.
+; CHECK-NEXT:  LV: Did not find one integer induction var.
+; CHECK-NEXT:  LV: Not vectorizing: Cannot determine exact exit count for latch block.
+; CHECK-NEXT:  LV: Interleaving disabled by the pass manager
+; CHECK-NEXT:  LV: Not vectorizing: Cannot prove legality.
+
+; CHECK-NOT: vector.body
+define void @latch_exit_cannot_compute_btc_due_to_step(ptr %dst, i64 %step) {
+entry:
+  br label %loop
+
+loop:                                   ; preds = %loop, %for.cond.us
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %iv.next = add i64 %iv, %step
+  %gep = getelementptr i8, ptr %dst, i64 %iv
+  store i8 0, ptr %gep, align 1
+  %ec = icmp eq i64 %iv.next, 1000
+  br i1 %ec, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+define void @header_exit_cannot_compute_btc_due_to_step(ptr %dst, i64 %step) {
+entry:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %iv.next = add i64 %iv, %step
+  %ec = icmp eq i64 %iv.next, 1000
+  br i1 %ec, label %loop.latch, label %exit
+
+loop.latch:
+  %gep = getelementptr i8, ptr %dst, i64 %iv
+  store i8 0, ptr %gep, align 1
+  br label %loop.header
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/uniform-blend.ll b/llvm/test/Transforms/LoopVectorize/uniform-blend.ll
index 72011ca7f484..7f5e0f3a77ef 100644
--- a/llvm/test/Transforms/LoopVectorize/uniform-blend.ll
+++ b/llvm/test/Transforms/LoopVectorize/uniform-blend.ll
@@ -133,11 +133,11 @@ define void @blend_chain_iv(i1 %c) {
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[C]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP0:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i1> [[BROADCAST_SPLAT]], <4 x i1> zeroinitializer
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i1> [[BROADCAST_SPLAT]], <4 x i1> zeroinitializer
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP0]], <4 x i64> [[VEC_IND]], <4 x i64> undef
 ; CHECK-NEXT:    [[PREDPHI1:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i64> [[PREDPHI]], <4 x i64> undef
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i64> [[PREDPHI1]], i32 0
diff --git a/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination.ll b/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination.ll
index fe5811e7e115..85b44a7076d1 100644
--- a/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination.ll
+++ b/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination.ll
@@ -707,6 +707,542 @@ outer.latch:
 exit:
   ret void
 }
+
+declare void @llvm.assume(i1)
+
+; Test case for https://github.com/llvm/llvm-project/issues/121897.
+define void @scev_expand_step(i64 %x, ptr %dst) {
+; VF8UF1-LABEL: define void @scev_expand_step(
+; VF8UF1-SAME: i64 [[X:%.*]], ptr [[DST:%.*]]) {
+; VF8UF1-NEXT:  [[ENTRY:.*]]:
+; VF8UF1-NEXT:    [[C:%.*]] = icmp eq i64 [[X]], 65536
+; VF8UF1-NEXT:    call void @llvm.assume(i1 [[C]])
+; VF8UF1-NEXT:    [[FR:%.*]] = freeze i64 [[X]]
+; VF8UF1-NEXT:    [[STEP:%.*]] = add i64 [[FR]], -65534
+; VF8UF1-NEXT:    [[TMP0:%.*]] = udiv i64 15, [[STEP]]
+; VF8UF1-NEXT:    [[TMP1:%.*]] = add nuw nsw i64 [[TMP0]], 1
+; VF8UF1-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF8UF1:       [[VECTOR_PH]]:
+; VF8UF1-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP1]], 7
+; VF8UF1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 8
+; VF8UF1-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; VF8UF1-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[TMP1]], 1
+; VF8UF1-NEXT:    [[TMP2:%.*]] = mul i64 [[N_VEC]], [[STEP]]
+; VF8UF1-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF8UF1:       [[VECTOR_BODY]]:
+; VF8UF1-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; VF8UF1-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer
+; VF8UF1-NEXT:    [[TMP3:%.*]] = icmp ule <8 x i64> <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, [[BROADCAST_SPLAT]]
+; VF8UF1-NEXT:    [[TMP4:%.*]] = extractelement <8 x i1> [[TMP3]], i32 0
+; VF8UF1-NEXT:    br i1 [[TMP4]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; VF8UF1:       [[PRED_STORE_IF]]:
+; VF8UF1-NEXT:    [[TMP5:%.*]] = mul i64 0, [[STEP]]
+; VF8UF1-NEXT:    [[TMP6:%.*]] = add i64 0, [[TMP5]]
+; VF8UF1-NEXT:    [[TMP7:%.*]] = add i64 [[TMP6]], [[STEP]]
+; VF8UF1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP7]]
+; VF8UF1-NEXT:    store i8 0, ptr [[TMP8]], align 1
+; VF8UF1-NEXT:    br label %[[PRED_STORE_CONTINUE]]
+; VF8UF1:       [[PRED_STORE_CONTINUE]]:
+; VF8UF1-NEXT:    [[TMP9:%.*]] = extractelement <8 x i1> [[TMP3]], i32 1
+; VF8UF1-NEXT:    br i1 [[TMP9]], label %[[PRED_STORE_IF1:.*]], label %[[PRED_STORE_CONTINUE2:.*]]
+; VF8UF1:       [[PRED_STORE_IF1]]:
+; VF8UF1-NEXT:    [[TMP10:%.*]] = mul i64 1, [[STEP]]
+; VF8UF1-NEXT:    [[TMP11:%.*]] = add i64 0, [[TMP10]]
+; VF8UF1-NEXT:    [[TMP12:%.*]] = add i64 [[TMP11]], [[STEP]]
+; VF8UF1-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP12]]
+; VF8UF1-NEXT:    store i8 0, ptr [[TMP13]], align 1
+; VF8UF1-NEXT:    br label %[[PRED_STORE_CONTINUE2]]
+; VF8UF1:       [[PRED_STORE_CONTINUE2]]:
+; VF8UF1-NEXT:    [[TMP14:%.*]] = extractelement <8 x i1> [[TMP3]], i32 2
+; VF8UF1-NEXT:    br i1 [[TMP14]], label %[[PRED_STORE_IF3:.*]], label %[[PRED_STORE_CONTINUE4:.*]]
+; VF8UF1:       [[PRED_STORE_IF3]]:
+; VF8UF1-NEXT:    [[TMP15:%.*]] = mul i64 2, [[STEP]]
+; VF8UF1-NEXT:    [[TMP16:%.*]] = add i64 0, [[TMP15]]
+; VF8UF1-NEXT:    [[TMP17:%.*]] = add i64 [[TMP16]], [[STEP]]
+; VF8UF1-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP17]]
+; VF8UF1-NEXT:    store i8 0, ptr [[TMP18]], align 1
+; VF8UF1-NEXT:    br label %[[PRED_STORE_CONTINUE4]]
+; VF8UF1:       [[PRED_STORE_CONTINUE4]]:
+; VF8UF1-NEXT:    [[TMP19:%.*]] = extractelement <8 x i1> [[TMP3]], i32 3
+; VF8UF1-NEXT:    br i1 [[TMP19]], label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6:.*]]
+; VF8UF1:       [[PRED_STORE_IF5]]:
+; VF8UF1-NEXT:    [[TMP20:%.*]] = mul i64 3, [[STEP]]
+; VF8UF1-NEXT:    [[TMP21:%.*]] = add i64 0, [[TMP20]]
+; VF8UF1-NEXT:    [[TMP22:%.*]] = add i64 [[TMP21]], [[STEP]]
+; VF8UF1-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP22]]
+; VF8UF1-NEXT:    store i8 0, ptr [[TMP23]], align 1
+; VF8UF1-NEXT:    br label %[[PRED_STORE_CONTINUE6]]
+; VF8UF1:       [[PRED_STORE_CONTINUE6]]:
+; VF8UF1-NEXT:    [[TMP24:%.*]] = extractelement <8 x i1> [[TMP3]], i32 4
+; VF8UF1-NEXT:    br i1 [[TMP24]], label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]]
+; VF8UF1:       [[PRED_STORE_IF7]]:
+; VF8UF1-NEXT:    [[TMP25:%.*]] = mul i64 4, [[STEP]]
+; VF8UF1-NEXT:    [[TMP26:%.*]] = add i64 0, [[TMP25]]
+; VF8UF1-NEXT:    [[TMP27:%.*]] = add i64 [[TMP26]], [[STEP]]
+; VF8UF1-NEXT:    [[TMP28:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP27]]
+; VF8UF1-NEXT:    store i8 0, ptr [[TMP28]], align 1
+; VF8UF1-NEXT:    br label %[[PRED_STORE_CONTINUE8]]
+; VF8UF1:       [[PRED_STORE_CONTINUE8]]:
+; VF8UF1-NEXT:    [[TMP29:%.*]] = extractelement <8 x i1> [[TMP3]], i32 5
+; VF8UF1-NEXT:    br i1 [[TMP29]], label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10:.*]]
+; VF8UF1:       [[PRED_STORE_IF9]]:
+; VF8UF1-NEXT:    [[TMP30:%.*]] = mul i64 5, [[STEP]]
+; VF8UF1-NEXT:    [[TMP31:%.*]] = add i64 0, [[TMP30]]
+; VF8UF1-NEXT:    [[TMP32:%.*]] = add i64 [[TMP31]], [[STEP]]
+; VF8UF1-NEXT:    [[TMP33:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP32]]
+; VF8UF1-NEXT:    store i8 0, ptr [[TMP33]], align 1
+; VF8UF1-NEXT:    br label %[[PRED_STORE_CONTINUE10]]
+; VF8UF1:       [[PRED_STORE_CONTINUE10]]:
+; VF8UF1-NEXT:    [[TMP34:%.*]] = extractelement <8 x i1> [[TMP3]], i32 6
+; VF8UF1-NEXT:    br i1 [[TMP34]], label %[[PRED_STORE_IF11:.*]], label %[[PRED_STORE_CONTINUE12:.*]]
+; VF8UF1:       [[PRED_STORE_IF11]]:
+; VF8UF1-NEXT:    [[TMP35:%.*]] = mul i64 6, [[STEP]]
+; VF8UF1-NEXT:    [[TMP36:%.*]] = add i64 0, [[TMP35]]
+; VF8UF1-NEXT:    [[TMP37:%.*]] = add i64 [[TMP36]], [[STEP]]
+; VF8UF1-NEXT:    [[TMP38:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP37]]
+; VF8UF1-NEXT:    store i8 0, ptr [[TMP38]], align 1
+; VF8UF1-NEXT:    br label %[[PRED_STORE_CONTINUE12]]
+; VF8UF1:       [[PRED_STORE_CONTINUE12]]:
+; VF8UF1-NEXT:    [[TMP39:%.*]] = extractelement <8 x i1> [[TMP3]], i32 7
+; VF8UF1-NEXT:    br i1 [[TMP39]], label %[[PRED_STORE_IF13:.*]], label %[[PRED_STORE_CONTINUE14:.*]]
+; VF8UF1:       [[PRED_STORE_IF13]]:
+; VF8UF1-NEXT:    [[TMP40:%.*]] = mul i64 7, [[STEP]]
+; VF8UF1-NEXT:    [[TMP41:%.*]] = add i64 0, [[TMP40]]
+; VF8UF1-NEXT:    [[TMP42:%.*]] = add i64 [[TMP41]], [[STEP]]
+; VF8UF1-NEXT:    [[TMP43:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP42]]
+; VF8UF1-NEXT:    store i8 0, ptr [[TMP43]], align 1
+; VF8UF1-NEXT:    br label %[[PRED_STORE_CONTINUE14]]
+; VF8UF1:       [[PRED_STORE_CONTINUE14]]:
+; VF8UF1-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
+; VF8UF1:       [[MIDDLE_BLOCK]]:
+; VF8UF1-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF8UF1:       [[SCALAR_PH]]:
+; VF8UF1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP2]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VF8UF1-NEXT:    br label %[[LOOP:.*]]
+; VF8UF1:       [[LOOP]]:
+; VF8UF1-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF8UF1-NEXT:    [[IV_NEXT]] = add i64 [[IV]], [[STEP]]
+; VF8UF1-NEXT:    [[GEP_DST:%.*]] = getelementptr i8, ptr [[DST]], i64 [[IV_NEXT]]
+; VF8UF1-NEXT:    store i8 0, ptr [[GEP_DST]], align 1
+; VF8UF1-NEXT:    [[EC:%.*]] = icmp slt i64 [[IV_NEXT]], 16
+; VF8UF1-NEXT:    br i1 [[EC]], label %[[LOOP]], label %[[EXIT]], !llvm.loop [[LOOP7:![0-9]+]]
+; VF8UF1:       [[EXIT]]:
+; VF8UF1-NEXT:    ret void
+;
+; VF8UF2-LABEL: define void @scev_expand_step(
+; VF8UF2-SAME: i64 [[X:%.*]], ptr [[DST:%.*]]) {
+; VF8UF2-NEXT:  [[ENTRY:.*]]:
+; VF8UF2-NEXT:    [[C:%.*]] = icmp eq i64 [[X]], 65536
+; VF8UF2-NEXT:    call void @llvm.assume(i1 [[C]])
+; VF8UF2-NEXT:    [[FR:%.*]] = freeze i64 [[X]]
+; VF8UF2-NEXT:    [[STEP:%.*]] = add i64 [[FR]], -65534
+; VF8UF2-NEXT:    [[TMP0:%.*]] = udiv i64 15, [[STEP]]
+; VF8UF2-NEXT:    [[TMP1:%.*]] = add nuw nsw i64 [[TMP0]], 1
+; VF8UF2-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF8UF2:       [[VECTOR_PH]]:
+; VF8UF2-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP1]], 15
+; VF8UF2-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 16
+; VF8UF2-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; VF8UF2-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[TMP1]], 1
+; VF8UF2-NEXT:    [[TMP2:%.*]] = mul i64 [[N_VEC]], [[STEP]]
+; VF8UF2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF8UF2:       [[VECTOR_BODY]]:
+; VF8UF2-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; VF8UF2-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer
+; VF8UF2-NEXT:    [[TMP3:%.*]] = icmp ule <8 x i64> <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, [[BROADCAST_SPLAT]]
+; VF8UF2-NEXT:    [[TMP4:%.*]] = icmp ule <8 x i64> <i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>, [[BROADCAST_SPLAT]]
+; VF8UF2-NEXT:    [[TMP5:%.*]] = extractelement <8 x i1> [[TMP3]], i32 0
+; VF8UF2-NEXT:    br i1 [[TMP5]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; VF8UF2:       [[PRED_STORE_IF]]:
+; VF8UF2-NEXT:    [[TMP6:%.*]] = mul i64 0, [[STEP]]
+; VF8UF2-NEXT:    [[TMP7:%.*]] = add i64 0, [[TMP6]]
+; VF8UF2-NEXT:    [[TMP8:%.*]] = add i64 [[TMP7]], [[STEP]]
+; VF8UF2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP8]]
+; VF8UF2-NEXT:    store i8 0, ptr [[TMP9]], align 1
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE]]
+; VF8UF2:       [[PRED_STORE_CONTINUE]]:
+; VF8UF2-NEXT:    [[TMP10:%.*]] = extractelement <8 x i1> [[TMP3]], i32 1
+; VF8UF2-NEXT:    br i1 [[TMP10]], label %[[PRED_STORE_IF1:.*]], label %[[PRED_STORE_CONTINUE2:.*]]
+; VF8UF2:       [[PRED_STORE_IF1]]:
+; VF8UF2-NEXT:    [[TMP11:%.*]] = mul i64 1, [[STEP]]
+; VF8UF2-NEXT:    [[TMP12:%.*]] = add i64 0, [[TMP11]]
+; VF8UF2-NEXT:    [[TMP13:%.*]] = add i64 [[TMP12]], [[STEP]]
+; VF8UF2-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP13]]
+; VF8UF2-NEXT:    store i8 0, ptr [[TMP14]], align 1
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE2]]
+; VF8UF2:       [[PRED_STORE_CONTINUE2]]:
+; VF8UF2-NEXT:    [[TMP15:%.*]] = extractelement <8 x i1> [[TMP3]], i32 2
+; VF8UF2-NEXT:    br i1 [[TMP15]], label %[[PRED_STORE_IF3:.*]], label %[[PRED_STORE_CONTINUE4:.*]]
+; VF8UF2:       [[PRED_STORE_IF3]]:
+; VF8UF2-NEXT:    [[TMP16:%.*]] = mul i64 2, [[STEP]]
+; VF8UF2-NEXT:    [[TMP17:%.*]] = add i64 0, [[TMP16]]
+; VF8UF2-NEXT:    [[TMP18:%.*]] = add i64 [[TMP17]], [[STEP]]
+; VF8UF2-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP18]]
+; VF8UF2-NEXT:    store i8 0, ptr [[TMP19]], align 1
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE4]]
+; VF8UF2:       [[PRED_STORE_CONTINUE4]]:
+; VF8UF2-NEXT:    [[TMP20:%.*]] = extractelement <8 x i1> [[TMP3]], i32 3
+; VF8UF2-NEXT:    br i1 [[TMP20]], label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6:.*]]
+; VF8UF2:       [[PRED_STORE_IF5]]:
+; VF8UF2-NEXT:    [[TMP21:%.*]] = mul i64 3, [[STEP]]
+; VF8UF2-NEXT:    [[TMP22:%.*]] = add i64 0, [[TMP21]]
+; VF8UF2-NEXT:    [[TMP23:%.*]] = add i64 [[TMP22]], [[STEP]]
+; VF8UF2-NEXT:    [[TMP24:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP23]]
+; VF8UF2-NEXT:    store i8 0, ptr [[TMP24]], align 1
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE6]]
+; VF8UF2:       [[PRED_STORE_CONTINUE6]]:
+; VF8UF2-NEXT:    [[TMP25:%.*]] = extractelement <8 x i1> [[TMP3]], i32 4
+; VF8UF2-NEXT:    br i1 [[TMP25]], label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]]
+; VF8UF2:       [[PRED_STORE_IF7]]:
+; VF8UF2-NEXT:    [[TMP26:%.*]] = mul i64 4, [[STEP]]
+; VF8UF2-NEXT:    [[TMP27:%.*]] = add i64 0, [[TMP26]]
+; VF8UF2-NEXT:    [[TMP28:%.*]] = add i64 [[TMP27]], [[STEP]]
+; VF8UF2-NEXT:    [[TMP29:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP28]]
+; VF8UF2-NEXT:    store i8 0, ptr [[TMP29]], align 1
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE8]]
+; VF8UF2:       [[PRED_STORE_CONTINUE8]]:
+; VF8UF2-NEXT:    [[TMP30:%.*]] = extractelement <8 x i1> [[TMP3]], i32 5
+; VF8UF2-NEXT:    br i1 [[TMP30]], label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10:.*]]
+; VF8UF2:       [[PRED_STORE_IF9]]:
+; VF8UF2-NEXT:    [[TMP31:%.*]] = mul i64 5, [[STEP]]
+; VF8UF2-NEXT:    [[TMP32:%.*]] = add i64 0, [[TMP31]]
+; VF8UF2-NEXT:    [[TMP33:%.*]] = add i64 [[TMP32]], [[STEP]]
+; VF8UF2-NEXT:    [[TMP34:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP33]]
+; VF8UF2-NEXT:    store i8 0, ptr [[TMP34]], align 1
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE10]]
+; VF8UF2:       [[PRED_STORE_CONTINUE10]]:
+; VF8UF2-NEXT:    [[TMP35:%.*]] = extractelement <8 x i1> [[TMP3]], i32 6
+; VF8UF2-NEXT:    br i1 [[TMP35]], label %[[PRED_STORE_IF11:.*]], label %[[PRED_STORE_CONTINUE12:.*]]
+; VF8UF2:       [[PRED_STORE_IF11]]:
+; VF8UF2-NEXT:    [[TMP36:%.*]] = mul i64 6, [[STEP]]
+; VF8UF2-NEXT:    [[TMP37:%.*]] = add i64 0, [[TMP36]]
+; VF8UF2-NEXT:    [[TMP38:%.*]] = add i64 [[TMP37]], [[STEP]]
+; VF8UF2-NEXT:    [[TMP39:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP38]]
+; VF8UF2-NEXT:    store i8 0, ptr [[TMP39]], align 1
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE12]]
+; VF8UF2:       [[PRED_STORE_CONTINUE12]]:
+; VF8UF2-NEXT:    [[TMP40:%.*]] = extractelement <8 x i1> [[TMP3]], i32 7
+; VF8UF2-NEXT:    br i1 [[TMP40]], label %[[PRED_STORE_IF13:.*]], label %[[PRED_STORE_CONTINUE14:.*]]
+; VF8UF2:       [[PRED_STORE_IF13]]:
+; VF8UF2-NEXT:    [[TMP41:%.*]] = mul i64 7, [[STEP]]
+; VF8UF2-NEXT:    [[TMP42:%.*]] = add i64 0, [[TMP41]]
+; VF8UF2-NEXT:    [[TMP43:%.*]] = add i64 [[TMP42]], [[STEP]]
+; VF8UF2-NEXT:    [[TMP44:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP43]]
+; VF8UF2-NEXT:    store i8 0, ptr [[TMP44]], align 1
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE14]]
+; VF8UF2:       [[PRED_STORE_CONTINUE14]]:
+; VF8UF2-NEXT:    [[TMP45:%.*]] = extractelement <8 x i1> [[TMP4]], i32 0
+; VF8UF2-NEXT:    br i1 [[TMP45]], label %[[PRED_STORE_IF15:.*]], label %[[PRED_STORE_CONTINUE16:.*]]
+; VF8UF2:       [[PRED_STORE_IF15]]:
+; VF8UF2-NEXT:    [[TMP46:%.*]] = mul i64 8, [[STEP]]
+; VF8UF2-NEXT:    [[TMP47:%.*]] = add i64 0, [[TMP46]]
+; VF8UF2-NEXT:    [[TMP48:%.*]] = add i64 [[TMP47]], [[STEP]]
+; VF8UF2-NEXT:    [[TMP49:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP48]]
+; VF8UF2-NEXT:    store i8 0, ptr [[TMP49]], align 1
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE16]]
+; VF8UF2:       [[PRED_STORE_CONTINUE16]]:
+; VF8UF2-NEXT:    [[TMP50:%.*]] = extractelement <8 x i1> [[TMP4]], i32 1
+; VF8UF2-NEXT:    br i1 [[TMP50]], label %[[PRED_STORE_IF17:.*]], label %[[PRED_STORE_CONTINUE18:.*]]
+; VF8UF2:       [[PRED_STORE_IF17]]:
+; VF8UF2-NEXT:    [[TMP51:%.*]] = mul i64 9, [[STEP]]
+; VF8UF2-NEXT:    [[TMP52:%.*]] = add i64 0, [[TMP51]]
+; VF8UF2-NEXT:    [[TMP53:%.*]] = add i64 [[TMP52]], [[STEP]]
+; VF8UF2-NEXT:    [[TMP54:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP53]]
+; VF8UF2-NEXT:    store i8 0, ptr [[TMP54]], align 1
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE18]]
+; VF8UF2:       [[PRED_STORE_CONTINUE18]]:
+; VF8UF2-NEXT:    [[TMP55:%.*]] = extractelement <8 x i1> [[TMP4]], i32 2
+; VF8UF2-NEXT:    br i1 [[TMP55]], label %[[PRED_STORE_IF19:.*]], label %[[PRED_STORE_CONTINUE20:.*]]
+; VF8UF2:       [[PRED_STORE_IF19]]:
+; VF8UF2-NEXT:    [[TMP56:%.*]] = mul i64 10, [[STEP]]
+; VF8UF2-NEXT:    [[TMP57:%.*]] = add i64 0, [[TMP56]]
+; VF8UF2-NEXT:    [[TMP58:%.*]] = add i64 [[TMP57]], [[STEP]]
+; VF8UF2-NEXT:    [[TMP59:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP58]]
+; VF8UF2-NEXT:    store i8 0, ptr [[TMP59]], align 1
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE20]]
+; VF8UF2:       [[PRED_STORE_CONTINUE20]]:
+; VF8UF2-NEXT:    [[TMP60:%.*]] = extractelement <8 x i1> [[TMP4]], i32 3
+; VF8UF2-NEXT:    br i1 [[TMP60]], label %[[PRED_STORE_IF21:.*]], label %[[PRED_STORE_CONTINUE22:.*]]
+; VF8UF2:       [[PRED_STORE_IF21]]:
+; VF8UF2-NEXT:    [[TMP61:%.*]] = mul i64 11, [[STEP]]
+; VF8UF2-NEXT:    [[TMP62:%.*]] = add i64 0, [[TMP61]]
+; VF8UF2-NEXT:    [[TMP63:%.*]] = add i64 [[TMP62]], [[STEP]]
+; VF8UF2-NEXT:    [[TMP64:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP63]]
+; VF8UF2-NEXT:    store i8 0, ptr [[TMP64]], align 1
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE22]]
+; VF8UF2:       [[PRED_STORE_CONTINUE22]]:
+; VF8UF2-NEXT:    [[TMP65:%.*]] = extractelement <8 x i1> [[TMP4]], i32 4
+; VF8UF2-NEXT:    br i1 [[TMP65]], label %[[PRED_STORE_IF23:.*]], label %[[PRED_STORE_CONTINUE24:.*]]
+; VF8UF2:       [[PRED_STORE_IF23]]:
+; VF8UF2-NEXT:    [[TMP66:%.*]] = mul i64 12, [[STEP]]
+; VF8UF2-NEXT:    [[TMP67:%.*]] = add i64 0, [[TMP66]]
+; VF8UF2-NEXT:    [[TMP68:%.*]] = add i64 [[TMP67]], [[STEP]]
+; VF8UF2-NEXT:    [[TMP69:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP68]]
+; VF8UF2-NEXT:    store i8 0, ptr [[TMP69]], align 1
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE24]]
+; VF8UF2:       [[PRED_STORE_CONTINUE24]]:
+; VF8UF2-NEXT:    [[TMP70:%.*]] = extractelement <8 x i1> [[TMP4]], i32 5
+; VF8UF2-NEXT:    br i1 [[TMP70]], label %[[PRED_STORE_IF25:.*]], label %[[PRED_STORE_CONTINUE26:.*]]
+; VF8UF2:       [[PRED_STORE_IF25]]:
+; VF8UF2-NEXT:    [[TMP71:%.*]] = mul i64 13, [[STEP]]
+; VF8UF2-NEXT:    [[TMP72:%.*]] = add i64 0, [[TMP71]]
+; VF8UF2-NEXT:    [[TMP73:%.*]] = add i64 [[TMP72]], [[STEP]]
+; VF8UF2-NEXT:    [[TMP74:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP73]]
+; VF8UF2-NEXT:    store i8 0, ptr [[TMP74]], align 1
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE26]]
+; VF8UF2:       [[PRED_STORE_CONTINUE26]]:
+; VF8UF2-NEXT:    [[TMP75:%.*]] = extractelement <8 x i1> [[TMP4]], i32 6
+; VF8UF2-NEXT:    br i1 [[TMP75]], label %[[PRED_STORE_IF27:.*]], label %[[PRED_STORE_CONTINUE28:.*]]
+; VF8UF2:       [[PRED_STORE_IF27]]:
+; VF8UF2-NEXT:    [[TMP76:%.*]] = mul i64 14, [[STEP]]
+; VF8UF2-NEXT:    [[TMP77:%.*]] = add i64 0, [[TMP76]]
+; VF8UF2-NEXT:    [[TMP78:%.*]] = add i64 [[TMP77]], [[STEP]]
+; VF8UF2-NEXT:    [[TMP79:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP78]]
+; VF8UF2-NEXT:    store i8 0, ptr [[TMP79]], align 1
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE28]]
+; VF8UF2:       [[PRED_STORE_CONTINUE28]]:
+; VF8UF2-NEXT:    [[TMP80:%.*]] = extractelement <8 x i1> [[TMP4]], i32 7
+; VF8UF2-NEXT:    br i1 [[TMP80]], label %[[PRED_STORE_IF29:.*]], label %[[PRED_STORE_CONTINUE30:.*]]
+; VF8UF2:       [[PRED_STORE_IF29]]:
+; VF8UF2-NEXT:    [[TMP81:%.*]] = mul i64 15, [[STEP]]
+; VF8UF2-NEXT:    [[TMP82:%.*]] = add i64 0, [[TMP81]]
+; VF8UF2-NEXT:    [[TMP83:%.*]] = add i64 [[TMP82]], [[STEP]]
+; VF8UF2-NEXT:    [[TMP84:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP83]]
+; VF8UF2-NEXT:    store i8 0, ptr [[TMP84]], align 1
+; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE30]]
+; VF8UF2:       [[PRED_STORE_CONTINUE30]]:
+; VF8UF2-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
+; VF8UF2:       [[MIDDLE_BLOCK]]:
+; VF8UF2-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF8UF2:       [[SCALAR_PH]]:
+; VF8UF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP2]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VF8UF2-NEXT:    br label %[[LOOP:.*]]
+; VF8UF2:       [[LOOP]]:
+; VF8UF2-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF8UF2-NEXT:    [[IV_NEXT]] = add i64 [[IV]], [[STEP]]
+; VF8UF2-NEXT:    [[GEP_DST:%.*]] = getelementptr i8, ptr [[DST]], i64 [[IV_NEXT]]
+; VF8UF2-NEXT:    store i8 0, ptr [[GEP_DST]], align 1
+; VF8UF2-NEXT:    [[EC:%.*]] = icmp slt i64 [[IV_NEXT]], 16
+; VF8UF2-NEXT:    br i1 [[EC]], label %[[LOOP]], label %[[EXIT]], !llvm.loop [[LOOP5:![0-9]+]]
+; VF8UF2:       [[EXIT]]:
+; VF8UF2-NEXT:    ret void
+;
+; VF16UF1-LABEL: define void @scev_expand_step(
+; VF16UF1-SAME: i64 [[X:%.*]], ptr [[DST:%.*]]) {
+; VF16UF1-NEXT:  [[ENTRY:.*]]:
+; VF16UF1-NEXT:    [[C:%.*]] = icmp eq i64 [[X]], 65536
+; VF16UF1-NEXT:    call void @llvm.assume(i1 [[C]])
+; VF16UF1-NEXT:    [[FR:%.*]] = freeze i64 [[X]]
+; VF16UF1-NEXT:    [[STEP:%.*]] = add i64 [[FR]], -65534
+; VF16UF1-NEXT:    [[TMP0:%.*]] = udiv i64 15, [[STEP]]
+; VF16UF1-NEXT:    [[TMP1:%.*]] = add nuw nsw i64 [[TMP0]], 1
+; VF16UF1-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF16UF1:       [[VECTOR_PH]]:
+; VF16UF1-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP1]], 15
+; VF16UF1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 16
+; VF16UF1-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; VF16UF1-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[TMP1]], 1
+; VF16UF1-NEXT:    [[TMP2:%.*]] = mul i64 [[N_VEC]], [[STEP]]
+; VF16UF1-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF16UF1:       [[VECTOR_BODY]]:
+; VF16UF1-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; VF16UF1-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT]], <16 x i64> poison, <16 x i32> zeroinitializer
+; VF16UF1-NEXT:    [[TMP3:%.*]] = icmp ule <16 x i64> <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>, [[BROADCAST_SPLAT]]
+; VF16UF1-NEXT:    [[TMP4:%.*]] = extractelement <16 x i1> [[TMP3]], i32 0
+; VF16UF1-NEXT:    br i1 [[TMP4]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; VF16UF1:       [[PRED_STORE_IF]]:
+; VF16UF1-NEXT:    [[TMP5:%.*]] = mul i64 0, [[STEP]]
+; VF16UF1-NEXT:    [[TMP6:%.*]] = add i64 0, [[TMP5]]
+; VF16UF1-NEXT:    [[TMP7:%.*]] = add i64 [[TMP6]], [[STEP]]
+; VF16UF1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP7]]
+; VF16UF1-NEXT:    store i8 0, ptr [[TMP8]], align 1
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE]]
+; VF16UF1:       [[PRED_STORE_CONTINUE]]:
+; VF16UF1-NEXT:    [[TMP9:%.*]] = extractelement <16 x i1> [[TMP3]], i32 1
+; VF16UF1-NEXT:    br i1 [[TMP9]], label %[[PRED_STORE_IF1:.*]], label %[[PRED_STORE_CONTINUE2:.*]]
+; VF16UF1:       [[PRED_STORE_IF1]]:
+; VF16UF1-NEXT:    [[TMP10:%.*]] = mul i64 1, [[STEP]]
+; VF16UF1-NEXT:    [[TMP11:%.*]] = add i64 0, [[TMP10]]
+; VF16UF1-NEXT:    [[TMP12:%.*]] = add i64 [[TMP11]], [[STEP]]
+; VF16UF1-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP12]]
+; VF16UF1-NEXT:    store i8 0, ptr [[TMP13]], align 1
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE2]]
+; VF16UF1:       [[PRED_STORE_CONTINUE2]]:
+; VF16UF1-NEXT:    [[TMP14:%.*]] = extractelement <16 x i1> [[TMP3]], i32 2
+; VF16UF1-NEXT:    br i1 [[TMP14]], label %[[PRED_STORE_IF3:.*]], label %[[PRED_STORE_CONTINUE4:.*]]
+; VF16UF1:       [[PRED_STORE_IF3]]:
+; VF16UF1-NEXT:    [[TMP15:%.*]] = mul i64 2, [[STEP]]
+; VF16UF1-NEXT:    [[TMP16:%.*]] = add i64 0, [[TMP15]]
+; VF16UF1-NEXT:    [[TMP17:%.*]] = add i64 [[TMP16]], [[STEP]]
+; VF16UF1-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP17]]
+; VF16UF1-NEXT:    store i8 0, ptr [[TMP18]], align 1
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE4]]
+; VF16UF1:       [[PRED_STORE_CONTINUE4]]:
+; VF16UF1-NEXT:    [[TMP19:%.*]] = extractelement <16 x i1> [[TMP3]], i32 3
+; VF16UF1-NEXT:    br i1 [[TMP19]], label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6:.*]]
+; VF16UF1:       [[PRED_STORE_IF5]]:
+; VF16UF1-NEXT:    [[TMP20:%.*]] = mul i64 3, [[STEP]]
+; VF16UF1-NEXT:    [[TMP21:%.*]] = add i64 0, [[TMP20]]
+; VF16UF1-NEXT:    [[TMP22:%.*]] = add i64 [[TMP21]], [[STEP]]
+; VF16UF1-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP22]]
+; VF16UF1-NEXT:    store i8 0, ptr [[TMP23]], align 1
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE6]]
+; VF16UF1:       [[PRED_STORE_CONTINUE6]]:
+; VF16UF1-NEXT:    [[TMP24:%.*]] = extractelement <16 x i1> [[TMP3]], i32 4
+; VF16UF1-NEXT:    br i1 [[TMP24]], label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]]
+; VF16UF1:       [[PRED_STORE_IF7]]:
+; VF16UF1-NEXT:    [[TMP25:%.*]] = mul i64 4, [[STEP]]
+; VF16UF1-NEXT:    [[TMP26:%.*]] = add i64 0, [[TMP25]]
+; VF16UF1-NEXT:    [[TMP27:%.*]] = add i64 [[TMP26]], [[STEP]]
+; VF16UF1-NEXT:    [[TMP28:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP27]]
+; VF16UF1-NEXT:    store i8 0, ptr [[TMP28]], align 1
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE8]]
+; VF16UF1:       [[PRED_STORE_CONTINUE8]]:
+; VF16UF1-NEXT:    [[TMP29:%.*]] = extractelement <16 x i1> [[TMP3]], i32 5
+; VF16UF1-NEXT:    br i1 [[TMP29]], label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10:.*]]
+; VF16UF1:       [[PRED_STORE_IF9]]:
+; VF16UF1-NEXT:    [[TMP30:%.*]] = mul i64 5, [[STEP]]
+; VF16UF1-NEXT:    [[TMP31:%.*]] = add i64 0, [[TMP30]]
+; VF16UF1-NEXT:    [[TMP32:%.*]] = add i64 [[TMP31]], [[STEP]]
+; VF16UF1-NEXT:    [[TMP33:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP32]]
+; VF16UF1-NEXT:    store i8 0, ptr [[TMP33]], align 1
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE10]]
+; VF16UF1:       [[PRED_STORE_CONTINUE10]]:
+; VF16UF1-NEXT:    [[TMP34:%.*]] = extractelement <16 x i1> [[TMP3]], i32 6
+; VF16UF1-NEXT:    br i1 [[TMP34]], label %[[PRED_STORE_IF11:.*]], label %[[PRED_STORE_CONTINUE12:.*]]
+; VF16UF1:       [[PRED_STORE_IF11]]:
+; VF16UF1-NEXT:    [[TMP35:%.*]] = mul i64 6, [[STEP]]
+; VF16UF1-NEXT:    [[TMP36:%.*]] = add i64 0, [[TMP35]]
+; VF16UF1-NEXT:    [[TMP37:%.*]] = add i64 [[TMP36]], [[STEP]]
+; VF16UF1-NEXT:    [[TMP38:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP37]]
+; VF16UF1-NEXT:    store i8 0, ptr [[TMP38]], align 1
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE12]]
+; VF16UF1:       [[PRED_STORE_CONTINUE12]]:
+; VF16UF1-NEXT:    [[TMP39:%.*]] = extractelement <16 x i1> [[TMP3]], i32 7
+; VF16UF1-NEXT:    br i1 [[TMP39]], label %[[PRED_STORE_IF13:.*]], label %[[PRED_STORE_CONTINUE14:.*]]
+; VF16UF1:       [[PRED_STORE_IF13]]:
+; VF16UF1-NEXT:    [[TMP40:%.*]] = mul i64 7, [[STEP]]
+; VF16UF1-NEXT:    [[TMP41:%.*]] = add i64 0, [[TMP40]]
+; VF16UF1-NEXT:    [[TMP42:%.*]] = add i64 [[TMP41]], [[STEP]]
+; VF16UF1-NEXT:    [[TMP43:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP42]]
+; VF16UF1-NEXT:    store i8 0, ptr [[TMP43]], align 1
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE14]]
+; VF16UF1:       [[PRED_STORE_CONTINUE14]]:
+; VF16UF1-NEXT:    [[TMP44:%.*]] = extractelement <16 x i1> [[TMP3]], i32 8
+; VF16UF1-NEXT:    br i1 [[TMP44]], label %[[PRED_STORE_IF15:.*]], label %[[PRED_STORE_CONTINUE16:.*]]
+; VF16UF1:       [[PRED_STORE_IF15]]:
+; VF16UF1-NEXT:    [[TMP45:%.*]] = mul i64 8, [[STEP]]
+; VF16UF1-NEXT:    [[TMP46:%.*]] = add i64 0, [[TMP45]]
+; VF16UF1-NEXT:    [[TMP47:%.*]] = add i64 [[TMP46]], [[STEP]]
+; VF16UF1-NEXT:    [[TMP48:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP47]]
+; VF16UF1-NEXT:    store i8 0, ptr [[TMP48]], align 1
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE16]]
+; VF16UF1:       [[PRED_STORE_CONTINUE16]]:
+; VF16UF1-NEXT:    [[TMP49:%.*]] = extractelement <16 x i1> [[TMP3]], i32 9
+; VF16UF1-NEXT:    br i1 [[TMP49]], label %[[PRED_STORE_IF17:.*]], label %[[PRED_STORE_CONTINUE18:.*]]
+; VF16UF1:       [[PRED_STORE_IF17]]:
+; VF16UF1-NEXT:    [[TMP50:%.*]] = mul i64 9, [[STEP]]
+; VF16UF1-NEXT:    [[TMP51:%.*]] = add i64 0, [[TMP50]]
+; VF16UF1-NEXT:    [[TMP52:%.*]] = add i64 [[TMP51]], [[STEP]]
+; VF16UF1-NEXT:    [[TMP53:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP52]]
+; VF16UF1-NEXT:    store i8 0, ptr [[TMP53]], align 1
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE18]]
+; VF16UF1:       [[PRED_STORE_CONTINUE18]]:
+; VF16UF1-NEXT:    [[TMP54:%.*]] = extractelement <16 x i1> [[TMP3]], i32 10
+; VF16UF1-NEXT:    br i1 [[TMP54]], label %[[PRED_STORE_IF19:.*]], label %[[PRED_STORE_CONTINUE20:.*]]
+; VF16UF1:       [[PRED_STORE_IF19]]:
+; VF16UF1-NEXT:    [[TMP55:%.*]] = mul i64 10, [[STEP]]
+; VF16UF1-NEXT:    [[TMP56:%.*]] = add i64 0, [[TMP55]]
+; VF16UF1-NEXT:    [[TMP57:%.*]] = add i64 [[TMP56]], [[STEP]]
+; VF16UF1-NEXT:    [[TMP58:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP57]]
+; VF16UF1-NEXT:    store i8 0, ptr [[TMP58]], align 1
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE20]]
+; VF16UF1:       [[PRED_STORE_CONTINUE20]]:
+; VF16UF1-NEXT:    [[TMP59:%.*]] = extractelement <16 x i1> [[TMP3]], i32 11
+; VF16UF1-NEXT:    br i1 [[TMP59]], label %[[PRED_STORE_IF21:.*]], label %[[PRED_STORE_CONTINUE22:.*]]
+; VF16UF1:       [[PRED_STORE_IF21]]:
+; VF16UF1-NEXT:    [[TMP60:%.*]] = mul i64 11, [[STEP]]
+; VF16UF1-NEXT:    [[TMP61:%.*]] = add i64 0, [[TMP60]]
+; VF16UF1-NEXT:    [[TMP62:%.*]] = add i64 [[TMP61]], [[STEP]]
+; VF16UF1-NEXT:    [[TMP63:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP62]]
+; VF16UF1-NEXT:    store i8 0, ptr [[TMP63]], align 1
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE22]]
+; VF16UF1:       [[PRED_STORE_CONTINUE22]]:
+; VF16UF1-NEXT:    [[TMP64:%.*]] = extractelement <16 x i1> [[TMP3]], i32 12
+; VF16UF1-NEXT:    br i1 [[TMP64]], label %[[PRED_STORE_IF23:.*]], label %[[PRED_STORE_CONTINUE24:.*]]
+; VF16UF1:       [[PRED_STORE_IF23]]:
+; VF16UF1-NEXT:    [[TMP65:%.*]] = mul i64 12, [[STEP]]
+; VF16UF1-NEXT:    [[TMP66:%.*]] = add i64 0, [[TMP65]]
+; VF16UF1-NEXT:    [[TMP67:%.*]] = add i64 [[TMP66]], [[STEP]]
+; VF16UF1-NEXT:    [[TMP68:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP67]]
+; VF16UF1-NEXT:    store i8 0, ptr [[TMP68]], align 1
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE24]]
+; VF16UF1:       [[PRED_STORE_CONTINUE24]]:
+; VF16UF1-NEXT:    [[TMP69:%.*]] = extractelement <16 x i1> [[TMP3]], i32 13
+; VF16UF1-NEXT:    br i1 [[TMP69]], label %[[PRED_STORE_IF25:.*]], label %[[PRED_STORE_CONTINUE26:.*]]
+; VF16UF1:       [[PRED_STORE_IF25]]:
+; VF16UF1-NEXT:    [[TMP70:%.*]] = mul i64 13, [[STEP]]
+; VF16UF1-NEXT:    [[TMP71:%.*]] = add i64 0, [[TMP70]]
+; VF16UF1-NEXT:    [[TMP72:%.*]] = add i64 [[TMP71]], [[STEP]]
+; VF16UF1-NEXT:    [[TMP73:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP72]]
+; VF16UF1-NEXT:    store i8 0, ptr [[TMP73]], align 1
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE26]]
+; VF16UF1:       [[PRED_STORE_CONTINUE26]]:
+; VF16UF1-NEXT:    [[TMP74:%.*]] = extractelement <16 x i1> [[TMP3]], i32 14
+; VF16UF1-NEXT:    br i1 [[TMP74]], label %[[PRED_STORE_IF27:.*]], label %[[PRED_STORE_CONTINUE28:.*]]
+; VF16UF1:       [[PRED_STORE_IF27]]:
+; VF16UF1-NEXT:    [[TMP75:%.*]] = mul i64 14, [[STEP]]
+; VF16UF1-NEXT:    [[TMP76:%.*]] = add i64 0, [[TMP75]]
+; VF16UF1-NEXT:    [[TMP77:%.*]] = add i64 [[TMP76]], [[STEP]]
+; VF16UF1-NEXT:    [[TMP78:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP77]]
+; VF16UF1-NEXT:    store i8 0, ptr [[TMP78]], align 1
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE28]]
+; VF16UF1:       [[PRED_STORE_CONTINUE28]]:
+; VF16UF1-NEXT:    [[TMP79:%.*]] = extractelement <16 x i1> [[TMP3]], i32 15
+; VF16UF1-NEXT:    br i1 [[TMP79]], label %[[PRED_STORE_IF29:.*]], label %[[PRED_STORE_CONTINUE30:.*]]
+; VF16UF1:       [[PRED_STORE_IF29]]:
+; VF16UF1-NEXT:    [[TMP80:%.*]] = mul i64 15, [[STEP]]
+; VF16UF1-NEXT:    [[TMP81:%.*]] = add i64 0, [[TMP80]]
+; VF16UF1-NEXT:    [[TMP82:%.*]] = add i64 [[TMP81]], [[STEP]]
+; VF16UF1-NEXT:    [[TMP83:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP82]]
+; VF16UF1-NEXT:    store i8 0, ptr [[TMP83]], align 1
+; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE30]]
+; VF16UF1:       [[PRED_STORE_CONTINUE30]]:
+; VF16UF1-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
+; VF16UF1:       [[MIDDLE_BLOCK]]:
+; VF16UF1-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF16UF1:       [[SCALAR_PH]]:
+; VF16UF1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP2]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VF16UF1-NEXT:    br label %[[LOOP:.*]]
+; VF16UF1:       [[LOOP]]:
+; VF16UF1-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF16UF1-NEXT:    [[IV_NEXT]] = add i64 [[IV]], [[STEP]]
+; VF16UF1-NEXT:    [[GEP_DST:%.*]] = getelementptr i8, ptr [[DST]], i64 [[IV_NEXT]]
+; VF16UF1-NEXT:    store i8 0, ptr [[GEP_DST]], align 1
+; VF16UF1-NEXT:    [[EC:%.*]] = icmp slt i64 [[IV_NEXT]], 16
+; VF16UF1-NEXT:    br i1 [[EC]], label %[[LOOP]], label %[[EXIT]], !llvm.loop [[LOOP5:![0-9]+]]
+; VF16UF1:       [[EXIT]]:
+; VF16UF1-NEXT:    ret void
+;
+entry:
+  %c = icmp eq i64 %x, 65536
+  call void @llvm.assume(i1 %c)
+  %fr = freeze i64 %x
+  %step = add i64 %fr, -65534
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %iv.next = add i64 %iv, %step
+  %gep.dst = getelementptr i8, ptr %dst, i64 %iv.next
+  store i8 0, ptr %gep.dst, align 1
+  %ec = icmp slt i64 %iv.next, 16
+  br i1 %ec, label %loop, label %exit
+
+exit:
+  ret void
+}
+
 ;.
 ; VF8UF1: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
 ; VF8UF1: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
@@ -715,16 +1251,19 @@ exit:
 ; VF8UF1: [[LOOP4]] = distinct !{[[LOOP4]], [[META2]], [[META1]]}
 ; VF8UF1: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]], [[META2]]}
 ; VF8UF1: [[LOOP6]] = distinct !{[[LOOP6]], [[META2]], [[META1]]}
+; VF8UF1: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
 ;.
 ; VF8UF2: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
 ; VF8UF2: [[META1]] = !{!"llvm.loop.unroll.runtime.disable"}
 ; VF8UF2: [[META2]] = !{!"llvm.loop.isvectorized", i32 1}
 ; VF8UF2: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]}
 ; VF8UF2: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; VF8UF2: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]], [[META2]]}
 ;.
 ; VF16UF1: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
 ; VF16UF1: [[META1]] = !{!"llvm.loop.unroll.runtime.disable"}
 ; VF16UF1: [[META2]] = !{!"llvm.loop.isvectorized", i32 1}
 ; VF16UF1: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]}
 ; VF16UF1: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; VF16UF1: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]], [[META2]]}
 ;.
diff --git a/llvm/test/Transforms/MemProfContextDisambiguation/recursive.ll b/llvm/test/Transforms/MemProfContextDisambiguation/recursive.ll
new file mode 100644
index 000000000000..759d5115896c
--- /dev/null
+++ b/llvm/test/Transforms/MemProfContextDisambiguation/recursive.ll
@@ -0,0 +1,159 @@
+;; Test recursion handling during cloning.
+;;
+;; Original code looks like:
+;;
+;; #include <stdlib.h>
+;; #include <string.h>
+;; #include <unistd.h>
+;; __attribute((noinline)) char *D() {
+;;   return new char[10];
+;; }
+;; __attribute((noinline)) char *B(int n);
+;; __attribute((noinline)) char *C(int n) {
+;;   if (!n) {
+;;     return D();
+;;   }
+;;   return B(n-1);
+;; }
+;; __attribute((noinline)) char *B(int n) {
+;;   return C(n);
+;; }
+;; int main(int argc, char **argv) {
+;;   char *x = B(1);
+;;   char *y = B(1);
+;;   char *z = B(0);
+;;   memset(x, 0, 10);
+;;   memset(y, 0, 10);
+;;   memset(z, 0, 10);
+;;   free(x);
+;;   sleep(200);
+;;   free(y);
+;;   free(z);
+;;   return 0;
+;; }
+;;
+;; The IR was then reduced using llvm-reduce with the expected FileCheck input.
+
+;; By default we should enable cloning of contexts involved with recursive
+;; cycles, but not through the cycle itself. I.e. until full support for
+;; recursion is added, the cloned recursive call from C back to B (line 12) will
+;; not be updated to call a clone.
+; RUN: opt -passes=memprof-context-disambiguation -supports-hot-cold-new \
+; RUN:  -memprof-verify-ccg -memprof-verify-nodes \
+; RUN:  -pass-remarks=memprof-context-disambiguation \
+; RUN:  %s -S 2>&1 | FileCheck %s \
+; RUN:  --implicit-check-not "memprof_recursive3.cc:12:10: call in clone _Z1Ci.memprof.1 assigned" \
+; RUN:  --check-prefix=ALL --check-prefix=ALLOW-RECUR-CALLSITES --check-prefix=ALLOW-RECUR-CONTEXTS
+
+;; Skipping recursive callsites should result in no cloning.
+; RUN: opt -passes=memprof-context-disambiguation -supports-hot-cold-new \
+; RUN:  -memprof-verify-ccg -memprof-verify-nodes \
+; RUN:  -pass-remarks=memprof-context-disambiguation \
+; RUN:	-memprof-allow-recursive-callsites=false \
+; RUN:  %s -S 2>&1 | FileCheck %s \
+; RUN:  --implicit-check-not "memprof_recursive3.cc:12:10: call in clone _Z1Ci.memprof.1 assigned" \
+; RUN:  --implicit-check-not="created clone" \
+; RUN:	--implicit-check-not="marked with memprof allocation attribute cold" \
+; RUN:  --check-prefix=ALL
+
+;; Skipping recursive contexts should prevent spurious call to cloned version of
+;; B from the context starting at memprof_recursive.cc:19:13, which is actually
+;; recursive (until that support is added).
+; RUN: opt -passes=memprof-context-disambiguation -supports-hot-cold-new \
+; RUN:  -memprof-verify-ccg -memprof-verify-nodes \
+; RUN:  -pass-remarks=memprof-context-disambiguation \
+; RUN:	-memprof-allow-recursive-contexts=false \
+; RUN:  %s -S 2>&1 | FileCheck %s \
+; RUN:  --implicit-check-not "memprof_recursive3.cc:12:10: call in clone _Z1Ci.memprof.1 assigned" \
+; RUN:  --check-prefix=ALL --check-prefix=ALLOW-RECUR-CALLSITES --check-prefix=SKIP-RECUR-CONTEXTS
+
+; ALLOW-RECUR-CALLSITES: memprof_recursive.cc:4:0: created clone _Z1Dv.memprof.1
+; ALLOW-RECUR-CALLSITES: memprof_recursive.cc:8:0: created clone _Z1Ci.memprof.1
+; ALLOW-RECUR-CALLSITES: memprof_recursive.cc:14:0: created clone _Z1Bi.memprof.1
+; ALLOW-RECUR-CALLSITES: memprof_recursive.cc:20:13: call in clone main assigned to call function clone _Z1Bi.memprof.1
+;; We should only call the cold clone for the recursive context if we enabled
+;; recursive contexts via -memprof-allow-recursive-contexts=true (default).
+; ALLOW-RECUR-CONTEXTS: memprof_recursive.cc:19:13: call in clone main assigned to call function clone _Z1Bi.memprof.1
+; ALLOW-RECUR-CALLSITES: memprof_recursive.cc:15:10: call in clone _Z1Bi.memprof.1 assigned to call function clone _Z1Ci.memprof.1
+; ALLOW-RECUR-CALLSITES: memprof_recursive.cc:10:12: call in clone _Z1Ci.memprof.1 assigned to call function clone _Z1Dv.memprof.1
+; ALLOW-RECUR-CALLSITES: memprof_recursive.cc:5:10: call in clone _Z1Dv.memprof.1 marked with memprof allocation attribute cold
+;; We should call the original B for the recursive context if we have
+;; disabled recursive contexts via -memprof-allow-recursive-contexts=false.
+; SKIP-RECUR-CONTEXTS: memprof_recursive.cc:19:13: call in clone main assigned to call function clone _Z1Bi
+; ALLOW-RECUR-CALLSITES: memprof_recursive.cc:12:10: call in clone _Z1Ci assigned to call function clone _Z1Bi
+; ALLOW-RECUR-CALLSITES: memprof_recursive.cc:18:13: call in clone main assigned to call function clone _Z1Bi
+; ALLOW-RECUR-CALLSITES: memprof_recursive.cc:15:10: call in clone _Z1Bi assigned to call function clone _Z1Ci
+; ALLOW-RECUR-CALLSITES: memprof_recursive.cc:10:12: call in clone _Z1Ci assigned to call function clone _Z1Dv
+; ALL: memprof_recursive.cc:5:10: call in clone _Z1Dv marked with memprof allocation attribute notcold
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define ptr @_Z1Dv() !dbg !3 {
+entry:
+  %call = tail call ptr @_Znam(i64 10), !dbg !6, !memprof !7, !callsite !14
+  ret ptr null
+}
+
+define ptr @_Z1Ci(i32 %n) !dbg !15 {
+entry:
+  %call = tail call ptr @_Z1Dv(), !dbg !16, !callsite !17
+  br label %return
+
+if.end:                                           ; No predecessors!
+  %call1 = tail call ptr @_Z1Bi(i32 0), !dbg !18, !callsite !19
+  br label %return
+
+return:                                           ; preds = %if.end, %entry
+  ret ptr null
+}
+
+define ptr @_Z1Bi(i32 %n) !dbg !20 {
+entry:
+  %call = tail call ptr @_Z1Ci(i32 0), !dbg !21, !callsite !22
+  ret ptr null
+}
+
+define i32 @main() {
+entry:
+  %call = tail call ptr @_Z1Bi(i32 0), !dbg !23, !callsite !25
+  %call1 = tail call ptr @_Z1Bi(i32 0), !dbg !26, !callsite !27
+  %call2 = tail call ptr @_Z1Bi(i32 0), !dbg !28, !callsite !29
+  ret i32 0
+}
+
+declare ptr @_Znam(i64)
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 20.0.0git (https://github.com/llvm/llvm-project.git 7aec6dc477f8148ed066d10dfc7a012a51b6599c)", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, splitDebugInlining: false, debugInfoForProfiling: true, nameTableKind: None)
+!1 = !DIFile(filename: "memprof_recursive.cc", directory: ".", checksumkind: CSK_MD5, checksum: "2f15f63b187a0e0d40e7fdd18b10576a")
+!2 = !{i32 2, !"Debug Info Version", i32 3}
+!3 = distinct !DISubprogram(name: "D", linkageName: "_Z1Dv", scope: !1, file: !1, line: 4, type: !4, scopeLine: 4, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!4 = !DISubroutineType(types: !5)
+!5 = !{}
+!6 = !DILocation(line: 5, column: 10, scope: !3)
+!7 = !{!8, !10, !12}
+!8 = !{!9, !"cold"}
+!9 = !{i64 6541423618768552252, i64 -200552803509692312, i64 -2954124005641725917, i64 6307901912192269588}
+!10 = !{!11, !"notcold"}
+!11 = !{i64 6541423618768552252, i64 -200552803509692312, i64 -2954124005641725917, i64 -7155190423157709404, i64 -2954124005641725917, i64 8632435727821051414}
+!12 = !{!13, !"cold"}
+!13 = !{i64 6541423618768552252, i64 -200552803509692312, i64 -2954124005641725917, i64 -7155190423157709404, i64 -2954124005641725917, i64 -3421689549917153178}
+!14 = !{i64 6541423618768552252}
+!15 = distinct !DISubprogram(name: "C", linkageName: "_Z1Ci", scope: !1, file: !1, line: 8, type: !4, scopeLine: 8, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!16 = !DILocation(line: 10, column: 12, scope: !15)
+!17 = !{i64 -200552803509692312}
+!18 = !DILocation(line: 12, column: 10, scope: !15)
+!19 = !{i64 -7155190423157709404}
+!20 = distinct !DISubprogram(name: "B", linkageName: "_Z1Bi", scope: !1, file: !1, line: 14, type: !4, scopeLine: 14, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!21 = !DILocation(line: 15, column: 10, scope: !20)
+!22 = !{i64 -2954124005641725917}
+!23 = !DILocation(line: 18, column: 13, scope: !24)
+!24 = distinct !DISubprogram(name: "main", scope: !1, file: !1, line: 17, type: !4, scopeLine: 17, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!25 = !{i64 8632435727821051414}
+!26 = !DILocation(line: 19, column: 13, scope: !24)
+!27 = !{i64 -3421689549917153178}
+!28 = !DILocation(line: 20, column: 13, scope: !24)
+!29 = !{i64 6307901912192269588}
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/block_scaling_decompr_8bit.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/block_scaling_decompr_8bit.ll
index 7d9524420286..9f3e09d7420d 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/block_scaling_decompr_8bit.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/block_scaling_decompr_8bit.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -passes='default<O3>' -S %s | FileCheck %s
+; RUN: opt -passes="default<O3>" -S %s | FileCheck %s
 
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32"
 target triple = "aarch64"
@@ -416,21 +416,23 @@ define internal noundef <8 x i16> @_ZL24cmplx_mul_combined_re_im11__Int16x8_t20c
 ; CHECK-NEXT:    [[SCALE_SROA_0_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[SCALE_COERCE]] to i16
 ; CHECK-NEXT:    [[SCALE_SROA_2_0_EXTRACT_SHIFT36:%.*]] = lshr i64 [[SCALE_COERCE]], 16
 ; CHECK-NEXT:    [[SCALE_SROA_2_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[SCALE_SROA_2_0_EXTRACT_SHIFT36]] to i16
+; CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
 ; CHECK-NEXT:    [[VECINIT_I19:%.*]] = insertelement <8 x i16> poison, i16 [[SCALE_SROA_0_0_EXTRACT_TRUNC]], i64 0
 ; CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[SCALE_SROA_2_0_EXTRACT_TRUNC]], i64 0
 ; CHECK-NEXT:    [[VECINIT7_I:%.*]] = shufflevector <8 x i16> [[VECINIT_I]], <8 x i16> poison, <8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[VQNEGQ_V1_I:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.sqneg.v8i16(<8 x i16> [[VECINIT7_I]])
+; CHECK-NEXT:    [[VBSL5_I:%.*]] = shufflevector <8 x i16> [[VQNEGQ_V1_I]], <8 x i16> [[VECINIT_I]], <8 x i32> <i32 0, i32 8, i32 2, i32 8, i32 4, i32 8, i32 6, i32 8>
 ; CHECK-NEXT:    [[SHUFFLE_I85:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[SHUFFLE_I82:%.*]] = shufflevector <8 x i16> [[VECINIT_I19]], <8 x i16> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[VQDMULL_V2_I72:%.*]] = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I85]], <4 x i16> [[SHUFFLE_I82]])
 ; CHECK-NEXT:    [[SHUFFLE_I97:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I97]], <4 x i16> [[SHUFFLE_I82]])
-; CHECK-NEXT:    [[SHUFFLE_I79:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-NEXT:    [[SHUFFLE_I76:%.*]] = shufflevector <8 x i16> [[VQNEGQ_V1_I]], <8 x i16> [[VECINIT_I]], <4 x i32> <i32 0, i32 8, i32 2, i32 8>
+; CHECK-NEXT:    [[SHUFFLE_I79:%.*]] = shufflevector <8 x i16> [[SHUFFLE_I]], <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[SHUFFLE_I76:%.*]] = shufflevector <8 x i16> [[VBSL5_I]], <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[VQDMLAL2_I106:%.*]] = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I79]], <4 x i16> [[SHUFFLE_I76]])
 ; CHECK-NEXT:    [[VQDMLAL_V3_I107:%.*]] = tail call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMULL_V2_I72]], <4 x i32> [[VQDMLAL2_I106]])
-; CHECK-NEXT:    [[SHUFFLE_I91:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <4 x i32> <i32 5, i32 4, i32 7, i32 6>
-; CHECK-NEXT:    [[SHUFFLE_I88:%.*]] = shufflevector <8 x i16> [[VQNEGQ_V1_I]], <8 x i16> [[VECINIT_I]], <4 x i32> <i32 4, i32 8, i32 6, i32 8>
+; CHECK-NEXT:    [[SHUFFLE_I91:%.*]] = shufflevector <8 x i16> [[SHUFFLE_I]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[SHUFFLE_I88:%.*]] = shufflevector <8 x i16> [[VBSL5_I]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I91]], <4 x i16> [[SHUFFLE_I88]])
 ; CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = tail call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMULL_V2_I]], <4 x i32> [[VQDMLAL2_I]])
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[VQDMLAL_V3_I107]] to <8 x i16>
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll b/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll
index 3c72d385dcf4..a4aea02a3351 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll
@@ -566,22 +566,16 @@ define <4 x float> @add_v4f32_012u(<4 x float> %a, <4 x float> %b) {
 ; SSE2-NEXT:    ret <4 x float> [[RESULT1]]
 ;
 ; SSE4-LABEL: @add_v4f32_012u(
-; SSE4-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[B]], [[SHIFT]]
-; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 2, i32 poison, i32 poison>
-; SSE4-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 0, i32 3, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> <i32 1, i32 2, i32 4, i32 poison>
+; SSE4-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 0, i32 3, i32 5, i32 poison>
 ; SSE4-NEXT:    [[TMP4:%.*]] = fadd <4 x float> [[TMP2]], [[TMP3]]
-; SSE4-NEXT:    [[RESULT:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 4, i32 poison>
-; SSE4-NEXT:    ret <4 x float> [[RESULT]]
+; SSE4-NEXT:    ret <4 x float> [[TMP4]]
 ;
 ; AVX2-LABEL: @add_v4f32_012u(
-; AVX2-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; AVX2-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[B]], [[SHIFT]]
-; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 2, i32 poison, i32 poison>
-; AVX2-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 0, i32 3, i32 poison, i32 poison>
+; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> <i32 1, i32 2, i32 4, i32 poison>
+; AVX2-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 0, i32 3, i32 5, i32 poison>
 ; AVX2-NEXT:    [[TMP4:%.*]] = fadd <4 x float> [[TMP2]], [[TMP3]]
-; AVX2-NEXT:    [[RESULT:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 4, i32 poison>
-; AVX2-NEXT:    ret <4 x float> [[RESULT]]
+; AVX2-NEXT:    ret <4 x float> [[TMP4]]
 ;
 ; AVX512-LABEL: @add_v4f32_012u(
 ; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 4, i32 poison>
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/hsub.ll b/llvm/test/Transforms/PhaseOrdering/X86/hsub.ll
index cf1c948ac6af..bcb316a4a73e 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/hsub.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/hsub.ll
@@ -566,22 +566,16 @@ define <4 x float> @sub_v4f32_012u(<4 x float> %a, <4 x float> %b) {
 ; SSE2-NEXT:    ret <4 x float> [[RESULT1]]
 ;
 ; SSE4-LABEL: @sub_v4f32_012u(
-; SSE4-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT:    [[TMP1:%.*]] = fsub <4 x float> [[B]], [[SHIFT]]
-; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> <i32 0, i32 2, i32 poison, i32 poison>
-; SSE4-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 1, i32 3, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 4, i32 poison>
+; SSE4-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 poison>
 ; SSE4-NEXT:    [[TMP4:%.*]] = fsub <4 x float> [[TMP2]], [[TMP3]]
-; SSE4-NEXT:    [[RESULT:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 4, i32 poison>
-; SSE4-NEXT:    ret <4 x float> [[RESULT]]
+; SSE4-NEXT:    ret <4 x float> [[TMP4]]
 ;
 ; AVX2-LABEL: @sub_v4f32_012u(
-; AVX2-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; AVX2-NEXT:    [[TMP1:%.*]] = fsub <4 x float> [[B]], [[SHIFT]]
-; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> <i32 0, i32 2, i32 poison, i32 poison>
-; AVX2-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 1, i32 3, i32 poison, i32 poison>
+; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 4, i32 poison>
+; AVX2-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 poison>
 ; AVX2-NEXT:    [[TMP4:%.*]] = fsub <4 x float> [[TMP2]], [[TMP3]]
-; AVX2-NEXT:    [[RESULT:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 4, i32 poison>
-; AVX2-NEXT:    ret <4 x float> [[RESULT]]
+; AVX2-NEXT:    ret <4 x float> [[TMP4]]
 ;
 ; AVX512-LABEL: @sub_v4f32_012u(
 ; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 4, i32 poison>
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-fadd.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-fadd.ll
index 6dceabe1d324..00a4417ba7af 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-fadd.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-fadd.ll
@@ -80,16 +80,11 @@ define half @reduce_fast_half8(<8 x half> %vec8) {
 ; NOFP16-LABEL: define half @reduce_fast_half8(
 ; NOFP16-SAME: <8 x half> [[VEC8:%.*]]) #[[ATTR0]] {
 ; NOFP16-NEXT:  [[ENTRY:.*:]]
-; NOFP16-NEXT:    [[ELT4:%.*]] = extractelement <8 x half> [[VEC8]], i64 4
-; NOFP16-NEXT:    [[ELT5:%.*]] = extractelement <8 x half> [[VEC8]], i64 5
-; NOFP16-NEXT:    [[ELT6:%.*]] = extractelement <8 x half> [[VEC8]], i64 6
-; NOFP16-NEXT:    [[ELT7:%.*]] = extractelement <8 x half> [[VEC8]], i64 7
 ; NOFP16-NEXT:    [[TMP0:%.*]] = shufflevector <8 x half> [[VEC8]], <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; NOFP16-NEXT:    [[TMP1:%.*]] = call fast half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> [[TMP0]])
-; NOFP16-NEXT:    [[OP_RDX:%.*]] = fadd fast half [[TMP1]], [[ELT4]]
-; NOFP16-NEXT:    [[OP_RDX1:%.*]] = fadd fast half [[ELT5]], [[ELT6]]
-; NOFP16-NEXT:    [[OP_RDX2:%.*]] = fadd fast half [[OP_RDX]], [[OP_RDX1]]
-; NOFP16-NEXT:    [[OP_RDX3:%.*]] = fadd fast half [[OP_RDX2]], [[ELT7]]
+; NOFP16-NEXT:    [[TMP2:%.*]] = shufflevector <8 x half> [[VEC8]], <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; NOFP16-NEXT:    [[TMP3:%.*]] = call fast half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> [[TMP2]])
+; NOFP16-NEXT:    [[OP_RDX3:%.*]] = fadd fast half [[TMP1]], [[TMP3]]
 ; NOFP16-NEXT:    ret half [[OP_RDX3]]
 ;
 ; FULLFP16-LABEL: define half @reduce_fast_half8(
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/long-gep-chains.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/long-gep-chains.ll
new file mode 100644
index 000000000000..cf1ed54149b8
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/long-gep-chains.ll
@@ -0,0 +1,76 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -passes=slp-vectorizer -mtriple=riscv64-unknown-linux -mattr=+v < %s |  FileCheck %s
+
+define i64 @test(ptr %arg, i32 %arg1, i64 %i) {
+; CHECK-LABEL: define i64 @test(
+; CHECK-SAME: ptr [[ARG:%.*]], i32 [[ARG1:%.*]], i64 [[I:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[BB:.*:]]
+; CHECK-NEXT:    [[I2:%.*]] = getelementptr i8, ptr [[ARG]], i64 [[I]]
+; CHECK-NEXT:    [[I3:%.*]] = getelementptr i8, ptr [[I2]], i64 [[I]]
+; CHECK-NEXT:    [[I4:%.*]] = getelementptr i8, ptr [[I3]], i64 [[I]]
+; CHECK-NEXT:    [[I5:%.*]] = getelementptr i8, ptr [[I4]], i64 [[I]]
+; CHECK-NEXT:    [[I6:%.*]] = getelementptr i8, ptr [[I5]], i64 [[I]]
+; CHECK-NEXT:    [[I7:%.*]] = getelementptr i8, ptr [[I6]], i64 [[I]]
+; CHECK-NEXT:    [[I8:%.*]] = getelementptr i8, ptr [[I7]], i64 [[I]]
+; CHECK-NEXT:    [[I9:%.*]] = getelementptr i8, ptr [[I8]], i64 [[I]]
+; CHECK-NEXT:    [[I10:%.*]] = getelementptr i8, ptr [[I9]], i64 [[I]]
+; CHECK-NEXT:    [[I11:%.*]] = getelementptr i8, ptr [[I10]], i64 [[I]]
+; CHECK-NEXT:    [[I12:%.*]] = getelementptr i8, ptr [[I11]], i64 [[I]]
+; CHECK-NEXT:    [[I13:%.*]] = getelementptr i8, ptr [[I12]], i64 [[I]]
+; CHECK-NEXT:    [[I14:%.*]] = getelementptr i8, ptr [[I13]], i64 [[I]]
+; CHECK-NEXT:    [[I140:%.*]] = load i8, ptr [[I14]], align 1
+; CHECK-NEXT:    [[I1412:%.*]] = zext i8 [[I140]] to i32
+; CHECK-NEXT:    [[I142:%.*]] = mul i32 [[ARG1]], [[I1412]]
+; CHECK-NEXT:    [[I143:%.*]] = getelementptr i8, ptr [[I13]], i64 15
+; CHECK-NEXT:    [[I144:%.*]] = load i8, ptr [[I143]], align 1
+; CHECK-NEXT:    [[I1453:%.*]] = zext i8 [[I144]] to i32
+; CHECK-NEXT:    [[I146:%.*]] = mul i32 [[ARG1]], [[I1453]]
+; CHECK-NEXT:    [[I147:%.*]] = getelementptr i8, ptr [[I13]], i64 14
+; CHECK-NEXT:    [[I148:%.*]] = load i8, ptr [[I147]], align 1
+; CHECK-NEXT:    [[I1494:%.*]] = zext i8 [[I148]] to i32
+; CHECK-NEXT:    [[I150:%.*]] = mul i32 [[ARG1]], [[I1494]]
+; CHECK-NEXT:    [[I151:%.*]] = getelementptr i8, ptr [[I13]], i64 13
+; CHECK-NEXT:    [[I152:%.*]] = load i8, ptr [[I151]], align 1
+; CHECK-NEXT:    [[I1535:%.*]] = zext i8 [[I152]] to i32
+; CHECK-NEXT:    [[I154:%.*]] = mul i32 [[ARG1]], [[I1535]]
+; CHECK-NEXT:    [[I1311:%.*]] = or i32 [[I142]], [[I146]]
+; CHECK-NEXT:    [[I1312:%.*]] = or i32 [[I1311]], [[I150]]
+; CHECK-NEXT:    [[I1313:%.*]] = or i32 [[I1312]], [[I154]]
+; CHECK-NEXT:    [[I1536:%.*]] = zext i32 [[I1313]] to i64
+; CHECK-NEXT:    ret i64 [[I1536]]
+;
+bb:
+  %i2 = getelementptr i8, ptr %arg, i64 %i
+  %i3 = getelementptr i8, ptr %i2, i64 %i
+  %i4 = getelementptr i8, ptr %i3, i64 %i
+  %i5 = getelementptr i8, ptr %i4, i64 %i
+  %i6 = getelementptr i8, ptr %i5, i64 %i
+  %i7 = getelementptr i8, ptr %i6, i64 %i
+  %i8 = getelementptr i8, ptr %i7, i64 %i
+  %i9 = getelementptr i8, ptr %i8, i64 %i
+  %i10 = getelementptr i8, ptr %i9, i64 %i
+  %i11 = getelementptr i8, ptr %i10, i64 %i
+  %i12 = getelementptr i8, ptr %i11, i64 %i
+  %i13 = getelementptr i8, ptr %i12, i64 %i
+  %i14 = getelementptr i8, ptr %i13, i64 %i
+  %i140 = load i8, ptr %i14, align 1
+  %i1412 = zext i8 %i140 to i32
+  %i142 = mul i32 %arg1, %i1412
+  %i143 = getelementptr i8, ptr %i13, i64 15
+  %i144 = load i8, ptr %i143, align 1
+  %i1453 = zext i8 %i144 to i32
+  %i146 = mul i32 %arg1, %i1453
+  %i147 = getelementptr i8, ptr %i13, i64 14
+  %i148 = load i8, ptr %i147, align 1
+  %i1494 = zext i8 %i148 to i32
+  %i150 = mul i32 %arg1, %i1494
+  %i151 = getelementptr i8, ptr %i13, i64 13
+  %i152 = load i8, ptr %i151, align 1
+  %i1535 = zext i8 %i152 to i32
+  %i154 = mul i32 %arg1, %i1535
+  %i1311 = or i32 %i142, %i146
+  %i1312 = or i32 %i1311, %i150
+  %i1313 = or i32 %i1312, %i154
+  %i1536 = zext i32 %i1313 to i64
+  ret i64 %i1536
+}
diff --git a/llvm/test/Transforms/StraightLineStrengthReduce/NVPTX/speculative-slsr.ll b/llvm/test/Transforms/StraightLineStrengthReduce/NVPTX/speculative-slsr.ll
index 92766d5a11aa..420e844b5103 100644
--- a/llvm/test/Transforms/StraightLineStrengthReduce/NVPTX/speculative-slsr.ll
+++ b/llvm/test/Transforms/StraightLineStrengthReduce/NVPTX/speculative-slsr.ll
@@ -11,7 +11,7 @@ target triple = "nvptx64-nvidia-cuda"
 ;       use((b + i) * s);
 ;   }
 ; }
-define void @foo(i32 %b, i32 %s) {
+define ptx_kernel void @foo(i32 %b, i32 %s) {
 ; CHECK-LABEL: .visible .entry foo(
 entry:
 ; CHECK: ld.param.u32 [[s:%r[0-9]+]], [foo_param_1];
@@ -65,7 +65,3 @@ for.inc.3:                                        ; preds = %if.then.3, %for.inc
 declare zeroext i1 @cond(i32)
 
 declare void @use(i32)
-
-!nvvm.annotations = !{!0}
-
-!0 = !{ptr @foo, !"kernel", i32 1}
diff --git a/llvm/test/Transforms/VectorCombine/X86/extract-binop-inseltpoison.ll b/llvm/test/Transforms/VectorCombine/X86/extract-binop-inseltpoison.ll
index 6ef18e66d421..f3b7f7b72ee4 100644
--- a/llvm/test/Transforms/VectorCombine/X86/extract-binop-inseltpoison.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/extract-binop-inseltpoison.ll
@@ -465,28 +465,13 @@ define <4 x float> @ins_bo_ext_ext_uses(<4 x float> %a, <4 x float> %b) {
 }
 
 define <4 x float> @PR34724(<4 x float> %a, <4 x float> %b) {
-; SSE-LABEL: @PR34724(
-; SSE-NEXT:    [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
-; SSE-NEXT:    [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1
-; SSE-NEXT:    [[SHIFT2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B1:%.*]], <4 x i32> <i32 poison, i32 2, i32 4, i32 6>
-; SSE-NEXT:    [[B:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B1]], <4 x i32> <i32 poison, i32 3, i32 5, i32 7>
-; SSE-NEXT:    [[TMP3:%.*]] = fadd <4 x float> [[SHIFT2]], [[B]]
-; SSE-NEXT:    ret <4 x float> [[TMP3]]
-;
-; AVX-LABEL: @PR34724(
-; AVX-NEXT:    [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
-; AVX-NEXT:    [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1
-; AVX-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 poison>
-; AVX-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]]
-; AVX-NEXT:    [[SHIFT1:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP2:%.*]] = fadd <4 x float> [[B]], [[SHIFT1]]
-; AVX-NEXT:    [[B01:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
-; AVX-NEXT:    [[SHIFT2:%.*]] = shufflevector <4 x float> [[B]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 2>
-; AVX-NEXT:    [[TMP3:%.*]] = fadd <4 x float> [[SHIFT2]], [[B]]
-; AVX-NEXT:    [[V1:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> <i32 poison, i32 2, i32 poison, i32 poison>
-; AVX-NEXT:    [[V2:%.*]] = insertelement <4 x float> [[V1]], float [[B01]], i32 2
-; AVX-NEXT:    [[V3:%.*]] = shufflevector <4 x float> [[V2]], <4 x float> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
-; AVX-NEXT:    ret <4 x float> [[V3]]
+; CHECK-LABEL: @PR34724(
+; CHECK-NEXT:    [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
+; CHECK-NEXT:    [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B:%.*]], <4 x i32> <i32 poison, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 poison, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[V3:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x float> [[V3]]
 ;
   %a0 = extractelement <4 x float> %a, i32 0
   %a1 = extractelement <4 x float> %a, i32 1
diff --git a/llvm/test/Transforms/VectorCombine/X86/extract-binop.ll b/llvm/test/Transforms/VectorCombine/X86/extract-binop.ll
index 307fbf711cdc..c125b73fccdd 100644
--- a/llvm/test/Transforms/VectorCombine/X86/extract-binop.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/extract-binop.ll
@@ -465,34 +465,19 @@ define <4 x float> @ins_bo_ext_ext_uses(<4 x float> %a, <4 x float> %b) {
 }
 
 define <4 x float> @PR34724(<4 x float> %a, <4 x float> %b) {
-; SSE-LABEL: @PR34724(
-; SSE-NEXT:    [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
-; SSE-NEXT:    [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1
-; SSE-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 poison>
-; SSE-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]]
-; SSE-NEXT:    [[SHIFT1:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT:    [[TMP2:%.*]] = fadd <4 x float> [[B]], [[SHIFT1]]
-; SSE-NEXT:    [[SHIFT2:%.*]] = shufflevector <4 x float> [[B]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 2>
-; SSE-NEXT:    [[TMP3:%.*]] = fadd <4 x float> [[SHIFT2]], [[B]]
-; SSE-NEXT:    [[V1:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 6, i32 7>
-; SSE-NEXT:    [[V2:%.*]] = shufflevector <4 x float> [[V1]], <4 x float> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 4, i32 3>
-; SSE-NEXT:    [[V3:%.*]] = shufflevector <4 x float> [[V2]], <4 x float> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
-; SSE-NEXT:    ret <4 x float> [[V3]]
-;
-; AVX-LABEL: @PR34724(
-; AVX-NEXT:    [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
-; AVX-NEXT:    [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1
-; AVX-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 poison>
-; AVX-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]]
-; AVX-NEXT:    [[SHIFT1:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP2:%.*]] = fadd <4 x float> [[B]], [[SHIFT1]]
-; AVX-NEXT:    [[B01:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
-; AVX-NEXT:    [[SHIFT2:%.*]] = shufflevector <4 x float> [[B]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 2>
-; AVX-NEXT:    [[TMP3:%.*]] = fadd <4 x float> [[SHIFT2]], [[B]]
-; AVX-NEXT:    [[V1:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 6, i32 7>
-; AVX-NEXT:    [[V2:%.*]] = insertelement <4 x float> [[V1]], float [[B01]], i32 2
-; AVX-NEXT:    [[V3:%.*]] = shufflevector <4 x float> [[V2]], <4 x float> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
-; AVX-NEXT:    ret <4 x float> [[V3]]
+; CHECK-LABEL: @PR34724(
+; CHECK-NEXT:    [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
+; CHECK-NEXT:    [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]]
+; CHECK-NEXT:    [[SHIFT1:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd <4 x float> [[B]], [[SHIFT1]]
+; CHECK-NEXT:    [[SHIFT2:%.*]] = shufflevector <4 x float> [[B]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 2>
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <4 x float> [[SHIFT2]], [[B]]
+; CHECK-NEXT:    [[V1:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 6, i32 7>
+; CHECK-NEXT:    [[V2:%.*]] = shufflevector <4 x float> [[V1]], <4 x float> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 4, i32 3>
+; CHECK-NEXT:    [[V3:%.*]] = shufflevector <4 x float> [[V2]], <4 x float> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+; CHECK-NEXT:    ret <4 x float> [[V3]]
 ;
   %a0 = extractelement <4 x float> %a, i32 0
   %a1 = extractelement <4 x float> %a, i32 1
diff --git a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-cmps.ll b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-cmps.ll
index 95068ad1f2a4..f9108efa7ee7 100644
--- a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-cmps.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-cmps.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
 ; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE4
-; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
-; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=avx512vl | FileCheck %s --check-prefixes=CHECK,AVX,AVX512
+; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=avx2 | FileCheck %s --check-prefixes=CHECK,AVX2
+; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=avx512vl | FileCheck %s --check-prefixes=CHECK,AVX512
 
 declare void @use(<4 x i1>)
 
@@ -105,8 +105,8 @@ define <4 x i32> @shuf_icmp_ugt_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z,
 define <4 x i32> @shuf_fcmp_oeq_v4i32(<4 x float> %x, <4 x float> %y, <4 x float> %z) {
 ; SSE2-LABEL: define <4 x i32> @shuf_fcmp_oeq_v4i32(
 ; SSE2-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]]) #[[ATTR0]] {
-; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[X]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 2, i32 0>
-; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[Y]], <4 x float> [[Z]], <4 x i32> <i32 poison, i32 poison, i32 6, i32 0>
+; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[X]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 0, i32 0>
+; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[Y]], <4 x float> [[Z]], <4 x i32> <i32 poison, i32 poison, i32 4, i32 0>
 ; SSE2-NEXT:    [[S:%.*]] = fcmp oeq <4 x float> [[TMP1]], [[TMP2]]
 ; SSE2-NEXT:    [[R:%.*]] = sext <4 x i1> [[S]] to <4 x i32>
 ; SSE2-NEXT:    ret <4 x i32> [[R]]
@@ -115,21 +115,29 @@ define <4 x i32> @shuf_fcmp_oeq_v4i32(<4 x float> %x, <4 x float> %y, <4 x float
 ; SSE4-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]]) #[[ATTR0]] {
 ; SSE4-NEXT:    [[B0:%.*]] = fcmp oeq <4 x float> [[X]], [[Y]]
 ; SSE4-NEXT:    [[B1:%.*]] = fcmp oeq <4 x float> [[X]], [[Z]]
-; SSE4-NEXT:    [[S:%.*]] = shufflevector <4 x i1> [[B0]], <4 x i1> [[B1]], <4 x i32> <i32 poison, i32 poison, i32 6, i32 0>
+; SSE4-NEXT:    [[S:%.*]] = shufflevector <4 x i1> [[B0]], <4 x i1> [[B1]], <4 x i32> <i32 poison, i32 poison, i32 4, i32 0>
 ; SSE4-NEXT:    [[R:%.*]] = sext <4 x i1> [[S]] to <4 x i32>
 ; SSE4-NEXT:    ret <4 x i32> [[R]]
 ;
-; AVX-LABEL: define <4 x i32> @shuf_fcmp_oeq_v4i32(
-; AVX-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]]) #[[ATTR0]] {
-; AVX-NEXT:    [[B0:%.*]] = fcmp oeq <4 x float> [[X]], [[Y]]
-; AVX-NEXT:    [[B1:%.*]] = fcmp oeq <4 x float> [[X]], [[Z]]
-; AVX-NEXT:    [[S:%.*]] = shufflevector <4 x i1> [[B0]], <4 x i1> [[B1]], <4 x i32> <i32 poison, i32 poison, i32 6, i32 0>
-; AVX-NEXT:    [[R:%.*]] = sext <4 x i1> [[S]] to <4 x i32>
-; AVX-NEXT:    ret <4 x i32> [[R]]
+; AVX2-LABEL: define <4 x i32> @shuf_fcmp_oeq_v4i32(
+; AVX2-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]]) #[[ATTR0]] {
+; AVX2-NEXT:    [[B0:%.*]] = fcmp oeq <4 x float> [[X]], [[Y]]
+; AVX2-NEXT:    [[B1:%.*]] = fcmp oeq <4 x float> [[X]], [[Z]]
+; AVX2-NEXT:    [[S:%.*]] = shufflevector <4 x i1> [[B0]], <4 x i1> [[B1]], <4 x i32> <i32 poison, i32 poison, i32 4, i32 0>
+; AVX2-NEXT:    [[R:%.*]] = sext <4 x i1> [[S]] to <4 x i32>
+; AVX2-NEXT:    ret <4 x i32> [[R]]
+;
+; AVX512-LABEL: define <4 x i32> @shuf_fcmp_oeq_v4i32(
+; AVX512-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]]) #[[ATTR0]] {
+; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[X]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 0, i32 0>
+; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[Y]], <4 x float> [[Z]], <4 x i32> <i32 poison, i32 poison, i32 4, i32 0>
+; AVX512-NEXT:    [[S:%.*]] = fcmp oeq <4 x float> [[TMP1]], [[TMP2]]
+; AVX512-NEXT:    [[R:%.*]] = sext <4 x i1> [[S]] to <4 x i32>
+; AVX512-NEXT:    ret <4 x i32> [[R]]
 ;
   %b0 = fcmp oeq <4 x float> %x, %y
   %b1 = fcmp oeq <4 x float> %x, %z
-  %s = shufflevector <4 x i1> %b0, <4 x i1> %b1, <4 x i32> <i32 poison, i32 poison, i32 6, i32 0>
+  %s = shufflevector <4 x i1> %b0, <4 x i1> %b1, <4 x i32> <i32 poison, i32 poison, i32 4, i32 0>
   %r = sext <4 x i1> %s to <4 x i32>
   ret <4 x i32> %r
 }
diff --git a/llvm/test/tools/UpdateTestChecks/lit.local.cfg b/llvm/test/tools/UpdateTestChecks/lit.local.cfg
index a954eb7ba174..7147769502a6 100644
--- a/llvm/test/tools/UpdateTestChecks/lit.local.cfg
+++ b/llvm/test/tools/UpdateTestChecks/lit.local.cfg
@@ -10,7 +10,7 @@ except ImportError:
     from pipes import quote as shell_quote
 
 
-def add_update_script_substition(
+def add_update_script_substitution(
     name, python_exe=config.python_executable, extra_args=""
 ):
     assert name.startswith("%")
@@ -33,26 +33,26 @@ llc_path = os.path.join(config.llvm_tools_dir, "llc")
 if os.path.isfile(llc_path):
     config.available_features.add("llc-binary")
     llc_arg = "--llc-binary " + shell_quote(llc_path)
-    add_update_script_substition("%update_llc_test_checks", extra_args=llc_arg)
-    add_update_script_substition("%update_mir_test_checks", extra_args=llc_arg)
+    add_update_script_substitution("%update_llc_test_checks", extra_args=llc_arg)
+    add_update_script_substitution("%update_mir_test_checks", extra_args=llc_arg)
 
 opt_path = os.path.join(config.llvm_tools_dir, "opt")
 if os.path.isfile(opt_path):
     config.available_features.add("opt-binary")
     opt_arg = "--opt-binary " + shell_quote(opt_path)
-    add_update_script_substition("%update_test_checks", extra_args=opt_arg)
-    add_update_script_substition("%update_analyze_test_checks", extra_args=opt_arg)
+    add_update_script_substitution("%update_test_checks", extra_args=opt_arg)
+    add_update_script_substitution("%update_analyze_test_checks", extra_args=opt_arg)
 
 llvm_mca_path = os.path.join(config.llvm_tools_dir, "llvm-mca")
 if os.path.isfile(llvm_mca_path):
     config.available_features.add("llvm-mca-binary")
     mca_arg = "--llvm-mca-binary " + shell_quote(llvm_mca_path)
-    add_update_script_substition("%update_test_checks", extra_args=mca_arg)
+    add_update_script_substitution("%update_test_checks", extra_args=mca_arg)
 
 split_file_path = os.path.join(config.llvm_tools_dir, "split-file")
 if os.path.isfile(split_file_path):
-    add_update_script_substition("%update_test_body")
+    add_update_script_substitution("%update_test_body")
 
 llvm_mc_path = os.path.join(config.llvm_tools_dir, "llvm-mc")
 if os.path.isfile(llvm_mc_path):
-    add_update_script_substition("%update_mc_test_checks")
+    add_update_script_substitution("%update_mc_test_checks")
diff --git a/llvm/test/tools/dxil-dis/fastmath.ll b/llvm/test/tools/dxil-dis/fastmath.ll
new file mode 100644
index 000000000000..7f4ba5b4cdd9
--- /dev/null
+++ b/llvm/test/tools/dxil-dis/fastmath.ll
@@ -0,0 +1,23 @@
+; RUN: llc %s --filetype=obj -o - | dxil-dis -o - | FileCheck %s
+target triple = "dxil-unknown-shadermodel6.7-library"
+
+define float @fma(float %0, float %1, float %2) #0 {
+  ; verify reassoc and contract are converted to fast
+  ; CHECK: %4 = fmul fast float %0, %1
+  %4 = fmul reassoc float %0, %1
+  ; CHECK-NEXT: %5 = fadd fast float %4, %2
+  %5 = fadd contract float %4, %2
+  ; verify these are converted to a single fast flag
+  ; CHECK-NEXT: %6 = fmul fast float %0, %1
+  %6 = fmul reassoc contract float %0, %1
+  ; verify these flags are maintained
+  ; CHECK-NEXT: %7 = fadd nnan ninf nsz arcp float %0, %1
+  %7 = fadd nnan ninf nsz arcp float %0, %1
+  ; verify that afn is removed
+  ; CHECK-NEXT: %8 = fmul float %0, %1
+  %8 = fmul afn float %0, %1
+  ret float %5
+}
+
+attributes #0 = { norecurse nounwind readnone willreturn "disable-tail-calls"="false" "waveops-include-helper-lanes" "fp32-denorm-mode"="any" "hlsl.export" }
+
diff --git a/llvm/test/tools/llvm-gsymutil/ARM_AArch64/macho-merged-funcs-dwarf.yaml b/llvm/test/tools/llvm-gsymutil/ARM_AArch64/macho-merged-funcs-dwarf.yaml
index 97dfc61ce1e1..522c57685442 100644
--- a/llvm/test/tools/llvm-gsymutil/ARM_AArch64/macho-merged-funcs-dwarf.yaml
+++ b/llvm/test/tools/llvm-gsymutil/ARM_AArch64/macho-merged-funcs-dwarf.yaml
@@ -71,11 +71,11 @@
 #### TODO: Fix non-determinism leading that is currently worked around with `{{[1-3]}}` below.
 
 # CHECK-MERGED-LOOKUP: Found 3 functions at address 0x0000000000000248:
-# CHECK-MERGED-LOOKUP-NEXT:       0x0000000000000248: my_func_0{{[1-3]}} @ /tmp/test_gsym_yaml/out/file_0{{[1-3]}}.cpp:5
-# CHECK-MERGED-LOOKUP-NEXT-NEXT:  0x0000000000000248: my_func_0{{[1-3]}} @ /tmp/test_gsym_yaml/out/file_0{{[1-3]}}.cpp:5
-# CHECK-MERGED-LOOKUP-NEXT-NEXT:  0x0000000000000248: my_func_0{{[1-3]}} @ /tmp/test_gsym_yaml/out/file_0{{[1-3]}}.cpp:5
+# CHECK-MERGED-LOOKUP-NEXT:       0x0000000000000248: my_func_0{{[1-3]}} @ /tmp/test_gsym_yaml{{[/\\]}}out/file_0{{[1-3]}}.cpp:5
+# CHECK-MERGED-LOOKUP-NEXT-NEXT:  0x0000000000000248: my_func_0{{[1-3]}} @ /tmp/test_gsym_yaml{{[/\\]}}out/file_0{{[1-3]}}.cpp:5
+# CHECK-MERGED-LOOKUP-NEXT-NEXT:  0x0000000000000248: my_func_0{{[1-3]}} @ /tmp/test_gsym_yaml{{[/\\]}}out/file_0{{[1-3]}}.cpp:5
  
-# CHECK-NORMAL-LOOKUP: 0x0000000000000248: my_func_0{{[1-3]}} @ /tmp/test_gsym_yaml/out/file_0{{[1-3]}}.cpp:5
+# CHECK-NORMAL-LOOKUP: 0x0000000000000248: my_func_0{{[1-3]}} @ /tmp/test_gsym_yaml{{[/\\]}}out/file_0{{[1-3]}}.cpp:5
 
 
 --- !mach-o
diff --git a/llvm/test/tools/llvm-mca/ARM/cortex-a57-memory-instructions.s b/llvm/test/tools/llvm-mca/ARM/cortex-a57-memory-instructions.s
index 04c95f62fbe1..36a2f04f4ace 100644
--- a/llvm/test/tools/llvm-mca/ARM/cortex-a57-memory-instructions.s
+++ b/llvm/test/tools/llvm-mca/ARM/cortex-a57-memory-instructions.s
@@ -325,11 +325,11 @@
 # CHECK-NEXT:  2      1     1.00           *            strd	r4, r5, [r12], -r10
 # CHECK-NEXT:  1      1     1.00           *            strh	r3, [r4]
 # CHECK-NEXT:  1      1     1.00           *            strh	r2, [r7, #4]
-# CHECK-NEXT:  2      1     1.00                  U     strh	r1, [r8, #64]!
+# CHECK-NEXT:  2      1     1.00           *            strh	r1, [r8, #64]!
 # CHECK-NEXT:  2      1     1.00           *            strh	r12, [sp], #4
 # CHECK-NEXT:  1      1     1.00           *            strh	r6, [r5, r4]
-# CHECK-NEXT:  2      1     1.00                  U     strh	r3, [r8, r11]!
-# CHECK-NEXT:  2      1     1.00                  U     strh	r1, [r2, -r1]!
+# CHECK-NEXT:  2      1     1.00           *            strh	r3, [r8, r11]!
+# CHECK-NEXT:  2      1     1.00           *            strh	r1, [r2, -r1]!
 # CHECK-NEXT:  2      1     1.00           *            strh	r9, [r7], r2
 # CHECK-NEXT:  2      1     1.00           *            strh	r4, [r3], -r2
 # CHECK-NEXT:  2      1     1.00                  U     strht	r2, [r5], #76
diff --git a/llvm/test/tools/llvm-mca/RISCV/SiFiveP400/div.s b/llvm/test/tools/llvm-mca/RISCV/SiFiveP400/div.s
new file mode 100644
index 000000000000..c42b4a9ef4ac
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/RISCV/SiFiveP400/div.s
@@ -0,0 +1,1009 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=riscv64 -mcpu=sifive-p470 -iterations=1 < %s | FileCheck %s
+
+vsetvli zero, zero, e8, mf8, tu, mu
+vdiv.vv v8, v16, v24
+vsetvli zero, zero, e8, mf4, tu, mu
+vdiv.vv v8, v16, v24
+vsetvli zero, zero, e8, mf2, tu, mu
+vdiv.vv v8, v16, v24
+vsetvli zero, zero, e8, m1, tu, mu
+vdiv.vv v8, v16, v24
+vsetvli zero, zero, e8, m1, tu, mu
+vdiv.vv v8, v16, v24
+vsetvli zero, zero, e8, m2, tu, mu
+vdiv.vv v8, v16, v24
+vsetvli zero, zero, e8, m4, tu, mu
+vdiv.vv v8, v16, v24
+vsetvli zero, zero, e8, m8, tu, mu
+vdiv.vv v8, v16, v24
+vsetvli zero, zero, e16, mf8, tu, mu
+vdiv.vv v8, v16, v24
+vsetvli zero, zero, e16, mf4, tu, mu
+vdiv.vv v8, v16, v24
+vsetvli zero, zero, e16, mf2, tu, mu
+vdiv.vv v8, v16, v24
+vsetvli zero, zero, e16, m1, tu, mu
+vdiv.vv v8, v16, v24
+vsetvli zero, zero, e16, m1, tu, mu
+vdiv.vv v8, v16, v24
+vsetvli zero, zero, e16, m2, tu, mu
+vdiv.vv v8, v16, v24
+vsetvli zero, zero, e16, m4, tu, mu
+vdiv.vv v8, v16, v24
+vsetvli zero, zero, e16, m8, tu, mu
+vdiv.vv v8, v16, v24
+vsetvli zero, zero, e32, mf8, tu, mu
+vdiv.vv v8, v16, v24
+vsetvli zero, zero, e32, mf4, tu, mu
+vdiv.vv v8, v16, v24
+vsetvli zero, zero, e32, mf2, tu, mu
+vdiv.vv v8, v16, v24
+vsetvli zero, zero, e32, m1, tu, mu
+vdiv.vv v8, v16, v24
+vsetvli zero, zero, e32, m1, tu, mu
+vdiv.vv v8, v16, v24
+vsetvli zero, zero, e32, m2, tu, mu
+vdiv.vv v8, v16, v24
+vsetvli zero, zero, e32, m4, tu, mu
+vdiv.vv v8, v16, v24
+vsetvli zero, zero, e32, m8, tu, mu
+vdiv.vv v8, v16, v24
+vsetvli zero, zero, e64, mf8, tu, mu
+vdiv.vv v8, v16, v24
+vsetvli zero, zero, e64, mf4, tu, mu
+vdiv.vv v8, v16, v24
+vsetvli zero, zero, e64, mf2, tu, mu
+vdiv.vv v8, v16, v24
+vsetvli zero, zero, e64, m1, tu, mu
+vdiv.vv v8, v16, v24
+vsetvli zero, zero, e64, m1, tu, mu
+vdiv.vv v8, v16, v24
+vsetvli zero, zero, e64, m2, tu, mu
+vdiv.vv v8, v16, v24
+vsetvli zero, zero, e64, m4, tu, mu
+vdiv.vv v8, v16, v24
+vsetvli zero, zero, e64, m8, tu, mu
+vdiv.vv v8, v16, v24
+
+vsetvli zero, zero, e8, mf8, tu, mu
+vdiv.vx v8, v16, a0
+vsetvli zero, zero, e8, mf4, tu, mu
+vdiv.vx v8, v16, a0
+vsetvli zero, zero, e8, mf2, tu, mu
+vdiv.vx v8, v16, a0
+vsetvli zero, zero, e8, m1, tu, mu
+vdiv.vx v8, v16, a0
+vsetvli zero, zero, e8, m1, tu, mu
+vdiv.vx v8, v16, a0
+vsetvli zero, zero, e8, m2, tu, mu
+vdiv.vx v8, v16, a0
+vsetvli zero, zero, e8, m4, tu, mu
+vdiv.vx v8, v16, a0
+vsetvli zero, zero, e8, m8, tu, mu
+vdiv.vx v8, v16, a0
+vsetvli zero, zero, e16, mf8, tu, mu
+vdiv.vx v8, v16, a0
+vsetvli zero, zero, e16, mf4, tu, mu
+vdiv.vx v8, v16, a0
+vsetvli zero, zero, e16, mf2, tu, mu
+vdiv.vx v8, v16, a0
+vsetvli zero, zero, e16, m1, tu, mu
+vdiv.vx v8, v16, a0
+vsetvli zero, zero, e16, m1, tu, mu
+vdiv.vx v8, v16, a0
+vsetvli zero, zero, e16, m2, tu, mu
+vdiv.vx v8, v16, a0
+vsetvli zero, zero, e16, m4, tu, mu
+vdiv.vx v8, v16, a0
+vsetvli zero, zero, e16, m8, tu, mu
+vdiv.vx v8, v16, a0
+vsetvli zero, zero, e32, mf8, tu, mu
+vdiv.vx v8, v16, a0
+vsetvli zero, zero, e32, mf4, tu, mu
+vdiv.vx v8, v16, a0
+vsetvli zero, zero, e32, mf2, tu, mu
+vdiv.vx v8, v16, a0
+vsetvli zero, zero, e32, m1, tu, mu
+vdiv.vx v8, v16, a0
+vsetvli zero, zero, e32, m1, tu, mu
+vdiv.vx v8, v16, a0
+vsetvli zero, zero, e32, m2, tu, mu
+vdiv.vx v8, v16, a0
+vsetvli zero, zero, e32, m4, tu, mu
+vdiv.vx v8, v16, a0
+vsetvli zero, zero, e32, m8, tu, mu
+vdiv.vx v8, v16, a0
+vsetvli zero, zero, e64, mf8, tu, mu
+vdiv.vx v8, v16, a0
+vsetvli zero, zero, e64, mf4, tu, mu
+vdiv.vx v8, v16, a0
+vsetvli zero, zero, e64, mf2, tu, mu
+vdiv.vx v8, v16, a0
+vsetvli zero, zero, e64, m1, tu, mu
+vdiv.vx v8, v16, a0
+vsetvli zero, zero, e64, m1, tu, mu
+vdiv.vx v8, v16, a0
+vsetvli zero, zero, e64, m2, tu, mu
+vdiv.vx v8, v16, a0
+vsetvli zero, zero, e64, m4, tu, mu
+vdiv.vx v8, v16, a0
+vsetvli zero, zero, e64, m8, tu, mu
+vdiv.vx v8, v16, a0
+
+vsetvli zero, zero, e8, mf8, tu, mu
+vfdiv.vv v8, v16, v24
+vsetvli zero, zero, e8, mf4, tu, mu
+vfdiv.vv v8, v16, v24
+vsetvli zero, zero, e8, mf2, tu, mu
+vfdiv.vv v8, v16, v24
+vsetvli zero, zero, e8, m1, tu, mu
+vfdiv.vv v8, v16, v24
+vsetvli zero, zero, e8, m1, tu, mu
+vfdiv.vv v8, v16, v24
+vsetvli zero, zero, e8, m2, tu, mu
+vfdiv.vv v8, v16, v24
+vsetvli zero, zero, e8, m4, tu, mu
+vfdiv.vv v8, v16, v24
+vsetvli zero, zero, e8, m8, tu, mu
+vfdiv.vv v8, v16, v24
+vsetvli zero, zero, e16, mf8, tu, mu
+vfdiv.vv v8, v16, v24
+vsetvli zero, zero, e16, mf4, tu, mu
+vfdiv.vv v8, v16, v24
+vsetvli zero, zero, e16, mf2, tu, mu
+vfdiv.vv v8, v16, v24
+vsetvli zero, zero, e16, m1, tu, mu
+vfdiv.vv v8, v16, v24
+vsetvli zero, zero, e16, m1, tu, mu
+vfdiv.vv v8, v16, v24
+vsetvli zero, zero, e16, m2, tu, mu
+vfdiv.vv v8, v16, v24
+vsetvli zero, zero, e16, m4, tu, mu
+vfdiv.vv v8, v16, v24
+vsetvli zero, zero, e16, m8, tu, mu
+vfdiv.vv v8, v16, v24
+vsetvli zero, zero, e32, mf8, tu, mu
+vfdiv.vv v8, v16, v24
+vsetvli zero, zero, e32, mf4, tu, mu
+vfdiv.vv v8, v16, v24
+vsetvli zero, zero, e32, mf2, tu, mu
+vfdiv.vv v8, v16, v24
+vsetvli zero, zero, e32, m1, tu, mu
+vfdiv.vv v8, v16, v24
+vsetvli zero, zero, e32, m1, tu, mu
+vfdiv.vv v8, v16, v24
+vsetvli zero, zero, e32, m2, tu, mu
+vfdiv.vv v8, v16, v24
+vsetvli zero, zero, e32, m4, tu, mu
+vfdiv.vv v8, v16, v24
+vsetvli zero, zero, e32, m8, tu, mu
+vfdiv.vv v8, v16, v24
+vsetvli zero, zero, e64, mf8, tu, mu
+vfdiv.vv v8, v16, v24
+vsetvli zero, zero, e64, mf4, tu, mu
+vfdiv.vv v8, v16, v24
+vsetvli zero, zero, e64, mf2, tu, mu
+vfdiv.vv v8, v16, v24
+vsetvli zero, zero, e64, m1, tu, mu
+vfdiv.vv v8, v16, v24
+vsetvli zero, zero, e64, m1, tu, mu
+vfdiv.vv v8, v16, v24
+vsetvli zero, zero, e64, m2, tu, mu
+vfdiv.vv v8, v16, v24
+vsetvli zero, zero, e64, m4, tu, mu
+vfdiv.vv v8, v16, v24
+vsetvli zero, zero, e64, m8, tu, mu
+vfdiv.vv v8, v16, v24
+
+vsetvli zero, zero, e8, mf8, tu, mu
+vfdiv.vf v8, v16, fa0
+vsetvli zero, zero, e8, mf4, tu, mu
+vfdiv.vf v8, v16, fa0
+vsetvli zero, zero, e8, mf2, tu, mu
+vfdiv.vf v8, v16, fa0
+vsetvli zero, zero, e8, m1, tu, mu
+vfdiv.vf v8, v16, fa0
+vsetvli zero, zero, e8, m1, tu, mu
+vfdiv.vf v8, v16, fa0
+vsetvli zero, zero, e8, m2, tu, mu
+vfdiv.vf v8, v16, fa0
+vsetvli zero, zero, e8, m4, tu, mu
+vfdiv.vf v8, v16, fa0
+vsetvli zero, zero, e8, m8, tu, mu
+vfdiv.vf v8, v16, fa0
+vsetvli zero, zero, e16, mf8, tu, mu
+vfdiv.vf v8, v16, fa0
+vsetvli zero, zero, e16, mf4, tu, mu
+vfdiv.vf v8, v16, fa0
+vsetvli zero, zero, e16, mf2, tu, mu
+vfdiv.vf v8, v16, fa0
+vsetvli zero, zero, e16, m1, tu, mu
+vfdiv.vf v8, v16, fa0
+vsetvli zero, zero, e16, m1, tu, mu
+vfdiv.vf v8, v16, fa0
+vsetvli zero, zero, e16, m2, tu, mu
+vfdiv.vf v8, v16, fa0
+vsetvli zero, zero, e16, m4, tu, mu
+vfdiv.vf v8, v16, fa0
+vsetvli zero, zero, e16, m8, tu, mu
+vfdiv.vf v8, v16, fa0
+vsetvli zero, zero, e32, mf8, tu, mu
+vfdiv.vf v8, v16, fa0
+vsetvli zero, zero, e32, mf4, tu, mu
+vfdiv.vf v8, v16, fa0
+vsetvli zero, zero, e32, mf2, tu, mu
+vfdiv.vf v8, v16, fa0
+vsetvli zero, zero, e32, m1, tu, mu
+vfdiv.vf v8, v16, fa0
+vsetvli zero, zero, e32, m1, tu, mu
+vfdiv.vf v8, v16, fa0
+vsetvli zero, zero, e32, m2, tu, mu
+vfdiv.vf v8, v16, fa0
+vsetvli zero, zero, e32, m4, tu, mu
+vfdiv.vf v8, v16, fa0
+vsetvli zero, zero, e32, m8, tu, mu
+vfdiv.vf v8, v16, fa0
+vsetvli zero, zero, e64, mf8, tu, mu
+vfdiv.vf v8, v16, fa0
+vsetvli zero, zero, e64, mf4, tu, mu
+vfdiv.vf v8, v16, fa0
+vsetvli zero, zero, e64, mf2, tu, mu
+vfdiv.vf v8, v16, fa0
+vsetvli zero, zero, e64, m1, tu, mu
+vfdiv.vf v8, v16, fa0
+vsetvli zero, zero, e64, m1, tu, mu
+vfdiv.vf v8, v16, fa0
+vsetvli zero, zero, e64, m2, tu, mu
+vfdiv.vf v8, v16, fa0
+vsetvli zero, zero, e64, m4, tu, mu
+vfdiv.vf v8, v16, fa0
+vsetvli zero, zero, e64, m8, tu, mu
+vfdiv.vf v8, v16, fa0
+
+vsetvli zero, zero, e8, mf8, tu, mu
+vfsqrt.v v8, v16
+vsetvli zero, zero, e8, mf4, tu, mu
+vfsqrt.v v8, v16
+vsetvli zero, zero, e8, mf2, tu, mu
+vfsqrt.v v8, v16
+vsetvli zero, zero, e8, m1, tu, mu
+vfsqrt.v v8, v16
+vsetvli zero, zero, e8, m1, tu, mu
+vfsqrt.v v8, v16
+vsetvli zero, zero, e8, m2, tu, mu
+vfsqrt.v v8, v16
+vsetvli zero, zero, e8, m4, tu, mu
+vfsqrt.v v8, v16
+vsetvli zero, zero, e8, m8, tu, mu
+vfsqrt.v v8, v16
+vsetvli zero, zero, e16, mf8, tu, mu
+vfsqrt.v v8, v16
+vsetvli zero, zero, e16, mf4, tu, mu
+vfsqrt.v v8, v16
+vsetvli zero, zero, e16, mf2, tu, mu
+vfsqrt.v v8, v16
+vsetvli zero, zero, e16, m1, tu, mu
+vfsqrt.v v8, v16
+vsetvli zero, zero, e16, m1, tu, mu
+vfsqrt.v v8, v16
+vsetvli zero, zero, e16, m2, tu, mu
+vfsqrt.v v8, v16
+vsetvli zero, zero, e16, m4, tu, mu
+vfsqrt.v v8, v16
+vsetvli zero, zero, e16, m8, tu, mu
+vfsqrt.v v8, v16
+vsetvli zero, zero, e32, mf8, tu, mu
+vfsqrt.v v8, v16
+vsetvli zero, zero, e32, mf4, tu, mu
+vfsqrt.v v8, v16
+vsetvli zero, zero, e32, mf2, tu, mu
+vfsqrt.v v8, v16
+vsetvli zero, zero, e32, m1, tu, mu
+vfsqrt.v v8, v16
+vsetvli zero, zero, e32, m1, tu, mu
+vfsqrt.v v8, v16
+vsetvli zero, zero, e32, m2, tu, mu
+vfsqrt.v v8, v16
+vsetvli zero, zero, e32, m4, tu, mu
+vfsqrt.v v8, v16
+vsetvli zero, zero, e32, m8, tu, mu
+vfsqrt.v v8, v16
+vsetvli zero, zero, e64, mf8, tu, mu
+vfsqrt.v v8, v16
+vsetvli zero, zero, e64, mf4, tu, mu
+vfsqrt.v v8, v16
+vsetvli zero, zero, e64, mf2, tu, mu
+vfsqrt.v v8, v16
+vsetvli zero, zero, e64, m1, tu, mu
+vfsqrt.v v8, v16
+vsetvli zero, zero, e64, m1, tu, mu
+vfsqrt.v v8, v16
+vsetvli zero, zero, e64, m2, tu, mu
+vfsqrt.v v8, v16
+vsetvli zero, zero, e64, m4, tu, mu
+vfsqrt.v v8, v16
+vsetvli zero, zero, e64, m8, tu, mu
+vfsqrt.v v8, v16
+
+# CHECK:      Iterations:        1
+# CHECK-NEXT: Instructions:      320
+# CHECK-NEXT: Total Cycles:      22358
+# CHECK-NEXT: Total uOps:        320
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.01
+# CHECK-NEXT: IPC:               0.01
+# CHECK-NEXT: Block RThroughput: 14361.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      51    51.00                       vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      51    51.00                       vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      51    51.00                       vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      51    51.00                       vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      51    51.00                       vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      102   102.00                      vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      204   204.00                      vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      408   408.00                      vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, mf8, tu, mu
+# CHECK-NEXT:  1      408   408.00                      vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      45    45.00                       vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      45    45.00                       vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      45    45.00                       vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      45    45.00                       vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      90    90.00                       vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      180   180.00                      vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      360   360.00                      vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, mf8, tu, mu
+# CHECK-NEXT:  1      408   408.00                      vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, mf4, tu, mu
+# CHECK-NEXT:  1      408   408.00                      vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      42    42.00                       vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      42    42.00                       vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      42    42.00                       vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      84    84.00                       vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      168   168.00                      vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      336   336.00                      vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, mf8, tu, mu
+# CHECK-NEXT:  1      408   408.00                      vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, mf4, tu, mu
+# CHECK-NEXT:  1      408   408.00                      vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, mf2, tu, mu
+# CHECK-NEXT:  1      408   408.00                      vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      72    72.00                       vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      72    72.00                       vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      144   144.00                      vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      288   288.00                      vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      576   576.00                      vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      51    51.00                       vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      51    51.00                       vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      51    51.00                       vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      51    51.00                       vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      51    51.00                       vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      102   102.00                      vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      204   204.00                      vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      408   408.00                      vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, mf8, tu, mu
+# CHECK-NEXT:  1      408   408.00                      vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      45    45.00                       vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      45    45.00                       vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      45    45.00                       vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      45    45.00                       vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      90    90.00                       vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      180   180.00                      vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      360   360.00                      vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, mf8, tu, mu
+# CHECK-NEXT:  1      408   408.00                      vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, mf4, tu, mu
+# CHECK-NEXT:  1      408   408.00                      vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      42    42.00                       vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      42    42.00                       vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      42    42.00                       vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      84    84.00                       vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      168   168.00                      vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      336   336.00                      vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, mf8, tu, mu
+# CHECK-NEXT:  1      408   408.00                      vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, mf4, tu, mu
+# CHECK-NEXT:  1      408   408.00                      vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, mf2, tu, mu
+# CHECK-NEXT:  1      408   408.00                      vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      72    72.00                       vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      72    72.00                       vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      144   144.00                      vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      288   288.00                      vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      576   576.00                      vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, mf8, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      29    29.00                       vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      29    29.00                       vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      29    29.00                       vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      29    29.00                       vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      58    58.00                       vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      116   116.00                      vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, mf8, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, mf4, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      25    25.00                       vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      25    25.00                       vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      25    25.00                       vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      50    50.00                       vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      100   100.00                      vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      200   200.00                      vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, mf8, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, mf4, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, mf2, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      37    37.00                       vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      37    37.00                       vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      74    74.00                       vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      148   148.00                      vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      296   296.00                      vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, mf8, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      29    29.00                       vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      29    29.00                       vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      29    29.00                       vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      29    29.00                       vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      58    58.00                       vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      116   116.00                      vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, mf8, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, mf4, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      25    25.00                       vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      25    25.00                       vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      25    25.00                       vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      50    50.00                       vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      100   100.00                      vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      200   200.00                      vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, mf8, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, mf4, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, mf2, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      37    37.00                       vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      37    37.00                       vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      74    74.00                       vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      148   148.00                      vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      296   296.00                      vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfsqrt.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfsqrt.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfsqrt.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfsqrt.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfsqrt.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, m2, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfsqrt.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, m4, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfsqrt.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e8, m8, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfsqrt.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, mf8, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfsqrt.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  1      29    29.00                       vfsqrt.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  1      29    29.00                       vfsqrt.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      29    29.00                       vfsqrt.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  1      29    29.00                       vfsqrt.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, m2, tu, mu
+# CHECK-NEXT:  1      58    58.00                       vfsqrt.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, m4, tu, mu
+# CHECK-NEXT:  1      116   116.00                      vfsqrt.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e16, m8, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfsqrt.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, mf8, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfsqrt.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, mf4, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfsqrt.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  1      25    25.00                       vfsqrt.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      25    25.00                       vfsqrt.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      25    25.00                       vfsqrt.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, m2, tu, mu
+# CHECK-NEXT:  1      50    50.00                       vfsqrt.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, m4, tu, mu
+# CHECK-NEXT:  1      100   100.00                      vfsqrt.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e32, m8, tu, mu
+# CHECK-NEXT:  1      200   200.00                      vfsqrt.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, mf8, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfsqrt.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, mf4, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfsqrt.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, mf2, tu, mu
+# CHECK-NEXT:  1      232   232.00                      vfsqrt.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      37    37.00                       vfsqrt.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      37    37.00                       vfsqrt.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, m2, tu, mu
+# CHECK-NEXT:  1      74    74.00                       vfsqrt.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, m4, tu, mu
+# CHECK-NEXT:  1      148   148.00                      vfsqrt.v	v8, v16
+# CHECK-NEXT:  1      1     1.00                  U     vsetvli	zero, zero, e64, m8, tu, mu
+# CHECK-NEXT:  1      296   296.00                      vfsqrt.v	v8, v16
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SiFiveP400Div
+# CHECK-NEXT: [1]   - SiFiveP400FEXQ0
+# CHECK-NEXT: [2]   - SiFiveP400FloatDiv
+# CHECK-NEXT: [3]   - SiFiveP400IEXQ0
+# CHECK-NEXT: [4]   - SiFiveP400IEXQ1
+# CHECK-NEXT: [5]   - SiFiveP400IEXQ2
+# CHECK-NEXT: [6]   - SiFiveP400Load
+# CHECK-NEXT: [7]   - SiFiveP400Store
+# CHECK-NEXT: [8]   - SiFiveP400VDiv
+# CHECK-NEXT: [9]   - SiFiveP400VEXQ0
+# CHECK-NEXT: [10]  - SiFiveP400VFloatDiv
+# CHECK-NEXT: [11]  - SiFiveP400VLD
+# CHECK-NEXT: [12]  - SiFiveP400VST
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]
+# CHECK-NEXT:  -      -      -      -     160.00  -      -      -     12186.00 725.00 14361.00  -   -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   Instructions:
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     51.00  1.00    -      -      -     vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     51.00  1.00    -      -      -     vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     51.00  1.00    -      -      -     vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     51.00  1.00    -      -      -     vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     51.00  1.00    -      -      -     vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     102.00 2.00    -      -      -     vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     204.00 4.00    -      -      -     vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     408.00 8.00    -      -      -     vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     408.00 8.00    -      -      -     vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     45.00  1.00    -      -      -     vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     45.00  1.00    -      -      -     vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     45.00  1.00    -      -      -     vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     45.00  1.00    -      -      -     vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     90.00  2.00    -      -      -     vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     180.00 4.00    -      -      -     vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     360.00 8.00    -      -      -     vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     408.00 8.00    -      -      -     vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     408.00 8.00    -      -      -     vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     42.00  1.00    -      -      -     vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     42.00  1.00    -      -      -     vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     42.00  1.00    -      -      -     vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     84.00  2.00    -      -      -     vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     168.00 4.00    -      -      -     vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     336.00 8.00    -      -      -     vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     408.00 8.00    -      -      -     vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     408.00 8.00    -      -      -     vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     408.00 8.00    -      -      -     vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     72.00  1.00    -      -      -     vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     72.00  1.00    -      -      -     vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     144.00 2.00    -      -      -     vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     288.00 4.00    -      -      -     vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     576.00 8.00    -      -      -     vdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     51.00  1.00    -      -      -     vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     51.00  1.00    -      -      -     vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     51.00  1.00    -      -      -     vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     51.00  1.00    -      -      -     vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     51.00  1.00    -      -      -     vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     102.00 2.00    -      -      -     vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     204.00 4.00    -      -      -     vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     408.00 8.00    -      -      -     vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     408.00 8.00    -      -      -     vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     45.00  1.00    -      -      -     vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     45.00  1.00    -      -      -     vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     45.00  1.00    -      -      -     vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     45.00  1.00    -      -      -     vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     90.00  2.00    -      -      -     vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     180.00 4.00    -      -      -     vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     360.00 8.00    -      -      -     vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     408.00 8.00    -      -      -     vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     408.00 8.00    -      -      -     vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     42.00  1.00    -      -      -     vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     42.00  1.00    -      -      -     vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     42.00  1.00    -      -      -     vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     84.00  2.00    -      -      -     vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     168.00 4.00    -      -      -     vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     336.00 8.00    -      -      -     vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     408.00 8.00    -      -      -     vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     408.00 8.00    -      -      -     vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     408.00 8.00    -      -      -     vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     72.00  1.00    -      -      -     vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     72.00  1.00    -      -      -     vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     144.00 2.00    -      -      -     vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     288.00 4.00    -      -      -     vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     576.00 8.00    -      -      -     vdiv.vx	v8, v16, a0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   29.00   -      -     vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   29.00   -      -     vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   29.00   -      -     vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   29.00   -      -     vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.00   58.00   -      -     vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     4.00   116.00  -      -     vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   25.00   -      -     vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   25.00   -      -     vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   25.00   -      -     vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.00   50.00   -      -     vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     4.00   100.00  -      -     vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   200.00  -      -     vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   37.00   -      -     vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   37.00   -      -     vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.00   74.00   -      -     vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     4.00   148.00  -      -     vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   296.00  -      -     vfdiv.vv	v8, v16, v24
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   29.00   -      -     vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   29.00   -      -     vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   29.00   -      -     vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   29.00   -      -     vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.00   58.00   -      -     vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     4.00   116.00  -      -     vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   25.00   -      -     vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   25.00   -      -     vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   25.00   -      -     vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.00   50.00   -      -     vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     4.00   100.00  -      -     vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   200.00  -      -     vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   37.00   -      -     vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   37.00   -      -     vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.00   74.00   -      -     vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     4.00   148.00  -      -     vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   296.00  -      -     vfdiv.vf	v8, v16, fa0
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfsqrt.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfsqrt.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfsqrt.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfsqrt.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfsqrt.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfsqrt.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfsqrt.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e8, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfsqrt.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfsqrt.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   29.00   -      -     vfsqrt.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   29.00   -      -     vfsqrt.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   29.00   -      -     vfsqrt.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   29.00   -      -     vfsqrt.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.00   58.00   -      -     vfsqrt.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     4.00   116.00  -      -     vfsqrt.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e16, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfsqrt.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfsqrt.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfsqrt.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   25.00   -      -     vfsqrt.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   25.00   -      -     vfsqrt.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   25.00   -      -     vfsqrt.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.00   50.00   -      -     vfsqrt.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     4.00   100.00  -      -     vfsqrt.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e32, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   200.00  -      -     vfsqrt.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, mf8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfsqrt.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, mf4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfsqrt.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, mf2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   232.00  -      -     vfsqrt.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   37.00   -      -     vfsqrt.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   37.00   -      -     vfsqrt.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, m2, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.00   74.00   -      -     vfsqrt.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, m4, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     4.00   148.00  -      -     vfsqrt.v	v8, v16
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     vsetvli	zero, zero, e64, m8, tu, mu
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     8.00   296.00  -      -     vfsqrt.v	v8, v16
diff --git a/llvm/test/tools/llvm-mca/RISCV/SiFiveP400/mul-cpop.s b/llvm/test/tools/llvm-mca/RISCV/SiFiveP400/mul-cpop.s
new file mode 100644
index 000000000000..5f7a1d1dce09
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/RISCV/SiFiveP400/mul-cpop.s
@@ -0,0 +1,60 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=riscv64 -mcpu=sifive-p470 -iterations=1 < %s | FileCheck %s
+
+mul s6, s6, s7
+
+mulw s4, s4, a2
+
+cpop t1, t1
+
+cpopw t2, t2
+
+# CHECK:      Iterations:        1
+# CHECK-NEXT: Instructions:      4
+# CHECK-NEXT: Total Cycles:      8
+# CHECK-NEXT: Total uOps:        4
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.50
+# CHECK-NEXT: IPC:               0.50
+# CHECK-NEXT: Block RThroughput: 4.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      2     1.00                        mul	s6, s6, s7
+# CHECK-NEXT:  1      2     1.00                        mulw	s4, s4, a2
+# CHECK-NEXT:  1      2     1.00                        cpop	t1, t1
+# CHECK-NEXT:  1      2     1.00                        cpopw	t2, t2
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SiFiveP400Div
+# CHECK-NEXT: [1]   - SiFiveP400FEXQ0
+# CHECK-NEXT: [2]   - SiFiveP400FloatDiv
+# CHECK-NEXT: [3]   - SiFiveP400IEXQ0
+# CHECK-NEXT: [4]   - SiFiveP400IEXQ1
+# CHECK-NEXT: [5]   - SiFiveP400IEXQ2
+# CHECK-NEXT: [6]   - SiFiveP400Load
+# CHECK-NEXT: [7]   - SiFiveP400Store
+# CHECK-NEXT: [8]   - SiFiveP400VDiv
+# CHECK-NEXT: [9]   - SiFiveP400VEXQ0
+# CHECK-NEXT: [10]  - SiFiveP400VFloatDiv
+# CHECK-NEXT: [11]  - SiFiveP400VLD
+# CHECK-NEXT: [12]  - SiFiveP400VST
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]
+# CHECK-NEXT:  -      -      -      -      -     4.00    -      -      -      -      -      -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -      -      -      -     mul	s6, s6, s7
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -      -      -      -     mulw	s4, s4, a2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -      -      -      -     cpop	t1, t1
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -      -      -      -     cpopw	t2, t2
diff --git a/llvm/test/tools/llvm-objcopy/MachO/strip-with-encryption-info.test b/llvm/test/tools/llvm-objcopy/MachO/strip-with-encryption-info.test
new file mode 100644
index 000000000000..19b06b1ec02c
--- /dev/null
+++ b/llvm/test/tools/llvm-objcopy/MachO/strip-with-encryption-info.test
@@ -0,0 +1,217 @@
+# RUN: rm -rf %t && mkdir %t
+# RUN: yaml2obj %s -o %t/original
+# RUN: llvm-strip --strip-all %t/original -o %t/stripped
+# RUN: llvm-readobj --macho-segment %t/stripped | FileCheck %s
+
+# CHECK-LABEL: Name: __PAGEZERO
+# CHECK:       fileoff: 16384
+
+# CHECK-LABEL: Name: __TEXT
+# CHECK:       fileoff: 16384
+
+# The YAML below is the following code
+# int main(int argc, char **argv) { return 0; }
+# Compiled on macOS against the macOS SDK and passing `-Wl,-encryptable`
+# Contents are removed, since they are not important for the test. We need a
+# small text segment (smaller than a page).
+--- !mach-o
+FileHeader:
+  magic:           0xFEEDFACF
+  cputype:         0x100000C
+  cpusubtype:      0x0
+  filetype:        0x2
+  ncmds:           15
+  sizeofcmds:      696
+  flags:           0x200085
+  reserved:        0x0
+LoadCommands:
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         72
+    segname:         __PAGEZERO
+    vmaddr:          0
+    vmsize:          4294967296
+    fileoff:         0
+    filesize:        0
+    maxprot:         0
+    initprot:        0
+    nsects:          0
+    flags:           0
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         232
+    segname:         __TEXT
+    vmaddr:          4294967296
+    vmsize:          32768
+    fileoff:         0
+    filesize:        32768
+    maxprot:         5
+    initprot:        5
+    nsects:          2
+    flags:           0
+    Sections:
+      - sectname:        __text
+        segname:         __TEXT
+        addr:            0x100004000
+        size:            32
+        offset:          0x4000
+        align:           2
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x80000400
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+      - sectname:        __unwind_info
+        segname:         __TEXT
+        addr:            0x100004020
+        size:            4152
+        offset:          0x4020
+        align:           2
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x0
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         72
+    segname:         __LINKEDIT
+    vmaddr:          4295000064
+    vmsize:          592
+    fileoff:         32768
+    filesize:        592
+    maxprot:         1
+    initprot:        1
+    nsects:          0
+    flags:           0
+  - cmd:             LC_DYLD_CHAINED_FIXUPS
+    cmdsize:         16
+    dataoff:         32768
+    datasize:        48
+  - cmd:             LC_DYLD_EXPORTS_TRIE
+    cmdsize:         16
+    dataoff:         32816
+    datasize:        48
+  - cmd:             LC_SYMTAB
+    cmdsize:         24
+    symoff:          32872
+    nsyms:           2
+    stroff:          32904
+    strsize:         32
+  - cmd:             LC_DYSYMTAB
+    cmdsize:         80
+    ilocalsym:       0
+    nlocalsym:       0
+    iextdefsym:      0
+    nextdefsym:      2
+    iundefsym:       2
+    nundefsym:       0
+    tocoff:          0
+    ntoc:            0
+    modtaboff:       0
+    nmodtab:         0
+    extrefsymoff:    0
+    nextrefsyms:     0
+    indirectsymoff:  0
+    nindirectsyms:   0
+    extreloff:       0
+    nextrel:         0
+    locreloff:       0
+    nlocrel:         0
+  - cmd:             LC_ENCRYPTION_INFO_64
+    cmdsize:         24
+    cryptoff:        16384
+    cryptsize:       16384
+    cryptid:         0
+    pad:             0
+  - cmd:             LC_LOAD_DYLINKER
+    cmdsize:         32
+    name:            12
+    Content:         '/usr/lib/dyld'
+    ZeroPadBytes:    7
+  - cmd:             LC_UUID
+    cmdsize:         24
+    uuid:            4C4C4447-5555-3144-A18A-01E9EB7E7D92
+  - cmd:             LC_BUILD_VERSION
+    cmdsize:         32
+    platform:        1
+    minos:           983040
+    sdk:             983552
+    ntools:          1
+    Tools:
+      - tool:            4
+        version:         1310720
+  - cmd:             LC_MAIN
+    cmdsize:         24
+    entryoff:        16384
+    stacksize:       0
+  - cmd:             LC_FUNCTION_STARTS
+    cmdsize:         16
+    dataoff:         32864
+    datasize:        8
+  - cmd:             LC_DATA_IN_CODE
+    cmdsize:         16
+    dataoff:         32872
+    datasize:        0
+  - cmd:             LC_CODE_SIGNATURE
+    cmdsize:         16
+    dataoff:         32944
+    datasize:        416
+LinkEditData:
+  ExportTrie:
+    TerminalSize:    0
+    NodeOffset:      0
+    Name:            ''
+    Flags:           0x0
+    Address:         0x0
+    Other:           0x0
+    ImportName:      ''
+    Children:
+      - TerminalSize:    0
+        NodeOffset:      5
+        Name:            _
+        Flags:           0x0
+        Address:         0x0
+        Other:           0x0
+        ImportName:      ''
+        Children:
+          - TerminalSize:    4
+            NodeOffset:      33
+            Name:            main
+            Flags:           0x0
+            Address:         0x4000
+            Other:           0x0
+            ImportName:      ''
+          - TerminalSize:    2
+            NodeOffset:      39
+            Name:            _mh_execute_header
+            Flags:           0x0
+            Address:         0x0
+            Other:           0x0
+            ImportName:      ''
+  NameList:
+    - n_strx:          2
+      n_type:          0xF
+      n_sect:          1
+      n_desc:          0
+      n_value:         4294983680
+    - n_strx:          8
+      n_type:          0xF
+      n_sect:          1
+      n_desc:          16
+      n_value:         4294967296
+  StringTable:
+    - ' '
+    - _main
+    - __mh_execute_header
+    - ''
+    - ''
+    - ''
+    - ''
+  FunctionStarts:  [ 0x4000 ]
+  ChainedFixups:   [ 0x0, 0x0, 0x0, 0x0, 0x20, 0x0, 0x0, 0x0, 0x30, 0x0, 
+                     0x0, 0x0, 0x30, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                     0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                     0x0, 0x0, 0x3, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
+                     0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 ]
+...
+
diff --git a/llvm/test/tools/llvm-profgen/context-depth.test b/llvm/test/tools/llvm-profgen/context-depth.test
new file mode 100644
index 000000000000..4eaa5fa1eae9
--- /dev/null
+++ b/llvm/test/tools/llvm-profgen/context-depth.test
@@ -0,0 +1,125 @@
+; Test --csprof-max-context-depth and --csprof-max-unsymbolized-context-depth
+
+; RUN: llvm-profgen --format=text --perfscript=%S/Inputs/recursion-compression-pseudoprobe.perfscript --binary=%S/Inputs/recursion-compression-pseudoprobe.perfbin --output=%t --compress-recursion=0 --profile-summary-hot-count=0 --csprof-max-context-depth=0 --csspgo-preinliner=0 --gen-cs-nested-profile=0
+; RUN: FileCheck %s --input-file %t -check-prefix=CHECK-MAX-CTX-DEPTH
+; RUN: llvm-profgen --format=text --perfscript=%S/Inputs/recursion-compression-pseudoprobe.perfscript --binary=%S/Inputs/recursion-compression-pseudoprobe.perfbin --output=%t --compress-recursion=0 --profile-summary-hot-count=0 --csprof-max-unsymbolized-context-depth=2 --csspgo-preinliner=0 --gen-cs-nested-profile=0 --skip-symbolization
+; RUN: FileCheck %s --input-file %t -check-prefix=CHECK-MAX-UNSYM-CTX-DEPTH
+; RUN: llvm-profgen --format=text --perfscript=%S/Inputs/recursion-compression-pseudoprobe.perfscript --binary=%S/Inputs/recursion-compression-pseudoprobe.perfbin --output=%t --compress-recursion=0 --profile-summary-hot-count=0 --csprof-max-unsymbolized-context-depth=2 --csspgo-preinliner=0 --gen-cs-nested-profile=0
+; RUN: FileCheck %s --input-file %t -check-prefix=CHECK-MAX-UNSYM-CTX-DEPTH-PROF
+; RUN: llvm-profgen --format=text --perfscript=%S/Inputs/recursion-compression-pseudoprobe.perfscript --binary=%S/Inputs/recursion-compression-pseudoprobe.perfbin --output=%t --compress-recursion=0 --profile-summary-hot-count=0 --csprof-max-unsymbolized-context-depth=2 --csprof-max-context-depth=0  --csspgo-preinliner=0 --gen-cs-nested-profile=0
+; RUN: FileCheck %s --input-file %t -check-prefix=CHECK-MAX-CTX-DEPTH
+
+
+; CHECK-MAX-CTX-DEPTH: [fb]:19:6
+; CHECK-MAX-CTX-DEPTH:  1: 6
+; CHECK-MAX-CTX-DEPTH:  2: 3
+; CHECK-MAX-CTX-DEPTH:  3: 3
+; CHECK-MAX-CTX-DEPTH:  4: 0
+; CHECK-MAX-CTX-DEPTH:  5: 4 fb:4
+; CHECK-MAX-CTX-DEPTH:  6: 3 fa:3
+; CHECK-MAX-CTX-DEPTH:  !CFGChecksum: 563022570642068
+; CHECK-MAX-CTX-DEPTH: [fa]:14:4
+; CHECK-MAX-CTX-DEPTH:  1: 4
+; CHECK-MAX-CTX-DEPTH:  3: 4
+; CHECK-MAX-CTX-DEPTH:  4: 2
+; CHECK-MAX-CTX-DEPTH:  5: 1
+; CHECK-MAX-CTX-DEPTH:  6: 0
+; CHECK-MAX-CTX-DEPTH:  7: 2 fb:2
+; CHECK-MAX-CTX-DEPTH:  8: 1 fa:1
+; CHECK-MAX-CTX-DEPTH:  !CFGChecksum: 563070469352221
+
+
+; CHECK-MAX-UNSYM-CTX-DEPTH: [0x7ab @ 0x7ab]
+; CHECK-MAX-UNSYM-CTX-DEPTH:   3
+; CHECK-MAX-UNSYM-CTX-DEPTH:   7a0-7a7:1
+; CHECK-MAX-UNSYM-CTX-DEPTH:   7a0-7ab:3
+; CHECK-MAX-UNSYM-CTX-DEPTH:   7b2-7b5:1
+; CHECK-MAX-UNSYM-CTX-DEPTH:   3
+; CHECK-MAX-UNSYM-CTX-DEPTH:   7a7->7b2:1
+; CHECK-MAX-UNSYM-CTX-DEPTH:   7ab->7a0:4
+; CHECK-MAX-UNSYM-CTX-DEPTH:   7b5->7c0:1
+; CHECK-MAX-UNSYM-CTX-DEPTH: [0x7ab @ 0x7b5]
+; CHECK-MAX-UNSYM-CTX-DEPTH:   1
+; CHECK-MAX-UNSYM-CTX-DEPTH:   7c0-7d4:1
+; CHECK-MAX-UNSYM-CTX-DEPTH:   1
+; CHECK-MAX-UNSYM-CTX-DEPTH:   7d4->7c0:1
+; CHECK-MAX-UNSYM-CTX-DEPTH: [0x7b5 @ 0x7d4]
+; CHECK-MAX-UNSYM-CTX-DEPTH:   2
+; CHECK-MAX-UNSYM-CTX-DEPTH:   7c0-7cd:1
+; CHECK-MAX-UNSYM-CTX-DEPTH:   7db-7e0:1
+; CHECK-MAX-UNSYM-CTX-DEPTH:   2
+; CHECK-MAX-UNSYM-CTX-DEPTH:   7cd->7db:1
+; CHECK-MAX-UNSYM-CTX-DEPTH:   7e0->7a0:1
+; CHECK-MAX-UNSYM-CTX-DEPTH: [0x7b5 @ 0x7e0]
+; CHECK-MAX-UNSYM-CTX-DEPTH:   2
+; CHECK-MAX-UNSYM-CTX-DEPTH:   7a0-7a7:1
+; CHECK-MAX-UNSYM-CTX-DEPTH:   7b2-7b5:1
+; CHECK-MAX-UNSYM-CTX-DEPTH:   2
+; CHECK-MAX-UNSYM-CTX-DEPTH:   7a7->7b2:1
+; CHECK-MAX-UNSYM-CTX-DEPTH:   7b5->7c0:1
+; CHECK-MAX-UNSYM-CTX-DEPTH: [0x7d4 @ 0x7e0]
+; CHECK-MAX-UNSYM-CTX-DEPTH:   2
+; CHECK-MAX-UNSYM-CTX-DEPTH:   7a0-7a7:1
+; CHECK-MAX-UNSYM-CTX-DEPTH:   7b2-7b5:1
+; CHECK-MAX-UNSYM-CTX-DEPTH:   2
+; CHECK-MAX-UNSYM-CTX-DEPTH:   7a7->7b2:1
+; CHECK-MAX-UNSYM-CTX-DEPTH:   7b5->7c0:1
+; CHECK-MAX-UNSYM-CTX-DEPTH: [0x7e0 @ 0x7b5]
+; CHECK-MAX-UNSYM-CTX-DEPTH:   2
+; CHECK-MAX-UNSYM-CTX-DEPTH:   7c0-7cd:2
+; CHECK-MAX-UNSYM-CTX-DEPTH:   7db-7e0:1
+; CHECK-MAX-UNSYM-CTX-DEPTH:   2
+; CHECK-MAX-UNSYM-CTX-DEPTH:   7cd->7db:2
+; CHECK-MAX-UNSYM-CTX-DEPTH:   7e0->7a0:1
+
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF: [fb:5 @ fb:5 @ fb]:13:4
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  1: 4
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  2: 3
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  3: 1
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  4: 0
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  5: 4 fb:4
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  6: 1 fa:1
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  !CFGChecksum: 563022570642068
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF: [fa:7 @ fb:6 @ fa]:6:2
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  1: 2
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  3: 2
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  4: 1
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  5: 0
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  6: 0
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  7: 1 fb:1
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  8: 0
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  !CFGChecksum: 563070469352221
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF: [fb:5 @ fb:6 @ fa]:4:1
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  1: 1
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  3: 1
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  4: 0
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  5: 1
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  6: 0
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  7: 0
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  8: 1 fa:1
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  !CFGChecksum: 563070469352221
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF: [fb:6 @ fa:8 @ fa]:4:1
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  1: 1
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  3: 1
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  4: 1
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  5: 0
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  6: 0
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  7: 1 fb:1
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  8: 0
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  !CFGChecksum: 563070469352221
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF: [fa:8 @ fa:7 @ fb]:3:1
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  1: 1
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  2: 0
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  3: 1
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  4: 0
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  5: 0
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  6: 1 fa:1
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  !CFGChecksum: 563022570642068
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF: [fb:6 @ fa:7 @ fb]:3:1
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  1: 1
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  2: 0
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  3: 1
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  4: 0
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  5: 0
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  6: 1 fa:1
+; CHECK-MAX-UNSYM-CTX-DEPTH-PROF:  !CFGChecksum: 563022570642068
diff --git a/llvm/test/tools/llvm-profgen/recursion-compression-pseudoprobe.test b/llvm/test/tools/llvm-profgen/recursion-compression-pseudoprobe.test
index c673028584c0..b8e3e248e779 100644
--- a/llvm/test/tools/llvm-profgen/recursion-compression-pseudoprobe.test
+++ b/llvm/test/tools/llvm-profgen/recursion-compression-pseudoprobe.test
@@ -9,9 +9,6 @@
 ; RUN: FileCheck %s --input-file %t --check-prefix=CHECK-UNWINDER
 ; RUN: llvm-profgen --format=text --perfscript=%S/Inputs/recursion-compression-pseudoprobe-nommap.perfscript --binary=%S/Inputs/recursion-compression-pseudoprobe.perfbin --output=%t --profile-summary-hot-count=0 --csspgo-preinliner=0 --gen-cs-nested-profile=0
 ; RUN: FileCheck %s --input-file %t
-; RUN: llvm-profgen --format=text --perfscript=%S/Inputs/recursion-compression-pseudoprobe.perfscript --binary=%S/Inputs/recursion-compression-pseudoprobe.perfbin --output=%t --compress-recursion=0 --profile-summary-hot-count=0 --csprof-max-context-depth=0 --csspgo-preinliner=0 --gen-cs-nested-profile=0
-; RUN: FileCheck %s --input-file %t -check-prefix=CHECK-MAX-CTX-DEPTH
-
 
 ; CHECK-UNCOMPRESS: [main:2 @ foo:5 @ fa:8 @ fa:7 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:6 @ fa]:4:1
 ; CHECK-UNCOMPRESS:  1: 1
@@ -68,23 +65,6 @@
 ; CHECK-UNCOMPRESS: [main:2 @ foo:5 @ fa:8 @ fa:7 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb]:1:0
 ; CHECK-UNCOMPRESS:  5: 1 fb:1
 ; CHECK-UNCOMPRESS:  !CFGChecksum: 563022570642068
-; CHECK-MAX-CTX-DEPTH: [fb]:19:6
-; CHECK-MAX-CTX-DEPTH:  1: 6
-; CHECK-MAX-CTX-DEPTH:  2: 3
-; CHECK-MAX-CTX-DEPTH:  3: 3
-; CHECK-MAX-CTX-DEPTH:  4: 0
-; CHECK-MAX-CTX-DEPTH:  5: 4 fb:4
-; CHECK-MAX-CTX-DEPTH:  6: 3 fa:3
-; CHECK-MAX-CTX-DEPTH:  !CFGChecksum: 563022570642068
-; CHECK-MAX-CTX-DEPTH: [fa]:14:4
-; CHECK-MAX-CTX-DEPTH:  1: 4
-; CHECK-MAX-CTX-DEPTH:  3: 4
-; CHECK-MAX-CTX-DEPTH:  4: 2
-; CHECK-MAX-CTX-DEPTH:  5: 1
-; CHECK-MAX-CTX-DEPTH:  6: 0
-; CHECK-MAX-CTX-DEPTH:  7: 2 fb:2
-; CHECK-MAX-CTX-DEPTH:  8: 1 fa:1
-; CHECK-MAX-CTX-DEPTH:  !CFGChecksum: 563070469352221
 
 
 ; CHECK: [main:2 @ foo:5 @ fa:8 @ fa:7 @ fb:5 @ fb]:13:4
diff --git a/llvm/tools/llvm-exegesis/lib/RISCV/Target.cpp b/llvm/tools/llvm-exegesis/lib/RISCV/Target.cpp
index 41d361532908..5636782bdf7f 100644
--- a/llvm/tools/llvm-exegesis/lib/RISCV/Target.cpp
+++ b/llvm/tools/llvm-exegesis/lib/RISCV/Target.cpp
@@ -24,6 +24,8 @@
 namespace llvm {
 namespace exegesis {
 
+#include "RISCVGenExegesis.inc"
+
 namespace {
 
 // Stores constant value to a general-purpose (integer) register.
@@ -132,8 +134,7 @@ public:
 };
 
 ExegesisRISCVTarget::ExegesisRISCVTarget()
-    : ExegesisTarget(ArrayRef<CpuAndPfmCounters>{},
-                     RISCV_MC::isOpcodeAvailable) {}
+    : ExegesisTarget(RISCVCpuPfmCounters, RISCV_MC::isOpcodeAvailable) {}
 
 bool ExegesisRISCVTarget::matchesArch(Triple::ArchType Arch) const {
   return Arch == Triple::riscv32 || Arch == Triple::riscv64;
diff --git a/llvm/tools/llvm-jitlink/llvm-jitlink.cpp b/llvm/tools/llvm-jitlink/llvm-jitlink.cpp
index 646d4cef01a5..963c36322c8a 100644
--- a/llvm/tools/llvm-jitlink/llvm-jitlink.cpp
+++ b/llvm/tools/llvm-jitlink/llvm-jitlink.cpp
@@ -1667,6 +1667,12 @@ static Error sanitizeArguments(const Triple &TT, const char *ArgV0) {
                                      inconvertibleErrorCode());
   }
 
+#ifndef NDEBUG
+  if (DebugFlag && MaterializationThreads != 0)
+    errs() << "Warning: debugging output is not thread safe. "
+              "Use -num-threads=0 to stabilize output.\n";
+#endif // NDEBUG
+
   // Only one of -oop-executor and -oop-executor-connect can be used.
   if (!!OutOfProcessExecutor.getNumOccurrences() &&
       !!OutOfProcessExecutorConnect.getNumOccurrences())
@@ -2556,6 +2562,7 @@ int main(int argc, char *argv[]) {
     if (Timers)
       Timers->JITLinkTG.printAll(errs());
     reportLLVMJITLinkError(EntryPoint.takeError());
+    ExitOnErr(S->ES.endSession());
     exit(1);
   }
 
diff --git a/llvm/tools/llvm-profgen/PerfReader.cpp b/llvm/tools/llvm-profgen/PerfReader.cpp
index 111c546f5329..ad113eda2791 100644
--- a/llvm/tools/llvm-profgen/PerfReader.cpp
+++ b/llvm/tools/llvm-profgen/PerfReader.cpp
@@ -42,6 +42,11 @@ static cl::opt<bool>
 cl::opt<bool> ShowDetailedWarning("show-detailed-warning",
                                   cl::desc("Show detailed warning message."));
 
+static cl::opt<int> CSProfMaxUnsymbolizedCtxDepth(
+    "csprof-max-unsymbolized-context-depth", cl::init(-1),
+    cl::desc("Keep the last K contexts while merging unsymbolized profile. -1 "
+             "means no depth limit."));
+
 extern cl::opt<std::string> PerfTraceFilename;
 extern cl::opt<bool> ShowDisassemblyOnly;
 extern cl::opt<bool> ShowSourceLocations;
@@ -172,7 +177,19 @@ std::shared_ptr<AddrBasedCtxKey> AddressStack::getContextKey() {
   std::shared_ptr<AddrBasedCtxKey> KeyStr = std::make_shared<AddrBasedCtxKey>();
   KeyStr->Context = Stack;
   CSProfileGenerator::compressRecursionContext<uint64_t>(KeyStr->Context);
-  CSProfileGenerator::trimContext<uint64_t>(KeyStr->Context);
+  // MaxContextDepth(--csprof-max-context-depth) is used to trim both symbolized
+  // and unsymbolized profile context. Sometimes we want to at least preserve
+  // the inlinings for the leaf frame(the profiled binary inlining),
+  // --csprof-max-context-depth may not be flexible enough, in this case,
+  // --csprof-max-unsymbolized-context-depth is used to limit the context for
+  // unsymbolized profile. If both are set, use the minimum of them.
+  int Depth = CSProfileGenerator::MaxContextDepth != -1
+                  ? CSProfileGenerator::MaxContextDepth
+                  : KeyStr->Context.size();
+  Depth = CSProfMaxUnsymbolizedCtxDepth != -1
+              ? std::min(static_cast<int>(CSProfMaxUnsymbolizedCtxDepth), Depth)
+              : Depth;
+  CSProfileGenerator::trimContext<uint64_t>(KeyStr->Context, Depth);
   return KeyStr;
 }
 
diff --git a/llvm/unittests/ProfileData/CoverageMappingTest.cpp b/llvm/unittests/ProfileData/CoverageMappingTest.cpp
index ef147674591c..46f881ecddb5 100644
--- a/llvm/unittests/ProfileData/CoverageMappingTest.cpp
+++ b/llvm/unittests/ProfileData/CoverageMappingTest.cpp
@@ -291,6 +291,45 @@ struct CoverageMappingTest : ::testing::TestWithParam<std::tuple<bool, bool>> {
   }
 };
 
+TEST(CoverageMappingTest, expression_subst) {
+  CounterExpressionBuilder Builder;
+  CounterExpressionBuilder::SubstMap MapToExpand;
+
+  auto C = [](unsigned ID) { return Counter::getCounter(ID); };
+  auto A = [&](Counter LHS, Counter RHS) { return Builder.add(LHS, RHS); };
+  // returns {E, N} in clangCodeGen
+  auto getBranchCounterPair = [&](Counter E, Counter P, Counter N) {
+    auto Skipped = Builder.subtract(P, E);
+    MapToExpand[N] = Builder.subst(Skipped, MapToExpand);
+  };
+
+  auto E18 = C(5);
+  auto P18 = C(2);
+  auto S18 = C(18);
+  // #18 => (#2 - #5)
+  getBranchCounterPair(E18, P18, S18);
+
+  auto E22 = S18;
+  auto P22 = C(0);
+  auto S22 = C(22);
+  // #22 => #0 - (#2 - #5)
+  getBranchCounterPair(E22, P22, S22);
+
+  auto E28 = A(A(C(9), C(11)), C(14));
+  auto P28 = S22;
+  auto S28 = C(28);
+  // #28 => (((((#0 + #5) - #2) - #9) - #11) - #14)
+  getBranchCounterPair(E28, P28, S28);
+
+  auto LHS = A(E28, A(S28, S18));
+  auto RHS = C(0);
+
+  // W/o subst, LHS cannot be reduced.
+  ASSERT_FALSE(Builder.subtract(LHS, RHS).isZero());
+  // W/ subst, C(18) and C(28) in LHS will be reduced.
+  ASSERT_TRUE(Builder.subst(Builder.subtract(LHS, RHS), MapToExpand).isZero());
+}
+
 TEST_P(CoverageMappingTest, basic_write_read) {
   startFunction("func", 0x1234);
   addCMR(Counter::getCounter(0), "foo", 1, 1, 1, 1);
diff --git a/llvm/unittests/Support/CMakeLists.txt b/llvm/unittests/Support/CMakeLists.txt
index d64f89847aa8..6de816582644 100644
--- a/llvm/unittests/Support/CMakeLists.txt
+++ b/llvm/unittests/Support/CMakeLists.txt
@@ -69,6 +69,7 @@ add_llvm_unittest(SupportTests
   PerThreadBumpPtrAllocatorTest.cpp
   ProcessTest.cpp
   ProgramTest.cpp
+  RecyclerTest.cpp
   RegexTest.cpp
   ReverseIterationTest.cpp
   ReplaceFileTest.cpp
diff --git a/llvm/unittests/Support/RecyclerTest.cpp b/llvm/unittests/Support/RecyclerTest.cpp
new file mode 100644
index 000000000000..a33506b47ebe
--- /dev/null
+++ b/llvm/unittests/Support/RecyclerTest.cpp
@@ -0,0 +1,47 @@
+//===--- unittest/Support/RecyclerTest.cpp --------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/Recycler.h"
+#include "llvm/Support/AllocatorBase.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+
+namespace {
+
+struct Object8 {
+  char Data[8];
+};
+
+class DecoratedMallocAllocator : public MallocAllocator {
+public:
+  int DeallocCount = 0;
+
+  template <typename T> void Deallocate(T *Ptr) {
+    DeallocCount++;
+    MallocAllocator::Deallocate(Ptr);
+  }
+};
+
+TEST(RecyclerTest, MoveConstructor) {
+  DecoratedMallocAllocator Allocator;
+  Recycler<Object8> R;
+  Object8 *A1 = R.Allocate(Allocator);
+  Object8 *A2 = R.Allocate(Allocator);
+  R.Deallocate(Allocator, A1);
+  R.Deallocate(Allocator, A2);
+  Recycler<Object8> R2(std::move(R));
+  Object8 *A3 = R2.Allocate(Allocator);
+  R2.Deallocate(Allocator, A3);
+  R.clear(Allocator); // Should not deallocate anything as it was moved from.
+  EXPECT_EQ(Allocator.DeallocCount, 0);
+  R2.clear(Allocator);
+  EXPECT_EQ(Allocator.DeallocCount, 2);
+}
+
+} // end anonymous namespace
diff --git a/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp b/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp
index 3ea5afce56fa..3955d36fce89 100644
--- a/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp
+++ b/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp
@@ -655,8 +655,8 @@ TEST(ParseArchString, RejectsConflictingExtensions) {
 
   for (StringRef Input :
        {"rv64i_xqcisls0p2", "rv64i_xqcia0p2", "rv64i_xqciac0p2",
-        "rv64i_xqcicsr0p2", "rv64i_xqcilsm0p2", "rv64i_xqcics0p2",
-        "rv64i_xqcicli0p2"}) {
+        "rv64i_xqcicsr0p2", "rv64i_xqcilsm0p2", "rv64i_xqcicm0p2",
+        "rv64i_xqcics0p2", "rv64i_xqcicli0p2"}) {
     EXPECT_THAT(
         toString(RISCVISAInfo::parseArchString(Input, true).takeError()),
         ::testing::EndsWith(" is only supported for 'rv32'"));
@@ -1118,6 +1118,7 @@ Experimental extensions
     xqcia                0.2
     xqciac               0.2
     xqcicli              0.2
+    xqcicm               0.2
     xqcics               0.2
     xqcicsr              0.2
     xqcilsm              0.2
diff --git a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
index 5bcc2b8eb2e2..1ac499fba417 100644
--- a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
+++ b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
@@ -1220,9 +1220,9 @@ TEST_F(VPRecipeTest, MayHaveSideEffectsAndMayReadWriteMemory) {
     VPInstruction VPInst(Instruction::Add, {&Op1, &Op2});
     VPRecipeBase &Recipe = VPInst;
     EXPECT_FALSE(Recipe.mayHaveSideEffects());
-    EXPECT_TRUE(Recipe.mayReadFromMemory());
+    EXPECT_FALSE(Recipe.mayReadFromMemory());
     EXPECT_FALSE(Recipe.mayWriteToMemory());
-    EXPECT_TRUE(Recipe.mayReadOrWriteMemory());
+    EXPECT_FALSE(Recipe.mayReadOrWriteMemory());
   }
   {
     VPValue Op1;
diff --git a/llvm/unittests/tools/llvm-exegesis/CMakeLists.txt b/llvm/unittests/tools/llvm-exegesis/CMakeLists.txt
index 3ee3a0dc6b5d..735f17ab03e6 100644
--- a/llvm/unittests/tools/llvm-exegesis/CMakeLists.txt
+++ b/llvm/unittests/tools/llvm-exegesis/CMakeLists.txt
@@ -53,6 +53,9 @@ endif()
 if(LLVM_TARGETS_TO_BUILD MATCHES "Mips")
   include(Mips/CMakeLists.txt)
 endif()
+if(LLVM_TARGETS_TO_BUILD MATCHES "RISCV")
+  include(RISCV/CMakeLists.txt)
+endif()
 
 include_directories(${exegesis_includes})
 
diff --git a/llvm/unittests/tools/llvm-exegesis/RISCV/CMakeLists.txt b/llvm/unittests/tools/llvm-exegesis/RISCV/CMakeLists.txt
new file mode 100644
index 000000000000..1984819be773
--- /dev/null
+++ b/llvm/unittests/tools/llvm-exegesis/RISCV/CMakeLists.txt
@@ -0,0 +1,21 @@
+add_llvm_exegesis_unittest_includes(
+  ${LLVM_MAIN_SRC_DIR}/lib/Target/RISCV
+  ${LLVM_BINARY_DIR}/lib/Target/RISCV
+  ${LLVM_MAIN_SRC_DIR}/tools/llvm-exegesis/lib
+  )
+
+add_llvm_exegesis_unittest_link_components(
+  MC
+  MCParser
+  Object
+  Support
+  Symbolize
+  RISCV
+  )
+
+add_llvm_exegesis_unittest_sources(
+  SnippetGeneratorTest.cpp
+  TargetTest.cpp
+  )
+add_llvm_exegesis_unittest_link_libraries(
+  LLVMExegesisRISCV)
diff --git a/llvm/unittests/tools/llvm-exegesis/RISCV/SnippetGeneratorTest.cpp b/llvm/unittests/tools/llvm-exegesis/RISCV/SnippetGeneratorTest.cpp
new file mode 100644
index 000000000000..5920b79da9d3
--- /dev/null
+++ b/llvm/unittests/tools/llvm-exegesis/RISCV/SnippetGeneratorTest.cpp
@@ -0,0 +1,122 @@
+//===-- SnippetGeneratorTest.cpp --------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "../Common/AssemblerUtils.h"
+#include "LlvmState.h"
+#include "MCInstrDescView.h"
+#include "ParallelSnippetGenerator.h"
+#include "RISCVInstrInfo.h"
+#include "RegisterAliasing.h"
+#include "SerialSnippetGenerator.h"
+#include "TestBase.h"
+
+namespace llvm {
+namespace exegesis {
+namespace {
+
+using testing::AnyOf;
+using testing::ElementsAre;
+using testing::HasSubstr;
+using testing::SizeIs;
+
+MATCHER(IsInvalid, "") { return !arg.isValid(); }
+MATCHER(IsReg, "") { return arg.isReg(); }
+
+template <typename SnippetGeneratorT>
+class RISCVSnippetGeneratorTest : public RISCVTestBase {
+protected:
+  RISCVSnippetGeneratorTest() : Generator(State, SnippetGenerator::Options()) {}
+
+  std::vector<CodeTemplate> checkAndGetCodeTemplates(unsigned Opcode) {
+    randomGenerator().seed(0); // Initialize seed.
+    const Instruction &Instr = State.getIC().getInstr(Opcode);
+    auto CodeTemplateOrError = Generator.generateCodeTemplates(
+        &Instr, State.getRATC().emptyRegisters());
+    EXPECT_FALSE(CodeTemplateOrError.takeError()); // Valid configuration.
+    return std::move(CodeTemplateOrError.get());
+  }
+
+  SnippetGeneratorT Generator;
+};
+
+using RISCVSerialSnippetGeneratorTest =
+    RISCVSnippetGeneratorTest<SerialSnippetGenerator>;
+
+using RISCVParallelSnippetGeneratorTest =
+    RISCVSnippetGeneratorTest<ParallelSnippetGenerator>;
+
+TEST_F(RISCVSerialSnippetGeneratorTest,
+       ImplicitSelfDependencyThroughExplicitRegs) {
+  // - ADD
+  // - Op0 Explicit Def RegClass(GPR)
+  // - Op1 Explicit Use RegClass(GPR)
+  // - Op2 Explicit Use RegClass(GPR)
+  // - Var0 [Op0]
+  // - Var1 [Op1]
+  // - Var2 [Op2]
+  // - hasAliasingRegisters
+  const unsigned Opcode = RISCV::ADD;
+  const auto CodeTemplates = checkAndGetCodeTemplates(Opcode);
+  ASSERT_THAT(CodeTemplates, SizeIs(1));
+  const auto &CT = CodeTemplates[0];
+  EXPECT_THAT(CT.Execution, ExecutionMode::SERIAL_VIA_EXPLICIT_REGS);
+  ASSERT_THAT(CT.Instructions, SizeIs(1));
+  const InstructionTemplate &IT = CT.Instructions[0];
+  EXPECT_THAT(IT.getOpcode(), Opcode);
+  ASSERT_THAT(IT.getVariableValues(), SizeIs(3));
+  EXPECT_THAT(IT.getVariableValues(),
+              AnyOf(ElementsAre(IsReg(), IsInvalid(), IsReg()),
+                    ElementsAre(IsReg(), IsReg(), IsInvalid())))
+      << "Op0 is either set to Op1 or to Op2";
+}
+
+TEST_F(RISCVSerialSnippetGeneratorTest,
+       ImplicitSelfDependencyThroughExplicitRegsForbidAll) {
+  // - XOR
+  // - Op0 Explicit Def RegClass(GPR)
+  // - Op1 Explicit Use RegClass(GPR)
+  // - Op2 Explicit Use RegClass(GPR)
+  // - Var0 [Op0]
+  // - Var1 [Op1]
+  // - Var2 [Op2]
+  // - hasAliasingRegisters
+  randomGenerator().seed(0); // Initialize seed.
+  const Instruction &Instr = State.getIC().getInstr(RISCV::XOR);
+  auto AllRegisters = State.getRATC().emptyRegisters();
+  AllRegisters.flip();
+  EXPECT_TRUE(errorToBool(
+      Generator.generateCodeTemplates(&Instr, AllRegisters).takeError()));
+}
+
+TEST_F(RISCVParallelSnippetGeneratorTest, MemoryUse) {
+  // LB reads from memory.
+  // - LB
+  // - Op0 Explicit Def RegClass(GPR)
+  // - Op1 Explicit Use Memory RegClass(GPR)
+  // - Op2 Explicit Use Memory
+  // - Var0 [Op0]
+  // - Var1 [Op1]
+  // - Var2 [Op2]
+  // - hasMemoryOperands
+  const unsigned Opcode = RISCV::LB;
+  const auto CodeTemplates = checkAndGetCodeTemplates(Opcode);
+  ASSERT_THAT(CodeTemplates, SizeIs(1));
+  const auto &CT = CodeTemplates[0];
+  EXPECT_THAT(CT.Info, HasSubstr("instruction has no tied variables"));
+  EXPECT_THAT(CT.Execution, ExecutionMode::UNKNOWN);
+  ASSERT_THAT(CT.Instructions,
+              SizeIs(ParallelSnippetGenerator::kMinNumDifferentAddresses));
+  const InstructionTemplate &IT = CT.Instructions[0];
+  EXPECT_THAT(IT.getOpcode(), Opcode);
+  ASSERT_THAT(IT.getVariableValues(), SizeIs(3));
+  EXPECT_EQ(IT.getVariableValues()[1].getReg(), RISCV::X10);
+}
+
+} // namespace
+} // namespace exegesis
+} // namespace llvm
diff --git a/llvm/unittests/tools/llvm-exegesis/RISCV/TargetTest.cpp b/llvm/unittests/tools/llvm-exegesis/RISCV/TargetTest.cpp
new file mode 100644
index 000000000000..12d3ce7165a8
--- /dev/null
+++ b/llvm/unittests/tools/llvm-exegesis/RISCV/TargetTest.cpp
@@ -0,0 +1,56 @@
+//===-- TargetTest.cpp ---------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "Target.h"
+
+#include <cassert>
+#include <memory>
+
+#include "MCTargetDesc/RISCVMCTargetDesc.h"
+#include "TestBase.h"
+#include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/TargetSelect.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+namespace llvm {
+namespace exegesis {
+
+void InitializeRISCVExegesisTarget();
+
+namespace {
+
+using testing::IsEmpty;
+using testing::Not;
+using testing::NotNull;
+
+class RISCVTargetTest : public RISCVTestBase {
+protected:
+  std::vector<MCInst> setRegTo(unsigned Reg, const APInt &Value) {
+    return State.getExegesisTarget().setRegTo(State.getSubtargetInfo(), Reg,
+                                              Value);
+  }
+};
+
+TEST_F(RISCVTargetTest, SetRegToConstant) {
+  const auto Insts = setRegTo(RISCV::X10, APInt());
+  EXPECT_THAT(Insts, Not(IsEmpty()));
+}
+
+TEST_F(RISCVTargetTest, DefaultPfmCounters) {
+  const std::string Expected = "CYCLES";
+  EXPECT_EQ(State.getExegesisTarget().getPfmCounters("").CycleCounter,
+            Expected);
+  EXPECT_EQ(
+      State.getExegesisTarget().getPfmCounters("unknown_cpu").CycleCounter,
+      Expected);
+}
+
+} // namespace
+} // namespace exegesis
+} // namespace llvm
diff --git a/llvm/unittests/tools/llvm-exegesis/RISCV/TestBase.h b/llvm/unittests/tools/llvm-exegesis/RISCV/TestBase.h
new file mode 100644
index 000000000000..66748fb9a2ce
--- /dev/null
+++ b/llvm/unittests/tools/llvm-exegesis/RISCV/TestBase.h
@@ -0,0 +1,44 @@
+//===-- TestBase.h ----------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// Test fixture common to all RISC-V tests.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_UNITTESTS_TOOLS_LLVMEXEGESIS_RISCV_TESTBASE_H
+#define LLVM_UNITTESTS_TOOLS_LLVMEXEGESIS_RISCV_TESTBASE_H
+
+#include "LlvmState.h"
+#include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/TargetSelect.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+namespace llvm {
+namespace exegesis {
+
+void InitializeRISCVExegesisTarget();
+
+class RISCVTestBase : public ::testing::Test {
+protected:
+  RISCVTestBase()
+      : State(cantFail(
+            LLVMState::Create("riscv64-unknown-linux", "generic-rv64"))) {}
+
+  static void SetUpTestCase() {
+    LLVMInitializeRISCVTargetInfo();
+    LLVMInitializeRISCVTargetMC();
+    LLVMInitializeRISCVTarget();
+    InitializeRISCVExegesisTarget();
+  }
+
+  const LLVMState State;
+};
+
+} // namespace exegesis
+} // namespace llvm
+
+#endif
diff --git a/llvm/utils/TableGen/Basic/ARMTargetDefEmitter.cpp b/llvm/utils/TableGen/Basic/ARMTargetDefEmitter.cpp
index 3b02f63e9490..4dea89ecbeff 100644
--- a/llvm/utils/TableGen/Basic/ARMTargetDefEmitter.cpp
+++ b/llvm/utils/TableGen/Basic/ARMTargetDefEmitter.cpp
@@ -162,14 +162,14 @@ static void emitARMTargetDef(const RecordKeeper &RK, raw_ostream &OS) {
   for (const Record *Rec : FMVExts) {
     OS << "  I.emplace_back(";
     OS << "\"" << Rec->getValueAsString("Name") << "\"";
-    OS << ", " << Rec->getValueAsString("Bit");
+    OS << ", " << Rec->getValueAsString("FeatureBit");
+    OS << ", " << Rec->getValueAsString("PriorityBit");
     auto FeatName = Rec->getValueAsString("BackendFeature");
     const Record *FeatRec = ExtensionMap[FeatName];
     if (FeatRec)
       OS << ", " << FeatRec->getValueAsString("ArchExtKindSpelling").upper();
     else
       OS << ", std::nullopt";
-    OS << ", " << (uint64_t)Rec->getValueAsInt("Priority");
     OS << ");\n";
   };
   OS << "  return I;\n"
diff --git a/llvm/utils/TableGen/DXILEmitter.cpp b/llvm/utils/TableGen/DXILEmitter.cpp
index a0c93bed5ad8..7488c8de5788 100644
--- a/llvm/utils/TableGen/DXILEmitter.cpp
+++ b/llvm/utils/TableGen/DXILEmitter.cpp
@@ -218,8 +218,10 @@ static StringRef getOverloadKindStr(const Record *R) {
       .Case("Int64Ty", "OverloadKind::I64")
       .Case("ResRetHalfTy", "OverloadKind::HALF")
       .Case("ResRetFloatTy", "OverloadKind::FLOAT")
+      .Case("ResRetDoubleTy", "OverloadKind::DOUBLE")
       .Case("ResRetInt16Ty", "OverloadKind::I16")
-      .Case("ResRetInt32Ty", "OverloadKind::I32");
+      .Case("ResRetInt32Ty", "OverloadKind::I32")
+      .Case("ResRetInt64Ty", "OverloadKind::I64");
 }
 
 /// Return a string representation of valid overload information denoted
diff --git a/llvm/utils/UpdateTestChecks/common.py b/llvm/utils/UpdateTestChecks/common.py
index b108a21dbc52..e1cc02e1a608 100644
--- a/llvm/utils/UpdateTestChecks/common.py
+++ b/llvm/utils/UpdateTestChecks/common.py
@@ -1396,7 +1396,7 @@ def find_diff_matching(lhs: List[str], rhs: List[str]) -> List[tuple]:
             backlinks.append(None)
 
     # Commit to names in the matching by walking the backlinks. Recursively
-    # attempt to fill in more matches in-betweem.
+    # attempt to fill in more matches in-between.
     match_idx = table_candidate_idx[-1]
     while match_idx is not None:
         current = candidates[match_idx]
diff --git a/llvm/utils/gn/secondary/clang-tools-extra/clangd/unittests/BUILD.gn b/llvm/utils/gn/secondary/clang-tools-extra/clangd/unittests/BUILD.gn
index 7deefe9dc061..c79d5ad662b7 100644
--- a/llvm/utils/gn/secondary/clang-tools-extra/clangd/unittests/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang-tools-extra/clangd/unittests/BUILD.gn
@@ -80,6 +80,7 @@ unittest("ClangdTests") {
     "GlobalCompilationDatabaseTests.cpp",
     "HeaderSourceSwitchTests.cpp",
     "HeadersTests.cpp",
+    "HeuristicResolverTests.cpp",
     "HoverTests.cpp",
     "IncludeCleanerTests.cpp",
     "IndexActionTests.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/tools/llvm-exegesis/lib/RISCV/BUILD.gn b/llvm/utils/gn/secondary/llvm/tools/llvm-exegesis/lib/RISCV/BUILD.gn
index c334b54a833b..7c6cdd0a34d2 100644
--- a/llvm/utils/gn/secondary/llvm/tools/llvm-exegesis/lib/RISCV/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/tools/llvm-exegesis/lib/RISCV/BUILD.gn
@@ -1,6 +1,14 @@
+import("//llvm/utils/TableGen/tablegen.gni")
+
+tablegen("RISCVGenExegesis") {
+  args = [ "-gen-exegesis" ]
+  td_file = "//llvm/lib/Target/RISCV/RISCV.td"
+}
+
 static_library("RISCV") {
   output_name = "LLVMExegesisRISCV"
   deps = [
+    ":RISCVGenExegesis",
     "//llvm/lib/CodeGen",
     "//llvm/lib/IR",
     "//llvm/lib/Support",
diff --git a/llvm/utils/gn/secondary/llvm/unittests/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/BUILD.gn
index 78875ea98102..0d01bfa98017 100644
--- a/llvm/utils/gn/secondary/llvm/unittests/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/unittests/BUILD.gn
@@ -102,7 +102,10 @@ group("unittests") {
     ]
   }
   if (llvm_build_RISCV) {
-    deps += [ "Target/RISCV:RISCVTests" ]
+    deps += [
+      "Target/RISCV:RISCVTests",
+      "tools/llvm-exegesis/RISCV:LLVMExegesisRISCVTests",
+    ]
   }
   if (llvm_build_SystemZ) {
     deps += [ "MC/SystemZ:SystemZAsmLexerTests" ]
diff --git a/llvm/utils/gn/secondary/llvm/unittests/Support/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/Support/BUILD.gn
index 47b03b42d096..bf6a0b752327 100644
--- a/llvm/utils/gn/secondary/llvm/unittests/Support/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/unittests/Support/BUILD.gn
@@ -73,6 +73,7 @@ unittest("SupportTests") {
     "ProcessTest.cpp",
     "ProgramTest.cpp",
     "RISCVAttributeParserTest.cpp",
+    "RecyclerTest.cpp",
     "RegexTest.cpp",
     "ReplaceFileTest.cpp",
     "ReverseIterationTest.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/unittests/tools/llvm-exegesis/RISCV/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/tools/llvm-exegesis/RISCV/BUILD.gn
new file mode 100644
index 000000000000..d1db867c79f5
--- /dev/null
+++ b/llvm/utils/gn/secondary/llvm/unittests/tools/llvm-exegesis/RISCV/BUILD.gn
@@ -0,0 +1,26 @@
+import("//third-party/unittest/unittest.gni")
+
+unittest("LLVMExegesisRISCVTests") {
+  deps = [
+    "//llvm/lib/DebugInfo/Symbolize",
+    "//llvm/lib/MC",
+    "//llvm/lib/MC/MCParser",
+    "//llvm/lib/Object",
+    "//llvm/lib/Support",
+    "//llvm/lib/Target/RISCV",
+
+    # Exegesis reaches inside the Target/RISCV tablegen internals and must
+    # depend on these Target/RISCV-internal build targets.
+    "//llvm/lib/Target/RISCV/MCTargetDesc",
+    "//llvm/tools/llvm-exegesis/lib",
+    "//llvm/tools/llvm-exegesis/lib/RISCV",
+  ]
+  include_dirs = [
+    "//llvm/lib/Target/RISCV",
+    "//llvm/tools/llvm-exegesis/lib",
+  ]
+  sources = [
+    "SnippetGeneratorTest.cpp",
+    "TargetTest.cpp",
+  ]
+}
diff --git a/mlir/CMakeLists.txt b/mlir/CMakeLists.txt
index 7416e522083b..a888ac243b04 100644
--- a/mlir/CMakeLists.txt
+++ b/mlir/CMakeLists.txt
@@ -170,7 +170,7 @@ configure_file(
 #   The pybind11 library can be found (set with -DPYBIND_DIR=...)
 #   The python executable is correct (set with -DPython3_EXECUTABLE=...)
 # By default, find_package and probing for installed pybind11 is performed.
-# Super projects can set MLIR_DISABLE_CONFIGURE_PYTHON_DEV_PACKAGES=ON to 
+# Super projects can set MLIR_DISABLE_CONFIGURE_PYTHON_DEV_PACKAGES=ON to
 # disable all package setup and control it themselves.
 #-------------------------------------------------------------------------------
 
diff --git a/mlir/docs/Bindings/Python.md b/mlir/docs/Bindings/Python.md
index a0bd1cac118b..32df3310d811 100644
--- a/mlir/docs/Bindings/Python.md
+++ b/mlir/docs/Bindings/Python.md
@@ -1035,7 +1035,7 @@ class ConstantOp(_ods_ir.OpView):
     ...
 ```
 
-expects `value` to be a `TypedAttr` (e.g., `IntegerAttr` or `FloatAttr`). 
+expects `value` to be a `TypedAttr` (e.g., `IntegerAttr` or `FloatAttr`).
 Thus, a natural extension is a builder that accepts a MLIR type and a Python value and instantiates the appropriate `TypedAttr`:
 
 ```python
@@ -1181,9 +1181,9 @@ make the passes available along with the dialect.
 Dialect functionality other than IR objects or passes, such as helper functions,
 can be exposed to Python similarly to attributes and types. C API is expected to
 exist for this functionality, which can then be wrapped using pybind11 and
-`[include/mlir/Bindings/Python/PybindAdaptors.h](https://github.com/llvm/llvm-project/blob/main/mlir/include/mlir/Bindings/Python/PybindAdaptors.h)`,
+[`include/mlir/Bindings/Python/PybindAdaptors.h`](https://github.com/llvm/llvm-project/blob/main/mlir/include/mlir/Bindings/Python/PybindAdaptors.h),
 or nanobind and
-`[include/mlir/Bindings/Python/NanobindAdaptors.h](https://github.com/llvm/llvm-project/blob/main/mlir/include/mlir/Bindings/Python/NanobindAdaptors.h)`
+[`include/mlir/Bindings/Python/NanobindAdaptors.h`](https://github.com/llvm/llvm-project/blob/main/mlir/include/mlir/Bindings/Python/NanobindAdaptors.h)
 utilities to connect to the rest of Python API. The bindings can be located in a
 separate module or in the same module as attributes and types, and
 loaded along with the dialect.
diff --git a/mlir/include/mlir-c/Dialect/LLVM.h b/mlir/include/mlir-c/Dialect/LLVM.h
index 0992285f997e..26c4140757c3 100644
--- a/mlir/include/mlir-c/Dialect/LLVM.h
+++ b/mlir/include/mlir-c/Dialect/LLVM.h
@@ -45,6 +45,13 @@ MLIR_CAPI_EXPORTED MlirType
 mlirLLVMFunctionTypeGet(MlirType resultType, intptr_t nArgumentTypes,
                         MlirType const *argumentTypes, bool isVarArg);
 
+/// Returns the number of input types.
+MLIR_CAPI_EXPORTED intptr_t mlirLLVMFunctionTypeGetNumInputs(MlirType type);
+
+/// Returns the pos-th input type.
+MLIR_CAPI_EXPORTED MlirType mlirLLVMFunctionTypeGetInput(MlirType type,
+                                                         intptr_t pos);
+
 /// Returns `true` if the type is an LLVM dialect struct type.
 MLIR_CAPI_EXPORTED bool mlirTypeIsALLVMStructType(MlirType type);
 
diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
index 42a017db300a..3adfd5f4f2c4 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
@@ -1055,7 +1055,7 @@ def GPU_PrintfOp : GPU_Op<"printf", [MemoryEffects<[MemWrite]>]>,
     imposed by one's target platform.
   }];
   let assemblyFormat = [{
-    $format attr-dict ($args^ `:` type($args))?
+    $format attr-dict (`,` $args^ `:` type($args))?
   }];
 }
 
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMAttrDefs.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMAttrDefs.td
index e8eeafd09a9c..267389774bd5 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMAttrDefs.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMAttrDefs.td
@@ -825,7 +825,7 @@ def LLVM_MemoryEffectsAttr : LLVM_Attr<"MemoryEffects", "memory_effects"> {
 def LLVM_AliasScopeDomainAttr : LLVM_Attr<"AliasScopeDomain",
                                           "alias_scope_domain"> {
   let parameters = (ins
-    "DistinctAttr":$id,
+    "Attribute":$id,
     OptionalParameter<"StringAttr">:$description
   );
 
@@ -853,7 +853,7 @@ def LLVM_AliasScopeDomainAttr : LLVM_Attr<"AliasScopeDomain",
 
 def LLVM_AliasScopeAttr : LLVM_Attr<"AliasScope", "alias_scope"> {
   let parameters = (ins
-    "DistinctAttr":$id,
+    "Attribute":$id,
     "AliasScopeDomainAttr":$domain,
     OptionalParameter<"StringAttr">:$description
   );
@@ -891,6 +891,8 @@ def LLVM_AliasScopeAttr : LLVM_Attr<"AliasScope", "alias_scope"> {
     }
     ```
 
+    The first attribute can either be a DistinctAttr or a StringAttr.
+
     See the following link for more details:
     https://llvm.org/docs/LangRef.html#noalias-and-alias-scope-metadata
   }];
@@ -898,6 +900,8 @@ def LLVM_AliasScopeAttr : LLVM_Attr<"AliasScope", "alias_scope"> {
   let summary = "LLVM dialect alias scope";
 
   let assemblyFormat = "`<` struct(params) `>`";
+
+  let genVerifyDecl = 1;
 }
 
 def LLVM_AliasScopeArrayAttr
diff --git a/mlir/include/mlir/Dialect/SCF/IR/SCFOps.td b/mlir/include/mlir/Dialect/SCF/IR/SCFOps.td
index 23c597a1ca51..6f408b3c924d 100644
--- a/mlir/include/mlir/Dialect/SCF/IR/SCFOps.td
+++ b/mlir/include/mlir/Dialect/SCF/IR/SCFOps.td
@@ -302,7 +302,7 @@ def ForallOp : SCF_Op<"forall", [
        AttrSizedOperandSegments,
        AutomaticAllocationScope,
        DeclareOpInterfaceMethods<LoopLikeOpInterface,
-          ["getInitsMutable", "getRegionIterArgs", "getLoopInductionVars", 
+          ["getInitsMutable", "getRegionIterArgs", "getLoopInductionVars",
            "getLoopLowerBounds", "getLoopUpperBounds", "getLoopSteps",
            "promoteIfSingleIteration", "yieldTiledValuesAndReplace"]>,
        RecursiveMemoryEffects,
@@ -671,7 +671,7 @@ def IfOp : SCF_Op<"if", [DeclareOpInterfaceMethods<RegionBranchOpInterface, [
     "getNumRegionInvocations", "getRegionInvocationBounds",
     "getEntrySuccessorRegions"]>,
     InferTypeOpAdaptor, SingleBlockImplicitTerminator<"scf::YieldOp">,
-    RecursiveMemoryEffects, NoRegionArguments]> {
+    RecursiveMemoryEffects, RecursivelySpeculatable, NoRegionArguments]> {
   let summary = "if-then-else operation";
   let description = [{
     The `scf.if` operation represents an if-then-else construct for
diff --git a/mlir/include/mlir/Dialect/SCF/Transforms/Patterns.h b/mlir/include/mlir/Dialect/SCF/Transforms/Patterns.h
index 9c1479d28c30..18c9dfd205de 100644
--- a/mlir/include/mlir/Dialect/SCF/Transforms/Patterns.h
+++ b/mlir/include/mlir/Dialect/SCF/Transforms/Patterns.h
@@ -68,8 +68,7 @@ void populateSCFStructuralTypeConversionTarget(
 /// applyPartialOneToNConversion.
 /// FIXME: The 1:N dialect conversion is deprecated and will be removed soon.
 /// 1:N support has been added to the regular dialect conversion driver.
-LLVM_DEPRECATED("Use populateSCFStructuralTypeConversions() instead",
-                "populateSCFStructuralTypeConversions")
+/// Use populateSCFStructuralTypeConversions() instead.
 void populateSCFStructuralOneToNTypeConversions(
     const TypeConverter &typeConverter, RewritePatternSet &patterns);
 
diff --git a/mlir/include/mlir/Dialect/Tosa/IR/TosaOpBase.td b/mlir/include/mlir/Dialect/Tosa/IR/TosaOpBase.td
index f5536927dc25..d3f12c34421b 100644
--- a/mlir/include/mlir/Dialect/Tosa/IR/TosaOpBase.td
+++ b/mlir/include/mlir/Dialect/Tosa/IR/TosaOpBase.td
@@ -126,11 +126,12 @@ def Tosa_ConvOpQuantInfoBuilder : OpBuilder<
   (ins "::mlir::Type":$outputType, "::mlir::Value":$input,
        "::mlir::Value":$weight, "::mlir::Value":$bias,
        "::mlir::DenseI64ArrayAttr":$pad, "::mlir::DenseI64ArrayAttr":$stride,
-       "::mlir::DenseI64ArrayAttr":$dilation),
+       "::mlir::DenseI64ArrayAttr":$dilation,
+       "::mlir::TypeAttr":$acc_type),
   [{
     buildConvOpWithQuantInfo($_builder, $_state, outputType,
                              input, weight, bias,
-                             pad, stride, dilation);
+                             pad, stride, dilation, acc_type);
   }]>;
 
 // Handles tosa.transpose_conv2d which has an outpad and output shape attribute.
@@ -139,12 +140,13 @@ def Tosa_TransConvOpQuantInfoBuilder : OpBuilder<
        "::mlir::Value":$weight, "mlir::Value":$bias,
        "::mlir::DenseI64ArrayAttr":$outpad,
        "::mlir::DenseI64ArrayAttr":$stride,
-       "::mlir::DenseI64ArrayAttr":$outputShape),
+       "::mlir::DenseI64ArrayAttr":$outputShape,
+       "::mlir::TypeAttr":$acc_type),
   [{
     buildTransConvOpWithQuantInfo($_builder, $_state, outputType,
                                   input, weight, bias,
                                   outpad, stride,
-                                  outputShape);
+                                  outputShape, acc_type);
   }]>;
 
 // The tosa.fully_connected op has its own builder as it does not have
diff --git a/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td b/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td
index 8ae5d3ab417b..6b43c9a259b1 100644
--- a/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td
+++ b/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td
@@ -57,7 +57,7 @@ def Tosa_ArgMaxOp : Tosa_InferShapedTypeOp<"argmax"> {
 // Accumulator types.
 //===----------------------------------------------------------------------===//
 
-def Tosa_AccType : AnyTypeOf<[I<32>, SI<32>, F16, F32]>;
+def Tosa_AccType : AnyTypeOf<[I<32>, I<48>, F16, F32]>;
 
 //===----------------------------------------------------------------------===//
 // Operator: avg_pool2d
@@ -106,6 +106,7 @@ def Tosa_Conv2DOp : Tosa_InferShapedTypeOp<"conv2d"> {
     Tosa_IntArrayAttr4:$pad,
     Tosa_IntArrayAttr2:$stride,
     Tosa_IntArrayAttr2:$dilation,
+    TypeAttrOf<Tosa_AccType>:$acc_type,
     OptionalAttr<Tosa_ConvOpQuantizationAttr>:$quantization_info,
     DefaultValuedOptionalAttr<BoolAttr, "false">:$local_bound
   );
@@ -135,6 +136,7 @@ def Tosa_Conv3DOp : Tosa_InferShapedTypeOp<"conv3d"> {
     Tosa_IntArrayAttr6:$pad,
     Tosa_IntArrayAttr3:$stride,
     Tosa_IntArrayAttr3:$dilation,
+    TypeAttrOf<Tosa_AccType>:$acc_type,
     OptionalAttr<Tosa_ConvOpQuantizationAttr>:$quantization_info,
     DefaultValuedOptionalAttr<BoolAttr, "false">:$local_bound
   );
@@ -165,6 +167,7 @@ def Tosa_DepthwiseConv2DOp : Tosa_InferShapedTypeOp<"depthwise_conv2d"> {
     Tosa_IntArrayAttr4:$pad,
     Tosa_IntArrayAttr2:$stride,
     Tosa_IntArrayAttr2:$dilation,
+    TypeAttrOf<Tosa_AccType>:$acc_type,
     OptionalAttr<Tosa_ConvOpQuantizationAttr>:$quantization_info,
     DefaultValuedOptionalAttr<BoolAttr, "false">:$local_bound
   );
@@ -348,6 +351,7 @@ def Tosa_TransposeConv2DOp : Tosa_InferShapedTypeOp<"transpose_conv2d"> {
     Tosa_IntArrayAttr4:$out_pad,
     Tosa_IntArrayAttr2:$stride,
     Tosa_IntArrayAttr4:$out_shape,
+    TypeAttrOf<Tosa_AccType>:$acc_type,
     OptionalAttr<Tosa_ConvOpQuantizationAttr>:$quantization_info,
     DefaultValuedOptionalAttr<BoolAttr, "false">:$local_bound
   );
@@ -357,6 +361,7 @@ def Tosa_TransposeConv2DOp : Tosa_InferShapedTypeOp<"transpose_conv2d"> {
   );
 
   let builders = [Tosa_TransConvOpQuantInfoBuilder];
+  let hasVerifier = 1;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1552,21 +1557,21 @@ def Tosa_PadOp : Tosa_InferShapedTypeOp<"pad"> {
     Example:
 
     ```mlir
-    %0 = arith.constant dense<[[1, 2], [3, 4]]> : tensor<2x2xi32>
-    tosa.pad %arg0, %0 : (tensor<1x2xf32>, tensor<2x2xi32>)  -> (tensor<4x9xf32>)
+    %0 = arith.constant dense<[1, 2, 3, 4]> : tensor<4xi32>
+    tosa.pad %arg0, %0 : (tensor<1x2xf32>, tensor<4xi32>)  -> (tensor<4x9xf32>)
     ```
 
     Example 2:
 
     ```mlir
-    %0 = arith.constant dense<[[-1, 2], [3, 4]]> : tensor<2x2xi32>
-    tosa.pad %arg0, %0 : (tensor<1x2xf32>, tensor<2x2xi32>)  -> (tensor<?x9xf32>)
+    %0 = arith.constant dense<[-1, 2, 3, 4]> : tensor<4xi32>
+    tosa.pad %arg0, %0 : (tensor<1x2xf32>, tensor<4xi32>)  -> (tensor<?x9xf32>)
     ```
   }];
 
   let arguments = (ins
     Tosa_RankedTensor:$input1,
-    Tosa_Int32Or64Tensor:$padding,
+    TosaTensorRankOf<[Tosa_Int32Or64], [1]>:$padding,
     Optional<Tosa_ScalarTensor>:$pad_const,
     OptionalAttr<Tosa_PadOpQuantizationAttr>:$quantization_info
   );
diff --git a/mlir/include/mlir/Dialect/Tosa/IR/TosaTypesBase.td b/mlir/include/mlir/Dialect/Tosa/IR/TosaTypesBase.td
index a6d3163d4446..d3cc6e92bac2 100644
--- a/mlir/include/mlir/Dialect/Tosa/IR/TosaTypesBase.td
+++ b/mlir/include/mlir/Dialect/Tosa/IR/TosaTypesBase.td
@@ -65,17 +65,17 @@ def Tosa_Int32Or64 : AnyTypeOf<[Tosa_Int32,
 // int8  : symmetric  per tensor/per channel, signed
 // int16 : symmetric  per tensor,             signed
 //===----------------------------------------------------------------------===//
-def Tosa_QuantizedInt	: AnyTypeOf<[ Tosa_QuantizedType<"uint8", [8], 0>,
-                                     Tosa_QuantizedType<"int4", [4, 0], 1>,
-                                     Tosa_QuantizedType<"int8", [8, 0], 1>,
-                                     Tosa_QuantizedType<"int16", [16, 0], 1>,
-                                     Tosa_QuantizedType<"int32", [32, 0], 1>]>;
+def Tosa_QuantizedInt : AnyTypeOf<[Tosa_QuantizedType<"uint8", [8], 0>,
+                                   Tosa_QuantizedType<"int4", [4, 0], 1>,
+                                   Tosa_QuantizedType<"int8", [8, 0], 1>,
+                                   Tosa_QuantizedType<"int16", [16, 0], 1>,
+                                   Tosa_QuantizedType<"int32", [32, 0], 1>]>;
 
 //===----------------------------------------------------------------------===//
 // Multi-category types.
 //===----------------------------------------------------------------------===//
 def Tosa_AnyNumber : AnyTypeOf<[Tosa_Int, Tosa_QuantizedInt, AnyFloat],
-                               "number">;
+                                "number">;
 
 // For weight tensors from tosa::Conv2DOp, tosa::Conv3DOp,
 // tosa::DepthwiseConv2DOp, tosa::TransposeConv2DOp, tosa::FullyConnectedOp
@@ -112,7 +112,7 @@ class TosaTensorRankOf<list<Type> allowedTypes, list<int> ranks>
 
 def Tosa_I1Tensor : TosaTensorOf<[I1]>;
 def Tosa_Int32Tensor : TosaTensorOf<[Tosa_Int32]>;
-def Tosa_Int32Or64Tensor :TosaTensorOf<[Tosa_Int32Or64]>;
+def Tosa_Int32Or64Tensor : TosaTensorOf<[Tosa_Int32Or64]>;
 
 def Tosa_FloatTensor : TosaTensorOf<[AnyFloat]>;
 
diff --git a/mlir/include/mlir/IR/OperationSupport.h b/mlir/include/mlir/IR/OperationSupport.h
index f4cc5baa6335..5eb2d69134ea 100644
--- a/mlir/include/mlir/IR/OperationSupport.h
+++ b/mlir/include/mlir/IR/OperationSupport.h
@@ -1166,16 +1166,20 @@ public:
   OpPrintingFlags &skipRegions(bool skip = true);
 
   /// Do not verify the operation when using custom operation printers.
-  OpPrintingFlags &assumeVerified();
+  OpPrintingFlags &assumeVerified(bool enable = true);
 
   /// Use local scope when printing the operation. This allows for using the
   /// printer in a more localized and thread-safe setting, but may not
   /// necessarily be identical to what the IR will look like when dumping
   /// the full module.
-  OpPrintingFlags &useLocalScope();
+  OpPrintingFlags &useLocalScope(bool enable = true);
 
   /// Print users of values as comments.
-  OpPrintingFlags &printValueUsers();
+  OpPrintingFlags &printValueUsers(bool enable = true);
+
+  /// Print unique SSA ID numbers for values, block arguments and naming
+  /// conflicts across all regions
+  OpPrintingFlags &printUniqueSSAIDs(bool enable = true);
 
   /// Return if the given ElementsAttr should be elided.
   bool shouldElideElementsAttr(ElementsAttr attr) const;
diff --git a/mlir/include/mlir/Transforms/LocationSnapshot.h b/mlir/include/mlir/Transforms/LocationSnapshot.h
index ccfdbac007ac..cefe005d2c4c 100644
--- a/mlir/include/mlir/Transforms/LocationSnapshot.h
+++ b/mlir/include/mlir/Transforms/LocationSnapshot.h
@@ -51,18 +51,6 @@ void generateLocationsFromIR(raw_ostream &os, StringRef fileName, StringRef tag,
 LogicalResult generateLocationsFromIR(StringRef fileName, StringRef tag,
                                       Operation *op, OpPrintingFlags flags);
 
-/// Create a pass to generate new locations by snapshotting the IR to the given
-/// file, and using the printed locations within that file. If `filename` is
-/// empty, a temporary file is generated instead. If a 'tag' is non-empty, the
-/// generated locations are represented as a NameLoc with the given tag as the
-/// name, and then fused with the existing locations. Otherwise, the existing
-/// locations are replaced.
-std::unique_ptr<Pass> createLocationSnapshotPass(OpPrintingFlags flags,
-                                                 StringRef fileName = "",
-                                                 StringRef tag = "");
-/// Overload utilizing pass options for initialization.
-std::unique_ptr<Pass> createLocationSnapshotPass();
-
 } // namespace mlir
 
 #endif // MLIR_TRANSFORMS_LOCATIONSNAPSHOT_H
diff --git a/mlir/include/mlir/Transforms/OneToNTypeConversion.h b/mlir/include/mlir/Transforms/OneToNTypeConversion.h
index 9c74bf916d97..37a326818d64 100644
--- a/mlir/include/mlir/Transforms/OneToNTypeConversion.h
+++ b/mlir/include/mlir/Transforms/OneToNTypeConversion.h
@@ -123,9 +123,7 @@ public:
   /// (i.e., the converted types must be the same as the types of the new
   /// values).
   /// FIXME: The 1:N dialect conversion is deprecated and will be removed soon.
-  /// 1:N support has been added to the regular dialect conversion driver.
-  LLVM_DEPRECATED("Use replaceOpWithMultiple() instead",
-                  "replaceOpWithMultiple")
+  /// Use replaceOpWithMultiple() instead.
   void replaceOp(Operation *op, ValueRange newValues,
                  const OneToNTypeMapping &resultMapping);
   using PatternRewriter::replaceOp;
@@ -260,8 +258,7 @@ public:
 /// only "partial").
 /// FIXME: The 1:N dialect conversion is deprecated and will be removed soon.
 /// 1:N support has been added to the regular dialect conversion driver.
-LLVM_DEPRECATED("Use applyPartialConversion() instead",
-                "applyPartialConversion")
+/// Use applyPartialConversion() instead.
 LogicalResult
 applyPartialOneToNConversion(Operation *op, TypeConverter &typeConverter,
                              const FrozenRewritePatternSet &patterns);
@@ -272,9 +269,7 @@ applyPartialOneToNConversion(Operation *op, TypeConverter &typeConverter,
 /// used with the 1:N dialect conversion.
 /// FIXME: The 1:N dialect conversion is deprecated and will be removed soon.
 /// 1:N support has been added to the regular dialect conversion driver.
-LLVM_DEPRECATED(
-    "Use populateFunctionOpInterfaceTypeConversionPattern() instead",
-    "populateFunctionOpInterfaceTypeConversionPattern")
+/// Use populateFunctionOpInterfaceTypeConversionPattern() instead.
 void populateOneToNFunctionOpInterfaceTypeConversionPattern(
     StringRef functionLikeOpName, const TypeConverter &converter,
     RewritePatternSet &patterns);
diff --git a/mlir/include/mlir/Transforms/Passes.td b/mlir/include/mlir/Transforms/Passes.td
index 000d9f697618..c4a8e7a81fa4 100644
--- a/mlir/include/mlir/Transforms/Passes.td
+++ b/mlir/include/mlir/Transforms/Passes.td
@@ -331,13 +331,21 @@ def LocationSnapshot : Pass<"snapshot-op-locations"> {
     ... loc(fused["original_source.cpp":1:1, "snapshot"("snapshot_source.mlir":10:10)])
     ```
   }];
-  let constructor = "mlir::createLocationSnapshotPass()";
   let options = [
     Option<"fileName", "filename", "std::string", /*default=*/"",
            "The filename to print the generated IR">,
     Option<"tag", "tag", "std::string", /*default=*/"",
            "A tag to use when fusing the new locations with the "
            "original. If unset, the locations are replaced.">,
+    Option<"enableDebugInfo", "print-debuginfo", "bool", /*default=*/"false",
+           "Print debug info in MLIR output">,
+    Option<"printGenericOpForm", "print-op-generic", "bool", /*default=*/"false",
+           "Print the generic op form">,
+    Option<"useLocalScope", "print-local-scope", "bool", /*default=*/"false",
+           "Print with local scope and inline information (eliding "
+           "aliases for attributes, types, and locations">,
+    Option<"printPrettyDebugInfo", "pretty-debuginfo", "bool", /*default=*/"false",
+           "Print pretty debug info in MLIR output">,
   ];
 }
 
diff --git a/mlir/lib/Bindings/Python/IRCore.cpp b/mlir/lib/Bindings/Python/IRCore.cpp
index 05c000bfd8ca..453d4f7c7e8b 100644
--- a/mlir/lib/Bindings/Python/IRCore.cpp
+++ b/mlir/lib/Bindings/Python/IRCore.cpp
@@ -272,13 +272,13 @@ struct PyAttrBuilderMap {
   static bool dunderContains(const std::string &attributeKind) {
     return PyGlobals::get().lookupAttributeBuilder(attributeKind).has_value();
   }
-  static nb::callable dundeGetItemNamed(const std::string &attributeKind) {
+  static nb::callable dunderGetItemNamed(const std::string &attributeKind) {
     auto builder = PyGlobals::get().lookupAttributeBuilder(attributeKind);
     if (!builder)
       throw nb::key_error(attributeKind.c_str());
     return *builder;
   }
-  static void dundeSetItemNamed(const std::string &attributeKind,
+  static void dunderSetItemNamed(const std::string &attributeKind,
                                 nb::callable func, bool replace) {
     PyGlobals::get().registerAttributeBuilder(attributeKind, std::move(func),
                                               replace);
@@ -287,8 +287,8 @@ struct PyAttrBuilderMap {
   static void bind(nb::module_ &m) {
     nb::class_<PyAttrBuilderMap>(m, "AttrBuilder")
         .def_static("contains", &PyAttrBuilderMap::dunderContains)
-        .def_static("get", &PyAttrBuilderMap::dundeGetItemNamed)
-        .def_static("insert", &PyAttrBuilderMap::dundeSetItemNamed,
+        .def_static("get", &PyAttrBuilderMap::dunderGetItemNamed)
+        .def_static("insert", &PyAttrBuilderMap::dunderSetItemNamed,
                     "attribute_kind"_a, "attr_builder"_a, "replace"_a = false,
                     "Register an attribute builder for building MLIR "
                     "attributes from python values.");
diff --git a/mlir/lib/CAPI/Dialect/LLVM.cpp b/mlir/lib/CAPI/Dialect/LLVM.cpp
index 6ed82ba1a025..da450dd3fd8a 100644
--- a/mlir/lib/CAPI/Dialect/LLVM.cpp
+++ b/mlir/lib/CAPI/Dialect/LLVM.cpp
@@ -55,6 +55,16 @@ MlirType mlirLLVMFunctionTypeGet(MlirType resultType, intptr_t nArgumentTypes,
       unwrapList(nArgumentTypes, argumentTypes, argumentStorage), isVarArg));
 }
 
+intptr_t mlirLLVMFunctionTypeGetNumInputs(MlirType type) {
+  return llvm::cast<LLVM::LLVMFunctionType>(unwrap(type)).getNumParams();
+}
+
+MlirType mlirLLVMFunctionTypeGetInput(MlirType type, intptr_t pos) {
+  assert(pos >= 0 && "pos in array must be positive");
+  return wrap(llvm::cast<LLVM::LLVMFunctionType>(unwrap(type))
+                  .getParamType(static_cast<unsigned>(pos)));
+}
+
 bool mlirTypeIsALLVMStructType(MlirType type) {
   return isa<LLVM::LLVMStructType>(unwrap(type));
 }
diff --git a/mlir/lib/Conversion/TosaToTensor/TosaToTensor.cpp b/mlir/lib/Conversion/TosaToTensor/TosaToTensor.cpp
index 6f085cb6ed06..b5a0da15e780 100644
--- a/mlir/lib/Conversion/TosaToTensor/TosaToTensor.cpp
+++ b/mlir/lib/Conversion/TosaToTensor/TosaToTensor.cpp
@@ -338,11 +338,6 @@ public:
           padOp, "tosa.pad was unable to determine the pad constant value.");
     }
 
-    Value lowIndex =
-        rewriter.create<arith::ConstantOp>(loc, rewriter.getIndexAttr(0));
-    Value highIndex =
-        rewriter.create<arith::ConstantOp>(loc, rewriter.getIndexAttr(1));
-
     SmallVector<OpFoldResult, 3> lowValues;
     SmallVector<OpFoldResult, 3> highValues;
 
@@ -350,11 +345,12 @@ public:
     highValues.reserve(rank);
 
     for (int i = 0; i < rank; i++) {
-      Value inputIndex = rewriter.create<arith::ConstantIndexOp>(loc, i);
+      Value lowIndex = rewriter.create<arith::ConstantIndexOp>(loc, 2 * i);
+      Value highIndex = rewriter.create<arith::ConstantIndexOp>(loc, 2 * i + 1);
       Value lowVal = rewriter.createOrFold<tensor::ExtractOp>(
-          loc, padding, ValueRange({inputIndex, lowIndex}));
+          loc, padding, ValueRange({lowIndex}));
       Value highVal = rewriter.createOrFold<tensor::ExtractOp>(
-          loc, padding, ValueRange({inputIndex, highIndex}));
+          loc, padding, ValueRange({highIndex}));
 
       lowVal = rewriter.createOrFold<arith::IndexCastOp>(
           loc, rewriter.getIndexType(), lowVal);
diff --git a/mlir/lib/Dialect/Affine/IR/ValueBoundsOpInterfaceImpl.cpp b/mlir/lib/Dialect/Affine/IR/ValueBoundsOpInterfaceImpl.cpp
index 82a9fb0d4908..e93b99b4f498 100644
--- a/mlir/lib/Dialect/Affine/IR/ValueBoundsOpInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/Affine/IR/ValueBoundsOpInterfaceImpl.cpp
@@ -91,6 +91,64 @@ struct AffineMaxOpInterface
   };
 };
 
+struct AffineDelinearizeIndexOpInterface
+    : public ValueBoundsOpInterface::ExternalModel<
+          AffineDelinearizeIndexOpInterface, AffineDelinearizeIndexOp> {
+  void populateBoundsForIndexValue(Operation *rawOp, Value value,
+                                   ValueBoundsConstraintSet &cstr) const {
+    auto op = cast<AffineDelinearizeIndexOp>(rawOp);
+    auto result = cast<OpResult>(value);
+    assert(result.getOwner() == rawOp &&
+           "bounded value isn't a result of this delinearize_index");
+    unsigned resIdx = result.getResultNumber();
+
+    AffineExpr linearIdx = cstr.getExpr(op.getLinearIndex());
+
+    SmallVector<OpFoldResult> basis = op.getPaddedBasis();
+    AffineExpr divisor = cstr.getExpr(1);
+    for (OpFoldResult basisElem : llvm::drop_begin(basis, resIdx + 1))
+      divisor = divisor * cstr.getExpr(basisElem);
+
+    if (resIdx == 0) {
+      cstr.bound(value) == linearIdx.floorDiv(divisor);
+      if (!basis.front().isNull())
+        cstr.bound(value) < cstr.getExpr(basis.front());
+      return;
+    }
+    AffineExpr thisBasis = cstr.getExpr(basis[resIdx]);
+    cstr.bound(value) == (linearIdx % (thisBasis * divisor)).floorDiv(divisor);
+  }
+};
+
+struct AffineLinearizeIndexOpInterface
+    : public ValueBoundsOpInterface::ExternalModel<
+          AffineLinearizeIndexOpInterface, AffineLinearizeIndexOp> {
+  void populateBoundsForIndexValue(Operation *rawOp, Value value,
+                                   ValueBoundsConstraintSet &cstr) const {
+    auto op = cast<AffineLinearizeIndexOp>(rawOp);
+    assert(value == op.getResult() &&
+           "value isn't the result of this linearize");
+
+    AffineExpr bound = cstr.getExpr(0);
+    AffineExpr stride = cstr.getExpr(1);
+    SmallVector<OpFoldResult> basis = op.getPaddedBasis();
+    OperandRange multiIndex = op.getMultiIndex();
+    unsigned numArgs = multiIndex.size();
+    for (auto [revArgNum, length] : llvm::enumerate(llvm::reverse(basis))) {
+      unsigned argNum = numArgs - (revArgNum + 1);
+      if (argNum == 0)
+        break;
+      OpFoldResult indexAsFoldRes = getAsOpFoldResult(multiIndex[argNum]);
+      bound = bound + cstr.getExpr(indexAsFoldRes) * stride;
+      stride = stride * cstr.getExpr(length);
+    }
+    bound = bound + cstr.getExpr(op.getMultiIndex().front()) * stride;
+    cstr.bound(value) == bound;
+    if (op.getDisjoint() && !basis.front().isNull()) {
+      cstr.bound(value) < stride *cstr.getExpr(basis.front());
+    }
+  }
+};
 } // namespace
 } // namespace mlir
 
@@ -100,6 +158,10 @@ void mlir::affine::registerValueBoundsOpInterfaceExternalModels(
     AffineApplyOp::attachInterface<AffineApplyOpInterface>(*ctx);
     AffineMaxOp::attachInterface<AffineMaxOpInterface>(*ctx);
     AffineMinOp::attachInterface<AffineMinOpInterface>(*ctx);
+    AffineDelinearizeIndexOp::attachInterface<
+        AffineDelinearizeIndexOpInterface>(*ctx);
+    AffineLinearizeIndexOp::attachInterface<AffineLinearizeIndexOpInterface>(
+        *ctx);
   });
 }
 
diff --git a/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp b/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp
index 0f2c889d4f39..4e02559a0894 100644
--- a/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp
+++ b/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp
@@ -919,8 +919,9 @@ static void generateUnrolledLoop(
   // 'forOp'.
   auto builder = OpBuilder::atBlockTerminator(loopBodyBlock);
 
+  constexpr auto defaultAnnotateFn = [](unsigned, Operation *, OpBuilder) {};
   if (!annotateFn)
-    annotateFn = [](unsigned, Operation *, OpBuilder) {};
+    annotateFn = defaultAnnotateFn;
 
   // Keep a pointer to the last non-terminator operation in the original block
   // so that we know what to clone (since we are doing this in-place).
diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMAttrs.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMAttrs.cpp
index 7490e8735f5f..ff1636bc121b 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/LLVMAttrs.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMAttrs.cpp
@@ -53,6 +53,23 @@ void LLVMDialect::registerAttributes() {
 }
 
 //===----------------------------------------------------------------------===//
+// AliasScopeAttr
+//===----------------------------------------------------------------------===//
+
+LogicalResult
+AliasScopeAttr::verify(function_ref<InFlightDiagnostic()> emitError,
+                       Attribute id, AliasScopeDomainAttr domain,
+                       StringAttr description) {
+  (void)domain;
+  (void)description;
+  if (!llvm::isa<StringAttr, DistinctAttr>(id))
+    return emitError()
+           << "id of an alias scope must be a StringAttr or a DistrinctAttr";
+
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
 // DINodeAttr
 //===----------------------------------------------------------------------===//
 
diff --git a/mlir/lib/Dialect/SCF/Utils/Utils.cpp b/mlir/lib/Dialect/SCF/Utils/Utils.cpp
index 41410a0a56aa..6cda7100fe07 100644
--- a/mlir/lib/Dialect/SCF/Utils/Utils.cpp
+++ b/mlir/lib/Dialect/SCF/Utils/Utils.cpp
@@ -329,8 +329,9 @@ static void generateUnrolledLoop(
   // 'forOp'.
   auto builder = OpBuilder::atBlockTerminator(loopBodyBlock);
 
+  constexpr auto defaultAnnotateFn = [](unsigned, Operation *, OpBuilder) {};
   if (!annotateFn)
-    annotateFn = [](unsigned, Operation *, OpBuilder) {};
+    annotateFn = defaultAnnotateFn;
 
   // Keep a pointer to the last non-terminator operation in the original block
   // so that we know what to clone (since we are doing this in-place).
diff --git a/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp b/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp
index 631d3c48f2df..764a5db48e07 100644
--- a/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp
+++ b/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp
@@ -210,7 +210,12 @@ template <typename T>
 static LogicalResult verifyConvOp(T op) {
   // All TOSA conv ops have an input() and weight().
   auto inputType = llvm::dyn_cast<RankedTensorType>(op.getInput().getType());
-  auto weightType = llvm::dyn_cast<RankedTensorType>(op.getWeight().getType());
+
+  RankedTensorType weightType;
+  if constexpr (std::is_same_v<T, tosa::TransposeConv2DOp>)
+    weightType = llvm::dyn_cast<RankedTensorType>(op.getFilter().getType());
+  else
+    weightType = llvm::dyn_cast<RankedTensorType>(op.getWeight().getType());
 
   // Must be ranked tensor types
   if (!inputType) {
@@ -218,7 +223,13 @@ static LogicalResult verifyConvOp(T op) {
     return failure();
   }
   if (!weightType) {
-    op.emitOpError("expect a ranked tensor for weight, got ") << op.getWeight();
+    if constexpr (std::is_same_v<T, tosa::TransposeConv2DOp>) {
+      op.emitOpError("expect a ranked tensor for filter, got ")
+          << op.getFilter();
+    } else {
+      op.emitOpError("expect a ranked tensor for weight, got ")
+          << op.getWeight();
+    }
     return failure();
   }
 
@@ -271,6 +282,38 @@ LogicalResult tosa::ConstOp::verify() {
   return success();
 }
 
+template <typename T>
+static LogicalResult verifyConvOpModes(T op) {
+  auto inputEType =
+      llvm::cast<ShapedType>(op.getInput().getType()).getElementType();
+
+  if (auto quantType =
+          llvm::dyn_cast<mlir::quant::UniformQuantizedType>(inputEType))
+    inputEType = quantType.getStorageType();
+
+  auto accType = op.getAccType();
+  if (inputEType.isInteger(8) && !accType.isInteger(32))
+    return op.emitOpError("accumulator type for i8 tensor is not i32");
+
+  if (inputEType.isInteger(16) && !accType.isInteger(48))
+    return op.emitOpError("accumulator type for i16 tensor is not i48");
+
+  if ((inputEType.isFloat8E5M2() || inputEType.isFloat8E4M3()) &&
+      !accType.isF16())
+    return op.emitOpError("accumulator type for f8 tensor is not f16");
+
+  if (inputEType.isF16() && !(accType.isF16() || accType.isF32()))
+    return op.emitOpError("accumulator type for f16 tensor is not f16/f32");
+
+  if (inputEType.isBF16() && !accType.isF32())
+    return op.emitOpError("accumulator type for bf16 tensor is not f32");
+
+  if (inputEType.isF32() && !accType.isF32())
+    return op.emitOpError("accumulator type for f32 tensor is not f32");
+
+  return success();
+}
+
 LogicalResult tosa::ArgMaxOp::verify() {
   // Ensure output is of 32-bit integer
   const auto resultETy = llvm::cast<ShapedType>(getType()).getElementType();
@@ -368,12 +411,14 @@ static void buildConvOpWithQuantInfo(OpBuilder &builder, OperationState &result,
                                      Type outputType, Value input, Value weight,
                                      Value bias, DenseI64ArrayAttr pad,
                                      DenseI64ArrayAttr stride,
-                                     DenseI64ArrayAttr dilation) {
+                                     DenseI64ArrayAttr dilation,
+                                     TypeAttr accType) {
 
   result.addOperands({input, weight, bias});
   result.addAttribute("pad", pad);
   result.addAttribute("stride", stride);
   result.addAttribute("dilation", dilation);
+  result.addAttribute("acc_type", accType);
 
   auto quantAttr = buildConvOpQuantizationAttr(builder, input, weight);
   if (quantAttr) {
@@ -390,11 +435,12 @@ static void buildConvOpWithQuantInfo(OpBuilder &builder, OperationState &result,
 static void buildTransConvOpWithQuantInfo(
     OpBuilder &builder, OperationState &result, Type outputType, Value input,
     Value weight, Value bias, DenseI64ArrayAttr outpad,
-    DenseI64ArrayAttr stride, DenseI64ArrayAttr outputShape) {
+    DenseI64ArrayAttr stride, DenseI64ArrayAttr outputShape, TypeAttr accType) {
   result.addOperands({input, weight, bias});
   result.addAttribute("out_pad", outpad);
   result.addAttribute("stride", stride);
   result.addAttribute("out_shape", outputShape);
+  result.addAttribute("acc_type", accType);
   auto quantAttr = ::buildConvOpQuantizationAttr(builder, input, weight);
 
   if (quantAttr) {
@@ -787,7 +833,7 @@ LogicalResult tosa::PadOp::inferReturnTypeComponents(
       return success();
     }
 
-    outputShape.resize(paddingShape.getDimSize(0), ShapedType::kDynamic);
+    outputShape.resize(paddingShape.getDimSize(0) / 2, ShapedType::kDynamic);
     inferredReturnShapes.push_back(ShapedTypeComponents(outputShape));
     return success();
   }
@@ -823,13 +869,17 @@ LogicalResult tosa::PadOp::inferReturnTypeComponents(
 LogicalResult tosa::PadOp::verify() {
   RankedTensorType inputType = getInput1().getType();
   RankedTensorType outputType = getOutput().getType();
-  TensorType paddingType = getPadding().getType();
+  RankedTensorType paddingType = getPadding().getType();
 
   if (inputType.getRank() != outputType.getRank())
     return emitOpError() << "expect same input and output tensor rank.";
 
-  if (paddingType.hasRank() && paddingType.getRank() != 2)
-    return emitOpError() << "expect 'padding' tensor rank equal to 2.";
+  if (!paddingType.isDynamicDim(0) &&
+      paddingType.getDimSize(0) != inputType.getRank() * 2)
+    return emitOpError() << "expected padding tensor dim 0 to have size "
+                         << inputType.getRank() * 2
+                         << " (2*rank(shape1)) but got size "
+                         << paddingType.getDimSize(0);
 
   return success();
 }
@@ -1595,7 +1645,11 @@ LogicalResult Conv2DOp::inferReturnTypeComponents(
   return success();
 }
 
-LogicalResult Conv2DOp::verify() { return verifyConvOp(*this); }
+LogicalResult Conv2DOp::verify() {
+  if (verifyConvOp(*this).failed() || verifyConvOpModes(*this).failed())
+    return failure();
+  return success();
+}
 
 LogicalResult Conv3DOp::inferReturnTypeComponents(
     MLIRContext *context, ::std::optional<Location> location,
@@ -1667,7 +1721,11 @@ LogicalResult Conv3DOp::inferReturnTypeComponents(
   return success();
 }
 
-LogicalResult Conv3DOp::verify() { return verifyConvOp(*this); }
+LogicalResult Conv3DOp::verify() {
+  if (verifyConvOp(*this).failed() || verifyConvOpModes(*this).failed())
+    return failure();
+  return success();
+}
 
 LogicalResult AvgPool2dOp::inferReturnTypeComponents(
     MLIRContext *context, ::std::optional<Location> location,
@@ -1762,7 +1820,11 @@ LogicalResult DepthwiseConv2DOp::inferReturnTypeComponents(
   return success();
 }
 
-LogicalResult DepthwiseConv2DOp::verify() { return verifyConvOp(*this); }
+LogicalResult DepthwiseConv2DOp::verify() {
+  if (verifyConvOp(*this).failed() || verifyConvOpModes(*this).failed())
+    return failure();
+  return success();
+}
 
 LogicalResult TransposeConv2DOp::inferReturnTypeComponents(
     MLIRContext *context, ::std::optional<Location> location,
@@ -1828,6 +1890,12 @@ LogicalResult TransposeConv2DOp::inferReturnTypeComponents(
   return success();
 }
 
+LogicalResult TransposeConv2DOp::verify() {
+  if (verifyConvOp(*this).failed() || verifyConvOpModes(*this).failed())
+    return failure();
+  return success();
+}
+
 LogicalResult IfOp::inferReturnTypeComponents(
     MLIRContext *context, ::std::optional<Location> location,
     IfOp::Adaptor adaptor,
diff --git a/mlir/lib/Dialect/Tosa/Transforms/TosaDecomposeConv2D.cpp b/mlir/lib/Dialect/Tosa/Transforms/TosaDecomposeConv2D.cpp
index 44f64f76e9b0..04a709c59677 100644
--- a/mlir/lib/Dialect/Tosa/Transforms/TosaDecomposeConv2D.cpp
+++ b/mlir/lib/Dialect/Tosa/Transforms/TosaDecomposeConv2D.cpp
@@ -81,7 +81,7 @@ struct Conv2DIsFullyConnected : public OpRewritePattern<tosa::Conv2DOp> {
         }
       }
 
-      auto padSizeTy = RankedTensorType::get({4, 2}, rewriter.getI64Type());
+      auto padSizeTy = RankedTensorType::get({8}, rewriter.getI64Type());
       auto padSize =
           DenseIntElementsAttr::get(padSizeTy, ArrayRef<int64_t>(pad));
       Value padSizeVal =
diff --git a/mlir/lib/Dialect/Tosa/Transforms/TosaDecomposeDepthwise.cpp b/mlir/lib/Dialect/Tosa/Transforms/TosaDecomposeDepthwise.cpp
index e6fba211dc37..14f392ab8c45 100644
--- a/mlir/lib/Dialect/Tosa/Transforms/TosaDecomposeDepthwise.cpp
+++ b/mlir/lib/Dialect/Tosa/Transforms/TosaDecomposeDepthwise.cpp
@@ -108,7 +108,7 @@ struct DepthwiseConv2DIsMul : public OpRewritePattern<tosa::DepthwiseConv2DOp> {
         }
       }
 
-      auto padSizeTy = RankedTensorType::get({5, 2}, rewriter.getI64Type());
+      auto padSizeTy = RankedTensorType::get({10}, rewriter.getI64Type());
       auto padSize =
           DenseIntElementsAttr::get(padSizeTy, ArrayRef<int64_t>(pad));
       Value padSizeVal =
diff --git a/mlir/lib/Dialect/Tosa/Transforms/TosaDecomposeTransposeConv.cpp b/mlir/lib/Dialect/Tosa/Transforms/TosaDecomposeTransposeConv.cpp
index 0779cdb9667a..db1e219b601b 100644
--- a/mlir/lib/Dialect/Tosa/Transforms/TosaDecomposeTransposeConv.cpp
+++ b/mlir/lib/Dialect/Tosa/Transforms/TosaDecomposeTransposeConv.cpp
@@ -75,13 +75,15 @@ public:
           loc, resultTy, input, reverse2, bias,
           rewriter.getDenseI64ArrayAttr(convPad),
           rewriter.getDenseI64ArrayAttr(stride),
-          rewriter.getDenseI64ArrayAttr({1, 1}), *op.getQuantizationInfo());
+          rewriter.getDenseI64ArrayAttr({1, 1}),
+          /* acc_type = */ op.getAccType(), *op.getQuantizationInfo());
     } else {
       conv2d = rewriter.create<tosa::Conv2DOp>(
           loc, resultTy, input, reverse2, bias,
           rewriter.getDenseI64ArrayAttr(convPad),
           rewriter.getDenseI64ArrayAttr(stride),
-          rewriter.getDenseI64ArrayAttr({1, 1}));
+          rewriter.getDenseI64ArrayAttr({1, 1}),
+          /* acc_type = */ op.getAccTypeAttr());
     }
 
     rewriter.replaceOp(op, conv2d);
@@ -139,7 +141,7 @@ public:
     weightPadding[5] =
         (weightWidth % stride[1]) ? (stride[1] - weightWidth % stride[1]) : 0;
     DenseElementsAttr weightPaddingAttr = DenseIntElementsAttr::get(
-        RankedTensorType::get({4, 2}, rewriter.getI32Type()), weightPadding);
+        RankedTensorType::get({8}, rewriter.getI32Type()), weightPadding);
     Value weightPaddingVal = CreateOpAndInferShape<tosa::ConstOp>(
         rewriter, loc, weightPaddingAttr.getType(), weightPaddingAttr);
 
@@ -202,7 +204,7 @@ public:
     inputPadding[5] += restridedWeightTy.getDimSize(2) - 1;
 
     DenseElementsAttr inputPaddingAttr = DenseIntElementsAttr::get(
-        RankedTensorType::get({4, 2}, rewriter.getI32Type()), inputPadding);
+        RankedTensorType::get({8}, rewriter.getI32Type()), inputPadding);
 
     Value inputPaddingVal = CreateOpAndInferShape<tosa::ConstOp>(
         rewriter, loc, inputPaddingAttr.getType(), inputPaddingAttr);
@@ -238,7 +240,7 @@ public:
                    /*pad=*/rewriter.getDenseI64ArrayAttr({0, 0, 0, 0}),
                    /*stride=*/rewriter.getDenseI64ArrayAttr({1, 1}),
                    /*dilation=*/rewriter.getDenseI64ArrayAttr({1, 1}),
-                   *op.getQuantizationInfo())
+                   /* acc_type = */ op.getAccType(), *op.getQuantizationInfo())
                    .getResult();
     } else {
       conv2d = CreateOpAndInferShape<tosa::Conv2DOp>(
@@ -246,7 +248,8 @@ public:
                    weight, zeroBias,
                    /*pad=*/rewriter.getDenseI64ArrayAttr({0, 0, 0, 0}),
                    /*stride=*/rewriter.getDenseI64ArrayAttr({1, 1}),
-                   /*dilation=*/rewriter.getDenseI64ArrayAttr({1, 1}))
+                   /*dilation=*/rewriter.getDenseI64ArrayAttr({1, 1}),
+                   /* acc_type = */ op.getAccTypeAttr())
                    .getResult();
     }
 
@@ -314,7 +317,7 @@ public:
     resultPadding[5] = resultTy.getDimSize(2) - resultPadLeft - sliceSize[2];
 
     DenseElementsAttr resultPaddingAttr = DenseIntElementsAttr::get(
-        RankedTensorType::get({4, 2}, rewriter.getI32Type()), resultPadding);
+        RankedTensorType::get({8}, rewriter.getI32Type()), resultPadding);
 
     Value resultPaddingVal = CreateOpAndInferShape<tosa::ConstOp>(
         rewriter, loc, resultPaddingAttr.getType(), resultPaddingAttr);
diff --git a/mlir/lib/IR/AsmPrinter.cpp b/mlir/lib/IR/AsmPrinter.cpp
index 6fe96504ae10..c603db450cbd 100644
--- a/mlir/lib/IR/AsmPrinter.cpp
+++ b/mlir/lib/IR/AsmPrinter.cpp
@@ -284,22 +284,29 @@ OpPrintingFlags &OpPrintingFlags::skipRegions(bool skip) {
 }
 
 /// Do not verify the operation when using custom operation printers.
-OpPrintingFlags &OpPrintingFlags::assumeVerified() {
-  assumeVerifiedFlag = true;
+OpPrintingFlags &OpPrintingFlags::assumeVerified(bool enable) {
+  assumeVerifiedFlag = enable;
   return *this;
 }
 
 /// Use local scope when printing the operation. This allows for using the
 /// printer in a more localized and thread-safe setting, but may not necessarily
 /// be identical of what the IR will look like when dumping the full module.
-OpPrintingFlags &OpPrintingFlags::useLocalScope() {
-  printLocalScope = true;
+OpPrintingFlags &OpPrintingFlags::useLocalScope(bool enable) {
+  printLocalScope = enable;
   return *this;
 }
 
 /// Print users of values as comments.
-OpPrintingFlags &OpPrintingFlags::printValueUsers() {
-  printValueUsersFlag = true;
+OpPrintingFlags &OpPrintingFlags::printValueUsers(bool enable) {
+  printValueUsersFlag = enable;
+  return *this;
+}
+
+/// Print unique SSA ID numbers for values, block arguments and naming conflicts
+/// across all regions
+OpPrintingFlags &OpPrintingFlags::printUniqueSSAIDs(bool enable) {
+  printUniqueSSAIDsFlag = enable;
   return *this;
 }
 
diff --git a/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp
index cf58bc5d8f47..659ab1227f11 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp
@@ -237,15 +237,7 @@ public:
       generateMetadata(value.getInt(), "maxnreg");
     } else if (attribute.getName() ==
                NVVM::NVVMDialect::getKernelFuncAttrName()) {
-      llvm::Metadata *llvmMetadataKernel[] = {
-          llvm::ValueAsMetadata::get(llvmFunc),
-          llvm::MDString::get(llvmContext, "kernel"),
-          llvm::ValueAsMetadata::get(
-              llvm::ConstantInt::get(llvm::Type::getInt32Ty(llvmContext), 1))};
-      llvm::MDNode *llvmMetadataNode =
-          llvm::MDNode::get(llvmContext, llvmMetadataKernel);
-      moduleTranslation.getOrInsertNamedModuleMetadata("nvvm.annotations")
-          ->addOperand(llvmMetadataNode);
+      llvmFunc->setCallingConv(llvm::CallingConv::PTX_Kernel);
     }
     return success();
   }
diff --git a/mlir/lib/Target/LLVMIR/ModuleImport.cpp b/mlir/lib/Target/LLVMIR/ModuleImport.cpp
index 95fb673fc72e..2d8d7745eca9 100644
--- a/mlir/lib/Target/LLVMIR/ModuleImport.cpp
+++ b/mlir/lib/Target/LLVMIR/ModuleImport.cpp
@@ -427,19 +427,33 @@ ModuleImport::processAliasScopeMetadata(const llvm::MDNode *node) {
     return node->getNumOperands() != 0 &&
            node == dyn_cast<llvm::MDNode>(node->getOperand(0));
   };
+  auto verifySelfRefOrString = [](const llvm::MDNode *node) {
+    return node->getNumOperands() != 0 &&
+           (node == dyn_cast<llvm::MDNode>(node->getOperand(0)) ||
+            isa<llvm::MDString>(node->getOperand(0)));
+  };
   // Helper that verifies the given operand is a string or does not exist.
   auto verifyDescription = [](const llvm::MDNode *node, unsigned idx) {
     return idx >= node->getNumOperands() ||
            isa<llvm::MDString>(node->getOperand(idx));
   };
+
+  auto getIdAttr = [&](const llvm::MDNode *node) -> Attribute {
+    if (verifySelfRef(node))
+      return DistinctAttr::create(builder.getUnitAttr());
+
+    auto name = cast<llvm::MDString>(node->getOperand(0));
+    return builder.getStringAttr(name->getString());
+  };
+
   // Helper that creates an alias scope domain attribute.
   auto createAliasScopeDomainOp = [&](const llvm::MDNode *aliasDomain) {
     StringAttr description = nullptr;
     if (aliasDomain->getNumOperands() >= 2)
       if (auto *operand = dyn_cast<llvm::MDString>(aliasDomain->getOperand(1)))
         description = builder.getStringAttr(operand->getString());
-    return builder.getAttr<AliasScopeDomainAttr>(
-        DistinctAttr::create(builder.getUnitAttr()), description);
+    Attribute idAttr = getIdAttr(aliasDomain);
+    return builder.getAttr<AliasScopeDomainAttr>(idAttr, description);
   };
 
   // Collect the alias scopes and domains to translate them.
@@ -452,10 +466,11 @@ ModuleImport::processAliasScopeMetadata(const llvm::MDNode *node) {
       // verifying its domain. Perform the verification before looking it up in
       // the alias scope mapping since it could have been inserted as a domain
       // node before.
-      if (!verifySelfRef(scope) || !domain || !verifyDescription(scope, 2))
+      if (!verifySelfRefOrString(scope) || !domain ||
+          !verifyDescription(scope, 2))
         return emitError(loc) << "unsupported alias scope node: "
                               << diagMD(scope, llvmModule.get());
-      if (!verifySelfRef(domain) || !verifyDescription(domain, 1))
+      if (!verifySelfRefOrString(domain) || !verifyDescription(domain, 1))
         return emitError(loc) << "unsupported alias domain node: "
                               << diagMD(domain, llvmModule.get());
 
@@ -473,9 +488,10 @@ ModuleImport::processAliasScopeMetadata(const llvm::MDNode *node) {
       StringAttr description = nullptr;
       if (!aliasScope.getName().empty())
         description = builder.getStringAttr(aliasScope.getName());
+      Attribute idAttr = getIdAttr(scope);
       auto aliasScopeOp = builder.getAttr<AliasScopeAttr>(
-          DistinctAttr::create(builder.getUnitAttr()),
-          cast<AliasScopeDomainAttr>(it->second), description);
+          idAttr, cast<AliasScopeDomainAttr>(it->second), description);
+
       aliasScopeMapping.try_emplace(aliasScope.getNode(), aliasScopeOp);
     }
   }
diff --git a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
index ad62ae0cef57..4367100e3aca 100644
--- a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
@@ -1724,25 +1724,36 @@ ModuleTranslation::getOrCreateAliasScope(AliasScopeAttr aliasScopeAttr) {
       aliasScopeAttr.getDomain(), nullptr);
   if (insertedDomain) {
     llvm::SmallVector<llvm::Metadata *, 2> operands;
-    // Placeholder for self-reference.
+    // Placeholder for potential self-reference.
     operands.push_back(dummy.get());
     if (StringAttr description = aliasScopeAttr.getDomain().getDescription())
       operands.push_back(llvm::MDString::get(ctx, description));
     domainIt->second = llvm::MDNode::get(ctx, operands);
     // Self-reference for uniqueness.
-    domainIt->second->replaceOperandWith(0, domainIt->second);
+    llvm::Metadata *replacement;
+    if (auto stringAttr =
+            dyn_cast<StringAttr>(aliasScopeAttr.getDomain().getId()))
+      replacement = llvm::MDString::get(ctx, stringAttr.getValue());
+    else
+      replacement = domainIt->second;
+    domainIt->second->replaceOperandWith(0, replacement);
   }
   // Convert the scope metadata node.
   assert(domainIt->second && "Scope's domain should already be valid");
   llvm::SmallVector<llvm::Metadata *, 3> operands;
-  // Placeholder for self-reference.
+  // Placeholder for potential self-reference.
   operands.push_back(dummy.get());
   operands.push_back(domainIt->second);
   if (StringAttr description = aliasScopeAttr.getDescription())
     operands.push_back(llvm::MDString::get(ctx, description));
   scopeIt->second = llvm::MDNode::get(ctx, operands);
   // Self-reference for uniqueness.
-  scopeIt->second->replaceOperandWith(0, scopeIt->second);
+  llvm::Metadata *replacement;
+  if (auto stringAttr = dyn_cast<StringAttr>(aliasScopeAttr.getId()))
+    replacement = llvm::MDString::get(ctx, stringAttr.getValue());
+  else
+    replacement = scopeIt->second;
+  scopeIt->second->replaceOperandWith(0, replacement);
   return scopeIt->second;
 }
 
diff --git a/mlir/lib/Transforms/LocationSnapshot.cpp b/mlir/lib/Transforms/LocationSnapshot.cpp
index b85850acda91..f701c8b4f0a9 100644
--- a/mlir/lib/Transforms/LocationSnapshot.cpp
+++ b/mlir/lib/Transforms/LocationSnapshot.cpp
@@ -10,6 +10,7 @@
 
 #include "mlir/IR/AsmState.h"
 #include "mlir/IR/Builders.h"
+#include "mlir/IR/OperationSupport.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Support/FileUtilities.h"
 #include "llvm/Support/FileSystem.h"
@@ -131,29 +132,23 @@ LogicalResult mlir::generateLocationsFromIR(StringRef fileName, StringRef tag,
 namespace {
 struct LocationSnapshotPass
     : public impl::LocationSnapshotBase<LocationSnapshotPass> {
-  LocationSnapshotPass() = default;
-  LocationSnapshotPass(OpPrintingFlags flags, StringRef fileName, StringRef tag)
-      : flags(flags) {
-    this->fileName = fileName.str();
-    this->tag = tag.str();
-  }
+  using impl::LocationSnapshotBase<LocationSnapshotPass>::LocationSnapshotBase;
 
   void runOnOperation() override {
     Operation *op = getOperation();
-    if (failed(generateLocationsFromIR(fileName, op, OpPrintingFlags(), tag)))
+    if (failed(generateLocationsFromIR(fileName, op, getFlags(), tag)))
       return signalPassFailure();
   }
 
-  /// The printing flags to use when creating the snapshot.
-  OpPrintingFlags flags;
+private:
+  /// build the flags from the command line arguments to the pass
+  OpPrintingFlags getFlags() {
+    OpPrintingFlags flags;
+    flags.enableDebugInfo(enableDebugInfo, printPrettyDebugInfo);
+    flags.printGenericOpForm(printGenericOpForm);
+    if (useLocalScope)
+      flags.useLocalScope();
+    return flags;
+  }
 };
 } // namespace
-
-std::unique_ptr<Pass> mlir::createLocationSnapshotPass(OpPrintingFlags flags,
-                                                       StringRef fileName,
-                                                       StringRef tag) {
-  return std::make_unique<LocationSnapshotPass>(flags, fileName, tag);
-}
-std::unique_ptr<Pass> mlir::createLocationSnapshotPass() {
-  return std::make_unique<LocationSnapshotPass>();
-}
diff --git a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
index 318f0f78efa5..f52dd6c0d0ce 100644
--- a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
+++ b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
@@ -633,7 +633,7 @@ gpu.module @test_module_29 {
     // CHECK-NEXT: %[[EL1:.*]] = llvm.getelementptr %[[ALLOC]][0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(i32, f64)>
     // CHECK-NEXT: llvm.store %[[EXT]], %[[EL1]] : f64, !llvm.ptr
     // CHECK-NEXT: llvm.call @vprintf(%[[FORMATSTART]], %[[ALLOC]]) : (!llvm.ptr, !llvm.ptr) -> i32
-    gpu.printf "Hello: %d\n" %arg0, %arg1 : i32, f32
+    gpu.printf "Hello: %d\n", %arg0, %arg1 : i32, f32
     gpu.return
   }
 }
diff --git a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-hip.mlir b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-hip.mlir
index 1b904fa142ba..2dc6a5ab2a86 100644
--- a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-hip.mlir
+++ b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-hip.mlir
@@ -36,7 +36,7 @@ gpu.module @test_module {
     // CHECK-NEXT: %[[NARGS1:.*]] = llvm.mlir.constant(1 : i32) : i32
     // CHECK-NEXT: %[[ARG0_64:.*]] = llvm.zext %[[ARG0]] : i32 to i64
     // CHECK-NEXT: %{{.*}} = llvm.call @__ockl_printf_append_args(%[[DESC1]], %[[NARGS1]], %[[ARG0_64]], %[[CST0]], %[[CST0]], %[[CST0]], %[[CST0]], %[[CST0]], %[[CST0]], %[[ISLAST]]) : (i64, i32, i64, i64, i64, i64, i64, i64, i64, i32) -> i64
-    gpu.printf "Hello: %d\n" %arg0 : i32
+    gpu.printf "Hello: %d\n", %arg0 : i32
     gpu.return
   }
 }
diff --git a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-opencl.mlir b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-opencl.mlir
index 870f5c5016ec..00d1d7d85268 100644
--- a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-opencl.mlir
+++ b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-opencl.mlir
@@ -9,7 +9,7 @@ gpu.module @test_module {
     // CHECK: %[[IMM0:.*]] = llvm.mlir.addressof @[[$PRINT_GLOBAL]] : !llvm.ptr<4>
     // CHECK-NEXT: %[[IMM2:.*]] = llvm.getelementptr %[[IMM0]][0, 0] : (!llvm.ptr<4>) -> !llvm.ptr<4>, !llvm.array<11 x i8>
     // CHECK-NEXT: %{{.*}} = llvm.call @printf(%[[IMM2]], %[[ARG0]]) vararg(!llvm.func<i32 (ptr<4>, ...)>) : (!llvm.ptr<4>, i32) -> i32
-    gpu.printf "Hello: %d\n" %arg0 : i32
+    gpu.printf "Hello: %d\n", %arg0 : i32
     gpu.return
   }
 }
diff --git a/mlir/test/Conversion/GPUToSPIRV/printf.mlir b/mlir/test/Conversion/GPUToSPIRV/printf.mlir
index bc091124ea4c..7fe9752b088d 100644
--- a/mlir/test/Conversion/GPUToSPIRV/printf.mlir
+++ b/mlir/test/Conversion/GPUToSPIRV/printf.mlir
@@ -62,7 +62,7 @@ module attributes {
         // CHECK: [[FMTSTR_ADDR:%.*]] = spirv.mlir.addressof [[PRINTMSG]] : !spirv.ptr<!spirv.array<[[ARRAYSIZE]] x i8>, UniformConstant>
         // CHECK-NEXT: [[FMTSTR_PTR1:%.*]] = spirv.Bitcast [[FMTSTR_ADDR]] : !spirv.ptr<!spirv.array<[[ARRAYSIZE]] x i8>, UniformConstant> to !spirv.ptr<i8, UniformConstant>
         // CHECK-NEXT:  {{%.*}} = spirv.CL.printf [[FMTSTR_PTR1]] {{%.*}}, {{%.*}}, {{%.*}} : !spirv.ptr<i8, UniformConstant>, i32, f32, i32 -> i32
-        gpu.printf "\nHello, world : %d %f \n Thread id: %d\n" %arg0, %arg1, %2: i32, f32, index
+        gpu.printf "\nHello, world : %d %f \n Thread id: %d\n", %arg0, %arg1, %2: i32, f32, index
 
         // CHECK: spirv.Return
         gpu.return
diff --git a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-named.mlir b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-named.mlir
index bfdc72ee07e9..453a8610e716 100644
--- a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-named.mlir
+++ b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-named.mlir
@@ -510,7 +510,7 @@ func.func @avg_pool_dyn(%arg0: tensor<?x6x34x62xf32>) -> (tensor<?x5x33x62xf32>)
 func.func @conv2d_scalar_bias_f32(%input: tensor<1x49x42x27xf32>, %weights: tensor<28x3x3x27xf32>, %bias: tensor<1xf32>) -> () {
   // CHECK: %[[INIT:.+]] = tensor.empty() : tensor<1x45x40x28xf32>
   // CHECK: %[[BROADCAST:.+]] = linalg.generic {indexing_maps = [#[[$MAP1]], #[[$MAP2]]], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2 : tensor<1xf32>) outs(%[[INIT]] : tensor<1x45x40x28xf32>) {
-  %0 = tosa.conv2d %input, %weights, %bias {pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>, dilation = array<i64: 2, 1>} : (tensor<1x49x42x27xf32>, tensor<28x3x3x27xf32>, tensor<1xf32>) -> tensor<1x45x40x28xf32>
+  %0 = tosa.conv2d %input, %weights, %bias {acc_type = f32, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>, dilation = array<i64: 2, 1>} : (tensor<1x49x42x27xf32>, tensor<28x3x3x27xf32>, tensor<1xf32>) -> tensor<1x45x40x28xf32>
   return
 }
 
@@ -531,7 +531,7 @@ func.func @conv2d_i8(%input: tensor<1x49x42x27xi8>, %weights: tensor<28x1x1x27xi
   // CHECK: linalg.conv_2d_nhwc_fhwc_q {dilations = dense<[2, 1]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%arg0, %arg1, %c0_i32, %c0_i32_0 : tensor<1x49x42x27xi8>, tensor<28x1x1x27xi8>, i32, i32) outs(%[[BROADCAST]] : tensor<1x45x40x28xi32>) -> tensor<1x45x40x28xi32>
   // HWCF: linalg.conv_2d_nhwc_hwcf_q {dilations = dense<[2, 1]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%arg0, %[[TRANSPOSE]], %c0_i32, %c0_i32_0 : tensor<1x49x42x27xi8>, tensor<1x1x27x28xi8>, i32, i32) outs(%{{[a-zA-Z0-9_]*}} : tensor<1x45x40x28xi32>) -> tensor<1x45x40x28xi32>
 
-  %0 = tosa.conv2d %input, %weights, %bias {dilation = array<i64: 2, 1>, pad = array<i64: 0, 0, 0, 0>, quantization_info = #tosa.conv_quant<input_zp = 0, weight_zp = 0>, stride = array<i64: 1, 1>} : (tensor<1x49x42x27xi8>, tensor<28x1x1x27xi8>, tensor<28xi8>) -> tensor<1x45x40x28xi32>
+  %0 = tosa.conv2d %input, %weights, %bias {acc_type = i32, dilation = array<i64: 2, 1>, pad = array<i64: 0, 0, 0, 0>, quantization_info = #tosa.conv_quant<input_zp = 0, weight_zp = 0>, stride = array<i64: 1, 1>} : (tensor<1x49x42x27xi8>, tensor<28x1x1x27xi8>, tensor<28xi8>) -> tensor<1x45x40x28xi32>
   return
 }
 
@@ -552,7 +552,7 @@ func.func @conv2d_f32(%input: tensor<1x49x42x27xf32>, %weights: tensor<28x3x3x27
   // CHECK: linalg.conv_2d_nhwc_fhwc {dilations = dense<[2, 1]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%arg0, %arg1 : tensor<1x49x42x27xf32>, tensor<28x3x3x27xf32>) outs(%1 : tensor<1x45x40x28xf32>) -> tensor<1x45x40x28xf32>
 
   // HWCF: linalg.conv_2d_nhwc_hwcf {dilations = dense<[2, 1]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%arg0, %[[TRANSPOSE]] : tensor<1x49x42x27xf32>, tensor<3x3x27x28xf32>) outs(%{{[a-zA-Z0-9_]*}} : tensor<1x45x40x28xf32>
-  %0 = tosa.conv2d %input, %weights, %bias {pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>, dilation = array<i64: 2, 1>} : (tensor<1x49x42x27xf32>, tensor<28x3x3x27xf32>, tensor<28xf32>) -> tensor<1x45x40x28xf32>
+  %0 = tosa.conv2d %input, %weights, %bias {acc_type = f32, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>, dilation = array<i64: 2, 1>} : (tensor<1x49x42x27xf32>, tensor<28x3x3x27xf32>, tensor<28xf32>) -> tensor<1x45x40x28xf32>
   return
 }
 
@@ -571,7 +571,7 @@ func.func @conv2d_dyn(%input: tensor<?x49x42x27xf32>, %weights: tensor<28x3x3x27
   // CHECK:   linalg.yield %[[IN]] : f32
   // CHECK: } -> tensor<?x45x40x28xf32>
   // CHECK: %2 = linalg.conv_2d_nhwc_fhwc {dilations = dense<[2, 1]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%arg0, %arg1 : tensor<?x49x42x27xf32>, tensor<28x3x3x27xf32>) outs(%[[BROADCAST]] : tensor<?x45x40x28xf32>) -> tensor<?x45x40x28xf32>
-  %0 = tosa.conv2d %input, %weights, %bias {pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>, dilation = array<i64: 2, 1>} : (tensor<?x49x42x27xf32>, tensor<28x3x3x27xf32>, tensor<28xf32>) -> tensor<?x45x40x28xf32>
+  %0 = tosa.conv2d %input, %weights, %bias {acc_type = f32, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>, dilation = array<i64: 2, 1>} : (tensor<?x49x42x27xf32>, tensor<28x3x3x27xf32>, tensor<28xf32>) -> tensor<?x45x40x28xf32>
   return
 }
 
@@ -627,7 +627,7 @@ func.func @conv2d_dyn_w_h(%input: tensor<1x?x?x27xf32>, %weights: tensor<28x3x3x
   // CHECK: } -> tensor<1x?x?x28xf32>
   // CHECK: linalg.conv_2d_nhwc_fhwc {dilations = dense<[2, 1]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%arg0, %arg1 : tensor<1x?x?x27xf32>, tensor<28x3x3x27xf32>) outs(%17 : tensor<1x?x?x28xf32>) -> tensor<1x?x?x28xf32>
 
-  %0 = tosa.conv2d %input, %weights, %bias {pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>, dilation = array<i64: 2, 1>} : (tensor<1x?x?x27xf32>, tensor<28x3x3x27xf32>, tensor<28xf32>) -> tensor<1x?x?x28xf32>
+  %0 = tosa.conv2d %input, %weights, %bias {acc_type = f32, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>, dilation = array<i64: 2, 1>} : (tensor<1x?x?x27xf32>, tensor<28x3x3x27xf32>, tensor<28xf32>) -> tensor<1x?x?x28xf32>
   return
 }
 
@@ -650,7 +650,7 @@ func.func @conv2d_dyn_output(%input: tensor<2x6x5x4xf32>, %weights: tensor<4x3x3
   //   linalg.yield %[[ADD]] : f32
   // } -> tensor<?x4x3x4xf32>
 
-  %0 = tosa.conv2d %input, %weights, %bias {dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>} : (tensor<2x6x5x4xf32    >, tensor<4x3x3x4xf32>, tensor<4xf32>) -> tensor<?x4x3x4xf32>
+  %0 = tosa.conv2d %input, %weights, %bias {acc_type = f32, dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>} : (tensor<2x6x5x4xf32    >, tensor<4x3x3x4xf32>, tensor<4xf32>) -> tensor<?x4x3x4xf32>
   return
 }
 
@@ -662,7 +662,7 @@ func.func @conv2d_padded_f32(%input: tensor<1x47x40x28xf32>, %weights: tensor<28
   // CHECK: tensor.pad %arg0 low[0, 1, 1, 0] high[0, 1, 1, 0]
   // CHECK:   tensor.yield %[[C0]]
   // CHECK: linalg.conv_2d_nhwc_fhwc
-  %0 = tosa.conv2d %input, %weights, %bias {pad = array<i64: 1, 1, 1, 1>, stride = array<i64: 1, 1>, dilation = array<i64: 2, 1>} : (tensor<1x47x40x28xf32>, tensor<28x3x3x28xf32>, tensor<28xf32>) -> tensor<1x45x40x28xf32>
+  %0 = tosa.conv2d %input, %weights, %bias {acc_type = f32, pad = array<i64: 1, 1, 1, 1>, stride = array<i64: 1, 1>, dilation = array<i64: 2, 1>} : (tensor<1x47x40x28xf32>, tensor<28x3x3x28xf32>, tensor<28xf32>) -> tensor<1x45x40x28xf32>
   return
 }
 
@@ -674,7 +674,7 @@ func.func @conv2d_quant(%arg0 : tensor<1x12x12x1xi8>, %arg1 : tensor<1024x3x3x1x
   // CHECK: tensor.pad %arg0 low[0, 1, 1, 0] high[0, 1, 1, 0]
   // CHECK:   tensor.yield %[[C22]]
   // CHECK: linalg.conv_2d_nhwc_fhwc_q
-  %0 = tosa.conv2d %arg0, %arg1, %arg2 {dilation = array<i64: 1, 1>, pad = array<i64: 1, 1, 1, 1>, quantization_info = #tosa.conv_quant<input_zp = -22, weight_zp = 42>, stride = array<i64: 1, 1>} : (tensor<1x12x12x1xi8>, tensor<1024x3x3x1xi8>, tensor<1024xi32>) -> tensor<1x12x12x1024xi32>
+  %0 = tosa.conv2d %arg0, %arg1, %arg2 {acc_type = i32, dilation = array<i64: 1, 1>, pad = array<i64: 1, 1, 1, 1>, quantization_info = #tosa.conv_quant<input_zp = -22, weight_zp = 42>, stride = array<i64: 1, 1>} : (tensor<1x12x12x1xi8>, tensor<1024x3x3x1xi8>, tensor<1024xi32>) -> tensor<1x12x12x1024xi32>
   return
 }
 
@@ -696,7 +696,7 @@ func.func @depthwise_conv(%arg0 : tensor<1x7x5x3xf32>, %arg1 : tensor<3x1x3x11xf
   // CHECK:   [[ADD:%.+]] = arith.addf %[[ARG3]], %[[ARG4]] : f32
   // CHECK:   linalg.yield [[ADD]] : f32
   // CHECK: } -> tensor<1x5x5x33xf32>
-  %2 = tosa.depthwise_conv2d %arg0, %arg1, %arg2 { pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>, dilation = array<i64: 1, 1> } : (tensor<1x7x5x3xf32>, tensor<3x1x3x11xf32>, tensor<33xf32>)  -> tensor<1x5x5x33xf32>
+  %2 = tosa.depthwise_conv2d %arg0, %arg1, %arg2 { acc_type = f32, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>, dilation = array<i64: 1, 1> } : (tensor<1x7x5x3xf32>, tensor<3x1x3x11xf32>, tensor<33xf32>)  -> tensor<1x5x5x33xf32>
   return
 }
 
@@ -712,7 +712,7 @@ func.func @depthwise_conv_scalar_bias(%arg0 : tensor<1x7x5x3xf32>, %arg1 : tenso
   // CHECK:   [[ADD:%.+]] = arith.addf %[[ARG3]], %[[ARG4]] : f32
   // CHECK:   linalg.yield [[ADD]] : f32
   // CHECK: } -> tensor<1x5x5x33xf32>
-  %2 = tosa.depthwise_conv2d %arg0, %arg1, %arg2 { pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>, dilation = array<i64: 1, 1> } : (tensor<1x7x5x3xf32>, tensor<3x1x3x11xf32>, tensor<1xf32>)  -> tensor<1x5x5x33xf32>
+  %2 = tosa.depthwise_conv2d %arg0, %arg1, %arg2 {acc_type = f32, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>, dilation = array<i64: 1, 1> } : (tensor<1x7x5x3xf32>, tensor<3x1x3x11xf32>, tensor<1xf32>)  -> tensor<1x5x5x33xf32>
   return
 }
 
@@ -736,7 +736,7 @@ func.func @depthwise_conv_dyn(%arg0 : tensor<?x7x5x3xf32>, %arg1 : tensor<3x1x3x
   // CHECK:   %[[ADD:.+]] = arith.addf %[[ARG3]], %[[ARG4]] : f32
   // CHECK:   linalg.yield %[[ADD]] : f32
   // CHECK: } -> tensor<?x5x5x33xf32>
-  %2 = tosa.depthwise_conv2d %arg0, %arg1, %arg2 { pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>, dilation = array<i64: 1, 1> } : (tensor<?x7x5x3xf32>, tensor<3x1x3x11xf32>, tensor<33xf32>)  -> tensor<?x5x5x33xf32>
+  %2 = tosa.depthwise_conv2d %arg0, %arg1, %arg2 { acc_type = f32, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>, dilation = array<i64: 1, 1> } : (tensor<?x7x5x3xf32>, tensor<3x1x3x11xf32>, tensor<33xf32>)  -> tensor<?x5x5x33xf32>
   return
 }
 
@@ -758,7 +758,7 @@ func.func @depthwise_conv_strides(%arg0 : tensor<1x11x9x3xf32>, %arg1 : tensor<3
   // CHECK:   [[ADD:%.+]] = arith.addf %[[ARG3]], %[[ARG4]] : f32
   // CHECK:   linalg.yield [[ADD]] : f32
   // CHECK: } -> tensor<1x5x5x33xf32>
-  %2 = tosa.depthwise_conv2d %arg0, %arg1, %arg2 { pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 2, 2>, dilation = array<i64: 1, 1> } : (tensor<1x11x9x3xf32>, tensor<3x1x3x11xf32>, tensor<33xf32>)  -> tensor<1x5x5x33xf32>
+  %2 = tosa.depthwise_conv2d %arg0, %arg1, %arg2 { acc_type = f32, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 2, 2>, dilation = array<i64: 1, 1> } : (tensor<1x11x9x3xf32>, tensor<3x1x3x11xf32>, tensor<33xf32>)  -> tensor<1x5x5x33xf32>
   return
 }
 
@@ -786,7 +786,7 @@ func.func @depthwise_conv_quant(%arg0 : tensor<1x12x12x4xi8>, %arg1 : tensor<3x3
   // CHECK:   [[ADD:%.+]] = arith.addi %[[ARG3]], %[[ARG4]] : i32
   // CHECK:   linalg.yield [[ADD]] : i32
   // CHECK: } -> tensor<1x12x12x512xi32>
-  %0 = tosa.depthwise_conv2d %arg0, %arg1, %arg2 {pad = array<i64: 1, 1, 1, 1>, quantization_info = #tosa.conv_quant<input_zp = -128, weight_zp = 42>, stride = array<i64: 1, 1>, dilation = array<i64: 1, 1> } : (tensor<1x12x12x4xi8>, tensor<3x3x4x128xi8>, tensor<512xi32>) -> tensor<1x12x12x512xi32>
+  %0 = tosa.depthwise_conv2d %arg0, %arg1, %arg2 {acc_type = i32, pad = array<i64: 1, 1, 1, 1>, quantization_info = #tosa.conv_quant<input_zp = -128, weight_zp = 42>, stride = array<i64: 1, 1>, dilation = array<i64: 1, 1> } : (tensor<1x12x12x4xi8>, tensor<3x3x4x128xi8>, tensor<512xi32>) -> tensor<1x12x12x512xi32>
   return
 }
 
@@ -810,7 +810,7 @@ func.func @depthwise_conv_quant_dilations(%arg0 : tensor<1x14x14x4xi8>, %arg1 :
   // CHECK:   [[ADD:%.+]] = arith.addi %[[ARG3]], %[[ARG4]] : i32
   // CHECK:   linalg.yield [[ADD]] : i32
   // CHECK: } -> tensor<1x10x10x512xi32>
-  %0 = tosa.depthwise_conv2d %arg0, %arg1, %arg2 {pad = array<i64: 0, 0, 0, 0>, quantization_info = #tosa.conv_quant<input_zp = -128, weight_zp = 42>, stride = array<i64: 1, 1>, dilation = array<i64: 2, 2> } : (tensor<1x14x14x4xi8>, tensor<3x3x4x128xi8>, tensor<512xi32>)  -> tensor<1x10x10x512xi32>
+  %0 = tosa.depthwise_conv2d %arg0, %arg1, %arg2 {acc_type = i32, pad = array<i64: 0, 0, 0, 0>, quantization_info = #tosa.conv_quant<input_zp = -128, weight_zp = 42>, stride = array<i64: 1, 1>, dilation = array<i64: 2, 2> } : (tensor<1x14x14x4xi8>, tensor<3x3x4x128xi8>, tensor<512xi32>)  -> tensor<1x10x10x512xi32>
   return
 }
 
@@ -826,7 +826,7 @@ func.func @depthwise_conv2d_dyn_w_h(%arg0: tensor<2x?x?x3xf32>, %arg1: tensor<3x
   // CHECK:  } : tensor<2x?x?x3xf32> to tensor<2x?x?x3xf32>
   // CHECK: %[[CONV:.+]] = linalg.depthwise_conv_2d_nhwc_hwcm {dilations = dense<[2, 1]> : tensor<2xi64>, strides = dense<[1, 2]> : tensor<2xi64>} ins(%[[PADDED]], %arg1 : tensor<2x?x?x3xf32>, tensor<3x6x3x5xf32>) outs(%{{.*}} : tensor<2x?x?x3x5xf32>) -> tensor<2x?x?x3x5xf32>
   // CHECK: %[[COLLAPSED:.+]] = tensor.collapse_shape %[[CONV]] {{\[}}[0], [1], [2], [3, 4]]
-  %0 = tosa.depthwise_conv2d %arg0, %arg1, %arg2 {pad = array<i64: 1, 2, 3, 4>, dilation = array<i64: 2, 1>, stride = array<i64: 1, 2>} : (tensor<2x?x?x3xf32>, tensor<3x6x3x5xf32>, tensor<15xf32>) -> tensor<2x?x?x15xf32>
+  %0 = tosa.depthwise_conv2d %arg0, %arg1, %arg2 {acc_type = f32, pad = array<i64: 1, 2, 3, 4>, dilation = array<i64: 2, 1>, stride = array<i64: 1, 2>} : (tensor<2x?x?x3xf32>, tensor<3x6x3x5xf32>, tensor<15xf32>) -> tensor<2x?x?x15xf32>
   return
 }
 
@@ -850,7 +850,7 @@ func.func @conv3d_f32(%input: tensor<1x49x48x47x27xf32>, %weights: tensor<28x3x4
   // CHECK-SAME: {dilations = dense<1> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>}
   // CHECK-SAME: ins(%arg0, %[[TRANSPOSE]] : tensor<1x49x48x47x27xf32>, tensor<3x4x5x27x28xf32>)
   // CHECK-SAME: outs(%[[BROADCAST]] : tensor<1x47x45x43x28xf32>) -> tensor<1x47x45x43x28xf32>
-  %0 = tosa.conv3d %input, %weights, %bias {pad = array<i64: 0, 0, 0, 0, 0, 0>, stride = array<i64: 1, 1, 1>, dilation = array<i64: 1, 1, 1>} : (tensor<1x49x48x47x27xf32>, tensor<28x3x4x5x27xf32>, tensor<28xf32>)  -> tensor<1x47x45x43x28xf32>
+  %0 = tosa.conv3d %input, %weights, %bias {acc_type = f32, pad = array<i64: 0, 0, 0, 0, 0, 0>, stride = array<i64: 1, 1, 1>, dilation = array<i64: 1, 1, 1>} : (tensor<1x49x48x47x27xf32>, tensor<28x3x4x5x27xf32>, tensor<28xf32>)  -> tensor<1x47x45x43x28xf32>
   return
 }
 
@@ -864,7 +864,7 @@ func.func @conv3d_scalar_bias_f32(%input: tensor<1x49x48x47x27xf32>, %weights: t
   // CHECK:  %[[INIT:.+]] = tensor.empty() : tensor<1x47x45x43x28xf32>
   // CHECK:      %[[BROADCAST:.+]] = linalg.generic
   // CHECK-SAME: {indexing_maps = [#[[$MAP1]], #[[$MAP2]]], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel"]}
-  %0 = tosa.conv3d %input, %weights, %bias {pad = array<i64: 0, 0, 0, 0, 0, 0>, stride = array<i64: 1, 1, 1>, dilation = array<i64: 1, 1, 1>} : (tensor<1x49x48x47x27xf32>, tensor<28x3x4x5x27xf32>, tensor<1xf32>)  -> tensor<1x47x45x43x28xf32>
+  %0 = tosa.conv3d %input, %weights, %bias {acc_type = f32, pad = array<i64: 0, 0, 0, 0, 0, 0>, stride = array<i64: 1, 1, 1>, dilation = array<i64: 1, 1, 1>} : (tensor<1x49x48x47x27xf32>, tensor<28x3x4x5x27xf32>, tensor<1xf32>)  -> tensor<1x47x45x43x28xf32>
   return
 }
 
@@ -892,7 +892,7 @@ func.func @conv3d_i8(%input: tensor<1x49x48x47x27xi8>, %weights: tensor<28x3x4x5
   // CHECK-SAME: ins(%arg0, %[[TRANSPOSE]], %[[IZP]], %[[FZP]] : tensor<1x49x48x47x27xi8>, tensor<3x4x5x27x28xi8>, i32, i32)
   // CHECK-SAME: outs(%[[BROADCAST]] : tensor<1x47x45x43x28xi32>) -> tensor<1x47x45x43x28xi32>
 
-  %0 = tosa.conv3d %input, %weights, %bias {pad = array<i64: 0, 0, 0, 0, 0, 0>, quantization_info = #tosa.conv_quant<input_zp = -128, weight_zp = 42>, stride = array<i64: 1, 1, 1>, dilation = array<i64: 1, 1, 1>} : (tensor<1x49x48x47x27xi8>, tensor<28x3x4x5x27xi8>, tensor<28xi32>)  -> tensor<1x47x45x43x28xi32>
+  %0 = tosa.conv3d %input, %weights, %bias {acc_type = i32, pad = array<i64: 0, 0, 0, 0, 0, 0>, quantization_info = #tosa.conv_quant<input_zp = -128, weight_zp = 42>, stride = array<i64: 1, 1, 1>, dilation = array<i64: 1, 1, 1>} : (tensor<1x49x48x47x27xi8>, tensor<28x3x4x5x27xi8>, tensor<28xi32>)  -> tensor<1x47x45x43x28xi32>
   return
 }
 
diff --git a/mlir/test/Conversion/TosaToTensor/tosa-to-tensor.mlir b/mlir/test/Conversion/TosaToTensor/tosa-to-tensor.mlir
index 1e62e25176a0..0b9a64494bc0 100644
--- a/mlir/test/Conversion/TosaToTensor/tosa-to-tensor.mlir
+++ b/mlir/test/Conversion/TosaToTensor/tosa-to-tensor.mlir
@@ -459,85 +459,65 @@ func.func @slice_dyn(%arg0: tensor<?xf32>) -> (tensor<?xf32>) {
 // CHECK-LABEL: @pad_float
 // CHECK-SAME: (%[[ARG0:[0-9a-zA-Z_]*]]:
 func.func @pad_float(%arg0 : tensor<1x2xf32>) -> (tensor<4x9xf32>) {
-  %0 = arith.constant dense<[[1, 2], [3, 4]]> : tensor<2x2xi32>
-  // TODO: Output contains multiple "arith.constant 1 : index".
-  // CHECK-DAG: [[INDEX1:%.+]] = arith.constant 1 : index
-  // CHECK-DAG: [[INDEX2:%.+]] = arith.constant 2 : index
-  // CHECK-DAG: [[INDEX3:%.+]] = arith.constant 3 : index
-  // CHECK-DAG: [[INDEX4:%.+]] = arith.constant 4 : index
+  %0 = arith.constant dense<[1, 2, 3, 4]> : tensor<4xi32>
   // CHECK-DAG: [[CST:%.+]] = arith.constant 0.000000e+00 : f32
-  // CHECK: tensor.pad %[[ARG0]] low{{\[}}%{{.*}}, [[INDEX3]]] high{{\[}}[[INDEX2]], [[INDEX4]]]  {
+  // CHECK: tensor.pad %[[ARG0]] low{{\[}}%{{.*}}, %{{.*}}] high{{\[}}%{{.*}}, %{{.*}}] {
   // CHECK:   tensor.yield [[CST]]
   // CHECK: } : tensor<1x2xf32> to tensor<4x9xf32>
-  %1 = "tosa.pad"(%arg0, %0)  : (tensor<1x2xf32>, tensor<2x2xi32>)  -> (tensor<4x9xf32>)
+  %1 = "tosa.pad"(%arg0, %0)  : (tensor<1x2xf32>, tensor<4xi32>)  -> (tensor<4x9xf32>)
   return %1 : tensor<4x9xf32>
 }
 
 func.func @pad_int(%arg0 : tensor<1x2xi32>) -> (tensor<4x9xi32>) {
-  %0 = arith.constant dense<[[1, 2], [3, 4]]> : tensor<2x2xi32>
+  %0 = arith.constant dense<[1, 2, 3, 4]> : tensor<4xi32>
   // CHECK: [[CST:%.+]] = arith.constant 0 : i32
   // CHECK: tensor.pad
   // CHECK:   tensor.yield [[CST]]
-  %1 = "tosa.pad"(%arg0, %0)  : (tensor<1x2xi32>, tensor<2x2xi32>)  -> (tensor<4x9xi32>)
+  %1 = "tosa.pad"(%arg0, %0)  : (tensor<1x2xi32>, tensor<4xi32>)  -> (tensor<4x9xi32>)
   return %1 : tensor<4x9xi32>
 }
 
 func.func @pad_quant(%arg0 : tensor<1x2xi32>) -> (tensor<4x9xi32>) {
-  %0 = arith.constant dense<[[1, 2], [3, 4]]> : tensor<2x2xi32>
+  %0 = arith.constant dense<[1, 2, 3, 4]> : tensor<4xi32>
   // CHECK: [[CST:%.+]] = arith.constant 42 : i32
   // CHECK: tensor.pad
   // CHECK:   tensor.yield [[CST]]
-  %1 = "tosa.pad"(%arg0, %0) {quantization_info = #tosa.pad_quant<input_zp = 42>} : (tensor<1x2xi32>, tensor<2x2xi32>)  -> (tensor<4x9xi32>)
+  %1 = "tosa.pad"(%arg0, %0) {quantization_info = #tosa.pad_quant<input_zp = 42>} : (tensor<1x2xi32>, tensor<4xi32>)  -> (tensor<4x9xi32>)
   return %1 : tensor<4x9xi32>
 }
 
 // -----
 
 func.func @pad_float_explicit(%arg0 : tensor<1x2xf32>) -> (tensor<4x9xf32>) {
-  %0 = arith.constant dense<[[1, 2], [3, 4]]> : tensor<2x2xi32>
-  // TODO: Output contains multiple "arith.constant 1 : index".
-  // CHECK-DAG: [[INDEX1:%.+]] = arith.constant 1 : index
-  // CHECK-DAG: [[INDEX2:%.+]] = arith.constant 2 : index
-  // CHECK-DAG: [[INDEX3:%.+]] = arith.constant 3 : index
-  // CHECK-DAG: [[INDEX4:%.+]] = arith.constant 4 : index
+  %0 = arith.constant dense<[1, 2, 3, 4]> : tensor<4xi32>
   // CHECK-DAG: [[CST:%.+]] = arith.constant 4.200000e+01 : f32
-  // CHECK: tensor.pad %[[ARG0]] low{{\[}}%{{.*}}, [[INDEX3]]] high{{\[}}[[INDEX2]], [[INDEX4]]]  {
+  // CHECK: tensor.pad %[[ARG0]] low{{\[}}%{{.*}}, %{{.*}}] high{{\[}}%{{.*}}, %{{.*}}] {
   // CHECK:   tensor.yield [[CST]]
   // CHECK: } : tensor<1x2xf32> to tensor<4x9xf32>
   %1 = arith.constant dense<42.0> : tensor<f32>
-  %2 = "tosa.pad"(%arg0, %0, %1)  : (tensor<1x2xf32>, tensor<2x2xi32>, tensor<f32>)  -> (tensor<4x9xf32>)
+  %2 = "tosa.pad"(%arg0, %0, %1)  : (tensor<1x2xf32>, tensor<4xi32>, tensor<f32>)  -> (tensor<4x9xf32>)
   return %2 : tensor<4x9xf32>
 }
 
 // -----
 
 func.func @pad_dyn_input(%arg0 : tensor<?x2xf32>) -> (tensor<?x9xf32>) {
-  %0 = arith.constant dense<[[1, 2], [3, 4]]> : tensor<2x2xi32>
-  // TODO: Output contains multiple "arith.constant 1 : index".
-  // CHECK-DAG: [[INDEX1:%.+]] = arith.constant 1 : index
-  // CHECK-DAG: [[INDEX2:%.+]] = arith.constant 2 : index
-  // CHECK-DAG: [[INDEX3:%.+]] = arith.constant 3 : index
-  // CHECK-DAG: [[INDEX4:%.+]] = arith.constant 4 : index
+  %0 = arith.constant dense<[1, 2, 3, 4]> : tensor<4xi32>
   // CHECK-DAG: [[CST:%.+]] = arith.constant 0.000000e+00 : f32
-  // CHECK: tensor.pad %[[ARG0]] low{{\[}}%{{.*}}, [[INDEX3]]] high{{\[}}[[INDEX2]], [[INDEX4]]]  {
+  // CHECK: tensor.pad %[[ARG0]] low{{\[}}%{{.*}}, %{{.*}}] high{{\[}}%{{.*}}, %{{.*}}] {
   // CHECK:   tensor.yield [[CST]]
   // CHECK: } : tensor<?x2xf32> to tensor<?x9xf32>
-  %1 = "tosa.pad"(%arg0, %0)  : (tensor<?x2xf32>, tensor<2x2xi32>)  -> (tensor<?x9xf32>)
+  %1 = "tosa.pad"(%arg0, %0)  : (tensor<?x2xf32>, tensor<4xi32>)  -> (tensor<?x9xf32>)
   return %1 : tensor<?x9xf32>
 }
 
 func.func @pad_dyn_padding(%arg0 : tensor<1x2xf32>) -> (tensor<?x9xf32>) {
-  %0 = arith.constant dense<[[-1, 2], [3, 4]]> : tensor<2x2xi32>
-  // TODO: Output contains multiple "arith.constant 1 : index".
-  // CHECK-DAG: [[INDEX1:%.+]] = arith.constant 1 : index
-  // CHECK-DAG: [[INDEX2:%.+]] = arith.constant 2 : index
-  // CHECK-DAG: [[INDEX3:%.+]] = arith.constant 3 : index
-  // CHECK-DAG: [[INDEX4:%.+]] = arith.constant 4 : index
+  %0 = arith.constant dense<[-1, 2, 3, 4]> : tensor<4xi32>
   // CHECK-DAG: [[CST:%.+]] = arith.constant 0.000000e+00 : f32
-  // CHECK: tensor.pad %[[ARG0]] low{{\[}}%{{.*}}, [[INDEX3]]] high{{\[}}[[INDEX2]], [[INDEX4]]]  {
+  // CHECK: tensor.pad %[[ARG0]] low{{\[}}%{{.*}}, %{{.*}}] high{{\[}}%{{.*}}, %{{.*}}] {
   // CHECK:   tensor.yield [[CST]]
   // CHECK: } : tensor<1x2xf32> to tensor<?x9xf32>
-  %1 = "tosa.pad"(%arg0, %0)  : (tensor<1x2xf32>, tensor<2x2xi32>)  -> (tensor<?x9xf32>)
+  %1 = "tosa.pad"(%arg0, %0)  : (tensor<1x2xf32>, tensor<4xi32>)  -> (tensor<?x9xf32>)
   return %1 : tensor<?x9xf32>
 }
 
diff --git a/mlir/test/Dialect/Affine/value-bounds-op-interface-impl.mlir b/mlir/test/Dialect/Affine/value-bounds-op-interface-impl.mlir
index 935c08aceff5..5354eb38d7b0 100644
--- a/mlir/test/Dialect/Affine/value-bounds-op-interface-impl.mlir
+++ b/mlir/test/Dialect/Affine/value-bounds-op-interface-impl.mlir
@@ -155,3 +155,84 @@ func.func @compare_maps(%a: index, %b: index) {
       : (index, index, index, index) -> ()
   return
 }
+
+// -----
+
+// CHECK-DAG: #[[$map1:.+]] = affine_map<()[s0] -> (s0 floordiv 15)>
+// CHECK-DAG: #[[$map2:.+]] = affine_map<()[s0] -> ((s0 mod 15) floordiv 5)>
+// CHECK-DAG: #[[$map3:.+]] = affine_map<()[s0] -> (s0 mod 5)>
+// CHECK-LABEL: func.func @delinearize_static
+// CHECK-SAME: (%[[arg0:.+]]: index)
+// CHECK-DAG: %[[v1:.+]] = affine.apply #[[$map1]]()[%[[arg0]]]
+// CHECK-DAG: %[[v2:.+]] = affine.apply #[[$map2]]()[%[[arg0]]]
+// CHECK-DAG: %[[v3:.+]] = affine.apply #[[$map3]]()[%[[arg0]]]
+// CHECK: return %[[v1]], %[[v2]], %[[v3]]
+func.func @delinearize_static(%arg0: index) -> (index, index, index) {
+  %c2 = arith.constant 2 : index
+  %c3 = arith.constant 3 : index
+  %0:3 = affine.delinearize_index %arg0 into (2, 3, 5) : index, index, index
+  %1 = "test.reify_bound"(%0#0) {type = "EQ"} : (index) -> (index)
+  %2 = "test.reify_bound"(%0#1) {type = "EQ"} : (index) -> (index)
+  %3 = "test.reify_bound"(%0#2) {type = "EQ"} : (index) -> (index)
+  // expected-remark @below{{true}}
+  "test.compare"(%0#0, %c2) {cmp = "LT"} : (index, index) -> ()
+  // expected-remark @below{{true}}
+  "test.compare"(%0#1, %c3) {cmp = "LT"} : (index, index) -> ()
+  return %1, %2, %3 : index, index, index
+}
+
+// -----
+
+// CHECK-DAG: #[[$map1:.+]] = affine_map<()[s0] -> (s0 floordiv 15)>
+// CHECK-DAG: #[[$map2:.+]] = affine_map<()[s0] -> ((s0 mod 15) floordiv 5)>
+// CHECK-DAG: #[[$map3:.+]] = affine_map<()[s0] -> (s0 mod 5)>
+// CHECK-LABEL: func.func @delinearize_static_no_outer_bound
+// CHECK-SAME: (%[[arg0:.+]]: index)
+// CHECK-DAG: %[[v1:.+]] = affine.apply #[[$map1]]()[%[[arg0]]]
+// CHECK-DAG: %[[v2:.+]] = affine.apply #[[$map2]]()[%[[arg0]]]
+// CHECK-DAG: %[[v3:.+]] = affine.apply #[[$map3]]()[%[[arg0]]]
+// CHECK: return %[[v1]], %[[v2]], %[[v3]]
+func.func @delinearize_static_no_outer_bound(%arg0: index) -> (index, index, index) {
+  %c2 = arith.constant 2 : index
+  %c3 = arith.constant 3 : index
+  %0:3 = affine.delinearize_index %arg0 into (3, 5) : index, index, index
+  %1 = "test.reify_bound"(%0#0) {type = "EQ"} : (index) -> (index)
+  %2 = "test.reify_bound"(%0#1) {type = "EQ"} : (index) -> (index)
+  %3 = "test.reify_bound"(%0#2) {type = "EQ"} : (index) -> (index)
+  "test.compaare"(%0#0, %c2) {cmp = "LT"} : (index, index) -> ()
+  // expected-remark @below{{true}}
+  "test.compare"(%0#1, %c3) {cmp = "LT"} : (index, index) -> ()
+  return %1, %2, %3 : index, index, index
+}
+
+// -----
+
+// CHECK: #[[$map:.+]] = affine_map<()[s0, s1] -> (s0 + s1 * 3)>
+// CHECK-LABEL: func.func @linearize_static
+// CHECK-SAME: (%[[arg0:.+]]: index, %[[arg1:.+]]: index)
+// CHECK: %[[v1:.+]] = affine.apply #[[$map]]()[%[[arg1]], %[[arg0]]]
+// CHECK: return %[[v1]]
+func.func @linearize_static(%arg0: index, %arg1: index)  -> index {
+  %c6 = arith.constant 6 : index
+  %0 = affine.linearize_index disjoint [%arg0, %arg1] by (2, 3) : index
+  %1 = "test.reify_bound"(%0) {type = "EQ"} : (index) -> (index)
+  // expected-remark @below{{true}}
+  "test.compare"(%0, %c6) {cmp = "LT"} : (index, index) -> ()
+  return %1 : index
+}
+
+// -----
+
+// CHECK: #[[$map:.+]] = affine_map<()[s0, s1] -> (s0 + s1 * 3)>
+// CHECK-LABEL: func.func @linearize_static_no_outer_bound
+// CHECK-SAME: (%[[arg0:.+]]: index, %[[arg1:.+]]: index)
+// CHECK: %[[v1:.+]] = affine.apply #[[$map]]()[%[[arg1]], %[[arg0]]]
+// CHECK: return %[[v1]]
+func.func @linearize_static_no_outer_bound(%arg0: index, %arg1: index)  -> index {
+  %c6 = arith.constant 6 : index
+  %0 = affine.linearize_index disjoint [%arg0, %arg1] by (3) : index
+  %1 = "test.reify_bound"(%0) {type = "EQ"} : (index) -> (index)
+  // expected-error @below{{unknown}}
+  "test.compare"(%0, %c6) {cmp = "LT"} : (index, index) -> ()
+  return %1 : index
+}
diff --git a/mlir/test/Dialect/GPU/indirect-device-func-call.mlir b/mlir/test/Dialect/GPU/indirect-device-func-call.mlir
index 91d7f1cd6c67..85805da3ac10 100644
--- a/mlir/test/Dialect/GPU/indirect-device-func-call.mlir
+++ b/mlir/test/Dialect/GPU/indirect-device-func-call.mlir
@@ -6,7 +6,7 @@ gpu.module @kernels {
     func.func @hello(%arg0 : f32) {
         %tid_x = gpu.thread_id x
         %csti8 = arith.constant 2 : i8
-        gpu.printf "Hello from %lld, %d, %f\n" %tid_x, %csti8, %arg0  : index, i8, f32
+        gpu.printf "Hello from %lld, %d, %f\n", %tid_x, %csti8, %arg0  : index, i8, f32
         return
     }
     // CHECK-LABEL: @hello_indirect
diff --git a/mlir/test/Dialect/GPU/ops.mlir b/mlir/test/Dialect/GPU/ops.mlir
index c0ff2044b76c..99915c493ea4 100644
--- a/mlir/test/Dialect/GPU/ops.mlir
+++ b/mlir/test/Dialect/GPU/ops.mlir
@@ -229,9 +229,22 @@ module attributes {gpu.container_module} {
 
     // CHECK-LABEL: gpu.func @printf_test
     // CHECK: (%[[ARG0:.*]]: i32)
-    // CHECK: gpu.printf "Value: %d" %[[ARG0]] : i32
+    // CHECK: gpu.printf "Value: %d", %[[ARG0]] : i32
     gpu.func @printf_test(%arg0 : i32) {
-      gpu.printf "Value: %d" %arg0 : i32
+      gpu.printf "Value: %d", %arg0 : i32
+      gpu.return
+    }
+
+    // CHECK-LABEL: gpu.func @printf_empty
+    // CHECK: gpu.printf  "]"
+    // CHECK: scf.if
+    // CHECK: gpu.printf ", "
+    gpu.func @printf_empty(%arg0 : i32) {
+      gpu.printf "]"
+      %1 = arith.cmpi slt, %arg0, %arg0 : i32
+      scf.if %1 {
+        gpu.printf ", "
+      } 
       gpu.return
     }
 
diff --git a/mlir/test/Dialect/GPU/test-nvvm-pipeline.mlir b/mlir/test/Dialect/GPU/test-nvvm-pipeline.mlir
index 732f40c4333d..f02b26dba97d 100644
--- a/mlir/test/Dialect/GPU/test-nvvm-pipeline.mlir
+++ b/mlir/test/Dialect/GPU/test-nvvm-pipeline.mlir
@@ -23,7 +23,7 @@ func.func @test_math(%arg0 : f32) {
         threads(%6, %7, %8) in (%9 = %c2, %10 = %c1, %11 = %c1) { 
         // CHECK-NVVM: __nv_expf 
         %s1 = math.exp %arg0 : f32
-        gpu.printf "%f" %s1 : f32
+        gpu.printf "%f", %s1 : f32
         gpu.terminator
     }
     return
diff --git a/mlir/test/Dialect/LLVMIR/roundtrip.mlir b/mlir/test/Dialect/LLVMIR/roundtrip.mlir
index aebfd7492093..88660ce598f3 100644
--- a/mlir/test/Dialect/LLVMIR/roundtrip.mlir
+++ b/mlir/test/Dialect/LLVMIR/roundtrip.mlir
@@ -750,6 +750,16 @@ llvm.func @experimental_noalias_scope_decl() {
   llvm.return
 }
 
+#alias_scope_domain2 = #llvm.alias_scope_domain<id = "domainid", description = "The domain">
+#alias_scope2 = #llvm.alias_scope<id = "stringid", domain = #alias_scope_domain2, description = "The domain">
+
+// CHECK-LABEL: @experimental_noalias_scope_with_string_id
+llvm.func @experimental_noalias_scope_with_string_id() {
+  // CHECK: llvm.intr.experimental.noalias.scope.decl #{{.*}}
+  llvm.intr.experimental.noalias.scope.decl #alias_scope2
+  llvm.return
+}
+
 // CHECK-LABEL: @experimental_constrained_fptrunc
 llvm.func @experimental_constrained_fptrunc(%in: f64) {
   // CHECK: llvm.intr.experimental.constrained.fptrunc %{{.*}} towardzero ignore : f64 to f32
diff --git a/mlir/test/Dialect/Tosa/canonicalize.mlir b/mlir/test/Dialect/Tosa/canonicalize.mlir
index 67cd01f62f0b..60121bb0ea2f 100644
--- a/mlir/test/Dialect/Tosa/canonicalize.mlir
+++ b/mlir/test/Dialect/Tosa/canonicalize.mlir
@@ -162,7 +162,7 @@ func.func @conv2d_stride_2(%arg0: tensor<4x10x10x2xf32>) -> tensor<4x10x10x3xf32
   // CHECK: tosa.conv2d
   %weight = "tosa.const"() {value = dense<[[[[1.0, 1.0]]], [[[1.0, 1.0]]], [[[1.0, 1.0]]]]> : tensor<3x1x1x2xf32>} : ()-> tensor<3x1x1x2xf32>
   %bias = "tosa.const"() {value = dense<0.0> : tensor<3xf32>} : ()-> tensor<3xf32>
-  %0 = tosa.conv2d %arg0, %weight, %bias {pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 2, 2>, dilation = array<i64: 1, 1>} : (tensor<4x10x10x2xf32>, tensor<3x1x1x2xf32>, tensor<3xf32>) -> tensor<4x10x10x3xf32>
+  %0 = tosa.conv2d %arg0, %weight, %bias {acc_type = f32, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 2, 2>, dilation = array<i64: 1, 1>} : (tensor<4x10x10x2xf32>, tensor<3x1x1x2xf32>, tensor<3xf32>) -> tensor<4x10x10x3xf32>
   return %0 : tensor<4x10x10x3xf32>
 }
 
@@ -173,7 +173,7 @@ func.func @conv2d_weight_2x2(%arg0: tensor<4x10x10x1xf32>) -> tensor<4x10x10x1xf
   // CHECK: tosa.conv2d
   %weight = "tosa.const"() {value = dense<[[[[1.0], [1.0]], [[1.0], [1.0]]]]> : tensor<1x2x2x1xf32>} : ()-> tensor<1x2x2x1xf32>
   %bias = "tosa.const"() {value = dense<0.0> : tensor<1xf32>} : ()-> tensor<1xf32>
-  %0 = tosa.conv2d %arg0, %weight, %bias {pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>, dilation = array<i64: 1, 1>} : (tensor<4x10x10x1xf32>, tensor<1x2x2x1xf32>, tensor<1xf32>) -> tensor<4x10x10x1xf32>
+  %0 = tosa.conv2d %arg0, %weight, %bias {acc_type = f32, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>, dilation = array<i64: 1, 1>} : (tensor<4x10x10x1xf32>, tensor<1x2x2x1xf32>, tensor<1xf32>) -> tensor<4x10x10x1xf32>
   return %0 : tensor<4x10x10x1xf32>
 }
 
@@ -182,7 +182,7 @@ func.func @conv2d_weight_2x2(%arg0: tensor<4x10x10x1xf32>) -> tensor<4x10x10x1xf
 // CHECK-LABEL: @depthwise_conv2d_stride_2
 func.func @depthwise_conv2d_stride_2(%arg0: tensor<4x10x10x2xf32>, %arg1: tensor<1x1x2x3xf32>, %arg2: tensor<6xf32>) -> tensor<4x10x10x6xf32> {
   // CHECK: tosa.depthwise_conv2d
-  %0 = tosa.depthwise_conv2d %arg0, %arg1, %arg2 {pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 2, 2>, dilation = array<i64: 1, 1>} : (tensor<4x10x10x2xf32>, tensor<1x1x2x3xf32>, tensor<6xf32>) -> tensor<4x10x10x6xf32>
+  %0 = tosa.depthwise_conv2d %arg0, %arg1, %arg2 {acc_type = f32, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 2, 2>, dilation = array<i64: 1, 1>} : (tensor<4x10x10x2xf32>, tensor<1x1x2x3xf32>, tensor<6xf32>) -> tensor<4x10x10x6xf32>
   return %0 : tensor<4x10x10x6xf32>
 }
 
@@ -191,7 +191,7 @@ func.func @depthwise_conv2d_stride_2(%arg0: tensor<4x10x10x2xf32>, %arg1: tensor
 // CHECK-LABEL: @depthwise_conv2d_weight_2x2
 func.func @depthwise_conv2d_weight_2x2(%arg0: tensor<4x10x10x2xf32>, %arg1: tensor<2x2x2x3xf32>, %arg2: tensor<6xf32>) -> tensor<4x10x10x6xf32> {
   // CHECK: tosa.depthwise_conv2d
-  %0 = tosa.depthwise_conv2d %arg0, %arg1, %arg2 {pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>, dilation = array<i64: 1, 1>} : (tensor<4x10x10x2xf32>, tensor<2x2x2x3xf32>, tensor<6xf32>) -> tensor<4x10x10x6xf32>
+  %0 = tosa.depthwise_conv2d %arg0, %arg1, %arg2 {acc_type = f32, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>, dilation = array<i64: 1, 1>} : (tensor<4x10x10x2xf32>, tensor<2x2x2x3xf32>, tensor<6xf32>) -> tensor<4x10x10x6xf32>
   return %0 : tensor<4x10x10x6xf32>
 }
 
@@ -210,8 +210,8 @@ func.func @max_pool2d_is_noop(%arg0: tensor<10x1x1x3xf32>) -> tensor<10x1x1x3xf3
 // CHECK-LABEL: @pad_noop
 func.func @pad_noop(%arg0: tensor<?x?xf32>) -> tensor<?x?xf32> {
   // CHECK: return %arg0
-  %0 = "tosa.const"() { value = dense<0> : tensor<2x2xi32>} : () -> tensor<2x2xi32>
-  %1 = tosa.pad %arg0, %0 : (tensor<?x?xf32>, tensor<2x2xi32>) -> tensor<?x?xf32>
+  %0 = "tosa.const"() { value = dense<0> : tensor<4xi32>} : () -> tensor<4xi32>
+  %1 = tosa.pad %arg0, %0 : (tensor<?x?xf32>, tensor<4xi32>) -> tensor<?x?xf32>
   return %1 : tensor<?x?xf32>
 }
 
@@ -221,8 +221,8 @@ func.func @pad_noop(%arg0: tensor<?x?xf32>) -> tensor<?x?xf32> {
 func.func @pad_noop_padding_mismatch_nofold(%arg0: tensor<?x?xf32>) -> tensor<?x?xf32> {
   // CHECK: %[[PAD:.+]] = tosa.pad
   // CHECK: return %[[PAD]]
-  %0 = "tosa.const"() { value = dense_resource<__elided__> : tensor<2x2xi32>} : () -> tensor<2x2xi32>
-  %1 = tosa.pad %arg0, %0 : (tensor<?x?xf32>, tensor<2x2xi32>) -> tensor<?x?xf32>
+  %0 = "tosa.const"() { value = dense_resource<__elided__> : tensor<4xi32>} : () -> tensor<4xi32>
+  %1 = tosa.pad %arg0, %0 : (tensor<?x?xf32>, tensor<4xi32>) -> tensor<?x?xf32>
   return %1 : tensor<?x?xf32>
 }
 
@@ -234,42 +234,39 @@ func.func @pad_noop_type_mismatch_nofold(%arg0: tensor<10xf32>) -> tensor<?xf32>
   // CHECK: return %[[PAD]]
 
   %c0_i32 = arith.constant 0 : i32
-  %shape = tensor.from_elements %c0_i32, %c0_i32 : tensor<1x2xi32>
+  %shape = tensor.from_elements %c0_i32, %c0_i32 : tensor<2xi32>
 
-  %0 = tosa.pad %arg0, %shape : (tensor<10xf32>, tensor<1x2xi32>) -> tensor<?xf32>
+  %0 = tosa.pad %arg0, %shape : (tensor<10xf32>, tensor<2xi32>) -> tensor<?xf32>
   return %0 : tensor<?xf32>
 }
 
 // -----
 
 // CHECK-LABEL: @pad_determine_val_i32
-func.func @pad_determine_val_i32(%arg0: tensor<?x?xi32>, %arg1 : tensor<2x2xi32>) -> tensor<?x?xi32> {
+func.func @pad_determine_val_i32(%arg0: tensor<?x?xi32>, %arg1 : tensor<4xi32>) -> tensor<?x?xi32> {
   // CHECK: %[[ZERO:.+]] = "tosa.const"() <{value = dense<0> : tensor<i32>}
   // CHECK: tosa.pad %arg0, %arg1, %[[ZERO]]
-  %0 = "tosa.const"() { value = dense<[[1, 0], [0, 1]]> : tensor<2x2xi32>} : () -> tensor<2x2xi32>
-  %1 = tosa.pad %arg0, %arg1 : (tensor<?x?xi32>, tensor<2x2xi32>) -> tensor<?x?xi32>
+  %1 = tosa.pad %arg0, %arg1 : (tensor<?x?xi32>, tensor<4xi32>) -> tensor<?x?xi32>
   return %1 : tensor<?x?xi32>
 }
 
 // -----
 
 // CHECK-LABEL: @pad_determine_val_f32
-func.func @pad_determine_val_f32(%arg0: tensor<?x?xf32>, %arg1 : tensor<2x2xi32>) -> tensor<?x?xf32> {
+func.func @pad_determine_val_f32(%arg0: tensor<?x?xf32>, %arg1 : tensor<4xi32>) -> tensor<?x?xf32> {
   // CHECK: %[[ZERO:.+]] = "tosa.const"() <{value = dense<0.000000e+00> : tensor<f32>}
   // CHECK: tosa.pad %arg0, %arg1, %[[ZERO]]
-  %0 = "tosa.const"() { value = dense<[[1, 0], [0, 1]]> : tensor<2x2xi32>} : () -> tensor<2x2xi32>
-  %1 = tosa.pad %arg0, %arg1 : (tensor<?x?xf32>, tensor<2x2xi32>) -> tensor<?x?xf32>
+  %1 = tosa.pad %arg0, %arg1 : (tensor<?x?xf32>, tensor<4xi32>) -> tensor<?x?xf32>
   return %1 : tensor<?x?xf32>
 }
 
 // -----
 
 // CHECK-LABEL: @pad_determine_val_quant
-func.func @pad_determine_val_quant(%arg0: tensor<?x?xi32>, %arg1 : tensor<2x2xi32>) -> tensor<?x?xi32> {
+func.func @pad_determine_val_quant(%arg0: tensor<?x?xi32>, %arg1 : tensor<4xi32>) -> tensor<?x?xi32> {
   // CHECK: %[[ZERO:.+]] = "tosa.const"() <{value = dense<42> : tensor<i32>}
   // CHECK: tosa.pad %arg0, %arg1, %[[ZERO]]
-  %0 = "tosa.const"() { value = dense<[[1, 0], [0, 1]]> : tensor<2x2xi32>} : () -> tensor<2x2xi32>
-  %1 = tosa.pad %arg0, %arg1 {quantization_info = #tosa.pad_quant<input_zp = 42>} : (tensor<?x?xi32>, tensor<2x2xi32>) -> tensor<?x?xi32>
+  %1 = tosa.pad %arg0, %arg1 {quantization_info = #tosa.pad_quant<input_zp = 42>} : (tensor<?x?xi32>, tensor<4xi32>) -> tensor<?x?xi32>
   return %1 : tensor<?x?xi32>
 }
 
diff --git a/mlir/test/Dialect/Tosa/invalid.mlir b/mlir/test/Dialect/Tosa/invalid.mlir
index b796a6343e5e..a6d57f8a2f61 100644
--- a/mlir/test/Dialect/Tosa/invalid.mlir
+++ b/mlir/test/Dialect/Tosa/invalid.mlir
@@ -25,7 +25,7 @@ func.func @test_const_non_tensor_attr() {
 
 func.func @test_conv2d(%arg0: tensor<1x29x29x4xf32>, %arg1: tensor<16x3x3x4xi8>, %arg2: tensor<16xi8>) -> tensor<1x27x27x16xi8> {
   // expected-error@+1 {{expect both input and weight to be float or not together, got 'f32' and 'i8'}}
-  %0 = tosa.conv2d %arg0, %arg1, %arg2 {dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>}
+  %0 = tosa.conv2d %arg0, %arg1, %arg2 {acc_type = i32, dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>}
            : (tensor<1x29x29x4xf32>, tensor<16x3x3x4xi8>, tensor<16xi8>) -> tensor<1x27x27x16xi8>
   return %0 : tensor<1x27x27x16xi8>
 }
@@ -34,7 +34,7 @@ func.func @test_conv2d(%arg0: tensor<1x29x29x4xf32>, %arg1: tensor<16x3x3x4xi8>,
 
 func.func @test_conv2d(%arg0: tensor<*xi8>, %arg1: tensor<16x3x3x4xi8>, %arg2: tensor<16xi8>) -> tensor<1x27x27x16xi8> {
   // expected-error@+1 {{expect a ranked tensor for input, got <block argument> of type 'tensor<*xi8>' at index: 0}}
-  %0 = tosa.conv2d %arg0, %arg1, %arg2 {dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>}
+  %0 = tosa.conv2d %arg0, %arg1, %arg2 {acc_type = i32, dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>}
            : (tensor<*xi8>, tensor<16x3x3x4xi8>, tensor<16xi8>) -> tensor<1x27x27x16xi8>
   return %0 : tensor<1x27x27x16xi8>
 }
@@ -43,7 +43,7 @@ func.func @test_conv2d(%arg0: tensor<*xi8>, %arg1: tensor<16x3x3x4xi8>, %arg2: t
 
 func.func @test_conv2d(%arg0: tensor<1x29x29x4xi8>, %arg1: tensor<*xi8>, %arg2: tensor<16xi8>) -> tensor<1x27x27x16xi8> {
   // expected-error@+1 {{'tosa.conv2d' op operand #1 must be 4D tensor of 4-bit signless integer or 8-bit signless integer or Quint8 type or Qint4 type or Qint8 type or Qint16 type or Qint32 type or floating-point values, but got 'tensor<*xi8>'}}
-  %0 = tosa.conv2d %arg0, %arg1, %arg2 {dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>}
+  %0 = tosa.conv2d %arg0, %arg1, %arg2 {acc_type = i32, dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>}
            : (tensor<1x29x29x4xi8>, tensor<*xi8>, tensor<16xi8>) -> tensor<1x27x27x16xi8>
   return %0 : tensor<1x27x27x16xi8>
 }
@@ -52,13 +52,101 @@ func.func @test_conv2d(%arg0: tensor<1x29x29x4xi8>, %arg1: tensor<*xi8>, %arg2:
 
 func.func @test_conv2d(%arg0: tensor<1x29x29x4xi8>, %arg1: tensor<16x3x3x4xi8>, %arg2: tensor<16xi8>) -> tensor<1x27x27x16xi8> {
   // expected-error@+1 {{'tosa.conv2d' op quantizationattr is required for quantized type, and not allowed for float type}}
-  %0 = tosa.conv2d %arg0, %arg1, %arg2 {dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>}
+  %0 = tosa.conv2d %arg0, %arg1, %arg2 {acc_type = f16, dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>}
            : (tensor<1x29x29x4xi8>, tensor<16x3x3x4xi8>, tensor<16xi8>) -> tensor<1x27x27x16xi8>
   return %0 : tensor<1x27x27x16xi8>
 }
 
 // -----
 
+func.func @test_conv2d_acc_type(%arg0: tensor<1x29x29x4xi8>, %arg1: tensor<16x3x3x4xi8>, %arg2: tensor<16xi8>) -> tensor<1x27x27x16xi8> {
+  // expected-error@+1 {{'tosa.conv2d' op accumulator type for i8 tensor is not i32}}
+  %0 = tosa.conv2d %arg0, %arg1, %arg2 {acc_type = f16, dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>, quantization_info = #tosa.conv_quant<input_zp = 0, weight_zp = 0>}
+           : (tensor<1x29x29x4xi8>, tensor<16x3x3x4xi8>, tensor<16xi8>) -> tensor<1x27x27x16xi8>
+  return %0 : tensor<1x27x27x16xi8>
+}
+
+// -----
+
+func.func @test_conv2d_acc_type(%arg0: tensor<1x29x29x4xi16>, %arg1: tensor<16x3x3x4xi8>, %arg2: tensor<16xi16>) -> tensor<1x27x27x16xi16> {
+  // expected-error@+1 {{'tosa.conv2d' op accumulator type for i16 tensor is not i48}}
+  %0 = tosa.conv2d %arg0, %arg1, %arg2 {acc_type = f16, dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>, quantization_info = #tosa.conv_quant<input_zp = 0, weight_zp = 0>}
+           : (tensor<1x29x29x4xi16>, tensor<16x3x3x4xi8>, tensor<16xi16>) -> tensor<1x27x27x16xi16>
+  return %0 : tensor<1x27x27x16xi16>
+}
+
+// -----
+
+func.func @test_conv2d_acc_type(%arg0: tensor<1x29x29x4xf8E5M2>, %arg1: tensor<16x3x3x4xf8E5M2>, %arg2: tensor<16xf16>) -> tensor<1x27x27x16xf16> {
+  // expected-error@+1 {{'tosa.conv2d' op accumulator type for f8 tensor is not f16}}
+  %0 = tosa.conv2d %arg0, %arg1, %arg2 {acc_type = i32, dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>}
+           : (tensor<1x29x29x4xf8E5M2>, tensor<16x3x3x4xf8E5M2>, tensor<16xf16>) -> tensor<1x27x27x16xf16>
+  return %0 : tensor<1x27x27x16xf16>
+}
+
+// -----
+
+func.func @test_conv2d_acc_type(%arg0: tensor<1x29x29x4xf8E4M3>, %arg1: tensor<16x3x3x4xf8E4M3>, %arg2: tensor<16xf16>) -> tensor<1x27x27x16xf16> {
+  // expected-error@+1 {{'tosa.conv2d' op accumulator type for f8 tensor is not f16}}
+  %0 = tosa.conv2d %arg0, %arg1, %arg2 {acc_type = i32, dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>}
+           : (tensor<1x29x29x4xf8E4M3>, tensor<16x3x3x4xf8E4M3>, tensor<16xf16>) -> tensor<1x27x27x16xf16>
+  return %0 : tensor<1x27x27x16xf16>
+}
+
+// -----
+
+func.func @test_conv2d_acc_type(%arg0: tensor<1x29x29x4xf16>, %arg1: tensor<16x3x3x4xf16>, %arg2: tensor<16xf16>) -> tensor<1x27x27x16xf16> {
+  // expected-error@+1 {{'tosa.conv2d' op accumulator type for f16 tensor is not f16/f32}}
+  %0 = tosa.conv2d %arg0, %arg1, %arg2 {acc_type = i32, dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>}
+           : (tensor<1x29x29x4xf16>, tensor<16x3x3x4xf16>, tensor<16xf16>) -> tensor<1x27x27x16xf16>
+  return %0 : tensor<1x27x27x16xf16>
+}
+
+// -----
+
+func.func @test_conv2d_acc_type(%arg0: tensor<1x29x29x4xbf16>, %arg1: tensor<16x3x3x4xbf16>, %arg2: tensor<16xbf16>) -> tensor<1x27x27x16xbf16> {
+  // expected-error@+1 {{'tosa.conv2d' op accumulator type for bf16 tensor is not f32}}
+  %0 = tosa.conv2d %arg0, %arg1, %arg2 {acc_type = i32, dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>}
+           : (tensor<1x29x29x4xbf16>, tensor<16x3x3x4xbf16>, tensor<16xbf16>) -> tensor<1x27x27x16xbf16>
+  return %0 : tensor<1x27x27x16xbf16>
+}
+
+// -----
+
+func.func @test_conv2d_acc_type(%arg0: tensor<1x29x29x4xf32>, %arg1: tensor<16x3x3x4xf32>, %arg2: tensor<16xf32>) -> tensor<1x27x27x16xf32> {
+  // expected-error@+1 {{'tosa.conv2d' op accumulator type for f32 tensor is not f32}}
+  %0 = tosa.conv2d %arg0, %arg1, %arg2 {acc_type = i32, dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>}
+           : (tensor<1x29x29x4xf32>, tensor<16x3x3x4xf32>, tensor<16xf32>) -> tensor<1x27x27x16xf32>
+  return %0 : tensor<1x27x27x16xf32>
+}
+
+// -----
+
+func.func @test_conv3d_acc_type(%arg0: tensor<1x4x8x21x17xi8>, %arg1: tensor<34x1x1x1x17xi8>, %arg2: tensor<34xi8>) -> tensor<1x4x8x21x34xi8> {
+  // expected-error@+1 {{'tosa.conv3d' op accumulator type for i8 tensor is not i32}}
+  %0 = tosa.conv3d %arg0, %arg1, %arg2 {acc_type = f16, dilation = array<i64: 1, 1, 1>, pad = array<i64: 0, 0, 0, 0, 0, 0>, stride = array<i64: 1, 1, 1>, quantization_info = #tosa.conv_quant<input_zp = 0, weight_zp = 0>}
+           : (tensor<1x4x8x21x17xi8>, tensor<34x1x1x1x17xi8>, tensor<34xi8>) -> tensor<1x4x8x21x34xi8>
+  return %0 : tensor<1x4x8x21x34xi8>
+}
+
+// -----
+
+func.func @test_depthwise_conv2d_acc_type(%arg0: tensor<1x4x4x4xi8>, %arg1: tensor<1x1x4x2xi8>, %arg2: tensor<8xi8>) -> tensor<1x4x4x8xi8> {
+  // expected-error@+1 {{'tosa.depthwise_conv2d' op accumulator type for i8 tensor is not i32}}
+  %0 = tosa.depthwise_conv2d %arg0, %arg1, %arg2 {acc_type = f16, dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>, quantization_info = #tosa.conv_quant<input_zp = 0, weight_zp = 0>} : (tensor<1x4x4x4xi8>, tensor<1x1x4x2xi8>, tensor<8xi8>) -> tensor<1x4x4x8xi8>
+  return %0 : tensor<1x4x4x8xi8>
+}
+
+// -----
+
+func.func @test_transpose_conv2d(%arg0: tensor<1x32x32x8xi8>, %arg1: tensor<16x1x1x8xi8>, %arg2: tensor<16xi8>) -> tensor<1x32x32x16xi8> {
+  // expected-error@+1 {{'tosa.transpose_conv2d' op accumulator type for i8 tensor is not i32}}
+  %0 = tosa.transpose_conv2d %arg0, %arg1, %arg2 {acc_type = f16, out_pad = array<i64: 0, 0, 0, 0>, out_shape = array<i64: 1, 32, 32, 16>, stride = array<i64: 1, 1>, quantization_info = #tosa.conv_quant<input_zp = 0, weight_zp = 0>} : (tensor<1x32x32x8xi8>, tensor<16x1x1x8xi8>, tensor<16xi8>) -> tensor<1x32x32x16xi8>
+  return %0 : tensor<1x32x32x16xi8>
+}
+
+// -----
+
 func.func @test_concat(%arg0 : tensor<2x1xf32>, %arg1 : tensor<2x2xf32>) -> tensor<?x?xf32> {
   // expected-error@+2 {{failed to infer returned types}}
   // expected-error@+1 {{Cannot concat tensors with different sizes on the non-axis dimension 1}}
@@ -77,48 +165,56 @@ func.func @test_concat_element_type_mismatch(%arg0 : tensor<1x2xf32>, %arg1 : te
 
 // -----
 
-func.func @test_pad_non_const(%arg0: tensor<13x21x3xf32>, %arg1: tensor<3x2xi32>) -> tensor<13x21x3xf32> {
+func.func @test_pad_non_const(%arg0: tensor<13x21x3xf32>, %arg1: tensor<6xi32>) -> tensor<13x21x3xf32> {
   // expected-error@+1 {{'tosa.pad' op padding of pad is not constant}}
-  %0 = tosa.pad %arg0, %arg1 : (tensor<13x21x3xf32>, tensor<3x2xi32>) -> tensor<13x21x3xf32>
+  %0 = tosa.pad %arg0, %arg1 : (tensor<13x21x3xf32>, tensor<6xi32>) -> tensor<13x21x3xf32>
   return %0 : tensor<13x21x3xf32>
 }
 
 // -----
 
 func.func @test_pad_non_const(%arg0: tensor<13x21x3xi8>, %arg1: tensor<i8>) -> tensor<13x21x3xi8> {
-  %0 = "tosa.const"() {value = dense<[[0, 0], [0, 1], [0, 1]]> : tensor<3x2xi32>} : () -> tensor<3x2xi32>
+  %0 = "tosa.const"() {value = dense<[0, 0, 0, 1, 0, 1]> : tensor<6xi32>} : () -> tensor<6xi32>
   // expected-error@+1 {{'tosa.pad' op pad_const of pad is not constant}}
-  %1 = tosa.pad %arg0, %0, %arg1 : (tensor<13x21x3xi8>, tensor<3x2xi32>, tensor<i8>) -> tensor<13x21x3xi8>
+  %1 = tosa.pad %arg0, %0, %arg1 : (tensor<13x21x3xi8>, tensor<6xi32>, tensor<i8>) -> tensor<13x21x3xi8>
   return %1 : tensor<13x21x3xi8>
 }
 
 // -----
 
-func.func @test_pad_io_rank_mismatch(%arg0: tensor<13x21xf32>, %arg1: tensor<2x2xi32>) {
+func.func @test_pad_io_rank_mismatch(%arg0: tensor<13x21xf32>, %arg1: tensor<4xi32>) {
   // expected-error@+1 {{'tosa.pad' op expect same input and output tensor rank.}}
-  %1 = tosa.pad %arg0, %arg1 : (tensor<13x21xf32>, tensor<2x2xi32>) -> tensor<13x21x3xf32>
+  %1 = tosa.pad %arg0, %arg1 : (tensor<13x21xf32>, tensor<4xi32>) -> tensor<13x21x3xf32>
   return
 }
 
 // -----
 
-func.func @test_pad_invalid_padding_rank(%arg0: tensor<13x21xf32>, %arg1: tensor<2xi32>) {
-  // expected-error@+1 {{'tosa.pad' op expect 'padding' tensor rank equal to 2.}}
-  %1 = tosa.pad %arg0, %arg1 : (tensor<13x21xf32>, tensor<2xi32>) -> tensor<13x21xf32>
+func.func @test_pad_invalid_padding_rank(%arg0: tensor<13x21xf32>, %arg1: tensor<2x2xi32>) {
+  // expected-error@+1 {{'tosa.pad' op operand #1 must be 1D tensor of 32-bit signless integer or 64-bit signless integer values, but got 'tensor<2x2xi32>'}}
+  %1 = tosa.pad %arg0, %arg1 : (tensor<13x21xf32>, tensor<2x2xi32>) -> tensor<13x21xf32>
   return
 }
 
 // -----
 
-func.func @test_pad_invalid_padConst_rank(%arg0: tensor<13x21xf32>, %arg1: tensor<2x2xi32>) {
+func.func @test_pad_invalid_padConst_rank(%arg0: tensor<13x21xf32>, %arg1: tensor<4xi32>) {
   %0 = "tosa.const"() {value = dense<3.14> : tensor<1xf32>} : () -> tensor<1xf32>
   // expected-error@+1 {{'tosa.pad' op operand #2 must be 0D tensor of number values, but got 'tensor<1xf32>'}}
-  %1 = tosa.pad %arg0, %arg1, %0 : (tensor<13x21xf32>, tensor<2x2xi32>, tensor<1xf32>) -> tensor<13x21xf32>
+  %1 = tosa.pad %arg0, %arg1, %0 : (tensor<13x21xf32>, tensor<4xi32>, tensor<1xf32>) -> tensor<13x21xf32>
   return
 }
 
 // -----
 
+func.func @test_pad_padding_shape_mismatch(%arg0: tensor<13x21x3xf32>, %arg1: tensor<4xi32>) -> tensor<13x21x3xf32> {
+  // expected-error@+1 {{'tosa.pad' op expected padding tensor dim 0 to have size 6 (2*rank(shape1)) but got size 4}}
+  %0 = tosa.pad %arg0, %arg1 : (tensor<13x21x3xf32>, tensor<4xi32>) -> tensor<13x21x3xf32>
+  return %0 : tensor<13x21x3xf32>
+}
+
+// -----
+
 func.func @test_transpose_non_const(%arg0: tensor<13x21x3xf32>, %arg1: tensor<3xi32>) -> tensor<3x13x21xf32> {
   // expected-error@+1 {{'tosa.transpose' op perms of transpose is not constant}}
   %0 = tosa.transpose %arg0, %arg1 : (tensor<13x21x3xf32>, tensor<3xi32>) -> tensor<3x13x21xf32>
@@ -425,7 +521,7 @@ func.func @test_const_attribute_type_mismatch() -> tensor<100x100xf32> {
 
 func.func @test_conv2d_static_zero_dim_input(%arg0: tensor<1x29x0x4xf32>, %arg1: tensor<16x3x3x4xf32>, %arg2: tensor<16xf32>) -> tensor<1x27x27x16xf32> {
   // expected-error@+1 {{'tosa.conv2d' op operand #0 must be 4-d tosa-conformant tensor, but got 'tensor<1x29x0x4xf32>'}}
-  %0 = "tosa.conv2d"(%arg0, %arg1, %arg2) {dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>}
+  %0 = "tosa.conv2d"(%arg0, %arg1, %arg2) {acc_type = f32, dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>}
            : (tensor<1x29x0x4xf32>, tensor<16x3x3x4xf32>, tensor<16xf32>) -> tensor<1x27x27x16xf32>
   return %0 : tensor<1x27x27x16xf32>
 }
diff --git a/mlir/test/Dialect/Tosa/level_check.mlir b/mlir/test/Dialect/Tosa/level_check.mlir
index 529a16ca48c7..ba8ed8a1e5f5 100644
--- a/mlir/test/Dialect/Tosa/level_check.mlir
+++ b/mlir/test/Dialect/Tosa/level_check.mlir
@@ -226,7 +226,7 @@ func.func @test_avgpool2d_pad_right(%arg0: tensor<1x32x32x8xf32>) -> tensor<1x32
 
 func.func @test_conv2d_dilation_y(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<16x2x2x8xf32>, %arg2: tensor<16xf32>) -> tensor<1x32x32x16xf32> {
   // expected-error@+1 {{'tosa.conv2d' op failed level check: dilation_y * KH <= MAX_KERNEL}}
-  %0 = "tosa.conv2d"(%arg0, %arg1, %arg2) {dilation = array<i64: 4097, 1>, pad = array<i64: 0, 1, 0, 1>, stride = array<i64: 1, 1>} :
+  %0 = "tosa.conv2d"(%arg0, %arg1, %arg2) {acc_type = f32, dilation = array<i64: 4097, 1>, pad = array<i64: 0, 1, 0, 1>, stride = array<i64: 1, 1>} :
             (tensor<1x32x32x8xf32>, tensor<16x2x2x8xf32>, tensor<16xf32>) -> tensor<1x32x32x16xf32>
   return %0 : tensor<1x32x32x16xf32>
 }
@@ -235,7 +235,7 @@ func.func @test_conv2d_dilation_y(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<16
 
 func.func @test_conv2d_dilation_x(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<16x2x2x8xf32>, %arg2: tensor<16xf32>) -> tensor<1x32x32x16xf32> {
   // expected-error@+1 {{'tosa.conv2d' op failed level check: dilation_x * KW <= MAX_KERNEL}}
-  %0 = "tosa.conv2d"(%arg0, %arg1, %arg2) {dilation = array<i64: 1, 4097>, pad = array<i64: 0, 1, 0, 1>, stride = array<i64: 1, 1>} :
+  %0 = "tosa.conv2d"(%arg0, %arg1, %arg2) {acc_type = f32, dilation = array<i64: 1, 4097>, pad = array<i64: 0, 1, 0, 1>, stride = array<i64: 1, 1>} :
             (tensor<1x32x32x8xf32>, tensor<16x2x2x8xf32>, tensor<16xf32>) -> tensor<1x32x32x16xf32>
   return %0 : tensor<1x32x32x16xf32>
 }
@@ -244,7 +244,7 @@ func.func @test_conv2d_dilation_x(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<16
 
 func.func @test_conv2d_pad_top(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<16x2x2x8xf32>, %arg2: tensor<16xf32>) -> tensor<1x32x32x16xf32> {
   // expected-error@+1 {{'tosa.conv2d' op failed level check: pad <= MAX_KERNEL}}
-  %0 = "tosa.conv2d"(%arg0, %arg1, %arg2) {dilation = array<i64: 1, 1>, pad = array<i64: 8193, 1, 0, 1>, stride = array<i64: 1, 1>} :
+  %0 = "tosa.conv2d"(%arg0, %arg1, %arg2) {acc_type = f32, dilation = array<i64: 1, 1>, pad = array<i64: 8193, 1, 0, 1>, stride = array<i64: 1, 1>} :
             (tensor<1x32x32x8xf32>, tensor<16x2x2x8xf32>, tensor<16xf32>) -> tensor<1x32x32x16xf32>
   return %0 : tensor<1x32x32x16xf32>
 }
@@ -253,7 +253,7 @@ func.func @test_conv2d_pad_top(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<16x2x
 
 func.func @test_conv2d_pad_bottom(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<16x2x2x8xf32>, %arg2: tensor<16xf32>) -> tensor<1x32x32x16xf32> {
   // expected-error@+1 {{'tosa.conv2d' op failed level check: pad <= MAX_KERNEL}}
-  %0 = "tosa.conv2d"(%arg0, %arg1, %arg2) {dilation = array<i64: 1, 1>, pad = array<i64: 0, 8193, 0, 1>, stride = array<i64: 1, 1>} :
+  %0 = "tosa.conv2d"(%arg0, %arg1, %arg2) {acc_type = f32, dilation = array<i64: 1, 1>, pad = array<i64: 0, 8193, 0, 1>, stride = array<i64: 1, 1>} :
             (tensor<1x32x32x8xf32>, tensor<16x2x2x8xf32>, tensor<16xf32>) -> tensor<1x32x32x16xf32>
   return %0 : tensor<1x32x32x16xf32>
 }
@@ -262,7 +262,7 @@ func.func @test_conv2d_pad_bottom(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<16
 
 func.func @test_conv2d_pad_left(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<16x2x2x8xf32>, %arg2: tensor<16xf32>) -> tensor<1x32x32x16xf32> {
   // expected-error@+1 {{'tosa.conv2d' op failed level check: pad <= MAX_KERNEL}}
-  %0 = "tosa.conv2d"(%arg0, %arg1, %arg2) {dilation = array<i64: 1, 1>, pad = array<i64: 0, 1, 8193, 1>, stride = array<i64: 1, 1>} :
+  %0 = "tosa.conv2d"(%arg0, %arg1, %arg2) {acc_type = f32, dilation = array<i64: 1, 1>, pad = array<i64: 0, 1, 8193, 1>, stride = array<i64: 1, 1>} :
             (tensor<1x32x32x8xf32>, tensor<16x2x2x8xf32>, tensor<16xf32>) -> tensor<1x32x32x16xf32>
   return %0 : tensor<1x32x32x16xf32>
 }
@@ -271,7 +271,7 @@ func.func @test_conv2d_pad_left(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<16x2
 
 func.func @test_conv2d_pad_right(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<16x2x2x8xf32>, %arg2: tensor<16xf32>) -> tensor<1x32x32x16xf32> {
   // expected-error@+1 {{'tosa.conv2d' op failed level check: pad <= MAX_KERNEL}}
-  %0 = "tosa.conv2d"(%arg0, %arg1, %arg2) {dilation = array<i64: 1, 1>, pad = array<i64: 0, 1, 0, 8193>, stride = array<i64: 1, 1>} :
+  %0 = "tosa.conv2d"(%arg0, %arg1, %arg2) {acc_type = f32, dilation = array<i64: 1, 1>, pad = array<i64: 0, 1, 0, 8193>, stride = array<i64: 1, 1>} :
             (tensor<1x32x32x8xf32>, tensor<16x2x2x8xf32>, tensor<16xf32>) -> tensor<1x32x32x16xf32>
   return %0 : tensor<1x32x32x16xf32>
 }
@@ -280,7 +280,7 @@ func.func @test_conv2d_pad_right(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<16x
 
 func.func @test_conv2d_stride_y(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<16x2x2x8xf32>, %arg2: tensor<16xf32>) -> tensor<1x32x32x16xf32> {
   // expected-error@+1 {{'tosa.conv2d' op failed level check: stride <= MAX_STRIDE}}
-  %0 = "tosa.conv2d"(%arg0, %arg1, %arg2) {dilation = array<i64: 1, 1>, pad = array<i64: 0, 1, 0, 1>, stride = array<i64: 8193, 1>} :
+  %0 = "tosa.conv2d"(%arg0, %arg1, %arg2) {acc_type = f32, dilation = array<i64: 1, 1>, pad = array<i64: 0, 1, 0, 1>, stride = array<i64: 8193, 1>} :
             (tensor<1x32x32x8xf32>, tensor<16x2x2x8xf32>, tensor<16xf32>) -> tensor<1x32x32x16xf32>
   return %0 : tensor<1x32x32x16xf32>
 }
@@ -289,7 +289,7 @@ func.func @test_conv2d_stride_y(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<16x2
 
 func.func @test_conv2d_stride_x(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<16x2x2x8xf32>, %arg2: tensor<16xf32>) -> tensor<1x32x32x16xf32> {
   // expected-error@+1 {{'tosa.conv2d' op failed level check: stride <= MAX_STRIDE}}
-  %0 = "tosa.conv2d"(%arg0, %arg1, %arg2) {dilation = array<i64: 1, 1>, pad = array<i64: 0, 1, 0, 1>, stride = array<i64: 1, 8193>} :
+  %0 = "tosa.conv2d"(%arg0, %arg1, %arg2) {acc_type = f32, dilation = array<i64: 1, 1>, pad = array<i64: 0, 1, 0, 1>, stride = array<i64: 1, 8193>} :
             (tensor<1x32x32x8xf32>, tensor<16x2x2x8xf32>, tensor<16xf32>) -> tensor<1x32x32x16xf32>
   return %0 : tensor<1x32x32x16xf32>
 }
@@ -298,7 +298,7 @@ func.func @test_conv2d_stride_x(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<16x2
 
 func.func @test_conv3d_dilation_d(%arg0: tensor<1x1x32x32x8xf32>, %arg1: tensor<16x2x2x2x8xf32>, %arg2: tensor<16xf32>) -> tensor<1x1x32x32x16xf32> {
   // expected-error@+1 {{'tosa.conv3d' op failed level check: dilation_d * KD <= MAX_KERNEL}}
-  %0 = "tosa.conv3d"(%arg0, %arg1, %arg2) {dilation = array<i64: 4097, 1, 1>, pad = array<i64: 0, 1, 0, 1, 0, 1>, stride = array<i64: 1, 1, 1>} :
+  %0 = "tosa.conv3d"(%arg0, %arg1, %arg2) {acc_type = f32, dilation = array<i64: 4097, 1, 1>, pad = array<i64: 0, 1, 0, 1, 0, 1>, stride = array<i64: 1, 1, 1>} :
             (tensor<1x1x32x32x8xf32>, tensor<16x2x2x2x8xf32>, tensor<16xf32>) -> tensor<1x1x32x32x16xf32>
   return %0 : tensor<1x1x32x32x16xf32>
 }
@@ -307,7 +307,7 @@ func.func @test_conv3d_dilation_d(%arg0: tensor<1x1x32x32x8xf32>, %arg1: tensor<
 
 func.func @test_conv3d_dilation_y(%arg0: tensor<1x1x32x32x8xf32>, %arg1: tensor<16x2x2x2x8xf32>, %arg2: tensor<16xf32>) -> tensor<1x1x32x32x16xf32> {
   // expected-error@+1 {{'tosa.conv3d' op failed level check: dilation_y * KH <= MAX_KERNEL}}
-  %0 = "tosa.conv3d"(%arg0, %arg1, %arg2) {dilation = array<i64: 1, 4097, 1>, pad = array<i64: 0, 1, 0, 1, 0, 1>, stride = array<i64: 1, 1, 1>} :
+  %0 = "tosa.conv3d"(%arg0, %arg1, %arg2) {acc_type = f32, dilation = array<i64: 1, 4097, 1>, pad = array<i64: 0, 1, 0, 1, 0, 1>, stride = array<i64: 1, 1, 1>} :
             (tensor<1x1x32x32x8xf32>, tensor<16x2x2x2x8xf32>, tensor<16xf32>) -> tensor<1x1x32x32x16xf32>
   return %0 : tensor<1x1x32x32x16xf32>
 }
@@ -316,7 +316,7 @@ func.func @test_conv3d_dilation_y(%arg0: tensor<1x1x32x32x8xf32>, %arg1: tensor<
 
 func.func @test_conv3d_dilation_x(%arg0: tensor<1x1x32x32x8xf32>, %arg1: tensor<16x2x2x2x8xf32>, %arg2: tensor<16xf32>) -> tensor<1x1x32x32x16xf32> {
   // expected-error@+1 {{'tosa.conv3d' op failed level check: dilation_x * KW <= MAX_KERNEL}}
-  %0 = "tosa.conv3d"(%arg0, %arg1, %arg2) {dilation = array<i64: 1, 1, 4097>, pad = array<i64: 0, 1, 0, 1, 0, 1>, stride = array<i64: 1, 1, 1>} :
+  %0 = "tosa.conv3d"(%arg0, %arg1, %arg2) {acc_type = f32, dilation = array<i64: 1, 1, 4097>, pad = array<i64: 0, 1, 0, 1, 0, 1>, stride = array<i64: 1, 1, 1>} :
             (tensor<1x1x32x32x8xf32>, tensor<16x2x2x2x8xf32>, tensor<16xf32>) -> tensor<1x1x32x32x16xf32>
   return %0 : tensor<1x1x32x32x16xf32>
 }
@@ -325,7 +325,7 @@ func.func @test_conv3d_dilation_x(%arg0: tensor<1x1x32x32x8xf32>, %arg1: tensor<
 
 func.func @test_conv3d_pad_d0(%arg0: tensor<1x1x32x32x8xf32>, %arg1: tensor<16x2x2x2x8xf32>, %arg2: tensor<16xf32>) -> tensor<1x1x32x32x16xf32> {
   // expected-error@+1 {{'tosa.conv3d' op failed level check: pad <= MAX_KERNEL}}
-  %0 = "tosa.conv3d"(%arg0, %arg1, %arg2) {dilation = array<i64: 1, 1, 1>, pad = array<i64: 8193, 1, 0, 1, 0, 1>, stride = array<i64: 1, 1, 1>} :
+  %0 = "tosa.conv3d"(%arg0, %arg1, %arg2) {acc_type = f32, dilation = array<i64: 1, 1, 1>, pad = array<i64: 8193, 1, 0, 1, 0, 1>, stride = array<i64: 1, 1, 1>} :
             (tensor<1x1x32x32x8xf32>, tensor<16x2x2x2x8xf32>, tensor<16xf32>) -> tensor<1x1x32x32x16xf32>
   return %0 : tensor<1x1x32x32x16xf32>
 }
@@ -334,7 +334,7 @@ func.func @test_conv3d_pad_d0(%arg0: tensor<1x1x32x32x8xf32>, %arg1: tensor<16x2
 
 func.func @test_conv3d_pad_d1(%arg0: tensor<1x1x32x32x8xf32>, %arg1: tensor<16x2x2x2x8xf32>, %arg2: tensor<16xf32>) -> tensor<1x1x32x32x16xf32> {
   // expected-error@+1 {{'tosa.conv3d' op failed level check: pad <= MAX_KERNEL}}
-  %0 = "tosa.conv3d"(%arg0, %arg1, %arg2) {dilation = array<i64: 1, 1, 1>, pad = array<i64: 1, 8193, 0, 1, 0, 1>, stride = array<i64: 1, 1, 1>} :
+  %0 = "tosa.conv3d"(%arg0, %arg1, %arg2) {acc_type = f32, dilation = array<i64: 1, 1, 1>, pad = array<i64: 1, 8193, 0, 1, 0, 1>, stride = array<i64: 1, 1, 1>} :
             (tensor<1x1x32x32x8xf32>, tensor<16x2x2x2x8xf32>, tensor<16xf32>) -> tensor<1x1x32x32x16xf32>
   return %0 : tensor<1x1x32x32x16xf32>
 }
@@ -343,7 +343,7 @@ func.func @test_conv3d_pad_d1(%arg0: tensor<1x1x32x32x8xf32>, %arg1: tensor<16x2
 
 func.func @test_conv3d_pad_top(%arg0: tensor<1x1x32x32x8xf32>, %arg1: tensor<16x2x2x2x8xf32>, %arg2: tensor<16xf32>) -> tensor<1x1x32x32x16xf32> {
   // expected-error@+1 {{'tosa.conv3d' op failed level check: pad <= MAX_KERNEL}}
-  %0 = "tosa.conv3d"(%arg0, %arg1, %arg2) {dilation = array<i64: 1, 1, 1>, pad = array<i64: 0, 1, 8193, 1, 0, 1>, stride = array<i64: 1, 1, 1>} :
+  %0 = "tosa.conv3d"(%arg0, %arg1, %arg2) {acc_type = f32, dilation = array<i64: 1, 1, 1>, pad = array<i64: 0, 1, 8193, 1, 0, 1>, stride = array<i64: 1, 1, 1>} :
             (tensor<1x1x32x32x8xf32>, tensor<16x2x2x2x8xf32>, tensor<16xf32>) -> tensor<1x1x32x32x16xf32>
   return %0 : tensor<1x1x32x32x16xf32>
 }
@@ -352,7 +352,7 @@ func.func @test_conv3d_pad_top(%arg0: tensor<1x1x32x32x8xf32>, %arg1: tensor<16x
 
 func.func @test_conv3d_pad_bottom(%arg0: tensor<1x1x32x32x8xf32>, %arg1: tensor<16x2x2x2x8xf32>, %arg2: tensor<16xf32>) -> tensor<1x1x32x32x16xf32> {
   // expected-error@+1 {{'tosa.conv3d' op failed level check: pad <= MAX_KERNEL}}
-  %0 = "tosa.conv3d"(%arg0, %arg1, %arg2) {dilation = array<i64: 1, 1, 1>, pad = array<i64: 0, 1, 0, 8193, 0, 1>, stride = array<i64: 1, 1, 1>} :
+  %0 = "tosa.conv3d"(%arg0, %arg1, %arg2) {acc_type = f32, dilation = array<i64: 1, 1, 1>, pad = array<i64: 0, 1, 0, 8193, 0, 1>, stride = array<i64: 1, 1, 1>} :
             (tensor<1x1x32x32x8xf32>, tensor<16x2x2x2x8xf32>, tensor<16xf32>) -> tensor<1x1x32x32x16xf32>
   return %0 : tensor<1x1x32x32x16xf32>
 }
@@ -361,7 +361,7 @@ func.func @test_conv3d_pad_bottom(%arg0: tensor<1x1x32x32x8xf32>, %arg1: tensor<
 
 func.func @test_conv3d_pad_left(%arg0: tensor<1x1x32x32x8xf32>, %arg1: tensor<16x2x2x2x8xf32>, %arg2: tensor<16xf32>) -> tensor<1x1x32x32x16xf32> {
   // expected-error@+1 {{'tosa.conv3d' op failed level check: pad <= MAX_KERNEL}}
-  %0 = "tosa.conv3d"(%arg0, %arg1, %arg2) {dilation = array<i64: 1, 1, 1>, pad = array<i64: 0, 1, 0, 1, 8193, 1>, stride = array<i64: 1, 1, 1>} :
+  %0 = "tosa.conv3d"(%arg0, %arg1, %arg2) {acc_type = f32, dilation = array<i64: 1, 1, 1>, pad = array<i64: 0, 1, 0, 1, 8193, 1>, stride = array<i64: 1, 1, 1>} :
             (tensor<1x1x32x32x8xf32>, tensor<16x2x2x2x8xf32>, tensor<16xf32>) -> tensor<1x1x32x32x16xf32>
   return %0 : tensor<1x1x32x32x16xf32>
 }
@@ -370,7 +370,7 @@ func.func @test_conv3d_pad_left(%arg0: tensor<1x1x32x32x8xf32>, %arg1: tensor<16
 
 func.func @test_conv3d_pad_right(%arg0: tensor<1x1x32x32x8xf32>, %arg1: tensor<16x2x2x2x8xf32>, %arg2: tensor<16xf32>) -> tensor<1x1x32x32x16xf32> {
   // expected-error@+1 {{'tosa.conv3d' op failed level check: pad <= MAX_KERNEL}}
-  %0 = "tosa.conv3d"(%arg0, %arg1, %arg2) {dilation = array<i64: 1, 1, 1>, pad = array<i64: 0, 1, 0, 1, 0, 8193>, stride = array<i64: 1, 1, 1>} :
+  %0 = "tosa.conv3d"(%arg0, %arg1, %arg2) {acc_type = f32, dilation = array<i64: 1, 1, 1>, pad = array<i64: 0, 1, 0, 1, 0, 8193>, stride = array<i64: 1, 1, 1>} :
             (tensor<1x1x32x32x8xf32>, tensor<16x2x2x2x8xf32>, tensor<16xf32>) -> tensor<1x1x32x32x16xf32>
   return %0 : tensor<1x1x32x32x16xf32>
 }
@@ -379,7 +379,7 @@ func.func @test_conv3d_pad_right(%arg0: tensor<1x1x32x32x8xf32>, %arg1: tensor<1
 
 func.func @test_conv3d_stride_d(%arg0: tensor<1x1x32x32x8xf32>, %arg1: tensor<16x2x2x2x8xf32>, %arg2: tensor<16xf32>) -> tensor<1x1x32x32x16xf32> {
   // expected-error@+1 {{'tosa.conv3d' op failed level check: stride <= MAX_STRIDE}}
-  %0 = "tosa.conv3d"(%arg0, %arg1, %arg2) {dilation = array<i64: 1, 1, 1>, pad = array<i64: 0, 1, 0, 1, 0, 1>, stride = array<i64: 8193, 1, 1>} :
+  %0 = "tosa.conv3d"(%arg0, %arg1, %arg2) {acc_type = f32, dilation = array<i64: 1, 1, 1>, pad = array<i64: 0, 1, 0, 1, 0, 1>, stride = array<i64: 8193, 1, 1>} :
             (tensor<1x1x32x32x8xf32>, tensor<16x2x2x2x8xf32>, tensor<16xf32>) -> tensor<1x1x32x32x16xf32>
   return %0 : tensor<1x1x32x32x16xf32>
 }
@@ -388,7 +388,7 @@ func.func @test_conv3d_stride_d(%arg0: tensor<1x1x32x32x8xf32>, %arg1: tensor<16
 
 func.func @test_conv3d_stride_y(%arg0: tensor<1x1x32x32x8xf32>, %arg1: tensor<16x2x2x2x8xf32>, %arg2: tensor<16xf32>) -> tensor<1x1x32x32x16xf32> {
   // expected-error@+1 {{'tosa.conv3d' op failed level check: stride <= MAX_STRIDE}}
-  %0 = "tosa.conv3d"(%arg0, %arg1, %arg2) {dilation = array<i64: 1, 1, 1>, pad = array<i64: 0, 1, 0, 1, 0, 1>, stride = array<i64: 1, 8193, 1>} :
+  %0 = "tosa.conv3d"(%arg0, %arg1, %arg2) {acc_type = f32, dilation = array<i64: 1, 1, 1>, pad = array<i64: 0, 1, 0, 1, 0, 1>, stride = array<i64: 1, 8193, 1>} :
             (tensor<1x1x32x32x8xf32>, tensor<16x2x2x2x8xf32>, tensor<16xf32>) -> tensor<1x1x32x32x16xf32>
   return %0 : tensor<1x1x32x32x16xf32>
 }
@@ -397,7 +397,7 @@ func.func @test_conv3d_stride_y(%arg0: tensor<1x1x32x32x8xf32>, %arg1: tensor<16
 
 func.func @test_conv3d_stride_x(%arg0: tensor<1x1x32x32x8xf32>, %arg1: tensor<16x2x2x2x8xf32>, %arg2: tensor<16xf32>) -> tensor<1x1x32x32x16xf32> {
   // expected-error@+1 {{'tosa.conv3d' op failed level check: stride <= MAX_STRIDE}}
-  %0 = "tosa.conv3d"(%arg0, %arg1, %arg2) {dilation = array<i64: 1, 1, 1>, pad = array<i64: 0, 1, 0, 1, 0, 1>, stride = array<i64: 1, 1, 8193>} :
+  %0 = "tosa.conv3d"(%arg0, %arg1, %arg2) {acc_type = f32, dilation = array<i64: 1, 1, 1>, pad = array<i64: 0, 1, 0, 1, 0, 1>, stride = array<i64: 1, 1, 8193>} :
             (tensor<1x1x32x32x8xf32>, tensor<16x2x2x2x8xf32>, tensor<16xf32>) -> tensor<1x1x32x32x16xf32>
   return %0 : tensor<1x1x32x32x16xf32>
 }
@@ -406,7 +406,7 @@ func.func @test_conv3d_stride_x(%arg0: tensor<1x1x32x32x8xf32>, %arg1: tensor<16
 
 func.func @test_depthwise_conv2d_dilation_y(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<2x2x8x8xf32>, %arg2: tensor<64xf32>) -> tensor<1x32x32x64xf32> {
   // expected-error@+1 {{'tosa.depthwise_conv2d' op failed level check: dilation_y * KH <= MAX_KERNEL}}
-  %0 = "tosa.depthwise_conv2d"(%arg0, %arg1, %arg2) {dilation = array<i64: 4097, 1>, pad = array<i64: 0, 1, 0, 1>, stride = array<i64: 1, 1>} :
+  %0 = "tosa.depthwise_conv2d"(%arg0, %arg1, %arg2) {acc_type = f32, dilation = array<i64: 4097, 1>, pad = array<i64: 0, 1, 0, 1>, stride = array<i64: 1, 1>} :
             (tensor<1x32x32x8xf32>, tensor<2x2x8x8xf32>, tensor<64xf32>) -> tensor<1x32x32x64xf32>
   return %0 : tensor<1x32x32x64xf32>
 }
@@ -415,7 +415,7 @@ func.func @test_depthwise_conv2d_dilation_y(%arg0: tensor<1x32x32x8xf32>, %arg1:
 
 func.func @test_depthwise_conv2d_dilation_x(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<2x2x8x8xf32>, %arg2: tensor<64xf32>) -> tensor<1x32x32x64xf32> {
   // expected-error@+1 {{'tosa.depthwise_conv2d' op failed level check: dilation_x * KW <= MAX_KERNEL}}
-  %0 = "tosa.depthwise_conv2d"(%arg0, %arg1, %arg2) {dilation = array<i64: 1, 4097>, pad = array<i64: 0, 1, 0, 1>, stride = array<i64: 1, 1>} :
+  %0 = "tosa.depthwise_conv2d"(%arg0, %arg1, %arg2) {acc_type = f32, dilation = array<i64: 1, 4097>, pad = array<i64: 0, 1, 0, 1>, stride = array<i64: 1, 1>} :
             (tensor<1x32x32x8xf32>, tensor<2x2x8x8xf32>, tensor<64xf32>) -> tensor<1x32x32x64xf32>
   return %0 : tensor<1x32x32x64xf32>
 }
@@ -424,7 +424,7 @@ func.func @test_depthwise_conv2d_dilation_x(%arg0: tensor<1x32x32x8xf32>, %arg1:
 
 func.func @test_depthwise_conv2d_pad_top(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<2x2x8x8xf32>, %arg2: tensor<64xf32>) -> tensor<1x32x32x64xf32> {
   // expected-error@+1 {{'tosa.depthwise_conv2d' op failed level check: pad <= MAX_KERNEL}}
-  %0 = "tosa.depthwise_conv2d"(%arg0, %arg1, %arg2) {dilation = array<i64: 1, 1>, pad = array<i64: 8193, 1, 0, 1>, stride = array<i64: 1, 1>} :
+  %0 = "tosa.depthwise_conv2d"(%arg0, %arg1, %arg2) {acc_type = f32, dilation = array<i64: 1, 1>, pad = array<i64: 8193, 1, 0, 1>, stride = array<i64: 1, 1>} :
             (tensor<1x32x32x8xf32>, tensor<2x2x8x8xf32>, tensor<64xf32>) -> tensor<1x32x32x64xf32>
   return %0 : tensor<1x32x32x64xf32>
 }
@@ -433,7 +433,7 @@ func.func @test_depthwise_conv2d_pad_top(%arg0: tensor<1x32x32x8xf32>, %arg1: te
 
 func.func @test_depthwise_conv2d_pad_bottom(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<2x2x8x8xf32>, %arg2: tensor<64xf32>) -> tensor<1x32x32x64xf32> {
   // expected-error@+1 {{'tosa.depthwise_conv2d' op failed level check: pad <= MAX_KERNEL}}
-  %0 = "tosa.depthwise_conv2d"(%arg0, %arg1, %arg2) {dilation = array<i64: 1, 1>, pad = array<i64: 0, 8193, 0, 1>, stride = array<i64: 1, 1>} :
+  %0 = "tosa.depthwise_conv2d"(%arg0, %arg1, %arg2) {acc_type = f32, dilation = array<i64: 1, 1>, pad = array<i64: 0, 8193, 0, 1>, stride = array<i64: 1, 1>} :
             (tensor<1x32x32x8xf32>, tensor<2x2x8x8xf32>, tensor<64xf32>) -> tensor<1x32x32x64xf32>
   return %0 : tensor<1x32x32x64xf32>
 }
@@ -442,7 +442,7 @@ func.func @test_depthwise_conv2d_pad_bottom(%arg0: tensor<1x32x32x8xf32>, %arg1:
 
 func.func @test_depthwise_conv2d_pad_left(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<2x2x8x8xf32>, %arg2: tensor<64xf32>) -> tensor<1x32x32x64xf32> {
   // expected-error@+1 {{'tosa.depthwise_conv2d' op failed level check: pad <= MAX_KERNEL}}
-  %0 = "tosa.depthwise_conv2d"(%arg0, %arg1, %arg2) {dilation = array<i64: 1, 1>, pad = array<i64: 0, 1, 8193, 1>, stride = array<i64: 1, 1>} :
+  %0 = "tosa.depthwise_conv2d"(%arg0, %arg1, %arg2) {acc_type = f32, dilation = array<i64: 1, 1>, pad = array<i64: 0, 1, 8193, 1>, stride = array<i64: 1, 1>} :
             (tensor<1x32x32x8xf32>, tensor<2x2x8x8xf32>, tensor<64xf32>) -> tensor<1x32x32x64xf32>
   return %0 : tensor<1x32x32x64xf32>
 }
@@ -451,7 +451,7 @@ func.func @test_depthwise_conv2d_pad_left(%arg0: tensor<1x32x32x8xf32>, %arg1: t
 
 func.func @test_depthwise_conv2d_pad_right(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<2x2x8x8xf32>, %arg2: tensor<64xf32>) -> tensor<1x32x32x64xf32> {
   // expected-error@+1 {{'tosa.depthwise_conv2d' op failed level check: pad <= MAX_KERNEL}}
-  %0 = "tosa.depthwise_conv2d"(%arg0, %arg1, %arg2) {dilation = array<i64: 1, 1>, pad = array<i64: 0, 1, 0, 8193>, stride = array<i64: 1, 1>} :
+  %0 = "tosa.depthwise_conv2d"(%arg0, %arg1, %arg2) {acc_type = f32, dilation = array<i64: 1, 1>, pad = array<i64: 0, 1, 0, 8193>, stride = array<i64: 1, 1>} :
             (tensor<1x32x32x8xf32>, tensor<2x2x8x8xf32>, tensor<64xf32>) -> tensor<1x32x32x64xf32>
   return %0 : tensor<1x32x32x64xf32>
 }
@@ -460,7 +460,7 @@ func.func @test_depthwise_conv2d_pad_right(%arg0: tensor<1x32x32x8xf32>, %arg1:
 
 func.func @test_depthwise_conv2d_stride_y(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<2x2x8x8xf32>, %arg2: tensor<64xf32>) -> tensor<1x32x32x64xf32> {
   // expected-error@+1 {{'tosa.depthwise_conv2d' op failed level check: stride <= MAX_STRIDE}}
-  %0 = "tosa.depthwise_conv2d"(%arg0, %arg1, %arg2) {dilation = array<i64: 1, 1>, pad = array<i64: 0, 1, 0, 1>, stride = array<i64: 8193, 1>} :
+  %0 = "tosa.depthwise_conv2d"(%arg0, %arg1, %arg2) {acc_type = f32, dilation = array<i64: 1, 1>, pad = array<i64: 0, 1, 0, 1>, stride = array<i64: 8193, 1>} :
             (tensor<1x32x32x8xf32>, tensor<2x2x8x8xf32>, tensor<64xf32>) -> tensor<1x32x32x64xf32>
   return %0 : tensor<1x32x32x64xf32>
 }
@@ -469,7 +469,7 @@ func.func @test_depthwise_conv2d_stride_y(%arg0: tensor<1x32x32x8xf32>, %arg1: t
 
 func.func @test_depthwise_conv2d_stride_x(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<2x2x8x8xf32>, %arg2: tensor<64xf32>) -> tensor<1x32x32x64xf32> {
   // expected-error@+1 {{'tosa.depthwise_conv2d' op failed level check: stride <= MAX_STRIDE}}
-  %0 = "tosa.depthwise_conv2d"(%arg0, %arg1, %arg2) {dilation = array<i64: 1, 1>, pad = array<i64: 0, 1, 0, 1>, stride = array<i64: 1, 8193>} :
+  %0 = "tosa.depthwise_conv2d"(%arg0, %arg1, %arg2) {acc_type = f32, dilation = array<i64: 1, 1>, pad = array<i64: 0, 1, 0, 1>, stride = array<i64: 1, 8193>} :
             (tensor<1x32x32x8xf32>, tensor<2x2x8x8xf32>, tensor<64xf32>) -> tensor<1x32x32x64xf32>
   return %0 : tensor<1x32x32x64xf32>
 }
@@ -603,7 +603,7 @@ func.func @test_rfft2d_input_w(%arg0: tensor<13x8x8193xf32>) -> (tensor<13x8x9xf
 
 func.func @test_transpose_conv2d_weight_h(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<16x8193x1x8xf32>, %arg2: tensor<16xf32>) -> tensor<1x32x32x16xf32> {
   // expected-error@+1 {{'tosa.transpose_conv2d' op failed level check: KH <= MAX_KERNEL}}
-  %0 = "tosa.transpose_conv2d"(%arg0, %arg1, %arg2) {out_pad = array<i64: 0, 0, 0, 0>, out_shape = array<i64: 1, 32, 32, 16>, stride = array<i64: 1, 1>} :
+  %0 = "tosa.transpose_conv2d"(%arg0, %arg1, %arg2) {acc_type = f32, out_pad = array<i64: 0, 0, 0, 0>, out_shape = array<i64: 1, 32, 32, 16>, stride = array<i64: 1, 1>} :
               (tensor<1x32x32x8xf32>, tensor<16x8193x1x8xf32>, tensor<16xf32>) -> tensor<1x32x32x16xf32>
   return %0 : tensor<1x32x32x16xf32>
 }
@@ -612,7 +612,7 @@ func.func @test_transpose_conv2d_weight_h(%arg0: tensor<1x32x32x8xf32>, %arg1: t
 
 func.func @test_transpose_conv2d_weight_w(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<16x1x8193x8xf32>, %arg2: tensor<16xf32>) -> tensor<1x32x32x16xf32> {
   // expected-error@+1 {{'tosa.transpose_conv2d' op failed level check: KW <= MAX_KERNEL}}
-  %0 = "tosa.transpose_conv2d"(%arg0, %arg1, %arg2) {out_pad = array<i64: 0, 0, 0, 0>, out_shape = array<i64: 1, 32, 32, 16>, stride = array<i64: 1, 1>} :
+  %0 = "tosa.transpose_conv2d"(%arg0, %arg1, %arg2) {acc_type = f32, out_pad = array<i64: 0, 0, 0, 0>, out_shape = array<i64: 1, 32, 32, 16>, stride = array<i64: 1, 1>} :
               (tensor<1x32x32x8xf32>, tensor<16x1x8193x8xf32>, tensor<16xf32>) -> tensor<1x32x32x16xf32>
   return %0 : tensor<1x32x32x16xf32>
 }
@@ -621,7 +621,7 @@ func.func @test_transpose_conv2d_weight_w(%arg0: tensor<1x32x32x8xf32>, %arg1: t
 
 func.func @test_transpose_conv2d_pad_top(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<16x1x1x8xf32>, %arg2: tensor<16xf32>) -> tensor<1x32x32x16xf32> {
   // expected-error@+1 {{'tosa.transpose_conv2d' op failed level check: pad <= MAX_KERNEL}}
-  %0 = "tosa.transpose_conv2d"(%arg0, %arg1, %arg2) {out_pad = array<i64: 8193, 0, 0, 0>, out_shape = array<i64: 1, 32, 32, 16>, stride = array<i64: 1, 1>} :
+  %0 = "tosa.transpose_conv2d"(%arg0, %arg1, %arg2) {acc_type = f32, out_pad = array<i64: 8193, 0, 0, 0>, out_shape = array<i64: 1, 32, 32, 16>, stride = array<i64: 1, 1>} :
               (tensor<1x32x32x8xf32>, tensor<16x1x1x8xf32>, tensor<16xf32>) -> tensor<1x32x32x16xf32>
   return %0 : tensor<1x32x32x16xf32>
 }
@@ -630,7 +630,7 @@ func.func @test_transpose_conv2d_pad_top(%arg0: tensor<1x32x32x8xf32>, %arg1: te
 
 func.func @test_transpose_conv2d_pad_bottom(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<16x1x1x8xf32>, %arg2: tensor<16xf32>) -> tensor<1x32x32x16xf32> {
   // expected-error@+1 {{'tosa.transpose_conv2d' op failed level check: pad <= MAX_KERNEL}}
-  %0 = "tosa.transpose_conv2d"(%arg0, %arg1, %arg2) {out_pad = array<i64: 0, 8193, 0, 0>, out_shape = array<i64: 1, 32, 32, 16>, stride = array<i64: 1, 1>} :
+  %0 = "tosa.transpose_conv2d"(%arg0, %arg1, %arg2) {acc_type = f32, out_pad = array<i64: 0, 8193, 0, 0>, out_shape = array<i64: 1, 32, 32, 16>, stride = array<i64: 1, 1>} :
               (tensor<1x32x32x8xf32>, tensor<16x1x1x8xf32>, tensor<16xf32>) -> tensor<1x32x32x16xf32>
   return %0 : tensor<1x32x32x16xf32>
 }
@@ -639,7 +639,7 @@ func.func @test_transpose_conv2d_pad_bottom(%arg0: tensor<1x32x32x8xf32>, %arg1:
 
 func.func @test_transpose_conv2d_pad_left(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<16x1x1x8xf32>, %arg2: tensor<16xf32>) -> tensor<1x32x32x16xf32> {
   // expected-error@+1 {{'tosa.transpose_conv2d' op failed level check: pad <= MAX_KERNEL}}
-  %0 = "tosa.transpose_conv2d"(%arg0, %arg1, %arg2) {out_pad = array<i64: 0, 0, 8193, 0>, out_shape = array<i64: 1, 32, 32, 16>, stride = array<i64: 1, 1>} :
+  %0 = "tosa.transpose_conv2d"(%arg0, %arg1, %arg2) {acc_type = f32, out_pad = array<i64: 0, 0, 8193, 0>, out_shape = array<i64: 1, 32, 32, 16>, stride = array<i64: 1, 1>} :
               (tensor<1x32x32x8xf32>, tensor<16x1x1x8xf32>, tensor<16xf32>) -> tensor<1x32x32x16xf32>
   return %0 : tensor<1x32x32x16xf32>
 }
@@ -648,7 +648,7 @@ func.func @test_transpose_conv2d_pad_left(%arg0: tensor<1x32x32x8xf32>, %arg1: t
 
 func.func @test_transpose_conv2d_pad_right(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<16x1x1x8xf32>, %arg2: tensor<16xf32>) -> tensor<1x32x32x16xf32> {
   // expected-error@+1 {{'tosa.transpose_conv2d' op failed level check: pad <= MAX_KERNEL}}
-  %0 = "tosa.transpose_conv2d"(%arg0, %arg1, %arg2) {out_pad = array<i64: 0, 0, 0, 8193>, out_shape = array<i64: 1, 32, 32, 16>, stride = array<i64: 1, 1>} :
+  %0 = "tosa.transpose_conv2d"(%arg0, %arg1, %arg2) {acc_type = f32, out_pad = array<i64: 0, 0, 0, 8193>, out_shape = array<i64: 1, 32, 32, 16>, stride = array<i64: 1, 1>} :
               (tensor<1x32x32x8xf32>, tensor<16x1x1x8xf32>, tensor<16xf32>) -> tensor<1x32x32x16xf32>
   return %0 : tensor<1x32x32x16xf32>
 }
@@ -657,7 +657,7 @@ func.func @test_transpose_conv2d_pad_right(%arg0: tensor<1x32x32x8xf32>, %arg1:
 
 func.func @test_transpose_conv2d_stride_y(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<16x1x1x8xf32>, %arg2: tensor<16xf32>) -> tensor<1x32x32x16xf32> {
   // expected-error@+1 {{'tosa.transpose_conv2d' op failed level check: stride <= MAX_STRIDE}}
-  %0 = "tosa.transpose_conv2d"(%arg0, %arg1, %arg2) {out_pad = array<i64: 0, 0, 0, 0>, out_shape = array<i64: 1, 32, 32, 16>, stride = array<i64: 8193, 1>} :
+  %0 = "tosa.transpose_conv2d"(%arg0, %arg1, %arg2) {acc_type = f32, out_pad = array<i64: 0, 0, 0, 0>, out_shape = array<i64: 1, 32, 32, 16>, stride = array<i64: 8193, 1>} :
               (tensor<1x32x32x8xf32>, tensor<16x1x1x8xf32>, tensor<16xf32>) -> tensor<1x32x32x16xf32>
   return %0 : tensor<1x32x32x16xf32>
 }
@@ -666,7 +666,7 @@ func.func @test_transpose_conv2d_stride_y(%arg0: tensor<1x32x32x8xf32>, %arg1: t
 
 func.func @test_transpose_conv2d_stride_x(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<16x1x1x8xf32>, %arg2: tensor<16xf32>) -> tensor<1x32x32x16xf32> {
   // expected-error@+1 {{'tosa.transpose_conv2d' op failed level check: stride <= MAX_STRIDE}}
-  %0 = "tosa.transpose_conv2d"(%arg0, %arg1, %arg2) {out_pad = array<i64: 0, 0, 0, 0>, out_shape = array<i64: 1, 32, 32, 16>, stride = array<i64: 1, 8193>} :
+  %0 = "tosa.transpose_conv2d"(%arg0, %arg1, %arg2) {acc_type = f32, out_pad = array<i64: 0, 0, 0, 0>, out_shape = array<i64: 1, 32, 32, 16>, stride = array<i64: 1, 8193>} :
               (tensor<1x32x32x8xf32>, tensor<16x1x1x8xf32>, tensor<16xf32>) -> tensor<1x32x32x16xf32>
   return %0 : tensor<1x32x32x16xf32>
 }
diff --git a/mlir/test/Dialect/Tosa/ops.mlir b/mlir/test/Dialect/Tosa/ops.mlir
index 88fa94ae90db..f2e1cff72ab2 100644
--- a/mlir/test/Dialect/Tosa/ops.mlir
+++ b/mlir/test/Dialect/Tosa/ops.mlir
@@ -54,7 +54,7 @@ func.func @test_avg_pool2d_q8(%arg0: tensor<1x7x7x9x!quant.uniform<i8:f32, 0.01>
 // -----
 // CHECK-LABEL: conv2d
 func.func @test_conv2d(%arg0: tensor<1x4x4x4xf32>, %arg1: tensor<8x1x1x4xf32>, %arg2: tensor<8xf32>) -> tensor<1x4x4x8xf32> {
-  %0 = tosa.conv2d %arg0, %arg1, %arg2 {dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>, local_bound = true} : (tensor<1x4x4x4xf32>, tensor<8x1x1x4xf32>, tensor<8xf32>) -> tensor<1x4x4x8xf32>
+  %0 = tosa.conv2d %arg0, %arg1, %arg2 {acc_type = f32, dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>, local_bound = true} : (tensor<1x4x4x4xf32>, tensor<8x1x1x4xf32>, tensor<8xf32>) -> tensor<1x4x4x8xf32>
   return %0 : tensor<1x4x4x8xf32>
 }
 
@@ -63,7 +63,7 @@ func.func @test_conv2d(%arg0: tensor<1x4x4x4xf32>, %arg1: tensor<8x1x1x4xf32>, %
 func.func @test_conv2d_q8xi4(%arg0: tensor<1x11x11x3xi8>) -> tensor<1x1x1x3xi8> {
   %0 = "tosa.const"() {value = dense<0> : tensor<3x11x11x3xi4>} : () -> tensor<3x11x11x3xi4>
   %1 = "tosa.const"() {value = dense<[12, 23, 55]> : tensor<3xi32>} : () -> tensor<3xi32>
-  %2 = "tosa.conv2d"(%arg0, %0, %1) {dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, quantization_info = #tosa.conv_quant<input_zp = 0, weight_zp = 0>, stride = array<i64: 1, 1>} : (tensor<1x11x11x3xi8>, tensor<3x11x11x3xi4>, tensor<3xi32>) -> tensor<1x1x1x3xi32>
+  %2 = "tosa.conv2d"(%arg0, %0, %1) {acc_type = i32, dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, quantization_info = #tosa.conv_quant<input_zp = 0, weight_zp = 0>, stride = array<i64: 1, 1>} : (tensor<1x11x11x3xi8>, tensor<3x11x11x3xi4>, tensor<3xi32>) -> tensor<1x1x1x3xi32>
   %3 = "tosa.rescale"(%2) {double_round = true, input_zp = 0 : i32, multiplier = array<i32: 2026291432, 1079222024, 1693132724>, output_zp = 27 : i32, per_channel = true, scale32 = true, shift = array<i8: 37, 36, 37>} : (tensor<1x1x1x3xi32>) -> tensor<1x1x1x3xi8>
   return %3 : tensor<1x1x1x3xi8>
 }
@@ -71,28 +71,28 @@ func.func @test_conv2d_q8xi4(%arg0: tensor<1x11x11x3xi8>) -> tensor<1x1x1x3xi8>
 // -----
 // CHECK-LABEL: conv3d
 func.func @test_conv3d(%arg0: tensor<1x4x8x21x17xf32>, %arg1: tensor<34x1x1x1x17xf32>, %arg2: tensor<34xf32>) -> tensor<1x4x8x21x34xf32> {
-  %0 = tosa.conv3d %arg0, %arg1, %arg2 {dilation = array<i64: 1, 1, 1>, pad = array<i64: 0, 0, 0, 0, 0, 0>, stride = array<i64: 1, 1, 1>} : (tensor<1x4x8x21x17xf32>, tensor<34x1x1x1x17xf32>, tensor<34xf32>) -> tensor<1x4x8x21x34xf32>
+  %0 = tosa.conv3d %arg0, %arg1, %arg2 {acc_type = f32, dilation = array<i64: 1, 1, 1>, pad = array<i64: 0, 0, 0, 0, 0, 0>, stride = array<i64: 1, 1, 1>} : (tensor<1x4x8x21x17xf32>, tensor<34x1x1x1x17xf32>, tensor<34xf32>) -> tensor<1x4x8x21x34xf32>
   return %0 : tensor<1x4x8x21x34xf32>
 }
 
 // -----
 // CHECK-LABEL: conv3d_with_local_bound
 func.func @test_conv3d_with_local_bound(%arg0: tensor<1x4x8x21x17xf32>, %arg1: tensor<34x1x1x1x17xf32>, %arg2: tensor<34xf32>) -> tensor<1x4x8x21x34xf32> {
-  %0 = tosa.conv3d %arg0, %arg1, %arg2 {dilation = array<i64: 1, 1, 1>, pad = array<i64: 0, 0, 0, 0, 0, 0>, stride = array<i64: 1, 1, 1>, local_bound = true} : (tensor<1x4x8x21x17xf32>, tensor<34x1x1x1x17xf32>, tensor<34xf32>) -> tensor<1x4x8x21x34xf32>
+  %0 = tosa.conv3d %arg0, %arg1, %arg2 {acc_type = f32, dilation = array<i64: 1, 1, 1>, pad = array<i64: 0, 0, 0, 0, 0, 0>, stride = array<i64: 1, 1, 1>, local_bound = true} : (tensor<1x4x8x21x17xf32>, tensor<34x1x1x1x17xf32>, tensor<34xf32>) -> tensor<1x4x8x21x34xf32>
   return %0 : tensor<1x4x8x21x34xf32>
 }
 
 // -----
 // CHECK-LABEL: depthwise_conv2d
 func.func @test_depthwise_conv2d(%arg0: tensor<1x4x4x4xf32>, %arg1: tensor<1x1x4x2xf32>, %arg2: tensor<8xf32>) -> tensor<1x4x4x8xf32> {
-  %0 = tosa.depthwise_conv2d %arg0, %arg1, %arg2 {dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>} : (tensor<1x4x4x4xf32>, tensor<1x1x4x2xf32>, tensor<8xf32>) -> tensor<1x4x4x8xf32>
+  %0 = tosa.depthwise_conv2d %arg0, %arg1, %arg2 {acc_type = f32, dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>} : (tensor<1x4x4x4xf32>, tensor<1x1x4x2xf32>, tensor<8xf32>) -> tensor<1x4x4x8xf32>
   return %0 : tensor<1x4x4x8xf32>
 }
 
 // -----
 // CHECK-LABEL: depthwise_conv2d_with_local_bound
 func.func @test_depthwise_conv2d_with_local_bound(%arg0: tensor<1x4x4x4xf32>, %arg1: tensor<1x1x4x2xf32>, %arg2: tensor<8xf32>) -> tensor<1x4x4x8xf32> {
-  %0 = tosa.depthwise_conv2d %arg0, %arg1, %arg2 {dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>, local_bound = true} : (tensor<1x4x4x4xf32>, tensor<1x1x4x2xf32>, tensor<8xf32>) -> tensor<1x4x4x8xf32>
+  %0 = tosa.depthwise_conv2d %arg0, %arg1, %arg2 {acc_type = f32, dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>, local_bound = true} : (tensor<1x4x4x4xf32>, tensor<1x1x4x2xf32>, tensor<8xf32>) -> tensor<1x4x4x8xf32>
   return %0 : tensor<1x4x4x8xf32>
 }
 
@@ -162,14 +162,14 @@ func.func @test_rfft2d_with_local_bound(%arg0: tensor<13x8x16xf32>) -> (tensor<1
 // -----
 // CHECK-LABEL: transpose_conv2d
 func.func @test_transpose_conv2d(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<16x1x1x8xf32>, %arg2: tensor<16xf32>) -> tensor<1x32x32x16xf32> {
-  %0 = tosa.transpose_conv2d %arg0, %arg1, %arg2 {out_pad = array<i64: 0, 0, 0, 0>, out_shape = array<i64: 1, 32, 32, 16>, stride = array<i64: 1, 1>} : (tensor<1x32x32x8xf32>, tensor<16x1x1x8xf32>, tensor<16xf32>) -> tensor<1x32x32x16xf32>
+  %0 = tosa.transpose_conv2d %arg0, %arg1, %arg2 {acc_type = f32, out_pad = array<i64: 0, 0, 0, 0>, out_shape = array<i64: 1, 32, 32, 16>, stride = array<i64: 1, 1>} : (tensor<1x32x32x8xf32>, tensor<16x1x1x8xf32>, tensor<16xf32>) -> tensor<1x32x32x16xf32>
   return %0 : tensor<1x32x32x16xf32>
 }
 
 // -----
 // CHECK-LABEL: transpose_conv2d_with_local_bound
 func.func @test_transpose_conv2d_with_local_bound(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<16x1x1x8xf32>, %arg2: tensor<16xf32>) -> tensor<1x32x32x16xf32> {
-  %0 = tosa.transpose_conv2d %arg0, %arg1, %arg2 {out_pad = array<i64: 0, 0, 0, 0>, out_shape = array<i64: 1, 32, 32, 16>, stride = array<i64: 1, 1>, local_bound = false} : (tensor<1x32x32x8xf32>, tensor<16x1x1x8xf32>, tensor<16xf32>) -> tensor<1x32x32x16xf32>
+  %0 = tosa.transpose_conv2d %arg0, %arg1, %arg2 {acc_type = f32, out_pad = array<i64: 0, 0, 0, 0>, out_shape = array<i64: 1, 32, 32, 16>, stride = array<i64: 1, 1>, local_bound = false} : (tensor<1x32x32x8xf32>, tensor<16x1x1x8xf32>, tensor<16xf32>) -> tensor<1x32x32x16xf32>
   return %0 : tensor<1x32x32x16xf32>
 }
 
@@ -525,16 +525,16 @@ func.func @test_concat(%arg0: tensor<13x21x3xf32>, %arg1: tensor<13x21x3xf32>) -
 
 // -----
 // CHECK-LABEL: pad
-func.func @test_pad(%arg0: tensor<13x21x3xf32>, %arg1: tensor<3x2xi32>) -> tensor<13x21x3xf32> {
-  %0 = tosa.pad %arg0, %arg1 : (tensor<13x21x3xf32>, tensor<3x2xi32>) -> tensor<13x21x3xf32>
+func.func @test_pad(%arg0: tensor<13x21x3xf32>, %arg1: tensor<6xi32>) -> tensor<13x21x3xf32> {
+  %0 = tosa.pad %arg0, %arg1 : (tensor<13x21x3xf32>, tensor<6xi32>) -> tensor<13x21x3xf32>
   return %0 : tensor<13x21x3xf32>
 }
 
 // -----
 // CHECK-LABEL: pad_explicit_value
-func.func @test_pad_explicit_value(%arg0: tensor<13x21x3xf32>, %arg1: tensor<3x2xi32>) -> tensor<13x21x3xf32> {
+func.func @test_pad_explicit_value(%arg0: tensor<13x21x3xf32>, %arg1: tensor<6xi32>) -> tensor<13x21x3xf32> {
   %0 = "tosa.const"() {value = dense<3.14> : tensor<f32>} : () -> tensor<f32>
-  %1 = tosa.pad %arg0, %arg1, %0 : (tensor<13x21x3xf32>, tensor<3x2xi32>, tensor<f32>) -> tensor<13x21x3xf32>
+  %1 = tosa.pad %arg0, %arg1, %0 : (tensor<13x21x3xf32>, tensor<6xi32>, tensor<f32>) -> tensor<13x21x3xf32>
   return %1 : tensor<13x21x3xf32>
 }
 
diff --git a/mlir/test/Dialect/Tosa/quant-test.mlir b/mlir/test/Dialect/Tosa/quant-test.mlir
index 82a87dbc2494..6437f12e3ff8 100644
--- a/mlir/test/Dialect/Tosa/quant-test.mlir
+++ b/mlir/test/Dialect/Tosa/quant-test.mlir
@@ -10,9 +10,9 @@ func.func @test_build_qtype(%arg0 : tensor<16x1x1x8x!quant.uniform<u8<1:255>:f32
 
 // -----
 // CHECK-LABEL: test_build_mult_and_shift
-func.func @test_build_mult_and_shift(%arg0: tensor<1x32x32x8x!quant.uniform<i8:f32, 0.015684768557548523>>, %arg1 : tensor<16x1x1x8x!quant.uniform<i8<-127:127>:f32, 0.015680249780416489>>, %arg2 : tensor<16xi32>) -> tensor<1x32x32x16x!quant.uniform<i8:f32, 0.078431375324726104>> {
+func.func @test_build_mult_and_shift(%arg0: tensor<1x32x32x8x!quant.uniform<i8:f32, 0.015684768557548523>>, %arg1 : tensor<16x1x1x8x!quant.uniform<i8<-127:127>:f32, 0.015680249780416489>>, %arg2 : tensor<16xi32>) -> tensor<1x32x32x16x!quant.uniform<i32:f32, 0.078431375324726104>> {
   // CHECK: tosa.conv2d
-  %0 = "tosa.conv2d"(%arg0, %arg1, %arg2) {pad = array<i64: 1, 1, 2, 2>, dilation = array<i64: 2, 1>, stride = array<i64: 1, 1>, quantization_info = #tosa.conv_quant<input_zp = -1, weight_zp = 0>} : (tensor<1x32x32x8x!quant.uniform<i8:f32, 0.015684768557548523>>, tensor<16x1x1x8x!quant.uniform<i8<-127:127>:f32, 0.015680249780416489>>, tensor<16xi32>) -> tensor<1x32x32x16x!quant.uniform<i8:f32, 0.078431375324726104>>
-  return %0 : tensor<1x32x32x16x!quant.uniform<i8:f32, 0.078431375324726104>>
+  %0 = "tosa.conv2d"(%arg0, %arg1, %arg2) {acc_type = i32, pad = array<i64: 1, 1, 2, 2>, dilation = array<i64: 2, 1>, stride = array<i64: 1, 1>, quantization_info = #tosa.conv_quant<input_zp = -1, weight_zp = 0>} : (tensor<1x32x32x8x!quant.uniform<i8:f32, 0.015684768557548523>>, tensor<16x1x1x8x!quant.uniform<i8<-127:127>:f32, 0.015680249780416489>>, tensor<16xi32>) -> tensor<1x32x32x16x!quant.uniform<i32:f32, 0.078431375324726104>>
+  return %0 : tensor<1x32x32x16x!quant.uniform<i32:f32, 0.078431375324726104>>
 
 }
diff --git a/mlir/test/Dialect/Tosa/tosa-decompose-conv2d.mlir b/mlir/test/Dialect/Tosa/tosa-decompose-conv2d.mlir
index d876ccfb3b91..8df4630f9c17 100644
--- a/mlir/test/Dialect/Tosa/tosa-decompose-conv2d.mlir
+++ b/mlir/test/Dialect/Tosa/tosa-decompose-conv2d.mlir
@@ -14,7 +14,7 @@ func.func @conv2d_as_fully_connected(%arg0: tensor<4x10x10x2xf32>, %arg1: tensor
   // CHECK: %[[VAR3:.*]] = tosa.reshape %[[VAR2]] {new_shape = array<i64: 4, 10, 10, 3>}
   // CHECK-SAME: -> tensor<4x10x10x3xf32>
   // CHECK: return %[[VAR3]]
-  %0 = tosa.conv2d %arg0, %arg1, %arg2 {pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>, dilation = array<i64: 1, 1>} : (tensor<4x10x10x2xf32>, tensor<3x1x1x2xf32>, tensor<3xf32>) -> tensor<4x10x10x3xf32>
+  %0 = tosa.conv2d %arg0, %arg1, %arg2 {acc_type = f32, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>, dilation = array<i64: 1, 1>} : (tensor<4x10x10x2xf32>, tensor<3x1x1x2xf32>, tensor<3xf32>) -> tensor<4x10x10x3xf32>
   return %0 : tensor<4x10x10x3xf32>
 }
 
@@ -33,7 +33,7 @@ func.func @conv2d_as_fully_connected_quant(%arg0: tensor<4x10x10x2xi8>, %arg1: t
   // CHECK: %[[VAR3:.*]] = tosa.reshape %[[VAR2]] {new_shape = array<i64: 4, 10, 10, 3>}
   // CHECK-SAME: -> tensor<4x10x10x3xi32>
   // CHECK: return %[[VAR3]]
-  %0 = tosa.conv2d %arg0, %arg1, %arg2 {pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>, dilation = array<i64: 1, 1>, quantization_info = #tosa.conv_quant<input_zp = 42, weight_zp = 24>} : (tensor<4x10x10x2xi8>, tensor<3x1x1x2xi8>, tensor<3xi32>) -> tensor<4x10x10x3xi32>
+  %0 = tosa.conv2d %arg0, %arg1, %arg2 {acc_type = i32, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>, dilation = array<i64: 1, 1>, quantization_info = #tosa.conv_quant<input_zp = 42, weight_zp = 24>} : (tensor<4x10x10x2xi8>, tensor<3x1x1x2xi8>, tensor<3xi32>) -> tensor<4x10x10x3xi32>
   return %0 : tensor<4x10x10x3xi32>
 }
 
@@ -50,7 +50,7 @@ func.func @conv_with_dynamic_dim(%arg0: tensor<?x14x14x64xi8>, %arg1: tensor<384
 // CHECK:           %[[VAL_6:.*]] = tosa.reshape %[[VAL_5]] {new_shape = array<i64: -1, 14, 14, 384>} : (tensor<?x384xi32>) -> tensor<?x14x14x384xi32>
 // CHECK:           return %[[VAL_6]] : tensor<?x14x14x384xi32>
 // CHECK:         }
-  %0 = tosa.conv2d %arg0, %arg1, %arg2 {dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, quantization_info = #tosa.conv_quant<input_zp = -6, weight_zp = 11>, stride = array<i64: 1, 1>} : (tensor<?x14x14x64xi8>, tensor<384x1x1x64xi8>, tensor<384xi32>) -> tensor<?x14x14x384xi32>
+  %0 = tosa.conv2d %arg0, %arg1, %arg2 {acc_type = i32, dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, quantization_info = #tosa.conv_quant<input_zp = -6, weight_zp = 11>, stride = array<i64: 1, 1>} : (tensor<?x14x14x64xi8>, tensor<384x1x1x64xi8>, tensor<384xi32>) -> tensor<?x14x14x384xi32>
   return %0 : tensor<?x14x14x384xi32>
 }
 
@@ -58,13 +58,13 @@ func.func @conv_with_dynamic_dim(%arg0: tensor<?x14x14x64xi8>, %arg1: tensor<384
 
 // CHECK-LABEL: @conv2d_as_fully_connected_padded
 func.func @conv2d_as_fully_connected_padded(%arg0: tensor<4x10x10x2xi8>, %arg1: tensor<3x1x1x2xi8>, %arg2: tensor<3xi32>) -> tensor<4x12x12x3xi32> {
-  // CHECK-DAG: %[[PAD_SHAPE:.+]] = "tosa.const"() <{value = dense<{{\[\[}}0, 0], [1, 1], [1, 1], [0, 0]]> : tensor<4x2xi64>}
+  // CHECK-DAG: %[[PAD_SHAPE:.+]] = "tosa.const"() <{value = dense<{{\[}}0, 0, 1, 1, 1, 1, 0, 0]> : tensor<8xi64>}
   // CHECK-DAG: %[[PAD_VAL:.+]] = "tosa.const"() <{value = dense<42> : tensor<i8>}
-  // CHECK-DAG: %[[PAD:.+]] = tosa.pad %arg0, %[[PAD_SHAPE]], %[[PAD_VAL]] : (tensor<4x10x10x2xi8>, tensor<4x2xi64>, tensor<i8>) -> tensor<4x12x12x2xi8>
+  // CHECK-DAG: %[[PAD:.+]] = tosa.pad %arg0, %[[PAD_SHAPE]], %[[PAD_VAL]] : (tensor<4x10x10x2xi8>, tensor<8xi64>, tensor<i8>) -> tensor<4x12x12x2xi8>
   // CHECK-DAG: %[[RESHAPE_INPUT:.+]] = tosa.reshape %[[PAD]] {new_shape = array<i64: 576, 2>}
   // CHECK-DAG: %[[RESHAPE_FILTER:.+]] = tosa.reshape %arg1 {new_shape = array<i64: 3, 2>}
   // CHECK-DAG: %[[FULLY:.+]] = tosa.fully_connected %[[RESHAPE_INPUT]], %[[RESHAPE_FILTER]], %arg2 {quantization_info = #tosa.conv_quant<input_zp = 42, weight_zp = 24>}
   // CHECK: %[[RESHAPE:.+]] = tosa.reshape %[[FULLY]] {new_shape = array<i64: 4, 12, 12, 3>}
-  %0 = tosa.conv2d %arg0, %arg1, %arg2 {pad = array<i64: 1, 1, 1, 1>, stride = array<i64: 1, 1>, dilation = array<i64: 1, 1>, quantization_info = #tosa.conv_quant<input_zp = 42, weight_zp = 24>} : (tensor<4x10x10x2xi8>, tensor<3x1x1x2xi8>, tensor<3xi32>) -> tensor<4x12x12x3xi32>
+  %0 = tosa.conv2d %arg0, %arg1, %arg2 {acc_type = i32, pad = array<i64: 1, 1, 1, 1>, stride = array<i64: 1, 1>, dilation = array<i64: 1, 1>, quantization_info = #tosa.conv_quant<input_zp = 42, weight_zp = 24>} : (tensor<4x10x10x2xi8>, tensor<3x1x1x2xi8>, tensor<3xi32>) -> tensor<4x12x12x3xi32>
   return %0 : tensor<4x12x12x3xi32>
 }
diff --git a/mlir/test/Dialect/Tosa/tosa-decompose-depthwise.mlir b/mlir/test/Dialect/Tosa/tosa-decompose-depthwise.mlir
index 2224bf3f57b2..cfff6396ad48 100644
--- a/mlir/test/Dialect/Tosa/tosa-decompose-depthwise.mlir
+++ b/mlir/test/Dialect/Tosa/tosa-decompose-depthwise.mlir
@@ -18,7 +18,7 @@ func.func @depthwise_conv2d_as_mul(%arg0: tensor<4x10x10x2xf32>, %arg1: tensor<1
   // CHECK: %[[VAR5:.*]] = tosa.add %[[VAR3]], %[[VAR4]]
   // CHECK-SAME: -> tensor<4x10x10x6xf32>
   // CHECK: return %[[VAR5]]
-  %0 = tosa.depthwise_conv2d %arg0, %arg1, %arg2 {pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>, dilation = array<i64: 1, 1>} : (tensor<4x10x10x2xf32>, tensor<1x1x2x3xf32>, tensor<6xf32>) -> tensor<4x10x10x6xf32>
+  %0 = tosa.depthwise_conv2d %arg0, %arg1, %arg2 {acc_type = f32, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>, dilation = array<i64: 1, 1>} : (tensor<4x10x10x2xf32>, tensor<1x1x2x3xf32>, tensor<6xf32>) -> tensor<4x10x10x6xf32>
   return %0 : tensor<4x10x10x6xf32>
 }
 
@@ -38,7 +38,7 @@ func.func @depthwise_conv2d_as_mul_q(%arg0: tensor<4x10x10x2xi8>, %arg1: tensor<
   // CHECK: %[[reO:.+]] = tosa.reshape %[[mul]] {new_shape = array<i64: 4, 10, 10, 6>}
   // CHECK: %[[reArg2:.+]] = tosa.reshape %arg2 {new_shape = array<i64: 1, 1, 1, 6>}
   // CHECK: %[[add:.+]] = tosa.add %[[reO]], %[[reArg2]]
-  %0 = tosa.depthwise_conv2d %arg0, %arg1, %arg2 {pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>, dilation = array<i64: 1, 1>, quantization_info = #tosa.conv_quant<input_zp = 7, weight_zp = 11>} : (tensor<4x10x10x2xi8>, tensor<1x1x2x3xi8>, tensor<6xi32>) -> tensor<4x10x10x6xi32>
+  %0 = tosa.depthwise_conv2d %arg0, %arg1, %arg2 {acc_type = i32, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>, dilation = array<i64: 1, 1>, quantization_info = #tosa.conv_quant<input_zp = 7, weight_zp = 11>} : (tensor<4x10x10x2xi8>, tensor<1x1x2x3xi8>, tensor<6xi32>) -> tensor<4x10x10x6xi32>
   return %0 : tensor<4x10x10x6xi32>
 }
 
@@ -46,15 +46,15 @@ func.func @depthwise_conv2d_as_mul_q(%arg0: tensor<4x10x10x2xi8>, %arg1: tensor<
 
 // CHECK-LABEL: @depthwise_conv2d_as_mul_padded
 func.func @depthwise_conv2d_as_mul_padded(%arg0: tensor<4x10x10x2xf32>, %arg1: tensor<1x1x2x3xf32>, %arg2: tensor<6xf32>) -> tensor<4x12x12x6xf32> {
-  // CHECK-DAG: %[[pad:.+]] = "tosa.const"() <{value = dense<{{\[\[}}0, 0], [1, 1], [1, 1], [0, 0], [0, 0]]> : tensor<5x2xi64>}
+  // CHECK-DAG: %[[pad:.+]] = "tosa.const"() <{value = dense<{{\[}}0, 0, 1, 1, 1, 1, 0, 0, 0, 0]> : tensor<10xi64>}
   // CHECK-DAG: %[[zero:.+]] = "tosa.const"() <{value = dense<0.000000e+00> : tensor<f32>}
   // CHECK: %[[reIn:.+]] = tosa.reshape %arg0 {new_shape = array<i64: 4, 10, 10, 2, 1>}
-  // CHECK: %[[padded:.+]] = tosa.pad %[[reIn]], %[[pad]], %[[zero]] : (tensor<4x10x10x2x1xf32>, tensor<5x2xi64>, tensor<f32>) -> tensor<4x12x12x2x1xf32>
+  // CHECK: %[[padded:.+]] = tosa.pad %[[reIn]], %[[pad]], %[[zero]] : (tensor<4x10x10x2x1xf32>, tensor<10xi64>, tensor<f32>) -> tensor<4x12x12x2x1xf32>
   // CHECK: %[[reArg1:.+]] = tosa.reshape %arg1 {new_shape = array<i64: 1, 1, 1, 2, 3>}
   // CHECK: %[[mul:.+]] = tosa.mul %3, %[[reArg1]] {shift = 0 : i8}
   // CHECK: %[[reOut:.+]] = tosa.reshape %[[mul]] {new_shape = array<i64: 4, 12, 12, 6>}
   // CHECK: %[[reArg2:.+]] = tosa.reshape %arg2 {new_shape = array<i64: 1, 1, 1, 6>}
   // CHECK: %[[add:.+]] = tosa.add %[[reOut]], %[[reArg2]]
-  %0 = tosa.depthwise_conv2d %arg0, %arg1, %arg2 {pad = array<i64: 1, 1, 1, 1>, stride = array<i64: 1, 1>, dilation = array<i64: 1, 1>} : (tensor<4x10x10x2xf32>, tensor<1x1x2x3xf32>, tensor<6xf32>) -> tensor<4x12x12x6xf32>
+  %0 = tosa.depthwise_conv2d %arg0, %arg1, %arg2 {acc_type = f32, pad = array<i64: 1, 1, 1, 1>, stride = array<i64: 1, 1>, dilation = array<i64: 1, 1>} : (tensor<4x10x10x2xf32>, tensor<1x1x2x3xf32>, tensor<6xf32>) -> tensor<4x12x12x6xf32>
   return %0 : tensor<4x12x12x6xf32>
 }
diff --git a/mlir/test/Dialect/Tosa/tosa-decompose-transpose-conv.mlir b/mlir/test/Dialect/Tosa/tosa-decompose-transpose-conv.mlir
index 1f2bb3fb9a36..c361c7c2899f 100644
--- a/mlir/test/Dialect/Tosa/tosa-decompose-transpose-conv.mlir
+++ b/mlir/test/Dialect/Tosa/tosa-decompose-transpose-conv.mlir
@@ -6,7 +6,7 @@ func.func @transpose_conv2d(%arg0: tensor<2x16x14x3xf32>, %arg1: tensor<5x3x6x3x
   // CHECK: %[[REV2:.+]] = tosa.reverse %[[REV1]] {axis = 2 : i32}
   // CHECK: tosa.conv2d %arg0, %[[REV2]], %arg2
   // CHECK-SAME: dilation = array<i64: 1, 1>, pad = array<i64: 2, 2, 5, 5>, stride = array<i64: 1, 1>
-  %0 = tosa.transpose_conv2d %arg0, %arg1, %arg2 {out_pad = array<i64: 0, 0, 0, 0>, out_shape = array<i64: -1, -1, -1, -1>, stride = array<i64: 1, 1>} : (tensor<2x16x14x3xf32>, tensor<5x3x6x3xf32>, tensor<5xf32>) -> tensor<2x18x19x5xf32>
+  %0 = tosa.transpose_conv2d %arg0, %arg1, %arg2 {acc_type = f32, out_pad = array<i64: 0, 0, 0, 0>, out_shape = array<i64: -1, -1, -1, -1>, stride = array<i64: 1, 1>} : (tensor<2x16x14x3xf32>, tensor<5x3x6x3xf32>, tensor<5xf32>) -> tensor<2x18x19x5xf32>
   return %0 : tensor<2x18x19x5xf32>
 }
 
@@ -17,8 +17,8 @@ func.func @transpose_conv2d(%arg0: tensor<2x16x14x3xf32>, %arg1: tensor<5x3x6x3x
 func.func @transpose_conv2d_quantized(%arg0: tensor<2x16x14x3xi8>, %arg1: tensor<5x3x6x3xi8>, %arg2: tensor<5xi32>) -> (tensor<2x18x19x5xi32>) {
   // CHECK: %[[REV1:.+]] = tosa.reverse %arg1 {axis = 1 : i32}
   // CHECK: %[[REV2:.+]] = tosa.reverse %[[REV1]] {axis = 2 : i32}
-  // CHECK: tosa.conv2d %arg0, %[[REV2]], %arg2 {dilation = array<i64: 1, 1>, pad = array<i64: 2, 2, 5, 5>, quantization_info = #tosa.conv_quant<input_zp = -22, weight_zp = 42>, stride = array<i64: 1, 1>}
-  %0 = tosa.transpose_conv2d %arg0, %arg1, %arg2 {out_pad = array<i64: 0, 0, 0, 0>, quantization_info = #tosa.conv_quant<input_zp = -22, weight_zp = 42>, out_shape = array<i64: -1, -1, -1, -1>, stride = array<i64: 1, 1>} : (tensor<2x16x14x3xi8>, tensor<5x3x6x3xi8>, tensor<5xi32>) -> tensor<2x18x19x5xi32>
+  // CHECK: tosa.conv2d %arg0, %[[REV2]], %arg2 {acc_type = i32, dilation = array<i64: 1, 1>, pad = array<i64: 2, 2, 5, 5>, quantization_info = #tosa.conv_quant<input_zp = -22, weight_zp = 42>, stride = array<i64: 1, 1>}
+  %0 = tosa.transpose_conv2d %arg0, %arg1, %arg2 {acc_type = i32, out_pad = array<i64: 0, 0, 0, 0>, quantization_info = #tosa.conv_quant<input_zp = -22, weight_zp = 42>, out_shape = array<i64: -1, -1, -1, -1>, stride = array<i64: 1, 1>} : (tensor<2x16x14x3xi8>, tensor<5x3x6x3xi8>, tensor<5xi32>) -> tensor<2x18x19x5xi32>
   return %0 : tensor<2x18x19x5xi32>
 }
 
@@ -32,6 +32,7 @@ func.func @transpose_conv2d_quantized_padded(%arg0: tensor<2x16x14x3xi8>, %arg1:
   // CHECK-SAME: dilation = array<i64: 1, 1>, pad = array<i64: 3, 4, 8, 9>,
   // CHECK-SAME: quantization_info = #tosa.conv_quant<input_zp = -22, weight_zp = 42>, stride = array<i64: 1, 1>}
   %0 = tosa.transpose_conv2d %arg0, %arg1, %arg2 {
+    acc_type = i32,
     out_pad = array<i64: 1, 2, 3, 4>,
     quantization_info = #tosa.conv_quant<input_zp = -22, weight_zp = 42>,
     out_shape = array<i64: -1, -1, -1, -1>,
@@ -44,7 +45,7 @@ func.func @transpose_conv2d_quantized_padded(%arg0: tensor<2x16x14x3xi8>, %arg1:
 // CHECK-LABEL: @transpose_conv2d_strided
 func.func @transpose_conv2d_strided(%arg0: tensor<2x17x15x3xf32>, %arg1: tensor<5x3x5x3xf32>, %arg2: tensor<5xf32>) -> tensor<2x?x?x5xf32> {
   // Manipulate the weight matrix to handle striding.
-  // CHECK-DAG: %[[PADV:.+]]  = "tosa.const"() <{value = dense<{{\[\[}}0, 0], [0, 1], [0, 1], [0, 0]]> : tensor<4x2xi32>}
+  // CHECK-DAG: %[[PADV:.+]]  = "tosa.const"() <{value = dense<{{\[}}0, 0, 0, 1, 0, 1, 0, 0]> : tensor<8xi32>}
   // CHECK-DAG: %[[TRANSV:.+]]  = "tosa.const"() <{value = dense<[2, 4, 0, 1, 3, 5]> : tensor<6xi32>}
   // CHECK-DAG: %[[PADW:.+]]  = tosa.pad %arg1, %[[PADV]]
   // CHECK-DAG: %[[RESW1:.+]]  = tosa.reshape %[[PADW]] {new_shape = array<i64: 5, 2, 2, 2, 3, 3>}
@@ -54,20 +55,20 @@ func.func @transpose_conv2d_strided(%arg0: tensor<2x17x15x3xf32>, %arg1: tensor<
   // CHECK-DAG: %[[NEWWEIGHT:.+]] = tosa.reverse %[[REV1]] {axis = 2 : i32}
 
   // Pad out the input matrix to handle the transpose conv.
-  // CHECK-DAG: %[[PAD:.+]]  = "tosa.const"() <{value = dense<{{\[\[}}0, 0], [1, 1], [1, 1], [0, 0]]> : tensor<4x2xi32>}
+  // CHECK-DAG: %[[PAD:.+]]  = "tosa.const"() <{value = dense<{{\[}}0, 0, 1, 1, 1, 1, 0, 0]> : tensor<8xi32>}
   // CHECK-DAG: %[[TRANS2:.+]]  = "tosa.const"() <{value = dense<[0, 1, 3, 2, 4, 5]> : tensor<6xi32>}
   // CHECK-DAG: %[[NEWINPUT:.+]] = tosa.pad %arg0, %[[PAD]]
 
   // Manipulate the final shape.
   // CHECK-DAG: %[[BIAS:.+]]  = "tosa.const"() <{value = dense<0.000000e+00> : tensor<30xf32>}
-  // CHECK-DAG: %[[CONV:.+]] = tosa.conv2d %[[NEWINPUT]], %[[NEWWEIGHT]], %[[BIAS]] {dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>}
+  // CHECK-DAG: %[[CONV:.+]] = tosa.conv2d %[[NEWINPUT]], %[[NEWWEIGHT]], %[[BIAS]] {acc_type = f32, dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>}
   // CHECK-DAG: %[[RESHAPE_OUT_1:.+]] = tosa.reshape %[[CONV]] {new_shape = array<i64: 2, 18, 16, 2, 3, 5>}
   // CHECK-DAG: %[[TRANS_OUT:.+]] = tosa.transpose %[[RESHAPE_OUT_1]], %[[TRANS2]]
   // CHECK-DAG: %[[RESHAPE_OUT_2:.+]] = tosa.reshape %[[TRANS_OUT]] {new_shape = array<i64: 2, 36, 48, 5>}
   // CHECK-DAG: %[[SLICE:.+]] = tosa.slice %[[RESHAPE_OUT_2]] {size = array<i64: 2, 35, 47, 5>, start = array<i64: 0, 0, 0, 0>}
   // CHECK-DAG: %[[RESHAPE_ARG2:.+]] = tosa.reshape %arg2 {new_shape = array<i64: 1, 1, 1, 5>}
   // CHECK: %[[ADD:.+]] = tosa.add %[[SLICE]], %[[RESHAPE_ARG2]]
-  %0 = tosa.transpose_conv2d %arg0, %arg1, %arg2 {out_pad = array<i64: 0, 0, 0, 0>, out_shape = array<i64: -1, -1, -1, -1>, stride = array<i64: 2, 3>} : (tensor<2x17x15x3xf32>, tensor<5x3x5x3xf32>, tensor<5xf32>) -> tensor<2x35x47x5xf32>
+  %0 = tosa.transpose_conv2d %arg0, %arg1, %arg2 {acc_type = f32, out_pad = array<i64: 0, 0, 0, 0>, out_shape = array<i64: -1, -1, -1, -1>, stride = array<i64: 2, 3>} : (tensor<2x17x15x3xf32>, tensor<5x3x5x3xf32>, tensor<5xf32>) -> tensor<2x35x47x5xf32>
   %1 = tensor.cast %0 : tensor<2x35x47x5xf32> to tensor<2x?x?x5xf32>
   return %1 : tensor<2x?x?x5xf32>
 }
@@ -77,7 +78,7 @@ func.func @transpose_conv2d_strided(%arg0: tensor<2x17x15x3xf32>, %arg1: tensor<
 // CHECK-LABEL: @transpose_conv2d_strided_quantized
 func.func @transpose_conv2d_strided_quantized(%arg0: tensor<2x17x15x3xi8>, %arg1: tensor<5x3x5x3xi8>, %arg2: tensor<5xi32>) -> (tensor<2x35x47x5xi32>) {
   // Manipulate the weight matrix to handle striding.
-  // CHECK-DAG: %[[PADV:.+]]  = "tosa.const"() <{value = dense<{{\[\[}}0, 0], [0, 1], [0, 1], [0, 0]]> : tensor<4x2xi32>}
+  // CHECK-DAG: %[[PADV:.+]]  = "tosa.const"() <{value = dense<{{\[}}0, 0, 0, 1, 0, 1, 0, 0]> : tensor<8xi32>}
   // CHECK-DAG: %[[TRANSV:.+]]  = "tosa.const"() <{value = dense<[2, 4, 0, 1, 3, 5]> : tensor<6xi32>}
   // CHECK-DAG: %[[PADW:.+]]  = tosa.pad %arg1, %[[PADV]] {quantization_info = #tosa.pad_quant<input_zp = 42>}
   // CHECK-DAG: %[[RESW1:.+]]  = tosa.reshape %[[PADW]] {new_shape = array<i64: 5, 2, 2, 2, 3, 3>}
@@ -87,20 +88,20 @@ func.func @transpose_conv2d_strided_quantized(%arg0: tensor<2x17x15x3xi8>, %arg1
   // CHECK-DAG: %[[NEWWEIGHT:.+]] = tosa.reverse %[[REV1]] {axis = 2 : i32}
 
   // Pad out the input matrix to handle the transpose conv.
-  // CHECK-DAG: %[[PAD:.+]]  = "tosa.const"() <{value = dense<{{\[\[}}0, 0], [1, 1], [1, 1], [0, 0]]> : tensor<4x2xi32>}
+  // CHECK-DAG: %[[PAD:.+]]  = "tosa.const"() <{value = dense<{{\[}}0, 0, 1, 1, 1, 1, 0, 0]> : tensor<8xi32>}
   // CHECK-DAG: %[[TRANS2:.+]]  = "tosa.const"() <{value = dense<[0, 1, 3, 2, 4, 5]> : tensor<6xi32>}
   // CHECK-DAG: %[[NEWINPUT:.+]] = tosa.pad %arg0, %[[PAD]] {quantization_info = #tosa.pad_quant<input_zp = -22>}
 
   // Manipulate the final shape.
   // CHECK-DAG: %[[BIAS:.+]]  = "tosa.const"() <{value = dense<0> : tensor<30xi32>}
-  // CHECK-DAG: %[[CONV:.+]] = tosa.conv2d %[[NEWINPUT]], %[[NEWWEIGHT]], %[[BIAS]] {dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, quantization_info = #tosa.conv_quant<input_zp = -22, weight_zp = 42>, stride = array<i64: 1, 1>}
+  // CHECK-DAG: %[[CONV:.+]] = tosa.conv2d %[[NEWINPUT]], %[[NEWWEIGHT]], %[[BIAS]] {acc_type = i32, dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, quantization_info = #tosa.conv_quant<input_zp = -22, weight_zp = 42>, stride = array<i64: 1, 1>}
   // CHECK-DAG: %[[RESHAPE_OUT_1:.+]] = tosa.reshape %[[CONV]] {new_shape = array<i64: 2, 18, 16, 2, 3, 5>}
   // CHECK-DAG: %[[TRANS_OUT:.+]] = tosa.transpose %[[RESHAPE_OUT_1]], %[[TRANS2]]
   // CHECK-DAG: %[[RESHAPE_OUT_2:.+]] = tosa.reshape %[[TRANS_OUT]] {new_shape = array<i64: 2, 36, 48, 5>}
   // CHECK-DAG: %[[SLICE:.+]] = tosa.slice %[[RESHAPE_OUT_2]] {size = array<i64: 2, 35, 47, 5>, start = array<i64: 0, 0, 0, 0>}
   // CHECK-DAG: %[[RESHAPE_ARG2:.+]] = tosa.reshape %arg2 {new_shape = array<i64: 1, 1, 1, 5>}
   // CHECK: %[[ADD:.+]] = tosa.add %[[SLICE]], %[[RESHAPE_ARG2]]
-  %0 = tosa.transpose_conv2d %arg0, %arg1, %arg2 {out_pad = array<i64: 0, 0, 0, 0>, quantization_info = #tosa.conv_quant<input_zp = -22, weight_zp = 42>, out_shape = array<i64: -1, -1, -1, -1>, stride = array<i64: 2, 3>} : (tensor<2x17x15x3xi8>, tensor<5x3x5x3xi8>, tensor<5xi32>) -> tensor<2x35x47x5xi32>
+  %0 = tosa.transpose_conv2d %arg0, %arg1, %arg2 {acc_type = i32, out_pad = array<i64: 0, 0, 0, 0>, quantization_info = #tosa.conv_quant<input_zp = -22, weight_zp = 42>, out_shape = array<i64: -1, -1, -1, -1>, stride = array<i64: 2, 3>} : (tensor<2x17x15x3xi8>, tensor<5x3x5x3xi8>, tensor<5xi32>) -> tensor<2x35x47x5xi32>
   return %0 : tensor<2x35x47x5xi32>
 }
 
@@ -108,12 +109,12 @@ func.func @transpose_conv2d_strided_quantized(%arg0: tensor<2x17x15x3xi8>, %arg1
 
 // CHECK-LABEL: @transpose_conv2d_strided_overpad
 func.func @transpose_conv2d_strided_overpad(%arg0 : tensor<1x16x1x1xi8>, %arg1 : tensor<1x2x1x1xi8>, %arg2 : tensor<1xi32>) -> (tensor<1x19x2x1xi32>) {
-  // CHECK-DAG: %[[WEIGHT_PAD:.+]] = "tosa.const"() <{value = dense<{{\[}}[0, 0], [0, 0], [0, 1], [0, 0]]> : tensor<4x2xi32>
+  // CHECK-DAG: %[[WEIGHT_PAD:.+]] = "tosa.const"() <{value = dense<{{\[}}0, 0, 0, 0, 0, 1, 0, 0]> : tensor<8xi32>
   // CHECK-DAG: %[[WEIGHT_PERMS:.+]] = "tosa.const"() <{value = dense<[2, 4, 0, 1, 3, 5]> : tensor<6xi32>}
-  // CHECK-DAG: %[[INPUT_PAD:.+]] = "tosa.const"() <{value = dense<{{\[}}[0, 0], [1, 1], [0, 0], [0, 0]]> : tensor<4x2xi32>}
+  // CHECK-DAG: %[[INPUT_PAD:.+]] = "tosa.const"() <{value = dense<{{\[}}0, 0, 1, 1, 0, 0, 0, 0]> : tensor<8xi32>}
   // CHECK-DAG: %[[ZERO:.+]] = "tosa.const"() <{value = dense<0> : tensor<2xi32>}
   // CHECK-DAG: %[[RESULT_PERMS:.+]] = "tosa.const"() <{value = dense<[0, 1, 3, 2, 4, 5]> : tensor<6xi32>}
-  // CHECK-DAG: %[[RESULT_PAD:.+]] = "tosa.const"() <{value = dense<{{\[}}[0, 0], [2, 0], [0, 0], [0, 0]]> : tensor<4x2xi32>}
+  // CHECK-DAG: %[[RESULT_PAD:.+]] = "tosa.const"() <{value = dense<{{\[}}0, 0, 2, 0, 0, 0, 0, 0]> : tensor<8xi32>}
   // CHECK: %[[PAD_WEIGHT:.+]] = tosa.pad %arg1, %[[WEIGHT_PAD]] {quantization_info = #tosa.pad_quant<input_zp = 93>}
   // CHECK: %[[RESHAPE_WEIGHT_0:.+]] = tosa.reshape %[[PAD_WEIGHT]] {new_shape = array<i64: 1, 2, 1, 1, 2, 1>}
   // CHECK: %[[TRANSPOSE_WEIGHT:.+]] = tosa.transpose %[[RESHAPE_WEIGHT_0]], %[[WEIGHT_PERMS]]
@@ -129,6 +130,7 @@ func.func @transpose_conv2d_strided_overpad(%arg0 : tensor<1x16x1x1xi8>, %arg1 :
   // CHECK: %[[RESHAPE_ARG2:.+]] = tosa.reshape %arg2 {new_shape = array<i64: 1, 1, 1, 1>}
   // CHECK: %[[ADD:.+]] = tosa.add %[[PAD_RESULT]], %[[RESHAPE_ARG2]]
   %2 =  tosa.transpose_conv2d %arg0, %arg1, %arg2 {
+    acc_type = i32,
     out_pad = array<i64: 2, 0, 0, 1>,
     out_shape = array<i64: 1, -1, -1, 1>,
     stride = array<i64: 1, 2>,
diff --git a/mlir/test/Dialect/Tosa/tosa-infer-shapes.mlir b/mlir/test/Dialect/Tosa/tosa-infer-shapes.mlir
index d46de740800e..82f3e22a3872 100644
--- a/mlir/test/Dialect/Tosa/tosa-infer-shapes.mlir
+++ b/mlir/test/Dialect/Tosa/tosa-infer-shapes.mlir
@@ -495,9 +495,9 @@ func.func @test_concat_axis_1(%arg0 : tensor<2x1xf32>, %arg1 : tensor<2x2xf32>)
 // -----
 
 // CHECK-LABEL: @test_padding_no_const
-func.func @test_padding_no_const(%arg0 : tensor<1x2xf32>, %arg1 : tensor<2x2xi32>) -> () {
-  // CHECK: tosa.pad %arg0, %arg1 : (tensor<1x2xf32>, tensor<2x2xi32>) -> tensor<?x?xf32>
-  %0 = tosa.pad %arg0, %arg1  : (tensor<1x2xf32>, tensor<2x2xi32>) -> tensor<?x?xf32>
+func.func @test_padding_no_const(%arg0 : tensor<1x2xf32>, %arg1 : tensor<4xi32>) -> () {
+  // CHECK: tosa.pad %arg0, %arg1 : (tensor<1x2xf32>, tensor<4xi32>) -> tensor<?x?xf32>
+  %0 = tosa.pad %arg0, %arg1  : (tensor<1x2xf32>, tensor<4xi32>) -> tensor<?x?xf32>
   return
 }
 
@@ -505,9 +505,9 @@ func.func @test_padding_no_const(%arg0 : tensor<1x2xf32>, %arg1 : tensor<2x2xi32
 
 // CHECK-LABEL:@test_padding_dynamic_input
 func.func @test_padding_dynamic_input(%arg0 : tensor<1x?xf32>) -> () {
-  %0 = arith.constant dense<[[1, 2], [3, 4]]> : tensor<2x2xi32>
-  // CHECK: tosa.pad %arg0, %cst : (tensor<1x?xf32>, tensor<2x2xi32>) -> tensor<4x?xf32>
-  %1 = tosa.pad %arg0, %0  : (tensor<1x?xf32>, tensor<2x2xi32>) -> tensor<?x?xf32>
+  %0 = arith.constant dense<[1, 2, 3, 4]> : tensor<4xi32>
+  // CHECK: tosa.pad %arg0, %cst : (tensor<1x?xf32>, tensor<4xi32>) -> tensor<4x?xf32>
+  %1 = tosa.pad %arg0, %0  : (tensor<1x?xf32>, tensor<4xi32>) -> tensor<?x?xf32>
   return
 }
 
@@ -515,9 +515,9 @@ func.func @test_padding_dynamic_input(%arg0 : tensor<1x?xf32>) -> () {
 
 // CHECK-LABEL: @test_padding_simple
 func.func @test_padding_simple(%arg0 : tensor<1x2xf32>) -> () {
-  %0 = arith.constant dense<[[1, 2], [3, 4]]> : tensor<2x2xi32>
-  // CHECK: tosa.pad %arg0, %cst : (tensor<1x2xf32>, tensor<2x2xi32>) -> tensor<4x9xf32>
-  %1 = tosa.pad %arg0, %0  : (tensor<1x2xf32>, tensor<2x2xi32>) -> tensor<?x?xf32>
+  %0 = arith.constant dense<[1, 2, 3, 4]> : tensor<4xi32>
+  // CHECK: tosa.pad %arg0, %cst : (tensor<1x2xf32>, tensor<4xi32>) -> tensor<4x9xf32>
+  %1 = tosa.pad %arg0, %0  : (tensor<1x2xf32>, tensor<4xi32>) -> tensor<?x?xf32>
   return
 }
 
@@ -674,7 +674,7 @@ func.func @test_pool_static(%arg0: tensor<3x5x6x7xf32>) {
 // CHECK-LABEL: @conv2d_static
 func.func @conv2d_static(%input: tensor<2x8x9x3xf32>, %weights: tensor<5x3x6x3xf32>, %bias: tensor<5xf32>) -> () {
   // CHECK: -> tensor<2x6x4x5xf32>
-  %0 = tosa.conv2d %input, %weights, %bias {pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>, dilation = array<i64: 1, 1>} : (tensor<2x8x9x3xf32>, tensor<5x3x6x3xf32>, tensor<5xf32>) -> tensor<?x?x?x?xf32>
+  %0 = tosa.conv2d %input, %weights, %bias {acc_type = f32, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>, dilation = array<i64: 1, 1>} : (tensor<2x8x9x3xf32>, tensor<5x3x6x3xf32>, tensor<5xf32>) -> tensor<?x?x?x?xf32>
   return
 }
 
@@ -683,7 +683,7 @@ func.func @conv2d_static(%input: tensor<2x8x9x3xf32>, %weights: tensor<5x3x6x3xf
 // CHECK-LABEL: @conv2d_dynamic_input
 func.func @conv2d_dynamic_input(%input: tensor<?x?x?x?xf32>, %weights: tensor<5x3x6x3xf32>, %bias: tensor<5xf32>) -> () {
   // CHECK: -> tensor<?x?x?x5xf32>
-  %0 = tosa.conv2d %input, %weights, %bias {pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>, dilation = array<i64: 1, 1>} : (tensor<?x?x?x?xf32>, tensor<5x3x6x3xf32>, tensor<5xf32>) -> tensor<?x?x?x?xf32>
+  %0 = tosa.conv2d %input, %weights, %bias {acc_type = f32, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>, dilation = array<i64: 1, 1>} : (tensor<?x?x?x?xf32>, tensor<5x3x6x3xf32>, tensor<5xf32>) -> tensor<?x?x?x?xf32>
   return
 }
 
@@ -716,7 +716,7 @@ func.func @test_pool_padded(%arg0: tensor<3x5x6x7xf32>) {
 // CHECK-LABEL: @conv2d_dynamic_weight
 func.func @conv2d_dynamic_weight(%input: tensor<2x8x9x3xf32>, %weights: tensor<?x?x?x?xf32>, %bias: tensor<5xf32>) -> () {
   // CHECK: -> tensor<2x?x?x5xf32>
-  %0 = tosa.conv2d %input, %weights, %bias {pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>, dilation = array<i64: 1, 1>} : (tensor<2x8x9x3xf32>, tensor<?x?x?x?xf32>, tensor<5xf32>) -> tensor<?x?x?x?xf32>
+  %0 = tosa.conv2d %input, %weights, %bias {acc_type = f32, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>, dilation = array<i64: 1, 1>} : (tensor<2x8x9x3xf32>, tensor<?x?x?x?xf32>, tensor<5xf32>) -> tensor<?x?x?x?xf32>
   return
 }
 
@@ -725,7 +725,7 @@ func.func @conv2d_dynamic_weight(%input: tensor<2x8x9x3xf32>, %weights: tensor<?
 // CHECK-LABEL: @conv2d_dynamic_bias
 func.func @conv2d_dynamic_bias(%input: tensor<2x8x9x3xf32>, %weights: tensor<5x3x6x3xf32>, %bias: tensor<?xf32>) -> () {
   // CHECK: -> tensor<2x6x4x5xf32>
-  %0 = tosa.conv2d %input, %weights, %bias {pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>, dilation = array<i64: 1, 1>} : (tensor<2x8x9x3xf32>, tensor<5x3x6x3xf32>, tensor<?xf32>) -> tensor<?x?x?x?xf32>
+  %0 = tosa.conv2d %input, %weights, %bias {acc_type = f32, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>, dilation = array<i64: 1, 1>} : (tensor<2x8x9x3xf32>, tensor<5x3x6x3xf32>, tensor<?xf32>) -> tensor<?x?x?x?xf32>
   return
 }
 
@@ -746,7 +746,7 @@ func.func @test_pool_stride(%arg0: tensor<3x11x12x7xf32>) {
 // CHECK-LABEL: @conv2d_padded
 func.func @conv2d_padded(%input: tensor<2x8x9x3xf32>, %weights: tensor<5x3x6x3xf32>, %bias: tensor<5xf32>) -> () {
   // CHECK: -> tensor<2x9x11x5xf32>
-  %0 = tosa.conv2d %input, %weights, %bias {pad = array<i64: 1, 2, 3, 4>, stride = array<i64: 1, 1>, dilation = array<i64: 1, 1>} : (tensor<2x8x9x3xf32>, tensor<5x3x6x3xf32>, tensor<5xf32>) -> tensor<?x?x?x?xf32>
+  %0 = tosa.conv2d %input, %weights, %bias {acc_type = f32, pad = array<i64: 1, 2, 3, 4>, stride = array<i64: 1, 1>, dilation = array<i64: 1, 1>} : (tensor<2x8x9x3xf32>, tensor<5x3x6x3xf32>, tensor<5xf32>) -> tensor<?x?x?x?xf32>
   return
 }
 
@@ -755,7 +755,7 @@ func.func @conv2d_padded(%input: tensor<2x8x9x3xf32>, %weights: tensor<5x3x6x3xf
 // CHECK-LABEL: @conv2d_dilated
 func.func @conv2d_dilated(%input: tensor<2x12x14x3xf32>, %weights: tensor<5x3x6x3xf32>, %bias: tensor<5xf32>) -> () {
   // CHECK: -> tensor<2x6x4x5xf32>
-  %0 = tosa.conv2d %input, %weights, %bias {pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>, dilation = array<i64: 3, 2>} : (tensor<2x12x14x3xf32>, tensor<5x3x6x3xf32>, tensor<5xf32>) -> tensor<?x?x?x?xf32>
+  %0 = tosa.conv2d %input, %weights, %bias {acc_type = f32, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>, dilation = array<i64: 3, 2>} : (tensor<2x12x14x3xf32>, tensor<5x3x6x3xf32>, tensor<5xf32>) -> tensor<?x?x?x?xf32>
   return
 }
 
@@ -764,7 +764,7 @@ func.func @conv2d_dilated(%input: tensor<2x12x14x3xf32>, %weights: tensor<5x3x6x
 // CHECK-LABEL: @conv2d_strided
 func.func @conv2d_strided(%input: tensor<1x13x14x1xf32>, %weights: tensor<1x1x1x1xf32>, %bias: tensor<1xf32>) -> () {
   // CHECK: -> tensor<1x5x7x1xf32>
-  %0 = tosa.conv2d %input, %weights, %bias {pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 3, 2>, dilation = array<i64: 1, 1>} : (tensor<1x13x14x1xf32>, tensor<1x1x1x1xf32>, tensor<1xf32>) -> tensor<?x?x?x?xf32>
+  %0 = tosa.conv2d %input, %weights, %bias {acc_type = f32, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 3, 2>, dilation = array<i64: 1, 1>} : (tensor<1x13x14x1xf32>, tensor<1x1x1x1xf32>, tensor<1xf32>) -> tensor<?x?x?x?xf32>
   return
 }
 
@@ -773,7 +773,7 @@ func.func @conv2d_strided(%input: tensor<1x13x14x1xf32>, %weights: tensor<1x1x1x
 // CHECK-LABEL: @conv3d_static
 func.func @conv3d_static(%input: tensor<2x8x9x10x3xf32>, %weights: tensor<5x3x6x4x3xf32>, %bias: tensor<5xf32>) -> () {
   // CHECK: -> tensor<2x6x4x7x5xf32>
-  %0 = tosa.conv3d %input, %weights, %bias {dilation = array<i64: 1, 1, 1>, pad = array<i64: 0, 0, 0, 0, 0, 0>, stride = array<i64: 1, 1, 1>} : (tensor<2x8x9x10x3xf32>, tensor<5x3x6x4x3xf32>, tensor<5xf32>) -> tensor<?x?x?x?x?xf32>
+  %0 = tosa.conv3d %input, %weights, %bias {acc_type = f32, dilation = array<i64: 1, 1, 1>, pad = array<i64: 0, 0, 0, 0, 0, 0>, stride = array<i64: 1, 1, 1>} : (tensor<2x8x9x10x3xf32>, tensor<5x3x6x4x3xf32>, tensor<5xf32>) -> tensor<?x?x?x?x?xf32>
   return
 }
 
@@ -782,7 +782,7 @@ func.func @conv3d_static(%input: tensor<2x8x9x10x3xf32>, %weights: tensor<5x3x6x
 // CHECK-LABEL: @conv3d_dynamic_input
 func.func @conv3d_dynamic_input(%arg0: tensor<?x?x?x?x?xf32>, %arg1: tensor<5x3x6x4x3xf32>, %arg2: tensor<5xf32>) {
   // CHECK: -> tensor<?x?x?x?x5xf32>
-  %0 = tosa.conv3d %arg0, %arg1, %arg2 {dilation = array<i64: 1, 1, 1>, pad = array<i64: 0, 0, 0, 0, 0, 0>, stride = array<i64: 1, 1, 1>} : (tensor<?x?x?x?x?xf32>, tensor<5x3x6x4x3xf32>, tensor<5xf32>) -> tensor<?x?x?x?x?xf32>
+  %0 = tosa.conv3d %arg0, %arg1, %arg2 {acc_type = f32, dilation = array<i64: 1, 1, 1>, pad = array<i64: 0, 0, 0, 0, 0, 0>, stride = array<i64: 1, 1, 1>} : (tensor<?x?x?x?x?xf32>, tensor<5x3x6x4x3xf32>, tensor<5xf32>) -> tensor<?x?x?x?x?xf32>
   return
 }
 
@@ -791,7 +791,7 @@ func.func @conv3d_dynamic_input(%arg0: tensor<?x?x?x?x?xf32>, %arg1: tensor<5x3x
 // CHECK-LABEL: @conv3d_dynamic_weight
 func.func @conv3d_dynamic_weight(%arg0: tensor<2x8x9x10x3xf32>, %arg1: tensor<?x?x?x?x?xf32>, %arg2: tensor<5xf32>) {
   // CHECK: -> tensor<2x?x?x?x5xf32>
-  %0 = tosa.conv3d %arg0, %arg1, %arg2 {dilation = array<i64: 1, 1, 1>, pad = array<i64: 0, 0, 0, 0, 0, 0>, stride = array<i64: 1, 1, 1>} : (tensor<2x8x9x10x3xf32>, tensor<?x?x?x?x?xf32>, tensor<5xf32>) -> tensor<?x?x?x?x?xf32>
+  %0 = tosa.conv3d %arg0, %arg1, %arg2 {acc_type = f32, dilation = array<i64: 1, 1, 1>, pad = array<i64: 0, 0, 0, 0, 0, 0>, stride = array<i64: 1, 1, 1>} : (tensor<2x8x9x10x3xf32>, tensor<?x?x?x?x?xf32>, tensor<5xf32>) -> tensor<?x?x?x?x?xf32>
   return
 }
 
@@ -800,7 +800,7 @@ func.func @conv3d_dynamic_weight(%arg0: tensor<2x8x9x10x3xf32>, %arg1: tensor<?x
 // CHECK-LABEL: @conv3d_dynamic_bias
 func.func @conv3d_dynamic_bias(%arg0: tensor<2x8x9x10x3xf32>, %arg1: tensor<5x3x6x4x3xf32>, %arg2: tensor<?xf32>) {
   // CHECK: -> tensor<2x6x4x7x5xf32>
-  %0 = tosa.conv3d %arg0, %arg1, %arg2 {dilation = array<i64: 1, 1, 1>, pad = array<i64: 0, 0, 0, 0, 0, 0>, stride = array<i64: 1, 1, 1>} : (tensor<2x8x9x10x3xf32>, tensor<5x3x6x4x3xf32>, tensor<?xf32>) -> tensor<?x?x?x?x?xf32>
+  %0 = tosa.conv3d %arg0, %arg1, %arg2 {acc_type = f32, dilation = array<i64: 1, 1, 1>, pad = array<i64: 0, 0, 0, 0, 0, 0>, stride = array<i64: 1, 1, 1>} : (tensor<2x8x9x10x3xf32>, tensor<5x3x6x4x3xf32>, tensor<?xf32>) -> tensor<?x?x?x?x?xf32>
   return
 }
 
@@ -809,7 +809,7 @@ func.func @conv3d_dynamic_bias(%arg0: tensor<2x8x9x10x3xf32>, %arg1: tensor<5x3x
 // CHECK-LABEL: @conv3d_padded
 func.func @conv3d_padded(%arg0: tensor<2x8x9x10x3xf32>, %arg1: tensor<5x3x6x4x3xf32>, %arg2: tensor<5xf32>) {
   // CHECK: -> tensor<2x9x11x18x5xf32>
-  %0 = tosa.conv3d %arg0, %arg1, %arg2 {dilation = array<i64: 1, 1, 1>, pad = array<i64: 1, 2, 3, 4, 5, 6>, stride = array<i64: 1, 1, 1>} : (tensor<2x8x9x10x3xf32>, tensor<5x3x6x4x3xf32>, tensor<5xf32>) -> tensor<?x?x?x?x?xf32>
+  %0 = tosa.conv3d %arg0, %arg1, %arg2 {acc_type = f32, dilation = array<i64: 1, 1, 1>, pad = array<i64: 1, 2, 3, 4, 5, 6>, stride = array<i64: 1, 1, 1>} : (tensor<2x8x9x10x3xf32>, tensor<5x3x6x4x3xf32>, tensor<5xf32>) -> tensor<?x?x?x?x?xf32>
   return
 }
 
@@ -818,7 +818,7 @@ func.func @conv3d_padded(%arg0: tensor<2x8x9x10x3xf32>, %arg1: tensor<5x3x6x4x3x
 // CHECK-LABEL: @conv3d_dilated
 func.func @conv3d_dilated(%arg0: tensor<2x12x14x16x3xf32>, %arg1: tensor<5x3x6x2x3xf32>, %arg2: tensor<5xf32>) {
   // CHECK: -> tensor<2x6x4x12x5xf32>
-  %0 = tosa.conv3d %arg0, %arg1, %arg2 {dilation = array<i64: 3, 2, 4>, pad = array<i64: 0, 0, 0, 0, 0, 0>, stride = array<i64: 1, 1, 1>} : (tensor<2x12x14x16x3xf32>, tensor<5x3x6x2x3xf32>, tensor<5xf32>) -> tensor<?x?x?x?x?xf32>
+  %0 = tosa.conv3d %arg0, %arg1, %arg2 {acc_type = f32, dilation = array<i64: 3, 2, 4>, pad = array<i64: 0, 0, 0, 0, 0, 0>, stride = array<i64: 1, 1, 1>} : (tensor<2x12x14x16x3xf32>, tensor<5x3x6x2x3xf32>, tensor<5xf32>) -> tensor<?x?x?x?x?xf32>
   return
 }
 
@@ -827,7 +827,7 @@ func.func @conv3d_dilated(%arg0: tensor<2x12x14x16x3xf32>, %arg1: tensor<5x3x6x2
 // CHECK-LABEL: @conv3d_strided
 func.func @conv3d_strided(%arg0: tensor<1x13x14x15x1xf32>, %arg1: tensor<1x1x1x1x1xf32>, %arg2: tensor<1xf32>) {
   // CHECK: -> tensor<1x5x7x4x1xf32>
-  %0 = tosa.conv3d %arg0, %arg1, %arg2 {dilation = array<i64: 1, 1, 1>, pad = array<i64: 0, 0, 0, 0, 0, 0>, stride = array<i64: 3, 2, 4>} : (tensor<1x13x14x15x1xf32>, tensor<1x1x1x1x1xf32>, tensor<1xf32>) -> tensor<?x?x?x?x?xf32>
+  %0 = tosa.conv3d %arg0, %arg1, %arg2 {acc_type = f32, dilation = array<i64: 1, 1, 1>, pad = array<i64: 0, 0, 0, 0, 0, 0>, stride = array<i64: 3, 2, 4>} : (tensor<1x13x14x15x1xf32>, tensor<1x1x1x1x1xf32>, tensor<1xf32>) -> tensor<?x?x?x?x?xf32>
   return
 }
 
@@ -836,7 +836,7 @@ func.func @conv3d_strided(%arg0: tensor<1x13x14x15x1xf32>, %arg1: tensor<1x1x1x1
 // CHECK-LABEL: @depthwise_conv2d_static
 func.func @depthwise_conv2d_static(%arg0: tensor<2x8x9x3xf32>, %arg1: tensor<3x6x3x5xf32>, %arg2: tensor<15xf32>) {
   // CHECK: -> tensor<2x6x4x15xf32>
-  %0 = tosa.depthwise_conv2d %arg0, %arg1, %arg2 {dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>} : (tensor<2x8x9x3xf32>, tensor<3x6x3x5xf32>, tensor<15xf32>) -> tensor<2x6x4x15xf32>
+  %0 = tosa.depthwise_conv2d %arg0, %arg1, %arg2 {acc_type = f32, dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>} : (tensor<2x8x9x3xf32>, tensor<3x6x3x5xf32>, tensor<15xf32>) -> tensor<2x6x4x15xf32>
   return
 }
 
@@ -845,7 +845,7 @@ func.func @depthwise_conv2d_static(%arg0: tensor<2x8x9x3xf32>, %arg1: tensor<3x6
 // CHECK-LABEL: @depthwise_conv2d_dynamic_input
 func.func @depthwise_conv2d_dynamic_input(%arg0: tensor<?x?x?x?xf32>, %arg1: tensor<3x6x3x5xf32>, %arg2: tensor<15xf32>) {
   // CHECK: -> tensor<?x?x?x15xf32>
-  %0 = tosa.depthwise_conv2d %arg0, %arg1, %arg2 {dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>} : (tensor<?x?x?x?xf32>, tensor<3x6x3x5xf32>, tensor<15xf32>) -> tensor<?x?x?x15xf32>
+  %0 = tosa.depthwise_conv2d %arg0, %arg1, %arg2 {acc_type = f32, dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>} : (tensor<?x?x?x?xf32>, tensor<3x6x3x5xf32>, tensor<15xf32>) -> tensor<?x?x?x15xf32>
   return
 }
 
@@ -854,7 +854,7 @@ func.func @depthwise_conv2d_dynamic_input(%arg0: tensor<?x?x?x?xf32>, %arg1: ten
 // CHECK-LABEL: @depthwise_conv2d_dynamic_weight
 func.func @depthwise_conv2d_dynamic_weight(%arg0: tensor<2x8x9x3xf32>, %arg1: tensor<?x?x?x?xf32>, %arg2: tensor<15xf32>) {
   // CHECK: -> tensor<2x?x?x15xf32>
-  %0 = tosa.depthwise_conv2d %arg0, %arg1, %arg2 {dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>} : (tensor<2x8x9x3xf32>, tensor<?x?x?x?xf32>, tensor<15xf32>) -> tensor<2x?x?x15xf32>
+  %0 = tosa.depthwise_conv2d %arg0, %arg1, %arg2 {acc_type = f32, dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>} : (tensor<2x8x9x3xf32>, tensor<?x?x?x?xf32>, tensor<15xf32>) -> tensor<2x?x?x15xf32>
   return
 }
 
@@ -863,7 +863,7 @@ func.func @depthwise_conv2d_dynamic_weight(%arg0: tensor<2x8x9x3xf32>, %arg1: te
 // CHECK-LABEL: @depthwise_conv2d_dynamic_bias
 func.func @depthwise_conv2d_dynamic_bias(%arg0: tensor<2x8x9x3xf32>, %arg1: tensor<3x6x3x5xf32>, %arg2: tensor<?xf32>) {
   // CHECK: -> tensor<2x6x4x15xf32>
-  %0 = tosa.depthwise_conv2d %arg0, %arg1, %arg2 {dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>} : (tensor<2x8x9x3xf32>, tensor<3x6x3x5xf32>, tensor<?xf32>) -> tensor<2x6x4x15xf32>
+  %0 = tosa.depthwise_conv2d %arg0, %arg1, %arg2 {acc_type = f32, dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>} : (tensor<2x8x9x3xf32>, tensor<3x6x3x5xf32>, tensor<?xf32>) -> tensor<2x6x4x15xf32>
   return
 }
 
@@ -872,7 +872,7 @@ func.func @depthwise_conv2d_dynamic_bias(%arg0: tensor<2x8x9x3xf32>, %arg1: tens
 // CHECK-LABEL: @depthwise_conv2d_padded
 func.func @depthwise_conv2d_padded(%arg0: tensor<2x8x9x3xf32>, %arg1: tensor<3x6x3x5xf32>, %arg2: tensor<15xf32>) {
   // CHECK: -> tensor<2x9x11x15xf32>
-  %0 = tosa.depthwise_conv2d %arg0, %arg1, %arg2 {dilation = array<i64: 1, 1>, pad = array<i64: 1, 2, 3, 4>, stride = array<i64: 1, 1>} : (tensor<2x8x9x3xf32>, tensor<3x6x3x5xf32>, tensor<15xf32>) -> tensor<2x9x11x15xf32>
+  %0 = tosa.depthwise_conv2d %arg0, %arg1, %arg2 {acc_type = f32, dilation = array<i64: 1, 1>, pad = array<i64: 1, 2, 3, 4>, stride = array<i64: 1, 1>} : (tensor<2x8x9x3xf32>, tensor<3x6x3x5xf32>, tensor<15xf32>) -> tensor<2x9x11x15xf32>
   return
 }
 
@@ -881,7 +881,7 @@ func.func @depthwise_conv2d_padded(%arg0: tensor<2x8x9x3xf32>, %arg1: tensor<3x6
 // CHECK-LABEL: @depthwise_conv2d_dilated
 func.func @depthwise_conv2d_dilated(%arg0: tensor<2x12x14x3xf32>, %arg1: tensor<3x6x3x5xf32>, %arg2: tensor<15xf32>) {
   // CHECK: -> tensor<2x6x4x15xf32>
-  %0 = tosa.depthwise_conv2d %arg0, %arg1, %arg2 {dilation = array<i64: 3, 2>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>} : (tensor<2x12x14x3xf32>, tensor<3x6x3x5xf32>, tensor<15xf32>) -> tensor<2x6x4x15xf32>
+  %0 = tosa.depthwise_conv2d %arg0, %arg1, %arg2 {acc_type = f32, dilation = array<i64: 3, 2>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>} : (tensor<2x12x14x3xf32>, tensor<3x6x3x5xf32>, tensor<15xf32>) -> tensor<2x6x4x15xf32>
   return
 }
 
@@ -890,7 +890,7 @@ func.func @depthwise_conv2d_dilated(%arg0: tensor<2x12x14x3xf32>, %arg1: tensor<
 // CHECK-LABEL: @depthwise_conv2d_strided
 func.func @depthwise_conv2d_strided(%arg0: tensor<1x13x14x1xf32>, %arg1: tensor<1x1x1x1xf32>, %arg2: tensor<1xf32>) {
   // CHECK: -> tensor<1x5x7x1xf32>
-  %0 = tosa.depthwise_conv2d %arg0, %arg1, %arg2 {dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 3, 2>} : (tensor<1x13x14x1xf32>, tensor<1x1x1x1xf32>, tensor<1xf32>) -> tensor<1x5x7x1xf32>
+  %0 = tosa.depthwise_conv2d %arg0, %arg1, %arg2 {acc_type = f32, dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 3, 2>} : (tensor<1x13x14x1xf32>, tensor<1x1x1x1xf32>, tensor<1xf32>) -> tensor<1x5x7x1xf32>
   return
 }
 
@@ -899,7 +899,7 @@ func.func @depthwise_conv2d_strided(%arg0: tensor<1x13x14x1xf32>, %arg1: tensor<
 // CHECK-LABEL: @transpose_conv2d_out_shape
 func.func @transpose_conv2d_out_shape(%arg0: tensor<2x?x?x3xf32>, %arg1: tensor<5x3x6x3xf32>, %arg2: tensor<5xf32>) {
   // CHECK: -> tensor<2x8x9x5xf32>
-  %0 = tosa.transpose_conv2d %arg0, %arg1, %arg2 {out_pad = array<i64: 0, 0, 0, 0>, out_shape = array<i64: -1, 8, 9, -1>, stride = array<i64: 1, 1>} : (tensor<2x?x?x3xf32>, tensor<5x3x6x3xf32>, tensor<5xf32>) -> tensor<2x8x9x5xf32>
+  %0 = tosa.transpose_conv2d %arg0, %arg1, %arg2 {acc_type = f32, out_pad = array<i64: 0, 0, 0, 0>, out_shape = array<i64: -1, 8, 9, -1>, stride = array<i64: 1, 1>} : (tensor<2x?x?x3xf32>, tensor<5x3x6x3xf32>, tensor<5xf32>) -> tensor<2x8x9x5xf32>
   return
 }
 
@@ -908,7 +908,7 @@ func.func @transpose_conv2d_out_shape(%arg0: tensor<2x?x?x3xf32>, %arg1: tensor<
 // CHECK-LABEL: @transpose_conv2d_static
 func.func @transpose_conv2d_static(%arg0: tensor<2x16x14x3xf32>, %arg1: tensor<5x3x6x3xf32>, %arg2: tensor<5xf32>) {
   // CHECK: -> tensor<2x18x19x5xf32>
-  %0 = tosa.transpose_conv2d %arg0, %arg1, %arg2 {out_pad = array<i64: 0, 0, 0, 0>, out_shape = array<i64: -1, -1, -1, -1>, stride = array<i64: 1, 1>} : (tensor<2x16x14x3xf32>, tensor<5x3x6x3xf32>, tensor<5xf32>) -> tensor<2x?x?x5xf32>
+  %0 = tosa.transpose_conv2d %arg0, %arg1, %arg2 {acc_type = f32, out_pad = array<i64: 0, 0, 0, 0>, out_shape = array<i64: -1, -1, -1, -1>, stride = array<i64: 1, 1>} : (tensor<2x16x14x3xf32>, tensor<5x3x6x3xf32>, tensor<5xf32>) -> tensor<2x?x?x5xf32>
   return
 }
 
@@ -917,7 +917,7 @@ func.func @transpose_conv2d_static(%arg0: tensor<2x16x14x3xf32>, %arg1: tensor<5
 // CHECK-LABEL: @transpose_conv2d_static_strided
 func.func @transpose_conv2d_static_strided(%arg0: tensor<2x16x14x3xf32>, %arg1: tensor<5x3x6x3xf32>, %arg2: tensor<5xf32>) {
   // CHECK: -> tensor<2x33x45x5xf32>
-  %0 = tosa.transpose_conv2d %arg0, %arg1, %arg2 {out_pad = array<i64: 0, 0, 0, 0>, out_shape = array<i64: -1, -1, -1, -1>, stride = array<i64: 2, 3>} : (tensor<2x16x14x3xf32>, tensor<5x3x6x3xf32>, tensor<5xf32>) -> tensor<2x?x?x5xf32>
+  %0 = tosa.transpose_conv2d %arg0, %arg1, %arg2 {acc_type = f32, out_pad = array<i64: 0, 0, 0, 0>, out_shape = array<i64: -1, -1, -1, -1>, stride = array<i64: 2, 3>} : (tensor<2x16x14x3xf32>, tensor<5x3x6x3xf32>, tensor<5xf32>) -> tensor<2x?x?x5xf32>
   return
 }
 
@@ -926,7 +926,7 @@ func.func @transpose_conv2d_static_strided(%arg0: tensor<2x16x14x3xf32>, %arg1:
 // CHECK-LABEL: @transpose_conv2d_dynamic_input
 func.func @transpose_conv2d_dynamic_input(%arg0: tensor<?x?x?x?xf32>, %arg1: tensor<5x3x6x3xf32>, %arg2: tensor<5xf32>) {
   // CHECK: -> tensor<?x?x?x5xf32>
-  %0 = tosa.transpose_conv2d %arg0, %arg1, %arg2 {out_pad = array<i64: 0, 0, 0, 0>, out_shape = array<i64: -1, -1, -1, -1>, stride = array<i64: 1, 1>} : (tensor<?x?x?x?xf32>, tensor<5x3x6x3xf32>, tensor<5xf32>) -> tensor<?x?x?x5xf32>
+  %0 = tosa.transpose_conv2d %arg0, %arg1, %arg2 {acc_type = f32, out_pad = array<i64: 0, 0, 0, 0>, out_shape = array<i64: -1, -1, -1, -1>, stride = array<i64: 1, 1>} : (tensor<?x?x?x?xf32>, tensor<5x3x6x3xf32>, tensor<5xf32>) -> tensor<?x?x?x5xf32>
   return
 }
 
@@ -935,7 +935,7 @@ func.func @transpose_conv2d_dynamic_input(%arg0: tensor<?x?x?x?xf32>, %arg1: ten
 // CHECK-LABEL: @transpose_conv2d_dynamic_weights
 func.func @transpose_conv2d_dynamic_weights(%arg0: tensor<2x6x4x3xf32>, %arg1: tensor<?x?x?x?xf32>, %arg2: tensor<5xf32>) {
   // CHECK: -> tensor<2x?x?x5xf32>
-  %0 = tosa.transpose_conv2d %arg0, %arg1, %arg2 {out_pad = array<i64: 0, 0, 0, 0>, out_shape = array<i64: -1, -1, -1, -1>, stride = array<i64: 1, 1>} : (tensor<2x6x4x3xf32>, tensor<?x?x?x?xf32>, tensor<5xf32>) -> tensor<2x?x?x5xf32>
+  %0 = tosa.transpose_conv2d %arg0, %arg1, %arg2 {acc_type = f32, out_pad = array<i64: 0, 0, 0, 0>, out_shape = array<i64: -1, -1, -1, -1>, stride = array<i64: 1, 1>} : (tensor<2x6x4x3xf32>, tensor<?x?x?x?xf32>, tensor<5xf32>) -> tensor<2x?x?x5xf32>
   return
 }
 
@@ -944,7 +944,7 @@ func.func @transpose_conv2d_dynamic_weights(%arg0: tensor<2x6x4x3xf32>, %arg1: t
 // CHECK-LABEL: @transpose_conv2d_dynamic_bias
 func.func @transpose_conv2d_dynamic_bias(%arg0: tensor<2x6x4x3xf32>, %arg1: tensor<5x3x6x3xf32>, %arg2: tensor<?xf32>) {
   // CHECK: -> tensor<2x8x9x5xf32>
-  %0 = tosa.transpose_conv2d %arg0, %arg1, %arg2 {out_pad = array<i64: 0, 0, 0, 0>, out_shape = array<i64: -1, -1, -1, -1>, stride = array<i64: 1, 1>} : (tensor<2x6x4x3xf32>, tensor<5x3x6x3xf32>, tensor<?xf32>) -> tensor<2x8x9x5xf32>
+  %0 = tosa.transpose_conv2d %arg0, %arg1, %arg2 {acc_type = f32, out_pad = array<i64: 0, 0, 0, 0>, out_shape = array<i64: -1, -1, -1, -1>, stride = array<i64: 1, 1>} : (tensor<2x6x4x3xf32>, tensor<5x3x6x3xf32>, tensor<?xf32>) -> tensor<2x8x9x5xf32>
   return
 }
 
@@ -953,14 +953,14 @@ func.func @transpose_conv2d_dynamic_bias(%arg0: tensor<2x6x4x3xf32>, %arg1: tens
 // CHECK-LABEL: @transpose_conv2d_padded
 func.func @transpose_conv2d_padded(%arg0: tensor<2x9x11x3xf32>, %arg1: tensor<5x3x6x3xf32>, %arg2: tensor<5xf32>) {
   // CHECK: -> tensor<2x10x13x5xf32>
-  %0 = tosa.transpose_conv2d %arg0, %arg1, %arg2 {out_pad = array<i64: 1, 0, 3, 0>, out_shape = array<i64: -1, -1, -1, -1>, stride = array<i64: 1, 1>} : (tensor<2x9x11x3xf32>, tensor<5x3x6x3xf32>, tensor<5xf32>) -> tensor<2x10x13x5xf32>
+  %0 = tosa.transpose_conv2d %arg0, %arg1, %arg2 {acc_type = f32, out_pad = array<i64: 1, 0, 3, 0>, out_shape = array<i64: -1, -1, -1, -1>, stride = array<i64: 1, 1>} : (tensor<2x9x11x3xf32>, tensor<5x3x6x3xf32>, tensor<5xf32>) -> tensor<2x10x13x5xf32>
   return
 }
 
 // CHECK-LABEL: @transpose_conv2d_strided
 func.func @transpose_conv2d_strided(%arg0: tensor<1x5x7x1xf32>, %arg1: tensor<1x1x1x1xf32>, %arg2: tensor<1xf32>) {
   // CHECK: -> tensor<1x13x13x1xf32>
-  %0 = tosa.transpose_conv2d %arg0, %arg1, %arg2 {out_pad = array<i64: 0, 0, 0, 0>, out_shape = array<i64: -1, -1, -1, -1>, stride = array<i64: 3, 2>} : (tensor<1x5x7x1xf32>, tensor<1x1x1x1xf32>, tensor<1xf32>) -> tensor<1x13x13x1xf32>
+  %0 = tosa.transpose_conv2d %arg0, %arg1, %arg2 {acc_type = f32, out_pad = array<i64: 0, 0, 0, 0>, out_shape = array<i64: -1, -1, -1, -1>, stride = array<i64: 3, 2>} : (tensor<1x5x7x1xf32>, tensor<1x1x1x1xf32>, tensor<1xf32>) -> tensor<1x13x13x1xf32>
   return
 }
 
@@ -1368,7 +1368,7 @@ func.func @test_non_tosa_consumer_still_propagates(%arg0: tensor<1x1x8xf32>, %ar
 func.func @test_tosa_use_def_chain(%arg0: tensor<1x32x32x3xf32>, %arg1: tensor<16x3x3x3xf32>, %arg2: tensor<16xf32>) -> tensor<?x16x16x16xf32> {
   // CHECK: [[CONV:%.+]] = tosa.conv2d %arg0, %arg1, %arg2
   // CHECK: (tensor<1x32x32x3xf32>, tensor<16x3x3x3xf32>, tensor<16xf32>) -> tensor<1x32x32x16xf32>
-  %0 = tosa.conv2d %arg0, %arg1, %arg2 {dilation = array<i64: 1, 1>, pad = array<i64: 1, 1, 1, 1>, stride = array<i64: 1, 1>} : (tensor<1x32x32x3xf32>, tensor<16x3x3x3xf32>, tensor<16xf32>) -> tensor<?x32x32x16xf32>
+  %0 = tosa.conv2d %arg0, %arg1, %arg2 {acc_type = f32, dilation = array<i64: 1, 1>, pad = array<i64: 1, 1, 1, 1>, stride = array<i64: 1, 1>} : (tensor<1x32x32x3xf32>, tensor<16x3x3x3xf32>, tensor<16xf32>) -> tensor<?x32x32x16xf32>
   // CHECK: tosa.max_pool2d [[CONV]]
   // CHECK: (tensor<1x32x32x16xf32>) -> tensor<1x16x16x16xf32>
   %1 = tosa.max_pool2d %0 {kernel = array<i64: 2, 2>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 2, 2>} : (tensor<?x32x32x16xf32>) -> tensor<?x16x16x16xf32>
diff --git a/mlir/test/Integration/GPU/CUDA/assert.mlir b/mlir/test/Integration/GPU/CUDA/assert.mlir
index 06a9c1ca0d11..3d6527fe59b2 100644
--- a/mlir/test/Integration/GPU/CUDA/assert.mlir
+++ b/mlir/test/Integration/GPU/CUDA/assert.mlir
@@ -16,10 +16,10 @@ gpu.module @kernels {
 gpu.func @test_assert(%c0: i1, %c1: i1) kernel {
   %0 = gpu.thread_id x
   cf.assert %c1, "passing assertion"
-  gpu.printf "thread %lld: print after passing assertion\n" %0 : index
+  gpu.printf "thread %lld: print after passing assertion\n", %0 : index
   // Test callsite(callsite(name)) location.
   cf.assert %c0, "failing assertion" loc(callsite(callsite("callee_func_name"("callee_file.cc":7:9) at "caller_file.cc":10:8) at "caller2_file.cc":11:12))
-  gpu.printf "thread %lld: print after failing assertion\n" %0 : index
+  gpu.printf "thread %lld: print after failing assertion\n", %0 : index
   gpu.return
 }
 }
diff --git a/mlir/test/Integration/GPU/CUDA/printf.mlir b/mlir/test/Integration/GPU/CUDA/printf.mlir
index 99ea1208e9c5..15b0bf02d911 100644
--- a/mlir/test/Integration/GPU/CUDA/printf.mlir
+++ b/mlir/test/Integration/GPU/CUDA/printf.mlir
@@ -14,7 +14,7 @@ module attributes {gpu.container_module} {
             %0 = gpu.thread_id x
             %csti8 = arith.constant 2 : i8
             %cstf32 = arith.constant 3.0 : f32
-            gpu.printf "Hello from %lld, %d, %f\n" %0, %csti8, %cstf32  : index, i8, f32
+            gpu.printf "Hello from %lld, %d, %f\n", %0, %csti8, %cstf32  : index, i8, f32
             gpu.return
         }
     }
diff --git a/mlir/test/Integration/GPU/CUDA/sm90/cga_cluster.mlir b/mlir/test/Integration/GPU/CUDA/sm90/cga_cluster.mlir
index c70c940564a2..a22a34b9393a 100644
--- a/mlir/test/Integration/GPU/CUDA/sm90/cga_cluster.mlir
+++ b/mlir/test/Integration/GPU/CUDA/sm90/cga_cluster.mlir
@@ -43,7 +43,7 @@ module attributes {gpu.container_module} {
       %cnd2 =  arith.cmpi eq, %bidY, %c3 : index
       scf.if %cnd1 {
         scf.if %cnd2 {
-          gpu.printf "clusterIdx: (%d, %d, %d) in Cluster Dimension: (%d, %d, %d) blockIdx: (%d, %d, %d) \n" 
+          gpu.printf "clusterIdx: (%d, %d, %d) in Cluster Dimension: (%d, %d, %d) blockIdx: (%d, %d, %d) \n",
             %cidX_i32,
             %cidY_i32,
             %cidZ_i32,
diff --git a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_128x64_swizzle128b.mlir b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_128x64_swizzle128b.mlir
index b50772f8249f..95bde40deb48 100644
--- a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_128x64_swizzle128b.mlir
+++ b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_128x64_swizzle128b.mlir
@@ -85,7 +85,7 @@ module @mymod {
       
       // Step 7. First thread does TMA load
       scf.if %10 {
-        gpu.printf "[GPU] TMA SIZE %d\0A" %c8192 : index
+        gpu.printf "[GPU] TMA SIZE %d\0A", %c8192 : index
         nvgpu.tma.async.load %3[%c0, %c0], %9[%c0] to %7 : !lhsTensorMap, !barrierType -> !shmemlhs
         nvgpu.mbarrier.arrive.expect_tx %9[%c0], %c8192 : !barrierType
       } else {
@@ -98,16 +98,16 @@ module @mymod {
 
       // Step 9. Print loaded data in 128b swizzled
       scf.if %10 {        
-        gpu.printf "===--- Matrix A ---=== %d \0A" %c-1_i32 : i32
+        gpu.printf "===--- Matrix A ---=== %d \0A", %c-1_i32 : i32
         scf.for %arg12 = %c0 to %c128 step %c1 {
           scf.for %arg13 = %c0 to %c64 step %c1 {
             %15 = memref.load %7[%arg12, %arg13] : !shmemlhs
             %16 = arith.extf %15 : f16 to f32
-            gpu.printf "%.0f,   " %16 : f32
+            gpu.printf "%.0f,   ", %16 : f32
           }
-          gpu.printf "%d\0A" %c-1_i32 : i32
+          gpu.printf "%d\0A", %c-1_i32 : i32
         }
-        gpu.printf "===----------------=== %d \0A" %c-1_i32 : i32
+        gpu.printf "===----------------=== %d \0A", %c-1_i32 : i32
       }
       gpu.terminator
     }
diff --git a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x64_swizzle128b.mlir b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x64_swizzle128b.mlir
index 65e5fc0aff6a..e76fa03903b8 100644
--- a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x64_swizzle128b.mlir
+++ b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x64_swizzle128b.mlir
@@ -109,7 +109,7 @@ module @mymod {
       
       // Step 6. First thread does TMA load
       scf.if %10 {
-        gpu.printf "[GPU] TMA SIZE %d\0A" %c32768 : index
+        gpu.printf "[GPU] TMA SIZE %d\0A", %c32768 : index
         nvgpu.tma.async.load %d_lhsTensorMap[%c0, %c0], %9[%c0] to %lhsShmem : !lhsTensorMap, !barrierType -> !shmemlhs
         nvgpu.tma.async.load %d_rhsTensorMap[%c0, %c0], %9[%c0] to %rhsShmem1 : !rhsTensorMap, !barrierType -> memref<64x64xf16, strided<[128, 1]>, 3>
         nvgpu.tma.async.load %d_rhsTensorMap[%c64, %c0], %9[%c0] to %rhsShmem2 : !rhsTensorMap, !barrierType -> memref<64x64xf16, strided<[128, 1], offset: 4096>, 3>
@@ -124,16 +124,16 @@ module @mymod {
 
       // Step 8. Print loaded data in 128b swizzled
       scf.if %10 {        
-        gpu.printf "===--- Matrix B ---=== %d \n" %c-1_i32 : i32
+        gpu.printf "===--- Matrix B ---=== %d \n", %c-1_i32 : i32
         scf.for %ii = %c0 to %c64 step %c1 {
           scf.for %j = %c0 to %c128 step %c1 {
             %lhs0 = memref.load %rhsShmem[%ii, %j] : !shmemrhs
             %lhs032 = arith.extf %lhs0: f16 to f32
-            gpu.printf "%.0f,   " %lhs032 : f32
+            gpu.printf "%.0f,   ", %lhs032 : f32
           }
-          gpu.printf "%d\n" %c-1_i32 : i32
+          gpu.printf "%d\n", %c-1_i32 : i32
         }
-        gpu.printf "===----------------=== %d \n" %c-1_i32 : i32
+        gpu.printf "===----------------=== %d \n", %c-1_i32 : i32
       }
       gpu.barrier
       gpu.terminator
diff --git a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x8_8x128_noswizzle.mlir b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x8_8x128_noswizzle.mlir
index 391fda82e1e1..acca9811f570 100644
--- a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x8_8x128_noswizzle.mlir
+++ b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x8_8x128_noswizzle.mlir
@@ -80,8 +80,8 @@ module @mymod {
         nvgpu.mbarrier.arrive.expect_tx %9[%c0], %c6144 : <memorySpace = #gpu.address_space<workgroup>>
         %11 = memref.load %7[%c0, %c0] : memref<64x8xf32, 3>
         %12 = memref.load %8[%c0, %c0] : memref<8x128xf32, 3>
-        gpu.printf "[GPU] TMA BEFORE lhs[45][7] %f\0A" %11 : f32
-        gpu.printf "[GPU] TMA BEFORE rhs[7][0] %f\0A" %12 : f32
+        gpu.printf "[GPU] TMA BEFORE lhs[45][7] %f\0A", %11 : f32
+        gpu.printf "[GPU] TMA BEFORE rhs[7][0] %f\0A", %12 : f32
         nvgpu.tma.async.load %3[%c0, %c0], %9[%c0] to %7 : <tensor = memref<64x8xf32, 3>, swizzle = none, l2promo = none, oob = zero, interleave = none>, <memorySpace = #gpu.address_space<workgroup>> -> memref<64x8xf32, 3>
         nvgpu.tma.async.load %4[%c0, %c0], %9[%c0] to %8 : <tensor = memref<8x128xf32, 3>, swizzle = none, l2promo = none, oob = zero, interleave = none>, <memorySpace = #gpu.address_space<workgroup>> -> memref<8x128xf32, 3>
       } else {
@@ -92,8 +92,8 @@ module @mymod {
       scf.if %10 {
         %11 = memref.load %7[%c45, %c7] : memref<64x8xf32, 3>
         %12 = memref.load %8[%c7, %c0] : memref<8x128xf32, 3>
-        gpu.printf "[GPU] TMA LOADED lhs[45][7] %f\0A" %11 : f32
-        gpu.printf "[GPU] TMA LOADED rhs[7][0] %f\0A" %12 : f32
+        gpu.printf "[GPU] TMA LOADED lhs[45][7] %f\0A", %11 : f32
+        gpu.printf "[GPU] TMA LOADED rhs[7][0] %f\0A", %12 : f32
       }
       gpu.terminator
     }
diff --git a/mlir/test/Integration/GPU/CUDA/sm90/transform-dialect/tma_load_64x8_8x128_noswizzle-transform.mlir b/mlir/test/Integration/GPU/CUDA/sm90/transform-dialect/tma_load_64x8_8x128_noswizzle-transform.mlir
index f83f65bb2963..fe6c645357ec 100644
--- a/mlir/test/Integration/GPU/CUDA/sm90/transform-dialect/tma_load_64x8_8x128_noswizzle-transform.mlir
+++ b/mlir/test/Integration/GPU/CUDA/sm90/transform-dialect/tma_load_64x8_8x128_noswizzle-transform.mlir
@@ -96,8 +96,8 @@ func.func @main() {
     scf.if %10 {
       %11 = memref.load %out[%c45, %c7] : memref<64x8xf32, 3>
       %12 = memref.load %out_1[%c7, %c0] : memref<8x128xf32, 3>
-      gpu.printf "[GPU] TMA LOADED lhs[45][7] %f\0A" %11 : f32
-      gpu.printf "[GPU] TMA LOADED rhs[7][0] %f\0A" %12 : f32
+      gpu.printf "[GPU] TMA LOADED lhs[45][7] %f\0A", %11 : f32
+      gpu.printf "[GPU] TMA LOADED rhs[7][0] %f\0A", %12 : f32
     }
     gpu.terminator
   }
diff --git a/mlir/test/Integration/GPU/ROCM/printf.mlir b/mlir/test/Integration/GPU/ROCM/printf.mlir
index d5e6e3757540..4a0e4d34bfab 100644
--- a/mlir/test/Integration/GPU/ROCM/printf.mlir
+++ b/mlir/test/Integration/GPU/ROCM/printf.mlir
@@ -13,7 +13,7 @@ module attributes {gpu.container_module} {
     gpu.module @kernels {
         gpu.func @hello() kernel {
             %0 = gpu.thread_id x
-            gpu.printf "Hello from %d\n" %0 : index
+            gpu.printf "Hello from %d\n", %0 : index
             gpu.return
         }
     }
diff --git a/mlir/test/Target/LLVMIR/Import/metadata-alias-scopes.ll b/mlir/test/Target/LLVMIR/Import/metadata-alias-scopes.ll
index f5128ff76bc5..bf4c85786216 100644
--- a/mlir/test/Target/LLVMIR/Import/metadata-alias-scopes.ll
+++ b/mlir/test/Target/LLVMIR/Import/metadata-alias-scopes.ll
@@ -92,3 +92,38 @@ declare void @foo(ptr %arg1)
 !0 = distinct !{!0, !"The domain"}
 !1 = !{!1, !0}
 !2 = !{!1}
+
+; // -----
+
+; CHECK: #[[DOMAIN:.*]] = #llvm.alias_scope_domain<id = "domain1">
+; CHECK: #[[$SCOPE0:.*]] = #llvm.alias_scope<id = "scopeid1", domain = #[[DOMAIN]], description = "The first scope">
+; CHECK: #[[$SCOPE1:.*]] = #llvm.alias_scope<id = "scopeid2", domain = #[[DOMAIN]]>
+; CHECK: #[[$SCOPE2:.*]] = #llvm.alias_scope<id = "scopeid3", domain = #[[DOMAIN]]>
+
+; CHECK-LABEL: llvm.func @alias_scope
+define void @alias_scope(ptr %arg1) {
+  ; CHECK: llvm.load
+  ; CHECK-SAME:  alias_scopes = [#[[$SCOPE0]]]
+  ; CHECK-SAME:  noalias_scopes = [#[[$SCOPE1]], #[[$SCOPE2]]]
+  %1 = load i32, ptr %arg1, !alias.scope !4, !noalias !7
+  ; CHECK: llvm.load
+  ; CHECK-SAME:  alias_scopes = [#[[$SCOPE1]]]
+  ; CHECK-SAME:  noalias_scopes = [#[[$SCOPE0]], #[[$SCOPE2]]]
+  %2 = load i32, ptr %arg1, !alias.scope !5, !noalias !8
+  ; CHECK: llvm.load
+  ; CHECK-SAME:  alias_scopes = [#[[$SCOPE2]]]
+  ; CHECK-SAME:  noalias_scopes = [#[[$SCOPE0]], #[[$SCOPE1]]]
+  %3 = load i32, ptr %arg1, !alias.scope !6, !noalias !9
+  ret void
+}
+
+!0 = !{!"domain1"}
+!1 = !{!"scopeid1", !0, !"The first scope"}
+!2 = !{!"scopeid2", !0}
+!3 = !{!"scopeid3", !0}
+!4 = !{!1}
+!5 = !{!2}
+!6 = !{!3}
+!7 = !{!2, !3}
+!8 = !{!1, !3}
+!9 = !{!1, !2}
diff --git a/mlir/test/Target/LLVMIR/attribute-alias-scopes.mlir b/mlir/test/Target/LLVMIR/attribute-alias-scopes.mlir
index fa3395533af2..fb71a51512ae 100644
--- a/mlir/test/Target/LLVMIR/attribute-alias-scopes.mlir
+++ b/mlir/test/Target/LLVMIR/attribute-alias-scopes.mlir
@@ -104,3 +104,54 @@ llvm.func @self_reference() {
 // CHECK-DAG: ![[SCOPES]] = !{![[SCOPE]]}
 // CHECK-DAG: = !DISubroutineType(types: ![[TYPES:[0-9]+]])
 // CHECK-DAG: ![[TYPES]] = !{null}
+
+// -----
+
+llvm.func @foo(%arg0: !llvm.ptr)
+
+#alias_scope_domain = #llvm.alias_scope_domain<id = "domain1", description = "The domain">
+#alias_scope1 = #llvm.alias_scope<id = "scope1", domain = #alias_scope_domain, description = "The first scope">
+#alias_scope2 = #llvm.alias_scope<id = "scope2", domain = #alias_scope_domain>
+#alias_scope3 = #llvm.alias_scope<id = "scope3", domain = #alias_scope_domain>
+
+// CHECK-LABEL: @alias_scopes
+llvm.func @alias_scopes(%arg1 : !llvm.ptr) {
+  %0 = llvm.mlir.constant(0 : i32) : i32
+  // CHECK:  call void @llvm.experimental.noalias.scope.decl(metadata ![[SCOPES1:[0-9]+]])
+  llvm.intr.experimental.noalias.scope.decl #alias_scope1
+  // CHECK:  store {{.*}}, !alias.scope ![[SCOPES1]], !noalias ![[SCOPES23:[0-9]+]]
+  llvm.store %0, %arg1 {alias_scopes = [#alias_scope1], noalias_scopes = [#alias_scope2, #alias_scope3]} : i32, !llvm.ptr
+  // CHECK:  load {{.*}}, !alias.scope ![[SCOPES2:[0-9]+]], !noalias ![[SCOPES13:[0-9]+]]
+  %1 = llvm.load %arg1 {alias_scopes = [#alias_scope2], noalias_scopes = [#alias_scope1, #alias_scope3]} : !llvm.ptr -> i32
+  // CHECK:  atomicrmw {{.*}}, !alias.scope ![[SCOPES3:[0-9]+]], !noalias ![[SCOPES12:[0-9]+]]
+  %2 = llvm.atomicrmw add %arg1, %0 monotonic {alias_scopes = [#alias_scope3], noalias_scopes = [#alias_scope1, #alias_scope2]} : !llvm.ptr, i32
+  // CHECK:  cmpxchg {{.*}}, !alias.scope ![[SCOPES3]]
+  %3 = llvm.cmpxchg %arg1, %1, %2 acq_rel monotonic {alias_scopes = [#alias_scope3]} : !llvm.ptr, i32
+  %5 = llvm.mlir.constant(42 : i8) : i8
+  // CHECK:  llvm.memcpy{{.*}}, !alias.scope ![[SCOPES3]]
+  "llvm.intr.memcpy"(%arg1, %arg1, %0) <{isVolatile = false}> {alias_scopes = [#alias_scope3]} : (!llvm.ptr, !llvm.ptr, i32) -> ()
+  // CHECK:  llvm.memset{{.*}}, !noalias ![[SCOPES3]]
+  "llvm.intr.memset"(%arg1, %5, %0) <{isVolatile = false}> {noalias_scopes = [#alias_scope3]} : (!llvm.ptr, i8, i32) -> ()
+  // CHECK: call void @foo({{.*}} !alias.scope ![[SCOPES3]]
+  llvm.call @foo(%arg1) {alias_scopes = [#alias_scope3]} : (!llvm.ptr) -> ()
+  // CHECK: call void @foo({{.*}} !noalias ![[SCOPES3]]
+  llvm.call @foo(%arg1) {noalias_scopes = [#alias_scope3]} : (!llvm.ptr) -> ()
+  llvm.return
+}
+
+// Check the intrinsic declarations.
+// CHECK-DAG: declare void @llvm.experimental.noalias.scope.decl(metadata)
+// CHECK-DAG: declare void @llvm.memcpy.p0.p0.i32(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i32, i1 immarg)
+// CHECK-DAG: declare void @llvm.memset.p0.i32(ptr nocapture writeonly, i8, i32, i1 immarg)
+
+// Check the translated metadata.
+// CHECK-DAG: ![[DOMAIN:[0-9]+]] = !{!"domain1", !"The domain"}
+// CHECK-DAG: ![[SCOPE1:[0-9]+]] = !{!"scope1", ![[DOMAIN]], !"The first scope"}
+// CHECK-DAG: ![[SCOPE2:[0-9]+]] = !{!"scope2", ![[DOMAIN]]}
+// CHECK-DAG: ![[SCOPE3:[0-9]+]] = !{!"scope3", ![[DOMAIN]]}
+// CHECK-DAG: ![[SCOPES1]] = !{![[SCOPE1]]}
+// CHECK-DAG: ![[SCOPES2]] = !{![[SCOPE2]]}
+// CHECK-DAG: ![[SCOPES3]] = !{![[SCOPE3]]}
+// CHECK-DAG: ![[SCOPES12]] = !{![[SCOPE1]], ![[SCOPE2]]}
+// CHECK-DAG: ![[SCOPES13]] = !{![[SCOPE1]], ![[SCOPE3]]}
+// CHECK-DAG: ![[SCOPES23]] = !{![[SCOPE2]], ![[SCOPE3]]}
diff --git a/mlir/test/Target/LLVMIR/nvvmir.mlir b/mlir/test/Target/LLVMIR/nvvmir.mlir
index b69d77496351..2d7710e7cbf2 100644
--- a/mlir/test/Target/LLVMIR/nvvmir.mlir
+++ b/mlir/test/Target/LLVMIR/nvvmir.mlir
@@ -556,9 +556,7 @@ llvm.func @kernel_func() attributes {nvvm.kernel} {
   llvm.return
 }
 
-// CHECK:     !nvvm.annotations =
-// CHECK-NOT: {ptr @nvvm_special_regs, !"kernel", i32 1}
-// CHECK:     {ptr @kernel_func, !"kernel", i32 1}
+// CHECK: ptx_kernel void @kernel_func
 
 // -----
 
@@ -566,9 +564,8 @@ llvm.func @kernel_func() attributes {nvvm.kernel, nvvm.maxntid = array<i32: 1, 2
   llvm.return
 }
 
+// CHECK: define ptx_kernel void @kernel_func
 // CHECK:     !nvvm.annotations =
-// CHECK-NOT: {ptr @nvvm_special_regs, !"kernel", i32 1}
-// CHECK:     {ptr @kernel_func, !"kernel", i32 1}
 // CHECK:     {ptr @kernel_func, !"maxntidx", i32 1}
 // CHECK:     {ptr @kernel_func, !"maxntidy", i32 23}
 // CHECK:     {ptr @kernel_func, !"maxntidz", i32 32}
@@ -578,9 +575,8 @@ llvm.func @kernel_func() attributes {nvvm.kernel, nvvm.reqntid = array<i32: 1, 2
   llvm.return
 }
 
+// CHECK: define ptx_kernel void @kernel_func
 // CHECK:     !nvvm.annotations =
-// CHECK-NOT: {ptr @nvvm_special_regs, !"kernel", i32 1}
-// CHECK:     {ptr @kernel_func, !"kernel", i32 1}
 // CHECK:     {ptr @kernel_func, !"reqntidx", i32 1}
 // CHECK:     {ptr @kernel_func, !"reqntidy", i32 23}
 // CHECK:     {ptr @kernel_func, !"reqntidz", i32 32}
@@ -590,31 +586,28 @@ llvm.func @kernel_func() attributes {nvvm.kernel, nvvm.cluster_dim = array<i32:
   llvm.return
 }
 
+// CHECK: define ptx_kernel void @kernel_func
 // CHECK:     !nvvm.annotations =
-// CHECK-NOT: {ptr @nvvm_special_regs, !"kernel", i32 1}
 // CHECK:     {ptr @kernel_func, !"cluster_dim_x", i32 3}
 // CHECK:     {ptr @kernel_func, !"cluster_dim_y", i32 5}
 // CHECK:     {ptr @kernel_func, !"cluster_dim_z", i32 7}
-// CHECK:     {ptr @kernel_func, !"kernel", i32 1}
 // -----
 
 llvm.func @kernel_func() attributes {nvvm.kernel, nvvm.cluster_max_blocks = 8} {
   llvm.return
 }
 
+// CHECK: define ptx_kernel void @kernel_func
 // CHECK:     !nvvm.annotations =
-// CHECK-NOT: {ptr @nvvm_special_regs, !"kernel", i32 1}
 // CHECK:     {ptr @kernel_func, !"cluster_max_blocks", i32 8}
-// CHECK:     {ptr @kernel_func, !"kernel", i32 1}
 // -----
 
 llvm.func @kernel_func() attributes {nvvm.kernel, nvvm.minctasm = 16} {
   llvm.return
 }
 
+// CHECK: define ptx_kernel void @kernel_func
 // CHECK:     !nvvm.annotations =
-// CHECK-NOT: {ptr @nvvm_special_regs, !"kernel", i32 1}
-// CHECK:     {ptr @kernel_func, !"kernel", i32 1}
 // CHECK:     {ptr @kernel_func, !"minctasm", i32 16}
 // -----
 
@@ -622,9 +615,8 @@ llvm.func @kernel_func() attributes {nvvm.kernel, nvvm.maxnreg = 16} {
   llvm.return
 }
 
+// CHECK: define ptx_kernel void @kernel_func
 // CHECK:     !nvvm.annotations =
-// CHECK-NOT: {ptr @nvvm_special_regs, !"kernel", i32 1}
-// CHECK:     {ptr @kernel_func, !"kernel", i32 1}
 // CHECK:     {ptr @kernel_func, !"maxnreg", i32 16}
 // -----
 
@@ -633,9 +625,8 @@ llvm.func @kernel_func() attributes {nvvm.kernel, nvvm.maxntid = array<i32: 1, 2
   llvm.return
 }
 
+// CHECK: define ptx_kernel void @kernel_func
 // CHECK:     !nvvm.annotations =
-// CHECK-NOT: {ptr @nvvm_special_regs, !"kernel", i32 1}
-// CHECK:     {ptr @kernel_func, !"kernel", i32 1}
 // CHECK:     {ptr @kernel_func, !"maxnreg", i32 32}
 // CHECK:     {ptr @kernel_func, !"maxntidx", i32 1}
 // CHECK:     {ptr @kernel_func, !"maxntidy", i32 23}
@@ -643,19 +634,19 @@ llvm.func @kernel_func() attributes {nvvm.kernel, nvvm.maxntid = array<i32: 1, 2
 // CHECK:     {ptr @kernel_func, !"minctasm", i32 16}
 
 // -----
+// CHECK: define ptx_kernel void @kernel_func
 // CHECK: !nvvm.annotations =
 // CHECK: !1 = !{ptr @kernel_func, !"grid_constant", !2}
 // CHECK: !2 = !{i32 1}
-// CHECK: !3 = !{ptr @kernel_func, !"kernel", i32 1}
 llvm.func @kernel_func(%arg0: !llvm.ptr {llvm.byval = i32, nvvm.grid_constant}) attributes {nvvm.kernel} {
   llvm.return
 }
 
 // -----
+// CHECK: define ptx_kernel void @kernel_func
 // CHECK: !nvvm.annotations =
 // CHECK: !1 = !{ptr @kernel_func, !"grid_constant", !2}
 // CHECK: !2 = !{i32 1, i32 3}
-// CHECK: !3 = !{ptr @kernel_func, !"kernel", i32 1}
 llvm.func @kernel_func(%arg0: !llvm.ptr {llvm.byval = i32, nvvm.grid_constant}, %arg1: f32, %arg2: !llvm.ptr {llvm.byval = f32, nvvm.grid_constant}) attributes {nvvm.kernel} {
   llvm.return
 }
diff --git a/mlir/test/Transforms/location-snapshot.mlir b/mlir/test/Transforms/location-snapshot.mlir
index 9f48cb6e3b3f..aeddfedd08ae 100644
--- a/mlir/test/Transforms/location-snapshot.mlir
+++ b/mlir/test/Transforms/location-snapshot.mlir
@@ -1,5 +1,6 @@
 // RUN: mlir-opt -allow-unregistered-dialect -snapshot-op-locations='filename=%/t' -mlir-print-local-scope -mlir-print-debuginfo %s | FileCheck %s -DFILE=%/t
 // RUN: mlir-opt -allow-unregistered-dialect -snapshot-op-locations='filename=%/t tag='tagged'' -mlir-print-local-scope -mlir-print-debuginfo %s | FileCheck %s --check-prefix=TAG -DFILE=%/t
+// RUN: mlir-opt -allow-unregistered-dialect -snapshot-op-locations='filename=%/t print-debuginfo' -mlir-print-local-scope -mlir-print-debuginfo %s | FileCheck %s --check-prefix=DBG -DFILE=%/t && cat %/t | FileCheck %s --check-prefix=DBGFILE
 
 // CHECK: func @function(
 // CHECK-NEXT: loc("[[FILE]]":{{[0-9]+}}:{{[0-9]+}})
@@ -15,3 +16,18 @@ func.func @function() -> i32 {
   %1 = "foo"() : () -> i32 loc("original")
   return %1 : i32 loc("original")
 } loc("original")
+
+// DBG: func @function2(
+// DBG-NEXT: loc("[[FILE]]":{{[0-9]+}}:{{[0-9]+}})
+// DBG-NEXT: loc("[[FILE]]":{{[0-9]+}}:{{[0-9]+}})
+// DBG-NEXT: } loc("[[FILE]]":{{[0-9]+}}:{{[0-9]+}})
+
+// DBGFILE: func @function2(
+// DBGFILE-NEXT: loc("{{.*}}location-snapshot.mlir":{{[0-9]+}}:{{[0-9]+}})
+// DBGFILE-NEXT: loc("{{.*}}location-snapshot.mlir":{{[0-9]+}}:{{[0-9]+}})
+// DBGFILE-NEXT: } loc("{{.*}}location-snapshot.mlir":{{[0-9]+}}:{{[0-9]+}})
+
+func.func @function2() -> i32 {
+  %1 = "foo"() : () -> i32
+  return %1 : i32
+}
+\ No newline at end of file
diff --git a/mlir/test/Transforms/loop-invariant-code-motion.mlir b/mlir/test/Transforms/loop-invariant-code-motion.mlir
index e4c423ce7052..5133c14414c9 100644
--- a/mlir/test/Transforms/loop-invariant-code-motion.mlir
+++ b/mlir/test/Transforms/loop-invariant-code-motion.mlir
@@ -124,6 +124,64 @@ func.func @invariant_affine_if() {
 
 // -----
 
+func.func @hoist_invariant_affine_if_success(%lb: index, %ub: index, %step: index) -> i32 {
+  %cst_0 = arith.constant 0 : i32
+  %cst_42 = arith.constant 42 : i32
+  %sum_result = affine.for %i = %lb to %ub iter_args(%acc = %cst_0) -> i32 {
+    %conditional_add = affine.if affine_set<() : ()> () -> (i32) {
+      %add = arith.addi %cst_42, %cst_42 : i32
+      affine.yield %add : i32
+    } else {
+      %poison = ub.poison : i32
+      affine.yield %poison : i32
+    }
+    %sum = arith.addi %acc, %conditional_add : i32
+    affine.yield %sum : i32
+  }
+
+  // CHECK-LABEL: hoist_invariant_affine_if_success
+  // CHECK-NEXT: arith.constant 0 : i32
+  // CHECK-NEXT: %[[CST:.*]] = arith.constant 42 : i32
+  // CHECK-NEXT: %[[IF:.*]] = affine.if
+  // CHECK-NEXT: arith.addi %[[CST]], %[[CST]] : i32
+  // CHECK: affine.for
+  // CHECK-NOT: affine.if
+  // CHECK-NEXT: arith.addi %{{.*}}, %[[IF]]
+
+  return %sum_result : i32
+}
+
+// -----
+
+func.func @hoist_variant_affine_if_failure(%lb: index, %ub: index, %step: index) -> i32 {
+  %cst_0 = arith.constant 0 : i32
+  %cst_42 = arith.constant 42 : i32
+  %ind_7 = arith.constant 7 : index
+  %sum_result = affine.for %i = %lb to %ub iter_args(%acc = %cst_0) -> i32 {
+    %conditional_add = affine.if affine_set<(d0, d1) : (d1 - d0 >= 0)> (%i, %ind_7) -> (i32) {
+      %add = arith.addi %cst_42, %cst_42 : i32
+      affine.yield %add : i32
+    } else {
+      %poison = ub.poison : i32
+      affine.yield %poison : i32
+    }
+    %sum = arith.addi %acc, %conditional_add : i32
+    affine.yield %sum : i32
+  }
+
+  // CHECK-LABEL: hoist_variant_affine_if_failure
+  // CHECK-NEXT: arith.constant 0 : i32
+  // CHECK-NEXT: %[[CST:.*]] = arith.constant 42 : i32
+  // CHECK-NEXT: arith.constant 7 : index
+  // CHECK-NEXT: affine.for
+  // CHECK-NEXT: %[[IF:.*]] = affine.if
+  // CHECK: arith.addi %{{.*}}, %[[IF]]
+
+  return %sum_result : i32
+}
+
+// -----
+
 func.func @hoist_affine_for_with_unknown_trip_count(%lb: index, %ub: index) {
   affine.for %arg0 = 0 to 10 {
     affine.for %arg1 = %lb to %ub {
@@ -383,6 +441,69 @@ func.func @parallel_loop_with_invariant() {
 
 // -----
 
+func.func @hoist_invariant_scf_if_success(%lb: index, %ub: index, %step: index) -> i32 {
+  %cst_0 = arith.constant 0 : i32
+  %cst_42 = arith.constant 42 : i32
+  %true = arith.constant true
+  %sum_result = scf.for %i = %lb to %ub step %step iter_args(%acc = %cst_0) -> i32 {
+    %conditional_add = scf.if %true -> (i32) {
+      %add = arith.addi %cst_42, %cst_42 : i32
+      scf.yield %add : i32
+    } else {
+      %poison = ub.poison : i32
+      scf.yield %poison : i32
+    }
+    %sum = arith.addi %acc, %conditional_add : i32
+    scf.yield %sum : i32
+  }
+
+  // CHECK-LABEL: hoist_invariant_scf_if_success
+  // CHECK-NEXT: arith.constant 0 : i32
+  // CHECK-NEXT: %[[CST:.*]] = arith.constant 42 : i32
+  // CHECK-NEXT: %[[TRUE:.*]] = arith.constant true
+  // CHECK-NEXT: %[[IF:.*]] = scf.if %[[TRUE]]
+  // CHECK-NEXT: arith.addi %[[CST]], %[[CST]] : i32
+  // CHECK: scf.for
+  // CHECK-NOT: scf.if
+  // CHECK-NEXT: arith.addi %{{.*}}, %[[IF]]
+
+  return %sum_result : i32
+}
+
+// -----
+
+func.func @hoist_variant_scf_if_failure(%lb: index, %ub: index, %step: index) -> i32 {
+  %cst_0 = arith.constant 0 : i32
+  %cst_42 = arith.constant 42 : i32
+  %ind_7 = arith.constant 7 : index
+  %sum_result = scf.for %i = %lb to %ub step %step iter_args(%acc = %cst_0) -> i32 {
+    %cond = arith.cmpi ult, %i, %ind_7 : index
+    %conditional_add = scf.if %cond -> (i32) {
+      %add = arith.addi %cst_42, %cst_42 : i32
+      scf.yield %add : i32
+    } else {
+      %poison = ub.poison : i32
+      scf.yield %poison : i32
+    }
+    %sum = arith.addi %acc, %conditional_add : i32
+    scf.yield %sum : i32
+  }
+
+  // CHECK-LABEL: hoist_variant_scf_if_failure
+  // CHECK-NEXT: arith.constant 0 : i32
+  // CHECK-NEXT: %[[CST_42:.*]] = arith.constant 42 : i32
+  // CHECK-NEXT: %[[CST_7:.*]] = arith.constant 7 : index
+  // CHECK-NEXT: scf.for %[[IV:.*]] = %{{.*}} to %{{.*}}
+  // CHECK-NEXT: %[[CMP:.*]] = arith.cmpi ult, %[[IV]], %[[CST_7]]
+  // CHECK-NEXT: %[[IF:.*]] = scf.if %[[CMP]]
+  // CHECK-NEXT: arith.addi %[[CST_42]], %[[CST_42]] : i32
+  // CHECK: arith.addi %{{.*}}, %[[IF]]
+
+  return %sum_result : i32
+}
+
+// -----
+
 func.func private @make_val() -> (index)
 
 // CHECK-LABEL: func @nested_uses_inside
diff --git a/mlir/test/lib/Dialect/Tosa/TosaTestPasses.cpp b/mlir/test/lib/Dialect/Tosa/TosaTestPasses.cpp
index ac904c3e01c9..83db1188861a 100644
--- a/mlir/test/lib/Dialect/Tosa/TosaTestPasses.cpp
+++ b/mlir/test/lib/Dialect/Tosa/TosaTestPasses.cpp
@@ -149,7 +149,7 @@ ConvertTosaConv2DOp::matchAndRewrite(Operation *op,
       op->getLoc(), newTosaConv2DOpType, tosaConv2DOp.getInput(),
       tosaConv2DOp.getWeight(), tosaConv2DOp.getBias(),
       tosaConv2DOp.getPadAttr(), tosaConv2DOp.getStrideAttr(),
-      tosaConv2DOp.getDilationAttr());
+      tosaConv2DOp.getDilationAttr(), tosaConv2DOp.getAccTypeAttr());
 
   // Create rescale to quantized type
   double inputScale = inputQType.getScale();
diff --git a/mlir/test/python/execution_engine.py b/mlir/test/python/execution_engine.py
index 6d3a8db8c24b..0d12c35d96be 100644
--- a/mlir/test/python/execution_engine.py
+++ b/mlir/test/python/execution_engine.py
@@ -306,7 +306,7 @@ def testUnrankedMemRefWithOffsetCallback():
         log(arr)
 
     with Context():
-        # The module takes a subview of the argument memref, casts it to an unranked memref and 
+        # The module takes a subview of the argument memref, casts it to an unranked memref and
         # calls the callback with it.
         module = Module.parse(
             r"""
diff --git a/polly/include/polly/CodeGen/BlockGenerators.h b/polly/include/polly/CodeGen/BlockGenerators.h
index 4e2645468a74..401e80eb0fec 100644
--- a/polly/include/polly/CodeGen/BlockGenerators.h
+++ b/polly/include/polly/CodeGen/BlockGenerators.h
@@ -632,7 +632,7 @@ public:
 };
 
 /// Generator for new versions of polyhedral region statements.
-class RegionGenerator final : BlockGenerator {
+class RegionGenerator final : public BlockGenerator {
 public:
   /// Create a generator for regions.
   ///
diff --git a/polly/lib/CodeGen/BlockGenerators.cpp b/polly/lib/CodeGen/BlockGenerators.cpp
index b76d8f4c18a5..6d723d6e3329 100644
--- a/polly/lib/CodeGen/BlockGenerators.cpp
+++ b/polly/lib/CodeGen/BlockGenerators.cpp
@@ -1000,7 +1000,7 @@ BasicBlock *RegionGenerator::repairDominance(BasicBlock *BB,
   BasicBlock *BBCopyIDom = EndBlockMap.lookup(BBIDom);
 
   if (BBCopyIDom)
-    DT.changeImmediateDominator(BBCopy, BBCopyIDom);
+    GenDT->changeImmediateDominator(BBCopy, BBCopyIDom);
 
   return StartBlockMap.lookup(BBIDom);
 }
@@ -1069,8 +1069,8 @@ void RegionGenerator::copyStmt(ScopStmt &Stmt, LoopToScevMapT &LTS,
   // Create a dedicated entry for the region where we can reload all demoted
   // inputs.
   BasicBlock *EntryBB = R->getEntry();
-  BasicBlock *EntryBBCopy = SplitBlock(Builder.GetInsertBlock(),
-                                       &*Builder.GetInsertPoint(), &DT, &LI);
+  BasicBlock *EntryBBCopy = SplitBlock(
+      Builder.GetInsertBlock(), &*Builder.GetInsertPoint(), GenDT, GenLI);
   EntryBBCopy->setName("polly.stmt." + EntryBB->getName() + ".entry");
   Builder.SetInsertPoint(&EntryBBCopy->front());
 
@@ -1136,7 +1136,7 @@ void RegionGenerator::copyStmt(ScopStmt &Stmt, LoopToScevMapT &LTS,
 
   // Now create a new dedicated region exit block and add it to the region map.
   BasicBlock *ExitBBCopy = SplitBlock(Builder.GetInsertBlock(),
-                                      &*Builder.GetInsertPoint(), &DT, &LI);
+                                      &*Builder.GetInsertPoint(), GenDT, GenLI);
   ExitBBCopy->setName("polly.stmt." + R->getExit()->getName() + ".exit");
   StartBlockMap[R->getExit()] = ExitBBCopy;
   EndBlockMap[R->getExit()] = ExitBBCopy;
@@ -1145,7 +1145,7 @@ void RegionGenerator::copyStmt(ScopStmt &Stmt, LoopToScevMapT &LTS,
   assert(ExitDomBBCopy &&
          "Common exit dominator must be within region; at least the entry node "
          "must match");
-  DT.changeImmediateDominator(ExitBBCopy, ExitDomBBCopy);
+  GenDT->changeImmediateDominator(ExitBBCopy, ExitDomBBCopy);
 
   // As the block generator doesn't handle control flow we need to add the
   // region control flow by hand after all blocks have been copied.
diff --git a/polly/lib/CodeGen/IslNodeBuilder.cpp b/polly/lib/CodeGen/IslNodeBuilder.cpp
index d76f6251ea4c..739bd63a5eb8 100644
--- a/polly/lib/CodeGen/IslNodeBuilder.cpp
+++ b/polly/lib/CodeGen/IslNodeBuilder.cpp
@@ -612,6 +612,7 @@ void IslNodeBuilder::createForParallel(__isl_take isl_ast_node *For) {
   GenLI = SubLI;
   GenSE = SubSE.get();
   BlockGen.switchGeneratedFunc(SubFn, GenDT, GenLI, GenSE);
+  RegionGen.switchGeneratedFunc(SubFn, GenDT, GenLI, GenSE);
   ExprBuilder.switchGeneratedFunc(SubFn, GenDT, GenLI, GenSE);
   Builder.SetInsertPoint(&*LoopBody);
 
@@ -681,6 +682,7 @@ void IslNodeBuilder::createForParallel(__isl_take isl_ast_node *For) {
   IDToValue = std::move(IDToValueCopy);
   ValueMap = std::move(CallerGlobals);
   ExprBuilder.switchGeneratedFunc(CallerFn, CallerDT, CallerLI, CallerSE);
+  RegionGen.switchGeneratedFunc(CallerFn, CallerDT, CallerLI, CallerSE);
   BlockGen.switchGeneratedFunc(CallerFn, CallerDT, CallerLI, CallerSE);
   Builder.SetInsertPoint(&*AfterLoop);
 
diff --git a/polly/test/CodeGen/reggen_domtree_crash.ll b/polly/test/CodeGen/reggen_domtree_crash.ll
new file mode 100644
index 000000000000..58c27091a22c
--- /dev/null
+++ b/polly/test/CodeGen/reggen_domtree_crash.ll
@@ -0,0 +1,41 @@
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-parallel -S < %s | FileCheck %s
+
+; CHECK: define ptr @ham(ptr %arg, i64 %arg1, i1 %arg2)
+
+; This test is added to verify if the following IR does not crash on using different Dominator Tree when using polly parallel flag.
+
+; ModuleID = '<stdin>'
+source_filename = "<stdin>"
+
+define ptr @ham(ptr %arg, i64 %arg1, i1 %arg2) {
+bb:
+  br label %bb3
+
+bb3:                                              ; preds = %bb8, %bb
+  %phi = phi i64 [ 0, %bb ], [ %add9, %bb8 ]
+  %getelementptr = getelementptr [64 x i16], ptr %arg, i64 %phi
+  br label %bb4
+
+bb4:                                              ; preds = %bb7, %bb3
+  %phi5 = phi i64 [ %add, %bb7 ], [ 0, %bb3 ]
+  %load = load i16, ptr null, align 2
+  br i1 %arg2, label %bb7, label %bb6
+
+bb6:                                              ; preds = %bb4
+  store i16 0, ptr %getelementptr, align 2
+  br label %bb7
+
+bb7:                                              ; preds = %bb6, %bb4
+  %add = add i64 %phi5, 1
+  %icmp = icmp ne i64 %phi5, 64
+  br i1 %icmp, label %bb4, label %bb8
+
+bb8:                                              ; preds = %bb7
+  %add9 = add i64 %phi, 1
+  %icmp10 = icmp ult i64 %phi, %arg1
+  br i1 %icmp10, label %bb3, label %bb11
+
+bb11:                                             ; preds = %bb8
+  ret ptr null
+}
+
diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index 09811eb06ff0..ac3f5034d2bf 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -670,6 +670,8 @@ libc_support_library(
     deps = [
         ":__support_cpp_array",
         ":__support_cpp_iterator",
+        ":__support_libc_assert",
+        ":string_memory_utils",
     ],
 )
 
diff --git a/utils/bazel/llvm-project-overlay/libc/test/src/stdlib/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/test/src/stdlib/BUILD.bazel
index 66ac46437a1d..a8b37c5ddcc2 100644
--- a/utils/bazel/llvm-project-overlay/libc/test/src/stdlib/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/test/src/stdlib/BUILD.bazel
@@ -115,7 +115,7 @@ libc_support_library(
     hdrs = ["SortingTest.h"],
     deps = [
         "//libc:__support_macros_config",
-        "//libc:qsort_util",
+        "//libc:qsort",
         "//libc/test/UnitTest:LibcUnitTest",
     ],
 )
diff --git a/utils/bazel/llvm-project-overlay/lldb/BUILD.bazel b/utils/bazel/llvm-project-overlay/lldb/BUILD.bazel
index db6577cb2a32..181fe3b7c331 100644
--- a/utils/bazel/llvm-project-overlay/lldb/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/lldb/BUILD.bazel
@@ -1071,6 +1071,7 @@ cc_binary(
         "//lldb:APIHeaders",
         "//lldb:Headers",
         "//lldb:Host",
+        "//lldb:Utility",
         "//lldb:liblldb.wrapper",
         "//llvm:Option",
         "//llvm:Support",
diff --git a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
index 18ac78174856..bfcb53e1f6b0 100644
--- a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
@@ -1322,6 +1322,7 @@ cc_library(
     includes = ["include"],
     textual_hdrs = [
         "include/llvm/TargetParser/AArch64CPUFeatures.inc",
+        "include/llvm/TargetParser/AArch64FeatPriorities.inc",
         "include/llvm/TargetParser/AArch64TargetParserDef.inc",
         "include/llvm/TargetParser/ARMTargetParserDef.inc",
         "include/llvm/TargetParser/RISCVTargetParserDef.inc",
@@ -2270,6 +2271,7 @@ llvm_target_lib_list = [lib for lib in [
             ("-gen-register-info", "lib/Target/RISCV/RISCVGenRegisterInfo.inc"),
             ("-gen-subtarget", "lib/Target/RISCV/RISCVGenSubtargetInfo.inc"),
             ("-gen-searchable-tables", "lib/Target/RISCV/RISCVGenSearchableTables.inc"),
+            ("-gen-exegesis", "lib/Target/RISCV/RISCVGenExegesis.inc"),
         ],
         "tbl_deps": [
             ":riscv_isel_target_gen",
author	NAKAMURA Takumi <geek4civic@gmail.com>	2025-01-10 19:25:56 +0900
committer	NAKAMURA Takumi <geek4civic@gmail.com>	2025-01-10 19:25:56 +0900
commit	63f5dc16d6bfca0512fb034052b41d13c3751e20 (patch)
tree	e70266be1fda941e0974e71e3d2c1cf080081311
parent	9e5734688ed3d5f6b3fb76a26b3d90a736d60781 (diff)
parent	397ac44f623f891d8f05d6673a95984ac0a26671 (diff)