From ba5f31cfaa2452a4a94a482b53d899d6f2ee0e66 Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@linaro.org>
Date: Thu, 17 Jul 2025 14:29:25 +0000
Subject: [lldb][test] Disable TestChildCountTruncation on Windows

This fails because it tells clang to use DWARF which link.exe
then discards.

The test may not need DWARF, but I'm going to confirm that in
a follow up PR review.

Test added by https://github.com/llvm/llvm-project/pull/149088.
---
 lldb/test/Shell/Settings/TestChildCountTruncation.test | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/lldb/test/Shell/Settings/TestChildCountTruncation.test b/lldb/test/Shell/Settings/TestChildCountTruncation.test
index a96a0d8310ee..2660ccae1aa5 100644
--- a/lldb/test/Shell/Settings/TestChildCountTruncation.test
+++ b/lldb/test/Shell/Settings/TestChildCountTruncation.test
@@ -1,6 +1,9 @@
 # Test that we warn the user about truncated output
 # when target.max-children-count wasn't explicitly set.
 
+# link.exe discards the DWARF information needed.
+# UNSUPPORTED: system-windows
+
 # RUN: split-file %s %t
 # RUN: %clang_host -g -gdwarf %t/main.cpp -o %t.out
 # RUN: %lldb -x -b -s %t/dwim-commands.input %t.out -o exit 2>&1 \
-- 
cgit v1.2.3


From 9fa3971fac27fbe0a6e3b9745d201c16f5f98bc2 Mon Sep 17 00:00:00 2001
From: Piotr Fusik <p.fusik@samsung.com>
Date: Thu, 17 Jul 2025 16:37:59 +0200
Subject: [DAGCombiner] Fold vector subtraction if above threshold to `umin`
 (#148834)

This extends #134235 and #135194 to vectors.
---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp    | 87 ++++++++++++++---------
 llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll | 88 +++++++++---------------
 llvm/test/CodeGen/RISCV/rvv/vminu-sdnode.ll      | 88 +++++++++---------------
 3 files changed, 122 insertions(+), 141 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 0e8e4c9618bb..40464e91f9ef 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -609,6 +609,8 @@ namespace {
     SDValue foldABSToABD(SDNode *N, const SDLoc &DL);
     SDValue foldSelectToABD(SDValue LHS, SDValue RHS, SDValue True,
                             SDValue False, ISD::CondCode CC, const SDLoc &DL);
+    SDValue foldSelectToUMin(SDValue LHS, SDValue RHS, SDValue True,
+                             SDValue False, ISD::CondCode CC, const SDLoc &DL);
     SDValue unfoldMaskedMerge(SDNode *N);
     SDValue unfoldExtremeBitClearingToShifts(SDNode *N);
     SDValue SimplifySetCC(EVT VT, SDValue N0, SDValue N1, ISD::CondCode Cond,
@@ -859,7 +861,7 @@ namespace {
       auto LK = TLI.getTypeConversion(*DAG.getContext(), VT);
       return (LK.first == TargetLoweringBase::TypeLegal ||
               LK.first == TargetLoweringBase::TypePromoteInteger) &&
-             TLI.isOperationLegal(ISD::UMIN, LK.second);
+             TLI.isOperationLegalOrCustom(ISD::UMIN, LK.second);
     }
 
   public:
@@ -4093,6 +4095,26 @@ SDValue DAGCombiner::visitSUB(SDNode *N) {
       return N0;
   }
 
+  // (sub x, ([v]select (ult x, y), 0, y)) -> (umin x, (sub x, y))
+  // (sub x, ([v]select (uge x, y), y, 0)) -> (umin x, (sub x, y))
+  if (N1.hasOneUse() && hasUMin(VT)) {
+    SDValue Y;
+    if (sd_match(N1, m_Select(m_SetCC(m_Specific(N0), m_Value(Y),
+                                      m_SpecificCondCode(ISD::SETULT)),
+                              m_Zero(), m_Deferred(Y))) ||
+        sd_match(N1, m_Select(m_SetCC(m_Specific(N0), m_Value(Y),
+                                      m_SpecificCondCode(ISD::SETUGE)),
+                              m_Deferred(Y), m_Zero())) ||
+        sd_match(N1, m_VSelect(m_SetCC(m_Specific(N0), m_Value(Y),
+                                       m_SpecificCondCode(ISD::SETULT)),
+                               m_Zero(), m_Deferred(Y))) ||
+        sd_match(N1, m_VSelect(m_SetCC(m_Specific(N0), m_Value(Y),
+                                       m_SpecificCondCode(ISD::SETUGE)),
+                               m_Deferred(Y), m_Zero())))
+      return DAG.getNode(ISD::UMIN, DL, VT, N0,
+                         DAG.getNode(ISD::SUB, DL, VT, N0, Y));
+  }
+
   if (SDValue NewSel = foldBinOpIntoSelect(N))
     return NewSel;
 
@@ -4442,20 +4464,6 @@ SDValue DAGCombiner::visitSUB(SDNode *N) {
       sd_match(N1, m_UMaxLike(m_Specific(A), m_Specific(B))))
     return DAG.getNegative(DAG.getNode(ISD::ABDU, DL, VT, A, B), DL, VT);
 
-  // (sub x, (select (ult x, y), 0, y)) -> (umin x, (sub x, y))
-  // (sub x, (select (uge x, y), y, 0)) -> (umin x, (sub x, y))
-  if (hasUMin(VT)) {
-    SDValue Y;
-    if (sd_match(N1, m_OneUse(m_Select(m_SetCC(m_Specific(N0), m_Value(Y),
-                                               m_SpecificCondCode(ISD::SETULT)),
-                                       m_Zero(), m_Deferred(Y)))) ||
-        sd_match(N1, m_OneUse(m_Select(m_SetCC(m_Specific(N0), m_Value(Y),
-                                               m_SpecificCondCode(ISD::SETUGE)),
-                                       m_Deferred(Y), m_Zero()))))
-      return DAG.getNode(ISD::UMIN, DL, VT, N0,
-                         DAG.getNode(ISD::SUB, DL, VT, N0, Y));
-  }
-
   return SDValue();
 }
 
@@ -12173,6 +12181,30 @@ SDValue DAGCombiner::foldSelectToABD(SDValue LHS, SDValue RHS, SDValue True,
   return SDValue();
 }
 
+// ([v]select (ugt x, C), (add x, ~C), x) -> (umin (add x, ~C), x)
+// ([v]select (ult x, C), x, (add x, -C)) -> (umin x, (add x, -C))
+SDValue DAGCombiner::foldSelectToUMin(SDValue LHS, SDValue RHS, SDValue True,
+                                      SDValue False, ISD::CondCode CC,
+                                      const SDLoc &DL) {
+  APInt C;
+  EVT VT = True.getValueType();
+  if (sd_match(RHS, m_ConstInt(C)) && hasUMin(VT)) {
+    if (CC == ISD::SETUGT && LHS == False &&
+        sd_match(True, m_Add(m_Specific(False), m_SpecificInt(~C)))) {
+      SDValue AddC = DAG.getConstant(~C, DL, VT);
+      SDValue Add = DAG.getNode(ISD::ADD, DL, VT, False, AddC);
+      return DAG.getNode(ISD::UMIN, DL, VT, Add, False);
+    }
+    if (CC == ISD::SETULT && LHS == True &&
+        sd_match(False, m_Add(m_Specific(True), m_SpecificInt(-C)))) {
+      SDValue AddC = DAG.getConstant(-C, DL, VT);
+      SDValue Add = DAG.getNode(ISD::ADD, DL, VT, True, AddC);
+      return DAG.getNode(ISD::UMIN, DL, VT, True, Add);
+    }
+  }
+  return SDValue();
+}
+
 SDValue DAGCombiner::visitSELECT(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
@@ -12358,24 +12390,8 @@ SDValue DAGCombiner::visitSELECT(SDNode *N) {
 
     // (select (ugt x, C), (add x, ~C), x) -> (umin (add x, ~C), x)
     // (select (ult x, C), x, (add x, -C)) -> (umin x, (add x, -C))
-    APInt C;
-    if (sd_match(Cond1, m_ConstInt(C)) && hasUMin(VT)) {
-      if (CC == ISD::SETUGT && Cond0 == N2 &&
-          sd_match(N1, m_Add(m_Specific(N2), m_SpecificInt(~C)))) {
-        // The resulting code relies on an unsigned wrap in ADD.
-        // Recreating ADD to drop possible nuw/nsw flags.
-        SDValue AddC = DAG.getConstant(~C, DL, VT);
-        SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N2, AddC);
-        return DAG.getNode(ISD::UMIN, DL, VT, Add, N2);
-      }
-      if (CC == ISD::SETULT && Cond0 == N1 &&
-          sd_match(N2, m_Add(m_Specific(N1), m_SpecificInt(-C)))) {
-        // Ditto.
-        SDValue AddC = DAG.getConstant(-C, DL, VT);
-        SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N1, AddC);
-        return DAG.getNode(ISD::UMIN, DL, VT, N1, Add);
-      }
-    }
+    if (SDValue UMin = foldSelectToUMin(Cond0, Cond1, N1, N2, CC, DL))
+      return UMin;
   }
 
   if (!VT.isVector())
@@ -13412,6 +13428,11 @@ SDValue DAGCombiner::visitVSELECT(SDNode *N) {
         }
       }
     }
+
+    // (vselect (ugt x, C), (add x, ~C), x) -> (umin (add x, ~C), x)
+    // (vselect (ult x, C), x, (add x, -C)) -> (umin x, (add x, -C))
+    if (SDValue UMin = foldSelectToUMin(LHS, RHS, N1, N2, CC, DL))
+      return UMin;
   }
 
   if (SimplifySelectOps(N, N1, N2))
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll
index 35b9457fbc1f..9df71cfc96cc 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll
@@ -5712,9 +5712,8 @@ define <8 x i8> @vsub_if_uge_v8i8(<8 x i8> %va, <8 x i8> %vb) {
 ; CHECK-LABEL: vsub_if_uge_v8i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vmsltu.vv v0, v8, v9
 ; CHECK-NEXT:    vsub.vv v9, v8, v9
-; CHECK-NEXT:    vmerge.vvm v8, v9, v8, v0
+; CHECK-NEXT:    vminu.vv v8, v8, v9
 ; CHECK-NEXT:    ret
   %cmp = icmp ult <8 x i8> %va, %vb
   %select = select <8 x i1> %cmp, <8 x i8> zeroinitializer, <8 x i8> %vb
@@ -5725,9 +5724,9 @@ define <8 x i8> @vsub_if_uge_v8i8(<8 x i8> %va, <8 x i8> %vb) {
 define <8 x i8> @vsub_if_uge_swapped_v8i8(<8 x i8> %va, <8 x i8> %vb) {
 ; CHECK-LABEL: vsub_if_uge_swapped_v8i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
-; CHECK-NEXT:    vmsleu.vv v0, v9, v8
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT:    vsub.vv v9, v8, v9
+; CHECK-NEXT:    vminu.vv v8, v8, v9
 ; CHECK-NEXT:    ret
   %cmp = icmp uge <8 x i8> %va, %vb
   %select = select <8 x i1> %cmp, <8 x i8> %vb, <8 x i8> zeroinitializer
@@ -5739,9 +5738,8 @@ define <8 x i16> @vsub_if_uge_v8i16(<8 x i16> %va, <8 x i16> %vb) {
 ; CHECK-LABEL: vsub_if_uge_v8i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-NEXT:    vmsltu.vv v0, v8, v9
 ; CHECK-NEXT:    vsub.vv v9, v8, v9
-; CHECK-NEXT:    vmerge.vvm v8, v9, v8, v0
+; CHECK-NEXT:    vminu.vv v8, v8, v9
 ; CHECK-NEXT:    ret
   %cmp = icmp ult <8 x i16> %va, %vb
   %select = select <8 x i1> %cmp, <8 x i16> zeroinitializer, <8 x i16> %vb
@@ -5752,9 +5750,9 @@ define <8 x i16> @vsub_if_uge_v8i16(<8 x i16> %va, <8 x i16> %vb) {
 define <8 x i16> @vsub_if_uge_swapped_v8i16(<8 x i16> %va, <8 x i16> %vb) {
 ; CHECK-LABEL: vsub_if_uge_swapped_v8i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
-; CHECK-NEXT:    vmsleu.vv v0, v9, v8
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vsub.vv v9, v8, v9
+; CHECK-NEXT:    vminu.vv v8, v8, v9
 ; CHECK-NEXT:    ret
   %cmp = icmp uge <8 x i16> %va, %vb
   %select = select <8 x i1> %cmp, <8 x i16> %vb, <8 x i16> zeroinitializer
@@ -5766,9 +5764,8 @@ define <4 x i32> @vsub_if_uge_v4i32(<4 x i32> %va, <4 x i32> %vb) {
 ; CHECK-LABEL: vsub_if_uge_v4i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vmsltu.vv v0, v8, v9
 ; CHECK-NEXT:    vsub.vv v9, v8, v9
-; CHECK-NEXT:    vmerge.vvm v8, v9, v8, v0
+; CHECK-NEXT:    vminu.vv v8, v8, v9
 ; CHECK-NEXT:    ret
   %cmp = icmp ult <4 x i32> %va, %vb
   %select = select <4 x i1> %cmp, <4 x i32> zeroinitializer, <4 x i32> %vb
@@ -5779,9 +5776,9 @@ define <4 x i32> @vsub_if_uge_v4i32(<4 x i32> %va, <4 x i32> %vb) {
 define <4 x i32> @vsub_if_uge_swapped_v4i32(<4 x i32> %va, <4 x i32> %vb) {
 ; CHECK-LABEL: vsub_if_uge_swapped_v4i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
-; CHECK-NEXT:    vmsleu.vv v0, v9, v8
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vsub.vv v9, v8, v9
+; CHECK-NEXT:    vminu.vv v8, v8, v9
 ; CHECK-NEXT:    ret
   %cmp = icmp uge <4 x i32> %va, %vb
   %select = select <4 x i1> %cmp, <4 x i32> %vb, <4 x i32> zeroinitializer
@@ -5793,9 +5790,8 @@ define <2 x i64> @vsub_if_uge_v2i64(<2 x i64> %va, <2 x i64> %vb) {
 ; CHECK-LABEL: vsub_if_uge_v2i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-NEXT:    vmsltu.vv v0, v8, v9
 ; CHECK-NEXT:    vsub.vv v9, v8, v9
-; CHECK-NEXT:    vmerge.vvm v8, v9, v8, v0
+; CHECK-NEXT:    vminu.vv v8, v8, v9
 ; CHECK-NEXT:    ret
   %cmp = icmp ult <2 x i64> %va, %vb
   %select = select <2 x i1> %cmp, <2 x i64> zeroinitializer, <2 x i64> %vb
@@ -5806,9 +5802,9 @@ define <2 x i64> @vsub_if_uge_v2i64(<2 x i64> %va, <2 x i64> %vb) {
 define <2 x i64> @vsub_if_uge_swapped_v2i64(<2 x i64> %va, <2 x i64> %vb) {
 ; CHECK-LABEL: vsub_if_uge_swapped_v2i64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
-; CHECK-NEXT:    vmsleu.vv v0, v9, v8
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-NEXT:    vsub.vv v9, v8, v9
+; CHECK-NEXT:    vminu.vv v8, v8, v9
 ; CHECK-NEXT:    ret
   %cmp = icmp uge <2 x i64> %va, %vb
   %select = select <2 x i1> %cmp, <2 x i64> %vb, <2 x i64> zeroinitializer
@@ -5819,9 +5815,9 @@ define <2 x i64> @vsub_if_uge_swapped_v2i64(<2 x i64> %va, <2 x i64> %vb) {
 define <8 x i8> @sub_if_uge_C_v8i8(<8 x i8> %x) {
 ; CHECK-LABEL: sub_if_uge_C_v8i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
-; CHECK-NEXT:    vmsgtu.vi v0, v8, 12
-; CHECK-NEXT:    vadd.vi v8, v8, -13, v0.t
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT:    vadd.vi v9, v8, -13
+; CHECK-NEXT:    vminu.vv v8, v9, v8
 ; CHECK-NEXT:    ret
   %cmp = icmp ugt <8 x i8> %x, splat (i8 12)
   %sub = add <8 x i8> %x, splat (i8 -13)
@@ -5832,11 +5828,10 @@ define <8 x i8> @sub_if_uge_C_v8i8(<8 x i8> %x) {
 define <8 x i16> @sub_if_uge_C_v8i16(<8 x i16> %x) {
 ; CHECK-LABEL: sub_if_uge_C_v8i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    li a0, 2000
-; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
-; CHECK-NEXT:    vmsgtu.vx v0, v8, a0
 ; CHECK-NEXT:    li a0, -2001
-; CHECK-NEXT:    vadd.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vadd.vx v9, v8, a0
+; CHECK-NEXT:    vminu.vv v8, v9, v8
 ; CHECK-NEXT:    ret
   %cmp = icmp ugt <8 x i16> %x, splat (i16 2000)
   %sub = add <8 x i16> %x, splat (i16 -2001)
@@ -5847,13 +5842,11 @@ define <8 x i16> @sub_if_uge_C_v8i16(<8 x i16> %x) {
 define <4 x i32> @sub_if_uge_C_v4i32(<4 x i32> %x) {
 ; CHECK-LABEL: sub_if_uge_C_v4i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, 16
-; CHECK-NEXT:    addi a0, a0, -16
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
-; CHECK-NEXT:    vmsgtu.vx v0, v8, a0
 ; CHECK-NEXT:    lui a0, 1048560
 ; CHECK-NEXT:    addi a0, a0, 15
-; CHECK-NEXT:    vadd.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vadd.vx v9, v8, a0
+; CHECK-NEXT:    vminu.vv v8, v9, v8
 ; CHECK-NEXT:    ret
   %cmp = icmp ugt <4 x i32> %x, splat (i32 65520)
   %sub = add <4 x i32> %x, splat (i32 -65521)
@@ -5864,14 +5857,11 @@ define <4 x i32> @sub_if_uge_C_v4i32(<4 x i32> %x) {
 define <4 x i32> @sub_if_uge_C_swapped_v4i32(<4 x i32> %x) {
 ; CHECK-LABEL: sub_if_uge_C_swapped_v4i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, 16
-; CHECK-NEXT:    addi a0, a0, -15
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vmsltu.vx v0, v8, a0
 ; CHECK-NEXT:    lui a0, 1048560
 ; CHECK-NEXT:    addi a0, a0, 15
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:    vadd.vx v9, v8, a0
-; CHECK-NEXT:    vmerge.vvm v8, v9, v8, v0
+; CHECK-NEXT:    vminu.vv v8, v8, v9
 ; CHECK-NEXT:    ret
   %cmp = icmp ult <4 x i32> %x, splat (i32 65521)
   %sub = add <4 x i32> %x, splat (i32 -65521)
@@ -5883,38 +5873,28 @@ define <2 x i64> @sub_if_uge_C_v2i64(<2 x i64> %x) nounwind {
 ; RV32-LABEL: sub_if_uge_C_v2i64:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    li a0, 1
-; RV32-NEXT:    lui a1, 172127
-; RV32-NEXT:    mv a2, sp
-; RV32-NEXT:    addi a1, a1, 512
-; RV32-NEXT:    sw a1, 0(sp)
-; RV32-NEXT:    sw a0, 4(sp)
 ; RV32-NEXT:    li a0, -2
-; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
-; RV32-NEXT:    vlse64.v v9, (a2), zero
 ; RV32-NEXT:    lui a1, 876449
 ; RV32-NEXT:    addi a1, a1, -513
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw a0, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vlse64.v v10, (a0), zero
-; RV32-NEXT:    vmsltu.vv v0, v9, v8
-; RV32-NEXT:    vadd.vv v8, v8, v10, v0.t
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
+; RV32-NEXT:    vadd.vv v9, v8, v9
+; RV32-NEXT:    vminu.vv v8, v9, v8
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: sub_if_uge_C_v2i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    lui a0, 2384
-; RV64-NEXT:    addi a0, a0, 761
-; RV64-NEXT:    slli a0, a0, 9
-; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
-; RV64-NEXT:    vmsgtu.vx v0, v8, a0
 ; RV64-NEXT:    lui a0, 1048278
 ; RV64-NEXT:    addi a0, a0, -95
 ; RV64-NEXT:    slli a0, a0, 12
 ; RV64-NEXT:    addi a0, a0, -513
-; RV64-NEXT:    vadd.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV64-NEXT:    vadd.vx v9, v8, a0
+; RV64-NEXT:    vminu.vv v8, v9, v8
 ; RV64-NEXT:    ret
   %cmp = icmp ugt <2 x i64> %x, splat (i64 5000000000)
   %sub = add <2 x i64> %x, splat (i64 -5000000001)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vminu-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vminu-sdnode.ll
index a21a526e00ec..9b58cb3d5c89 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vminu-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vminu-sdnode.ll
@@ -898,9 +898,8 @@ define <vscale x 2 x i8> @vsub_if_uge_nxv2i8(<vscale x 2 x i8> %va, <vscale x 2
 ; CHECK-LABEL: vsub_if_uge_nxv2i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    vmsltu.vv v0, v8, v9
 ; CHECK-NEXT:    vsub.vv v9, v8, v9
-; CHECK-NEXT:    vmerge.vvm v8, v9, v8, v0
+; CHECK-NEXT:    vminu.vv v8, v8, v9
 ; CHECK-NEXT:    ret
   %cmp = icmp ult <vscale x 2 x i8> %va, %vb
   %select = select <vscale x 2 x i1> %cmp, <vscale x 2 x i8> zeroinitializer, <vscale x 2 x i8> %vb
@@ -911,9 +910,9 @@ define <vscale x 2 x i8> @vsub_if_uge_nxv2i8(<vscale x 2 x i8> %va, <vscale x 2
 define <vscale x 2 x i8> @vsub_if_uge_swapped_nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i8> %vb) {
 ; CHECK-LABEL: vsub_if_uge_swapped_nxv2i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e8, mf4, ta, mu
-; CHECK-NEXT:    vmsleu.vv v0, v9, v8
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    vsub.vv v9, v8, v9
+; CHECK-NEXT:    vminu.vv v8, v8, v9
 ; CHECK-NEXT:    ret
   %cmp = icmp uge <vscale x 2 x i8> %va, %vb
   %select = select <vscale x 2 x i1> %cmp, <vscale x 2 x i8> %vb, <vscale x 2 x i8> zeroinitializer
@@ -925,9 +924,8 @@ define <vscale x 2 x i16> @vsub_if_uge_nxv2i16(<vscale x 2 x i16> %va, <vscale x
 ; CHECK-LABEL: vsub_if_uge_nxv2i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vmsltu.vv v0, v8, v9
 ; CHECK-NEXT:    vsub.vv v9, v8, v9
-; CHECK-NEXT:    vmerge.vvm v8, v9, v8, v0
+; CHECK-NEXT:    vminu.vv v8, v8, v9
 ; CHECK-NEXT:    ret
   %cmp = icmp ult <vscale x 2 x i16> %va, %vb
   %select = select <vscale x 2 x i1> %cmp, <vscale x 2 x i16> zeroinitializer, <vscale x 2 x i16> %vb
@@ -938,9 +936,9 @@ define <vscale x 2 x i16> @vsub_if_uge_nxv2i16(<vscale x 2 x i16> %va, <vscale x
 define <vscale x 2 x i16> @vsub_if_uge_swapped_nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i16> %vb) {
 ; CHECK-LABEL: vsub_if_uge_swapped_nxv2i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, mu
-; CHECK-NEXT:    vmsleu.vv v0, v9, v8
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsub.vv v9, v8, v9
+; CHECK-NEXT:    vminu.vv v8, v8, v9
 ; CHECK-NEXT:    ret
   %cmp = icmp uge <vscale x 2 x i16> %va, %vb
   %select = select <vscale x 2 x i1> %cmp, <vscale x 2 x i16> %vb, <vscale x 2 x i16> zeroinitializer
@@ -952,9 +950,8 @@ define <vscale x 2 x i32> @vsub_if_uge_nxv2i32(<vscale x 2 x i32> %va, <vscale x
 ; CHECK-LABEL: vsub_if_uge_nxv2i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vmsltu.vv v0, v8, v9
 ; CHECK-NEXT:    vsub.vv v9, v8, v9
-; CHECK-NEXT:    vmerge.vvm v8, v9, v8, v0
+; CHECK-NEXT:    vminu.vv v8, v8, v9
 ; CHECK-NEXT:    ret
   %cmp = icmp ult <vscale x 2 x i32> %va, %vb
   %select = select <vscale x 2 x i1> %cmp, <vscale x 2 x i32> zeroinitializer, <vscale x 2 x i32> %vb
@@ -965,9 +962,9 @@ define <vscale x 2 x i32> @vsub_if_uge_nxv2i32(<vscale x 2 x i32> %va, <vscale x
 define <vscale x 2 x i32> @vsub_if_uge_swapped_nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %vb) {
 ; CHECK-LABEL: vsub_if_uge_swapped_nxv2i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, mu
-; CHECK-NEXT:    vmsleu.vv v0, v9, v8
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vsub.vv v9, v8, v9
+; CHECK-NEXT:    vminu.vv v8, v8, v9
 ; CHECK-NEXT:    ret
   %cmp = icmp uge <vscale x 2 x i32> %va, %vb
   %select = select <vscale x 2 x i1> %cmp, <vscale x 2 x i32> %vb, <vscale x 2 x i32> zeroinitializer
@@ -979,9 +976,8 @@ define <vscale x 2 x i64> @vsub_if_uge_nxv2i64(<vscale x 2 x i64> %va, <vscale x
 ; CHECK-LABEL: vsub_if_uge_nxv2i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
-; CHECK-NEXT:    vmsltu.vv v0, v8, v10
 ; CHECK-NEXT:    vsub.vv v10, v8, v10
-; CHECK-NEXT:    vmerge.vvm v8, v10, v8, v0
+; CHECK-NEXT:    vminu.vv v8, v8, v10
 ; CHECK-NEXT:    ret
   %cmp = icmp ult <vscale x 2 x i64> %va, %vb
   %select = select <vscale x 2 x i1> %cmp, <vscale x 2 x i64> zeroinitializer, <vscale x 2 x i64> %vb
@@ -992,9 +988,9 @@ define <vscale x 2 x i64> @vsub_if_uge_nxv2i64(<vscale x 2 x i64> %va, <vscale x
 define <vscale x 2 x i64> @vsub_if_uge_swapped_nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i64> %vb) {
 ; CHECK-LABEL: vsub_if_uge_swapped_nxv2i64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, mu
-; CHECK-NEXT:    vmsleu.vv v0, v10, v8
-; CHECK-NEXT:    vsub.vv v8, v8, v10, v0.t
+; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; CHECK-NEXT:    vsub.vv v10, v8, v10
+; CHECK-NEXT:    vminu.vv v8, v8, v10
 ; CHECK-NEXT:    ret
   %cmp = icmp uge <vscale x 2 x i64> %va, %vb
   %select = select <vscale x 2 x i1> %cmp, <vscale x 2 x i64> %vb, <vscale x 2 x i64> zeroinitializer
@@ -1005,9 +1001,9 @@ define <vscale x 2 x i64> @vsub_if_uge_swapped_nxv2i64(<vscale x 2 x i64> %va, <
 define <vscale x 2 x i8> @sub_if_uge_C_nxv2i8(<vscale x 2 x i8> %x) {
 ; CHECK-LABEL: sub_if_uge_C_nxv2i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e8, mf4, ta, mu
-; CHECK-NEXT:    vmsgtu.vi v0, v8, 12
-; CHECK-NEXT:    vadd.vi v8, v8, -13, v0.t
+; CHECK-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    vadd.vi v9, v8, -13
+; CHECK-NEXT:    vminu.vv v8, v9, v8
 ; CHECK-NEXT:    ret
   %cmp = icmp ugt <vscale x 2 x i8> %x, splat (i8 12)
   %sub = add <vscale x 2 x i8> %x, splat (i8 -13)
@@ -1018,11 +1014,10 @@ define <vscale x 2 x i8> @sub_if_uge_C_nxv2i8(<vscale x 2 x i8> %x) {
 define <vscale x 2 x i16> @sub_if_uge_C_nxv2i16(<vscale x 2 x i16> %x) {
 ; CHECK-LABEL: sub_if_uge_C_nxv2i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    li a0, 2000
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, mu
-; CHECK-NEXT:    vmsgtu.vx v0, v8, a0
 ; CHECK-NEXT:    li a0, -2001
-; CHECK-NEXT:    vadd.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vadd.vx v9, v8, a0
+; CHECK-NEXT:    vminu.vv v8, v9, v8
 ; CHECK-NEXT:    ret
   %cmp = icmp ugt <vscale x 2 x i16> %x, splat (i16 2000)
   %sub = add <vscale x 2 x i16> %x, splat (i16 -2001)
@@ -1033,13 +1028,11 @@ define <vscale x 2 x i16> @sub_if_uge_C_nxv2i16(<vscale x 2 x i16> %x) {
 define <vscale x 2 x i32> @sub_if_uge_C_nxv2i32(<vscale x 2 x i32> %x) {
 ; CHECK-LABEL: sub_if_uge_C_nxv2i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, 16
-; CHECK-NEXT:    addi a0, a0, -16
-; CHECK-NEXT:    vsetvli a1, zero, e32, m1, ta, mu
-; CHECK-NEXT:    vmsgtu.vx v0, v8, a0
 ; CHECK-NEXT:    lui a0, 1048560
 ; CHECK-NEXT:    addi a0, a0, 15
-; CHECK-NEXT:    vadd.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vadd.vx v9, v8, a0
+; CHECK-NEXT:    vminu.vv v8, v9, v8
 ; CHECK-NEXT:    ret
   %cmp = icmp ugt <vscale x 2 x i32> %x, splat (i32 65520)
   %sub = add <vscale x 2 x i32> %x, splat (i32 -65521)
@@ -1050,14 +1043,11 @@ define <vscale x 2 x i32> @sub_if_uge_C_nxv2i32(<vscale x 2 x i32> %x) {
 define <vscale x 2 x i32> @sub_if_uge_C_swapped_nxv2i32(<vscale x 2 x i32> %x) {
 ; CHECK-LABEL: sub_if_uge_C_swapped_nxv2i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, 16
-; CHECK-NEXT:    addi a0, a0, -15
-; CHECK-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vmsltu.vx v0, v8, a0
 ; CHECK-NEXT:    lui a0, 1048560
 ; CHECK-NEXT:    addi a0, a0, 15
+; CHECK-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vadd.vx v9, v8, a0
-; CHECK-NEXT:    vmerge.vvm v8, v9, v8, v0
+; CHECK-NEXT:    vminu.vv v8, v8, v9
 ; CHECK-NEXT:    ret
   %cmp = icmp ult <vscale x 2 x i32> %x, splat (i32 65521)
   %sub = add <vscale x 2 x i32> %x, splat (i32 -65521)
@@ -1069,38 +1059,28 @@ define <vscale x 2 x i64> @sub_if_uge_C_nxv2i64(<vscale x 2 x i64> %x) nounwind
 ; RV32-LABEL: sub_if_uge_C_nxv2i64:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    li a0, 1
-; RV32-NEXT:    lui a1, 172127
-; RV32-NEXT:    mv a2, sp
-; RV32-NEXT:    addi a1, a1, 512
-; RV32-NEXT:    sw a1, 0(sp)
-; RV32-NEXT:    sw a0, 4(sp)
 ; RV32-NEXT:    li a0, -2
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, mu
-; RV32-NEXT:    vlse64.v v10, (a2), zero
 ; RV32-NEXT:    lui a1, 876449
 ; RV32-NEXT:    addi a1, a1, -513
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw a0, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vlse64.v v12, (a0), zero
-; RV32-NEXT:    vmsltu.vv v0, v10, v8
-; RV32-NEXT:    vadd.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
+; RV32-NEXT:    vadd.vv v10, v8, v10
+; RV32-NEXT:    vminu.vv v8, v10, v8
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: sub_if_uge_C_nxv2i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    lui a0, 2384
-; RV64-NEXT:    addi a0, a0, 761
-; RV64-NEXT:    slli a0, a0, 9
-; RV64-NEXT:    vsetvli a1, zero, e64, m2, ta, mu
-; RV64-NEXT:    vmsgtu.vx v0, v8, a0
 ; RV64-NEXT:    lui a0, 1048278
 ; RV64-NEXT:    addi a0, a0, -95
 ; RV64-NEXT:    slli a0, a0, 12
 ; RV64-NEXT:    addi a0, a0, -513
-; RV64-NEXT:    vadd.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; RV64-NEXT:    vadd.vx v10, v8, a0
+; RV64-NEXT:    vminu.vv v8, v10, v8
 ; RV64-NEXT:    ret
   %cmp = icmp ugt <vscale x 2 x i64> %x, splat (i64 5000000000)
   %sub = add <vscale x 2 x i64> %x, splat (i64 -5000000001)
-- 
cgit v1.2.3


From 8f18dde6c0b38a67ad0f06aab79cdadb78b35d33 Mon Sep 17 00:00:00 2001
From: Philip Reames <preames@rivosinc.com>
Date: Thu, 17 Jul 2025 07:32:59 -0700
Subject: [RISCV][IA] Rearrange code for readability and ease of merge [nfc]

---
 llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp | 66 ++++++++++++------------
 1 file changed, 33 insertions(+), 33 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp
index 8fb6ccaac2c9..0d4f24172b57 100644
--- a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp
@@ -69,6 +69,39 @@ static const Intrinsic::ID ScalableVlsegIntrIds[] = {
     Intrinsic::riscv_vlseg6_mask, Intrinsic::riscv_vlseg7_mask,
     Intrinsic::riscv_vlseg8_mask};
 
+static const Intrinsic::ID FixedVssegIntrIds[] = {
+    Intrinsic::riscv_seg2_store_mask, Intrinsic::riscv_seg3_store_mask,
+    Intrinsic::riscv_seg4_store_mask, Intrinsic::riscv_seg5_store_mask,
+    Intrinsic::riscv_seg6_store_mask, Intrinsic::riscv_seg7_store_mask,
+    Intrinsic::riscv_seg8_store_mask};
+
+static const Intrinsic::ID ScalableVssegIntrIds[] = {
+    Intrinsic::riscv_vsseg2_mask, Intrinsic::riscv_vsseg3_mask,
+    Intrinsic::riscv_vsseg4_mask, Intrinsic::riscv_vsseg5_mask,
+    Intrinsic::riscv_vsseg6_mask, Intrinsic::riscv_vsseg7_mask,
+    Intrinsic::riscv_vsseg8_mask};
+
+static bool isMultipleOfN(const Value *V, const DataLayout &DL, unsigned N) {
+  assert(N);
+  if (N == 1)
+    return true;
+
+  using namespace PatternMatch;
+  // Right now we're only recognizing the simplest pattern.
+  uint64_t C;
+  if (match(V, m_CombineOr(m_ConstantInt(C),
+                           m_NUWMul(m_Value(), m_ConstantInt(C)))) &&
+      C && C % N == 0)
+    return true;
+
+  if (isPowerOf2_32(N)) {
+    KnownBits KB = llvm::computeKnownBits(V, DL);
+    return KB.countMinTrailingZeros() >= Log2_32(N);
+  }
+
+  return false;
+}
+
 /// Lower an interleaved load into a vlsegN intrinsic.
 ///
 /// E.g. Lower an interleaved load (Factor = 2):
@@ -134,18 +167,6 @@ bool RISCVTargetLowering::lowerInterleavedLoad(
   return true;
 }
 
-static const Intrinsic::ID FixedVssegIntrIds[] = {
-    Intrinsic::riscv_seg2_store_mask, Intrinsic::riscv_seg3_store_mask,
-    Intrinsic::riscv_seg4_store_mask, Intrinsic::riscv_seg5_store_mask,
-    Intrinsic::riscv_seg6_store_mask, Intrinsic::riscv_seg7_store_mask,
-    Intrinsic::riscv_seg8_store_mask};
-
-static const Intrinsic::ID ScalableVssegIntrIds[] = {
-    Intrinsic::riscv_vsseg2_mask, Intrinsic::riscv_vsseg3_mask,
-    Intrinsic::riscv_vsseg4_mask, Intrinsic::riscv_vsseg5_mask,
-    Intrinsic::riscv_vsseg6_mask, Intrinsic::riscv_vsseg7_mask,
-    Intrinsic::riscv_vsseg8_mask};
-
 /// Lower an interleaved store into a vssegN intrinsic.
 ///
 /// E.g. Lower an interleaved store (Factor = 3):
@@ -235,27 +256,6 @@ bool RISCVTargetLowering::lowerInterleavedStore(StoreInst *SI,
   return true;
 }
 
-static bool isMultipleOfN(const Value *V, const DataLayout &DL, unsigned N) {
-  assert(N);
-  if (N == 1)
-    return true;
-
-  using namespace PatternMatch;
-  // Right now we're only recognizing the simplest pattern.
-  uint64_t C;
-  if (match(V, m_CombineOr(m_ConstantInt(C),
-                           m_NUWMul(m_Value(), m_ConstantInt(C)))) &&
-      C && C % N == 0)
-    return true;
-
-  if (isPowerOf2_32(N)) {
-    KnownBits KB = llvm::computeKnownBits(V, DL);
-    return KB.countMinTrailingZeros() >= Log2_32(N);
-  }
-
-  return false;
-}
-
 bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(
     Instruction *Load, Value *Mask, IntrinsicInst *DI) const {
   const unsigned Factor = getDeinterleaveIntrinsicFactor(DI->getIntrinsicID());
-- 
cgit v1.2.3


From 46357438baefbdcf630abc5d74565afcbf1c48dd Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Thu, 17 Jul 2025 16:47:54 +0200
Subject: [SCEV] Try to re-use existing LCSSA phis when expanding
 SCEVAddRecExpr. (#147214)

If an AddRec is expanded outside a loop with a single exit block, check
if any of the (lcssa) phi nodes in the exit block match the AddRec. If
that's the case, simply use the existing lcssa phi.

This can reduce the number of instruction created for SCEV expansions,
mainly for runtime checks generated by the loop vectorizer.

Compile-time impact should be mostly neutral

https://llvm-compile-time-tracker.com/compare.php?from=48c7a3187f9831304a38df9bdb3b4d5bf6b6b1a2&to=cf9d039a7b0db5d0d912e0e2c01b19c2a653273a&stat=instructions:u

PR: https://github.com/llvm/llvm-project/pull/147214
---
 .../Transforms/Utils/ScalarEvolutionExpander.h     |  1 +
 .../Transforms/Utils/ScalarEvolutionExpander.cpp   | 23 ++++++++++++++++++++++
 .../reuse-lcssa-phi-scev-expansion.ll              |  4 +---
 3 files changed, 25 insertions(+), 3 deletions(-)

diff --git a/llvm/include/llvm/Transforms/Utils/ScalarEvolutionExpander.h b/llvm/include/llvm/Transforms/Utils/ScalarEvolutionExpander.h
index a101151eed7c..39fef921a959 100644
--- a/llvm/include/llvm/Transforms/Utils/ScalarEvolutionExpander.h
+++ b/llvm/include/llvm/Transforms/Utils/ScalarEvolutionExpander.h
@@ -530,6 +530,7 @@ private:
 
   bool isExpandedAddRecExprPHI(PHINode *PN, Instruction *IncV, const Loop *L);
 
+  Value *tryToReuseLCSSAPhi(const SCEVAddRecExpr *S);
   Value *expandAddRecExprLiterally(const SCEVAddRecExpr *);
   PHINode *getAddRecExprPHILiterally(const SCEVAddRecExpr *Normalized,
                                      const Loop *L, Type *&TruncTy,
diff --git a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
index 739ac00ba47c..ed08c0bfa2e7 100644
--- a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
+++ b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
@@ -1223,6 +1223,24 @@ Value *SCEVExpander::expandAddRecExprLiterally(const SCEVAddRecExpr *S) {
   return Result;
 }
 
+Value *SCEVExpander::tryToReuseLCSSAPhi(const SCEVAddRecExpr *S) {
+  const Loop *L = S->getLoop();
+  BasicBlock *EB = L->getExitBlock();
+  if (!EB || !EB->getSinglePredecessor() ||
+      !SE.DT.dominates(EB, Builder.GetInsertBlock()))
+    return nullptr;
+
+  for (auto &PN : EB->phis()) {
+    if (!SE.isSCEVable(PN.getType()) || PN.getType() != S->getType())
+      continue;
+    auto *ExitV = SE.getSCEV(&PN);
+    if (S == ExitV)
+      return &PN;
+  }
+
+  return nullptr;
+}
+
 Value *SCEVExpander::visitAddRecExpr(const SCEVAddRecExpr *S) {
   // In canonical mode we compute the addrec as an expression of a canonical IV
   // using evaluateAtIteration and expand the resulting SCEV expression. This
@@ -1262,6 +1280,11 @@ Value *SCEVExpander::visitAddRecExpr(const SCEVAddRecExpr *S) {
     return V;
   }
 
+  // If S is expanded outside the defining loop, check if there is a
+  // matching LCSSA phi node for it.
+  if (Value *V = tryToReuseLCSSAPhi(S))
+    return V;
+
   // {X,+,F} --> X + {0,+,F}
   if (!S->getStart()->isZero()) {
     if (isa<PointerType>(S->getType())) {
diff --git a/llvm/test/Transforms/LoopVectorize/reuse-lcssa-phi-scev-expansion.ll b/llvm/test/Transforms/LoopVectorize/reuse-lcssa-phi-scev-expansion.ll
index 2747895f06a7..ce4270dc4b7f 100644
--- a/llvm/test/Transforms/LoopVectorize/reuse-lcssa-phi-scev-expansion.ll
+++ b/llvm/test/Transforms/LoopVectorize/reuse-lcssa-phi-scev-expansion.ll
@@ -18,11 +18,9 @@ define void @reuse_lcssa_phi_for_add_rec1(ptr %head) {
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw i64 [[IV]], 1
 ; CHECK-NEXT:    br i1 [[EC_1]], label %[[PH:.*]], label %[[LOOP_1]]
 ; CHECK:       [[PH]]:
-; CHECK-NEXT:    [[IV_2_LCSSA:%.*]] = phi i32 [ [[IV_2]], %[[LOOP_1]] ]
 ; CHECK-NEXT:    [[IV_LCSSA:%.*]] = phi i64 [ [[IV]], %[[LOOP_1]] ]
-; CHECK-NEXT:    [[IV_2_NEXT_LCSSA:%.*]] = phi i32 [ [[IV_2_NEXT]], %[[LOOP_1]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ [[IV_2_NEXT]], %[[LOOP_1]] ]
 ; CHECK-NEXT:    [[SRC_2:%.*]] = tail call noalias noundef dereferenceable_or_null(8) ptr @calloc(i64 1, i64 8)
-; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[IV_2_LCSSA]], 1
 ; CHECK-NEXT:    [[SMIN:%.*]] = call i32 @llvm.smin.i32(i32 [[TMP0]], i32 1)
 ; CHECK-NEXT:    [[TMP1:%.*]] = sub i32 [[TMP0]], [[SMIN]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
-- 
cgit v1.2.3


From 7817163663b3bb662a46a73cf1903ec900ba6146 Mon Sep 17 00:00:00 2001
From: Jeremy Kun <jkun@google.com>
Date: Thu, 17 Jul 2025 08:01:58 -0700
Subject: [mlir] [presburger] Add IntegerRelation::rangeProduct (#148092)

This is intended to match `isl::map`'s `flat_range_product`.

---------

Co-authored-by: Jeremy Kun <j2kun@users.noreply.github.com>
---
 .../mlir/Analysis/Presburger/IntegerRelation.h     | 13 +++
 mlir/lib/Analysis/Presburger/IntegerRelation.cpp   | 38 +++++++++
 .../Analysis/Presburger/IntegerRelationTest.cpp    | 94 ++++++++++++++++++++++
 3 files changed, 145 insertions(+)

diff --git a/mlir/include/mlir/Analysis/Presburger/IntegerRelation.h b/mlir/include/mlir/Analysis/Presburger/IntegerRelation.h
index b68262f09f48..ee401cca8f55 100644
--- a/mlir/include/mlir/Analysis/Presburger/IntegerRelation.h
+++ b/mlir/include/mlir/Analysis/Presburger/IntegerRelation.h
@@ -707,6 +707,19 @@ public:
   /// this for uniformity with `applyDomain`.
   void applyRange(const IntegerRelation &rel);
 
+  /// Let the relation `this` be R1, and the relation `rel` be R2. Requires
+  /// R1 and R2 to have the same domain.
+  ///
+  /// Let R3 be the rangeProduct of R1 and R2. Then x R3 (y, z) iff
+  /// (x R1 y and x R2 z).
+  ///
+  /// Example:
+  ///
+  /// R1: (i, j) -> k : f(i, j, k) = 0
+  /// R2: (i, j) -> l : g(i, j, l) = 0
+  /// R1.rangeProduct(R2): (i, j) -> (k, l) : f(i, j, k) = 0 and g(i, j, l) = 0
+  IntegerRelation rangeProduct(const IntegerRelation &rel);
+
   /// Given a relation `other: (A -> B)`, this operation merges the symbol and
   /// local variables and then takes the composition of `other` on `this: (B ->
   /// C)`. The resulting relation represents tuples of the form: `A -> C`.
diff --git a/mlir/lib/Analysis/Presburger/IntegerRelation.cpp b/mlir/lib/Analysis/Presburger/IntegerRelation.cpp
index 17e48e0d069b..5c4d4d13580a 100644
--- a/mlir/lib/Analysis/Presburger/IntegerRelation.cpp
+++ b/mlir/lib/Analysis/Presburger/IntegerRelation.cpp
@@ -2481,6 +2481,44 @@ void IntegerRelation::applyDomain(const IntegerRelation &rel) {
 
 void IntegerRelation::applyRange(const IntegerRelation &rel) { compose(rel); }
 
+IntegerRelation IntegerRelation::rangeProduct(const IntegerRelation &rel) {
+  /// R1: (i, j) -> k : f(i, j, k) = 0
+  /// R2: (i, j) -> l : g(i, j, l) = 0
+  /// R1.rangeProduct(R2): (i, j) -> (k, l) : f(i, j, k) = 0 and g(i, j, l) = 0
+  assert(getNumDomainVars() == rel.getNumDomainVars() &&
+         "Range product is only defined for relations with equal domains");
+
+  // explicit copy of `this`
+  IntegerRelation result = *this;
+  unsigned relRangeVarStart = rel.getVarKindOffset(VarKind::Range);
+  unsigned numThisRangeVars = getNumRangeVars();
+  unsigned numNewSymbolVars = result.getNumSymbolVars() - getNumSymbolVars();
+
+  result.appendVar(VarKind::Range, rel.getNumRangeVars());
+
+  // Copy each equality from `rel` and update the copy to account for range
+  // variables from `this`. The `rel` equality is a list of coefficients of the
+  // variables from `rel`, and so the range variables need to be shifted right
+  // by the number of `this` range variables and symbols.
+  for (unsigned i = 0; i < rel.getNumEqualities(); ++i) {
+    SmallVector<DynamicAPInt> copy =
+        SmallVector<DynamicAPInt>(rel.getEquality(i));
+    copy.insert(copy.begin() + relRangeVarStart,
+                numThisRangeVars + numNewSymbolVars, DynamicAPInt(0));
+    result.addEquality(copy);
+  }
+
+  for (unsigned i = 0; i < rel.getNumInequalities(); ++i) {
+    SmallVector<DynamicAPInt> copy =
+        SmallVector<DynamicAPInt>(rel.getInequality(i));
+    copy.insert(copy.begin() + relRangeVarStart,
+                numThisRangeVars + numNewSymbolVars, DynamicAPInt(0));
+    result.addInequality(copy);
+  }
+
+  return result;
+}
+
 void IntegerRelation::printSpace(raw_ostream &os) const {
   space.print(os);
   os << getNumConstraints() << " constraints\n";
diff --git a/mlir/unittests/Analysis/Presburger/IntegerRelationTest.cpp b/mlir/unittests/Analysis/Presburger/IntegerRelationTest.cpp
index 7df500bc9568..dd0b09f7f05d 100644
--- a/mlir/unittests/Analysis/Presburger/IntegerRelationTest.cpp
+++ b/mlir/unittests/Analysis/Presburger/IntegerRelationTest.cpp
@@ -608,3 +608,97 @@ TEST(IntegerRelationTest, convertVarKindToLocal) {
   EXPECT_EQ(space.getId(VarKind::Symbol, 0), Identifier(&identifiers[3]));
   EXPECT_EQ(space.getId(VarKind::Symbol, 1), Identifier(&identifiers[4]));
 }
+
+TEST(IntegerRelationTest, rangeProduct) {
+  IntegerRelation r1 = parseRelationFromSet(
+      "(i, j, k) : (2*i + 3*k == 0, i >= 0, j >= 0, k >= 0)", 2);
+  IntegerRelation r2 = parseRelationFromSet(
+      "(i, j, l) : (4*i + 6*j + 9*l == 0, i >= 0, j >= 0, l >= 0)", 2);
+
+  IntegerRelation rangeProd = r1.rangeProduct(r2);
+  IntegerRelation expected =
+      parseRelationFromSet("(i, j, k, l) : (2*i + 3*k == 0, 4*i + 6*j + 9*l == "
+                           "0, i >= 0, j >= 0, k >= 0, l >= 0)",
+                           2);
+
+  EXPECT_TRUE(expected.isEqual(rangeProd));
+}
+
+TEST(IntegerRelationTest, rangeProductMultdimRange) {
+  IntegerRelation r1 =
+      parseRelationFromSet("(i, k) : (2*i + 3*k == 0, i >= 0, k >= 0)", 1);
+  IntegerRelation r2 = parseRelationFromSet(
+      "(i, l, m) : (4*i + 6*m + 9*l == 0, i >= 0, l >= 0, m >= 0)", 1);
+
+  IntegerRelation rangeProd = r1.rangeProduct(r2);
+  IntegerRelation expected =
+      parseRelationFromSet("(i, k, l, m) : (2*i + 3*k == 0, 4*i + 6*m + 9*l == "
+                           "0, i >= 0, k >= 0, l >= 0, m >= 0)",
+                           1);
+
+  EXPECT_TRUE(expected.isEqual(rangeProd));
+}
+
+TEST(IntegerRelationTest, rangeProductMultdimRangeSwapped) {
+  IntegerRelation r1 = parseRelationFromSet(
+      "(i, l, m) : (4*i + 6*m + 9*l == 0, i >= 0, l >= 0, m >= 0)", 1);
+  IntegerRelation r2 =
+      parseRelationFromSet("(i, k) : (2*i + 3*k == 0, i >= 0, k >= 0)", 1);
+
+  IntegerRelation rangeProd = r1.rangeProduct(r2);
+  IntegerRelation expected =
+      parseRelationFromSet("(i, l, m, k) : (2*i + 3*k == 0, 4*i + 6*m + 9*l == "
+                           "0, i >= 0, k >= 0, l >= 0, m >= 0)",
+                           1);
+
+  EXPECT_TRUE(expected.isEqual(rangeProd));
+}
+
+TEST(IntegerRelationTest, rangeProductEmptyDomain) {
+  IntegerRelation r1 =
+      parseRelationFromSet("(i, j) : (4*i + 9*j == 0, i >= 0, j >= 0)", 0);
+  IntegerRelation r2 =
+      parseRelationFromSet("(k, l) : (2*k + 3*l == 0, k >= 0, l >= 0)", 0);
+  IntegerRelation rangeProd = r1.rangeProduct(r2);
+  IntegerRelation expected =
+      parseRelationFromSet("(i, j, k, l) : (2*k + 3*l == 0, 4*i + 9*j == "
+                           "0, i >= 0, j >= 0, k >= 0, l >= 0)",
+                           0);
+  EXPECT_TRUE(expected.isEqual(rangeProd));
+}
+
+TEST(IntegerRelationTest, rangeProductEmptyRange) {
+  IntegerRelation r1 =
+      parseRelationFromSet("(i, j) : (4*i + 9*j == 0, i >= 0, j >= 0)", 2);
+  IntegerRelation r2 =
+      parseRelationFromSet("(i, j) : (2*i + 3*j == 0, i >= 0, j >= 0)", 2);
+  IntegerRelation rangeProd = r1.rangeProduct(r2);
+  IntegerRelation expected =
+      parseRelationFromSet("(i, j) : (2*i + 3*j == 0, 4*i + 9*j == "
+                           "0, i >= 0, j >= 0)",
+                           2);
+  EXPECT_TRUE(expected.isEqual(rangeProd));
+}
+
+TEST(IntegerRelationTest, rangeProductEmptyDomainAndRange) {
+  IntegerRelation r1 = parseRelationFromSet("() : ()", 0);
+  IntegerRelation r2 = parseRelationFromSet("() : ()", 0);
+  IntegerRelation rangeProd = r1.rangeProduct(r2);
+  IntegerRelation expected = parseRelationFromSet("() : ()", 0);
+  EXPECT_TRUE(expected.isEqual(rangeProd));
+}
+
+TEST(IntegerRelationTest, rangeProductSymbols) {
+  IntegerRelation r1 = parseRelationFromSet(
+      "(i, j)[s] : (2*i + 3*j + s == 0, i >= 0, j >= 0)", 1);
+  IntegerRelation r2 = parseRelationFromSet(
+      "(i, l)[s] : (3*i + 4*l + s == 0, i >= 0, l >= 0)", 1);
+
+  IntegerRelation rangeProd = r1.rangeProduct(r2);
+  IntegerRelation expected = parseRelationFromSet(
+      "(i, j, l)[s] : (2*i + 3*j + s == 0, 3*i + 4*l + s == "
+      "0, i >= 0, j >= 0, l >= 0)",
+      1);
+
+  EXPECT_TRUE(expected.isEqual(rangeProd));
+}
-- 
cgit v1.2.3


From 4bf82aebc0da985cf6b2e70812714875e8fa78fa Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@linaro.org>
Date: Thu, 17 Jul 2025 16:06:03 +0100
Subject: [lldb][test] Fix TestChildCountTruncation on Windows (#149322)

By not forcing the DWARF debug info format. When left as the default,
the tests pass.

Test added by https://github.com/llvm/llvm-project/pull/149088.
---
 lldb/test/Shell/Settings/TestChildCountTruncation.test | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/lldb/test/Shell/Settings/TestChildCountTruncation.test b/lldb/test/Shell/Settings/TestChildCountTruncation.test
index 2660ccae1aa5..da6436cb5ca2 100644
--- a/lldb/test/Shell/Settings/TestChildCountTruncation.test
+++ b/lldb/test/Shell/Settings/TestChildCountTruncation.test
@@ -1,11 +1,8 @@
 # Test that we warn the user about truncated output
 # when target.max-children-count wasn't explicitly set.
 
-# link.exe discards the DWARF information needed.
-# UNSUPPORTED: system-windows
-
 # RUN: split-file %s %t
-# RUN: %clang_host -g -gdwarf %t/main.cpp -o %t.out
+# RUN: %clang_host -g %t/main.cpp -o %t.out
 # RUN: %lldb -x -b -s %t/dwim-commands.input %t.out -o exit 2>&1 \
 # RUN:       | FileCheck %s --check-prefix=DWIM
 #
-- 
cgit v1.2.3


From 149aa7679457e4c434374076fa3ad6d02efbe414 Mon Sep 17 00:00:00 2001
From: Baranov Victor <bar.victor.2002@gmail.com>
Date: Thu, 17 Jul 2025 18:38:05 +0300
Subject: [clang-tools-extra][NFC] Fix link to code review in README.txt
 (#148384)

---
 clang-tools-extra/README.txt | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/clang-tools-extra/README.txt b/clang-tools-extra/README.txt
index 6891e4078997..1195db9b468d 100644
--- a/clang-tools-extra/README.txt
+++ b/clang-tools-extra/README.txt
@@ -8,12 +8,13 @@ Clang frontend.  These tools are kept in a separate "extra" repository to
 allow lighter weight checkouts of the core Clang codebase.
 
 All discussion regarding Clang, Clang-based tools, and code in this repository
-should be held using the standard Clang forum:
+should be held using the standard Clang forums:
   https://discourse.llvm.org/c/clang
+  https://discourse.llvm.org/c/clang/clang-tidy/71
+  https://discourse.llvm.org/c/clang/clangd/34
 
-Code review for this tree should take place on the standard Clang patch and
-commit lists:
-  http://lists.llvm.org/mailman/listinfo/cfe-commits
+Code review for this tree should take place on Github:
+  https://github.com/llvm/llvm-project/pulls?q=label%3Aclang-tools-extra
 
 If you find a bug in these tools, please file it in the LLVM bug tracker:
   https://github.com/llvm/llvm-project/issues/
-- 
cgit v1.2.3


From 84d65e9d19ab577027238d38d053e293ba656e32 Mon Sep 17 00:00:00 2001
From: Amr Hesham <amr96@programmer.net>
Date: Thu, 17 Jul 2025 18:00:32 +0200
Subject: [CIR] Upstream builtin_conj for ComplexType (#149170)

This change adds support for builtin_conj for ComplexType

https://github.com/llvm/llvm-project/issues/141365
---
 clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp     | 14 ++++++++++-
 clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp |  3 +--
 clang/lib/CIR/CodeGen/CIRGenValue.h         |  6 +++++
 clang/test/CIR/CodeGen/complex-builtins.cpp | 36 +++++++++++++++++++++++++++++
 4 files changed, 56 insertions(+), 3 deletions(-)

diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
index 476f99495928..61d1c54ee9ec 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
@@ -125,7 +125,7 @@ RValue CIRGenFunction::emitBuiltinExpr(const GlobalDecl &gd, unsigned builtinID,
     mlir::Value real = emitScalarExpr(e->getArg(0));
     mlir::Value imag = emitScalarExpr(e->getArg(1));
     mlir::Value complex = builder.createComplexCreate(loc, real, imag);
-    return RValue::get(complex);
+    return RValue::getComplex(complex);
   }
 
   case Builtin::BI__builtin_creal:
@@ -150,6 +150,18 @@ RValue CIRGenFunction::emitBuiltinExpr(const GlobalDecl &gd, unsigned builtinID,
     return RValue::get(imag);
   }
 
+  case Builtin::BI__builtin_conj:
+  case Builtin::BI__builtin_conjf:
+  case Builtin::BI__builtin_conjl:
+  case Builtin::BIconj:
+  case Builtin::BIconjf:
+  case Builtin::BIconjl: {
+    mlir::Value complex = emitComplexExpr(e->getArg(0));
+    mlir::Value conj = builder.createUnaryOp(getLoc(e->getExprLoc()),
+                                             cir::UnaryOpKind::Not, complex);
+    return RValue::getComplex(conj);
+  }
+
   case Builtin::BI__builtin_clrsb:
   case Builtin::BI__builtin_clrsbl:
   case Builtin::BI__builtin_clrsbll:
diff --git a/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp b/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp
index 6663f5ea1e75..9f36be5397ad 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp
@@ -231,8 +231,7 @@ mlir::Value ComplexExprEmitter::VisitBinComma(const BinaryOperator *e) {
 mlir::Value ComplexExprEmitter::VisitCallExpr(const CallExpr *e) {
   if (e->getCallReturnType(cgf.getContext())->isReferenceType())
     return emitLoadOfLValue(e);
-
-  return cgf.emitCallExpr(e).getValue();
+  return cgf.emitCallExpr(e).getComplexValue();
 }
 
 mlir::Value ComplexExprEmitter::VisitCastExpr(CastExpr *e) {
diff --git a/clang/lib/CIR/CodeGen/CIRGenValue.h b/clang/lib/CIR/CodeGen/CIRGenValue.h
index 0a6dba5e80a6..0832c4141a10 100644
--- a/clang/lib/CIR/CodeGen/CIRGenValue.h
+++ b/clang/lib/CIR/CodeGen/CIRGenValue.h
@@ -58,6 +58,12 @@ public:
     return value;
   }
 
+  /// Return the value of this complex value.
+  mlir::Value getComplexValue() const {
+    assert(isComplex() && "Not a complex!");
+    return value;
+  }
+
   /// Return the value of the address of the aggregate.
   Address getAggregateAddress() const {
     assert(isAggregate() && "Not an aggregate!");
diff --git a/clang/test/CIR/CodeGen/complex-builtins.cpp b/clang/test/CIR/CodeGen/complex-builtins.cpp
index f0d12d0ef666..811af47a704f 100644
--- a/clang/test/CIR/CodeGen/complex-builtins.cpp
+++ b/clang/test/CIR/CodeGen/complex-builtins.cpp
@@ -83,3 +83,39 @@ void foo3() {
 // OGCG: %[[A_IMAG_PTR:.*]] = getelementptr inbounds nuw { double, double }, ptr %[[COMPLEX]], i32 0, i32 1
 // OGCG: %[[A_IMAG:.*]] = load double, ptr %[[A_IMAG_PTR]], align 8
 // OGCG: store double %[[A_IMAG]], ptr %[[INIT]], align 8
+
+void foo4() {
+  float _Complex a;
+  float _Complex b = __builtin_conjf(a);
+}
+
+// CIR: %[[COMPLEX:.*]] = cir.alloca !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>, ["a"]
+// CIR: %[[RESULT:.*]] = cir.alloca !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>, ["b", init]
+// CIR: %[[TMP:.*]] = cir.load{{.*}} %[[COMPLEX]] : !cir.ptr<!cir.complex<!cir.float>>, !cir.complex<!cir.float>
+// CIR: %[[REAL:.*]] = cir.complex.real %[[TMP]] : !cir.complex<!cir.float> -> !cir.float
+// CIR: %[[IMAG:.*]] = cir.complex.imag %[[TMP]] : !cir.complex<!cir.float> -> !cir.float
+// CIR: %[[IMAG_MINUS:.*]] = cir.unary(minus, %[[IMAG]]) : !cir.float, !cir.float
+// CIR: %[[RESULT_VAL:.*]] = cir.complex.create %[[REAL]], %[[IMAG_MINUS]] : !cir.float -> !cir.complex<!cir.float>
+// CIR: cir.store{{.*}} %[[RESULT_VAL]], %[[RESULT]] : !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>
+
+// LLVM: %[[COMPLEX:.*]] = alloca { float, float }, i64 1, align 4
+// LLVM: %[[RESULT:.*]] = alloca { float, float }, i64 1, align 4
+// LLVM: %[[TMP:.*]] = load { float, float }, ptr %[[COMPLEX]], align 4
+// LLVM: %[[REAL:.*]] = extractvalue { float, float } %[[TMP]], 0
+// LLVM: %[[IMAG:.*]] = extractvalue { float, float } %[[TMP]], 1
+// LLVM: %[[IMAG_MINUS:.*]] = fneg float %[[IMAG]]
+// LLVM: %[[RESULT_TMP:.*]] = insertvalue { float, float } {{.*}}, float %[[REAL]], 0
+// LLVM: %[[RESULT_VAL:.*]] = insertvalue { float, float } %[[RESULT_TMP]], float %[[IMAG_MINUS]], 1
+// LLVM: store { float, float } %[[RESULT_VAL]], ptr %[[RESULT]], align 4
+
+// OGCG: %[[COMPLEX:.*]] = alloca { float, float }, align 4
+// OGCG: %[[RESULT:.*]] = alloca { float, float }, align 4
+// OGCG: %[[A_REAL_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[COMPLEX]], i32 0, i32 0
+// OGCG: %[[A_REAL:.*]] = load float, ptr %[[A_REAL_PTR]], align 4
+// OGCG: %[[A_IMAG_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[COMPLEX]], i32 0, i32 1
+// OGCG: %[[A_IMAG:.*]] = load float, ptr %[[A_IMAG_PTR]], align 4
+// OGCG: %[[A_IMAG_MINUS:.*]] = fneg float  %[[A_IMAG]]
+// OGCG: %[[RESULT_REAL_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[RESULT]], i32 0, i32 0
+// OGCG: %[[RESULT_IMAG_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[RESULT]], i32 0, i32 1
+// OGCG: store float %[[A_REAL]], ptr %[[RESULT_REAL_PTR]], align 4
+// OGCG: store float %[[A_IMAG_MINUS]], ptr %[[RESULT_IMAG_PTR]], align 4
-- 
cgit v1.2.3


From a7f595efd840f7ed2210f2703048fad4d0027fac Mon Sep 17 00:00:00 2001
From: nvptm <pmathew@nvidia.com>
Date: Thu, 17 Jul 2025 09:04:44 -0700
Subject: [flang][acc] Create UseDeviceOp for both results of hlfir.declare
 (#148017)

A sample such as
```
program test
  integer :: N = 100
  real*8 :: b(-1:N)
  !$acc data copy(b)
  !$acc host_data use_device(b)
  call vadd(b)
  !$acc end host_data
  !$acc end data
end

```
is lowered to
```
    %13:2 = hlfir.declare %11(%12) {uniq_name = "_QFEb"} : (!fir.ref<!fir.array<?xf64>>, !fir.shapeshift<1>) -> (!fir.box<!fir.array<?xf64>>, !fir.ref<!fir.array<?xf64>>)
    %14 = acc.copyin var(%13#0 : !fir.box<!fir.array<?xf64>>) -> !fir.box<!fir.array<?xf64>> {dataClause = #acc<data_clause acc_copy>, name = "b"}
    acc.data dataOperands(%14 : !fir.box<!fir.array<?xf64>>) {
      %15 = acc.use_device var(%13#0 : !fir.box<!fir.array<?xf64>>) -> !fir.box<!fir.array<?xf64>> {name = "b"}
      acc.host_data dataOperands(%15 : !fir.box<!fir.array<?xf64>>) {
        fir.call @_QPvadd(%13#1) fastmath<contract> : (!fir.ref<!fir.array<?xf64>>) -> ()
        acc.terminator
      }
      acc.terminator
    }
    acc.copyout accVar(%14 : !fir.box<!fir.array<?xf64>>) to var(%13#0 : !fir.box<!fir.array<?xf64>>) {dataClause = #acc<data_clause acc_copy>, name = "b"}
```
Note that while the use_device clause is applied to %13#0, the argument
passed to vadd is %13#1. To avoid problems later in lowering, this
change additionally applies the use_device clause to %13#1, so that the
resulting MLIR is
```
   %13:2 = hlfir.declare %11(%12) {uniq_name = "_QFEb"} : (!fir.ref<!fir.array<?xf64>>, !fir.shapeshift<1>) -> (!fir.box<!fir.array<?xf64>>, !fir.ref<!fir.array<?xf64>>)
    %14 = acc.copyin var(%13#0 : !fir.box<!fir.array<?xf64>>) -> !fir.box<!fir.array<?xf64>> {dataClause = #acc<data_clause acc_copy>, name = "b"}
    acc.data dataOperands(%14 : !fir.box<!fir.array<?xf64>>) {
      %15 = acc.use_device var(%13#0 : !fir.box<!fir.array<?xf64>>) -> !fir.box<!fir.array<?xf64>> {name = "b"}
      %16 = acc.use_device varPtr(%13#1 : !fir.ref<!fir.array<?xf64>>) -> !fir.ref<!fir.array<?xf64>> {name = "b"}
      acc.host_data dataOperands(%15, %16 : !fir.box<!fir.array<?xf64>>, !fir.ref<!fir.array<?xf64>>) {
        fir.call @_QPvadd(%13#1) fastmath<contract> : (!fir.ref<!fir.array<?xf64>>) -> ()
        acc.terminator
      }
      acc.terminator
    }
    acc.copyout accVar(%14 : !fir.box<!fir.array<?xf64>>) to var(%13#0 : !fir.box<!fir.array<?xf64>>) {dataClause = #acc<data_clause acc_copy>, name = "b"}

```
---
 flang/lib/Lower/OpenACC.cpp                        | 20 ++++++-
 .../OpenACC/acc-host-data-unwrap-defaultbounds.f90 | 14 ++---
 flang/test/Lower/OpenACC/acc-host-data.f90         | 21 ++++----
 flang/test/Lower/OpenACC/acc-use-device.f90        | 61 ++++++++++++++++++++++
 4 files changed, 100 insertions(+), 16 deletions(-)
 create mode 100644 flang/test/Lower/OpenACC/acc-use-device.f90

diff --git a/flang/lib/Lower/OpenACC.cpp b/flang/lib/Lower/OpenACC.cpp
index 39e4444cde4e..25682cba5620 100644
--- a/flang/lib/Lower/OpenACC.cpp
+++ b/flang/lib/Lower/OpenACC.cpp
@@ -708,6 +708,7 @@ genDataOperandOperations(const Fortran::parser::AccObjectList &objectList,
                          bool setDeclareAttr = false) {
   fir::FirOpBuilder &builder = converter.getFirOpBuilder();
   Fortran::evaluate::ExpressionAnalyzer ea{semanticsContext};
+  const bool unwrapBoxAddr = true;
   for (const auto &accObject : objectList.v) {
     llvm::SmallVector<mlir::Value> bounds;
     std::stringstream asFortran;
@@ -735,8 +736,25 @@ genDataOperandOperations(const Fortran::parser::AccObjectList &objectList,
     Op op = createDataEntryOp<Op>(
         builder, operandLocation, baseAddr, asFortran, bounds, structured,
         implicit, dataClause, baseAddr.getType(), async, asyncDeviceTypes,
-        asyncOnlyDeviceTypes, /*unwrapBoxAddr=*/true, info.isPresent);
+        asyncOnlyDeviceTypes, unwrapBoxAddr, info.isPresent);
     dataOperands.push_back(op.getAccVar());
+
+    // For UseDeviceOp, if operand is one of a pair resulting from a
+    // declare operation, create a UseDeviceOp for the other operand as well.
+    if constexpr (std::is_same_v<Op, mlir::acc::UseDeviceOp>) {
+      if (auto declareOp =
+              mlir::dyn_cast<hlfir::DeclareOp>(baseAddr.getDefiningOp())) {
+        mlir::Value otherAddr = declareOp.getResult(1);
+        if (baseAddr != otherAddr) {
+          Op op = createDataEntryOp<Op>(builder, operandLocation, otherAddr,
+                                        asFortran, bounds, structured, implicit,
+                                        dataClause, otherAddr.getType(), async,
+                                        asyncDeviceTypes, asyncOnlyDeviceTypes,
+                                        unwrapBoxAddr, info.isPresent);
+          dataOperands.push_back(op.getAccVar());
+        }
+      }
+    }
   }
 }
 
diff --git a/flang/test/Lower/OpenACC/acc-host-data-unwrap-defaultbounds.f90 b/flang/test/Lower/OpenACC/acc-host-data-unwrap-defaultbounds.f90
index 164eb32a8f68..2de7cc5761a2 100644
--- a/flang/test/Lower/OpenACC/acc-host-data-unwrap-defaultbounds.f90
+++ b/flang/test/Lower/OpenACC/acc-host-data-unwrap-defaultbounds.f90
@@ -15,15 +15,17 @@ subroutine acc_host_data()
   !$acc end host_data
 
 ! CHECK: %[[BOUND:.*]] = acc.bounds lowerbound(%{{.*}} : index) upperbound(%{{.*}} : index) stride(%{{.*}} : index) startIdx(%{{.*}} : index)
-! CHECK: %[[DA:.*]] = acc.use_device varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10xf32>>) bounds(%[[BOUND]]) -> !fir.ref<!fir.array<10xf32>> {name = "a"}
-! CHECK: acc.host_data dataOperands(%[[DA]] : !fir.ref<!fir.array<10xf32>>)
+! CHECK: %[[DA0:.*]] = acc.use_device varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10xf32>>) bounds(%[[BOUND]]) -> !fir.ref<!fir.array<10xf32>> {name = "a"}
+! CHECK: %[[DA1:.*]] = acc.use_device varPtr(%[[DECLA]]#1 : !fir.ref<!fir.array<10xf32>>) bounds(%[[BOUND]]) -> !fir.ref<!fir.array<10xf32>> {name = "a"}
+ ! CHECK: acc.host_data dataOperands(%[[DA0]], %[[DA1]] : !fir.ref<!fir.array<10xf32>>, !fir.ref<!fir.array<10xf32>>)
 
   !$acc host_data use_device(a) if_present
   !$acc end host_data
 
 ! CHECK: %[[BOUND:.*]] = acc.bounds lowerbound(%{{.*}} : index) upperbound(%{{.*}} : index) stride(%{{.*}} : index) startIdx(%{{.*}} : index)
-! CHECK: %[[DA:.*]] = acc.use_device varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10xf32>>) bounds(%[[BOUND]]) -> !fir.ref<!fir.array<10xf32>> {name = "a"}
-! CHECK: acc.host_data dataOperands(%[[DA]] : !fir.ref<!fir.array<10xf32>>) {
+! CHECK: %[[DA0:.*]] = acc.use_device varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10xf32>>) bounds(%[[BOUND]]) -> !fir.ref<!fir.array<10xf32>> {name = "a"}
+! CHECK: %[[DA1:.*]] = acc.use_device varPtr(%[[DECLA]]#1 : !fir.ref<!fir.array<10xf32>>) bounds(%[[BOUND]]) -> !fir.ref<!fir.array<10xf32>> {name = "a"}
+! CHECK: acc.host_data dataOperands(%[[DA0]], %[[DA1]] : !fir.ref<!fir.array<10xf32>>{{.*}}) {
 ! CHECK: } attributes {ifPresent}
 
   !$acc host_data use_device(a) if(ifCondition)
@@ -33,14 +35,14 @@ subroutine acc_host_data()
 ! CHECK: %[[DA:.*]] = acc.use_device varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10xf32>>) bounds(%[[BOUND]]) -> !fir.ref<!fir.array<10xf32>> {name = "a"}
 ! CHECK: %[[LOAD_IFCOND:.*]] = fir.load %[[DECLIFCOND]]#0 : !fir.ref<!fir.logical<4>>
 ! CHECK: %[[IFCOND_I1:.*]] = fir.convert %[[LOAD_IFCOND]] : (!fir.logical<4>) -> i1
-! CHECK: acc.host_data if(%[[IFCOND_I1]]) dataOperands(%[[DA]] : !fir.ref<!fir.array<10xf32>>)
+! CHECK: acc.host_data if(%[[IFCOND_I1]]) dataOperands(%[[DA]]{{.*}} : !fir.ref<!fir.array<10xf32>>{{.*}})
 
   !$acc host_data use_device(a) if(.true.)
   !$acc end host_data
 
 ! CHECK: %[[BOUND:.*]] = acc.bounds lowerbound(%{{.*}} : index) upperbound(%{{.*}} : index) stride(%{{.*}} : index) startIdx(%{{.*}} : index)
 ! CHECK: %[[DA:.*]] = acc.use_device varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10xf32>>) bounds(%[[BOUND]]) -> !fir.ref<!fir.array<10xf32>> {name = "a"}
-! CHECK: acc.host_data dataOperands(%[[DA]] : !fir.ref<!fir.array<10xf32>>)
+! CHECK: acc.host_data dataOperands(%[[DA]]{{.*}} : !fir.ref<!fir.array<10xf32>>{{.*}})
 
   !$acc host_data use_device(a) if(.false.)
     a = 1.0
diff --git a/flang/test/Lower/OpenACC/acc-host-data.f90 b/flang/test/Lower/OpenACC/acc-host-data.f90
index 871eabd256ca..4d09b25b983b 100644
--- a/flang/test/Lower/OpenACC/acc-host-data.f90
+++ b/flang/test/Lower/OpenACC/acc-host-data.f90
@@ -14,34 +14,37 @@ subroutine acc_host_data()
   !$acc host_data use_device(a)
   !$acc end host_data
 
-! CHECK: %[[DA:.*]] = acc.use_device varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10xf32>>) -> !fir.ref<!fir.array<10xf32>> {name = "a"}
-! CHECK: acc.host_data dataOperands(%[[DA]] : !fir.ref<!fir.array<10xf32>>)
+! CHECK: %[[DA0:.*]] = acc.use_device varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10xf32>>) -> !fir.ref<!fir.array<10xf32>> {name = "a"}
+! CHECK: %[[DA1:.*]] = acc.use_device varPtr(%[[DECLA]]#1 : !fir.ref<!fir.array<10xf32>>) -> !fir.ref<!fir.array<10xf32>> {name = "a"}
+! CHECK: acc.host_data dataOperands(%[[DA0]], %[[DA1]] : !fir.ref<!fir.array<10xf32>>, !fir.ref<!fir.array<10xf32>>)
 
   !$acc host_data use_device(a) if_present
   !$acc end host_data
 
-! CHECK: %[[DA:.*]] = acc.use_device varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10xf32>>) -> !fir.ref<!fir.array<10xf32>> {name = "a"}
-! CHECK: acc.host_data dataOperands(%[[DA]] : !fir.ref<!fir.array<10xf32>>) {
+! CHECK: %[[DA0:.*]] = acc.use_device varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10xf32>>) -> !fir.ref<!fir.array<10xf32>> {name = "a"}
+! CHECK: %[[DA1:.*]] = acc.use_device varPtr(%[[DECLA]]#1 : !fir.ref<!fir.array<10xf32>>) -> !fir.ref<!fir.array<10xf32>> {name = "a"}
+! CHECK: acc.host_data dataOperands(%[[DA0]], %[[DA1]] : !fir.ref<!fir.array<10xf32>>, !fir.ref<!fir.array<10xf32>>)
 ! CHECK: } attributes {ifPresent}
 
-  !$acc host_data use_device(a) if_present if_present
+  !$acc host_data use_device(a) if_present 
   !$acc end host_data
-! CHECK: acc.host_data dataOperands(%{{.*}} : !fir.ref<!fir.array<10xf32>>) {
+! CHECK: acc.host_data dataOperands(%{{.*}}{{.*}} : !fir.ref<!fir.array<10xf32>>{{.*}}) {
 ! CHECK: } attributes {ifPresent}
 
   !$acc host_data use_device(a) if(ifCondition)
   !$acc end host_data
 
-! CHECK: %[[DA:.*]] = acc.use_device varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10xf32>>) -> !fir.ref<!fir.array<10xf32>> {name = "a"}
+! CHECK: %[[DA0:.*]] = acc.use_device varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10xf32>>) -> !fir.ref<!fir.array<10xf32>> {name = "a"}
+! CHECK: %[[DA1:.*]] = acc.use_device varPtr(%[[DECLA]]#1 : !fir.ref<!fir.array<10xf32>>) -> !fir.ref<!fir.array<10xf32>> {name = "a"}
 ! CHECK: %[[LOAD_IFCOND:.*]] = fir.load %[[DECLIFCOND]]#0 : !fir.ref<!fir.logical<4>>
 ! CHECK: %[[IFCOND_I1:.*]] = fir.convert %[[LOAD_IFCOND]] : (!fir.logical<4>) -> i1
-! CHECK: acc.host_data if(%[[IFCOND_I1]]) dataOperands(%[[DA]] : !fir.ref<!fir.array<10xf32>>)
+! CHECK: acc.host_data if(%[[IFCOND_I1]]) dataOperands(%[[DA0]]{{.*}} : !fir.ref<!fir.array<10xf32>>{{.*}})
 
   !$acc host_data use_device(a) if(.true.)
   !$acc end host_data
 
 ! CHECK: %[[DA:.*]] = acc.use_device varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10xf32>>) -> !fir.ref<!fir.array<10xf32>> {name = "a"}
-! CHECK: acc.host_data dataOperands(%[[DA]] : !fir.ref<!fir.array<10xf32>>)
+! CHECK: acc.host_data dataOperands(%[[DA]]{{.*}} : !fir.ref<!fir.array<10xf32>>{{.*}})
 
   !$acc host_data use_device(a) if(.false.)
     a = 1.0
diff --git a/flang/test/Lower/OpenACC/acc-use-device.f90 b/flang/test/Lower/OpenACC/acc-use-device.f90
new file mode 100644
index 000000000000..081a6e317bfc
--- /dev/null
+++ b/flang/test/Lower/OpenACC/acc-use-device.f90
@@ -0,0 +1,61 @@
+! This test checks whether the OpenACC use_device clause is applied on both results of hlfir.declare.
+
+! RUN: bbc -fopenacc -emit-hlfir %s -o - | FileCheck %s
+
+! Test for automatic variable appearing in use_device clause.
+subroutine test()
+  integer :: N = 100
+  real*8 :: b(-1:N)
+! CHECK: %[[A0:.*]] = fir.alloca !fir.array<?xf64>, %{{.*}} {bindc_name = "b", uniq_name = "_QFtestEb"}
+! CHECK: %[[A1:.*]] = fir.shape_shift {{.*}} : (index, index) -> !fir.shapeshift<1>
+! CHECK: %[[A:.*]]:2 = hlfir.declare %[[A0]](%[[A1]]) {uniq_name = "_QFtestEb"} : (!fir.ref<!fir.array<?xf64>>, !fir.shapeshift<1>) -> (!fir.box<!fir.array<?xf64>>, !fir.ref<!fir.array<?xf64>>)
+  
+  !$acc data copy(b)
+! CHECK: %[[B:.*]] = acc.copyin var(%[[A]]#0 : !fir.box<!fir.array<?xf64>>) -> !fir.box<!fir.array<?xf64>> {dataClause = #acc<data_clause acc_copy>, name = "b"}
+! CHECK: acc.data dataOperands(%[[B]] : !fir.box<!fir.array<?xf64>>) {
+
+  !$acc host_data use_device(b)
+  call vadd(b)
+  !$acc end host_data
+! CHECK: %[[C:.*]] = acc.use_device var(%[[A]]#0 : !fir.box<!fir.array<?xf64>>) -> !fir.box<!fir.array<?xf64>> {name = "b"}
+! CHECK: %[[D:.*]] = acc.use_device varPtr(%[[A]]#1 : !fir.ref<!fir.array<?xf64>>) -> !fir.ref<!fir.array<?xf64>> {name = "b"}
+! CHECK: acc.host_data dataOperands(%[[C]], %[[D]] : !fir.box<!fir.array<?xf64>>, !fir.ref<!fir.array<?xf64>>) {
+! CHECK: fir.call @_QPvadd(%[[A]]#1) fastmath<contract> : (!fir.ref<!fir.array<?xf64>>) -> ()
+  !$acc end data
+! CHECK: acc.copyout accVar(%[[B]] : !fir.box<!fir.array<?xf64>>) to var(%[[A]]#0 : !fir.box<!fir.array<?xf64>>) {dataClause = #acc<data_clause acc_copy>, name = "b"}
+end 
+
+! Test for allocatable, pointer and assumed-shape variables appearing in use_device clause.
+subroutine test2(a, b, c)
+  integer :: N = 100
+  real*8, allocatable :: a(:)
+  real*8, target, allocatable :: d(:)
+  real*8 :: b(:)
+  real*8, pointer :: c(:)
+  call allocate(a(N))
+  call allocate(d(N))
+  c => d
+! CHECK: %[[DS:.*]] = fir.dummy_scope : !fir.dscope
+! CHECK: %[[E:.*]]:2 = hlfir.declare %arg0 dummy_scope %[[DS]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest2Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>)
+! CHECK: %[[F:.*]]:2 = hlfir.declare %arg1 dummy_scope %[[DS]] {uniq_name = "_QFtest2Eb"} : (!fir.box<!fir.array<?xf64>>, !fir.dscope) -> (!fir.box<!fir.array<?xf64>>, !fir.box<!fir.array<?xf64>>)
+! CHECK: %[[G:.*]]:2 = hlfir.declare %arg2 dummy_scope %[[DS]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest2Ec"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf64>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf64>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf64>>>>)
+
+  !$acc data copy(a,b,c,d)
+  !$acc host_data use_device(a,b,c)
+  call vadd2(a,b,c)
+  !$acc end host_data
+
+! CHECK: %[[H:.*]] = acc.use_device varPtr(%[[E]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>> {name = "a"}
+! CHECK: %[[I:.*]] = acc.use_device varPtr(%[[E]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>> {name = "a"}
+! CHECK: %[[J:.*]] = acc.use_device var(%[[F]]#0 : !fir.box<!fir.array<?xf64>>) -> !fir.box<!fir.array<?xf64>> {name = "b"}
+! CHECK: %[[K:.*]] = acc.use_device var(%[[F]]#1 : !fir.box<!fir.array<?xf64>>) -> !fir.box<!fir.array<?xf64>> {name = "b"}
+! CHECK: %[[L:.*]] = acc.use_device varPtr(%[[G]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf64>>>>) -> !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf64>>>> {name = "c"}
+! CHECK: %[[M:.*]] = acc.use_device varPtr(%[[G]]#1 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf64>>>>) -> !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf64>>>> {name = "c"}
+! CHECK: acc.host_data dataOperands(%[[H]], %[[I]], %[[J]], %[[K]], %[[L]], %[[M]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>, !fir.box<!fir.array<?xf64>>, !fir.box<!fir.array<?xf64>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf64>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf64>>>>) {
+
+
+
+
+  !$acc end data
+
+end
-- 
cgit v1.2.3


From 7caf12da0bb09d6b6992bf42afd256d453753dcb Mon Sep 17 00:00:00 2001
From: Jeremy Kun <jkun@google.com>
Date: Thu, 17 Jul 2025 09:09:12 -0700
Subject: [mlir][core] Add an MLIR "pattern catalog" generator (#146228)

This PR adds a feature that attaches a listener to all RewritePatterns that
logs information about the modified operations.

When the MLIR test suite is run, these debug outputs can
be filtered and combined into an index linking operations to the
patterns that insert, modify, or replace them. This index is intended to
be used to create a website that allows one to look up patterns from an
operation name.

The debug logs emitted can be viewed with --debug-only=generate-pattern-catalog,
and the lit config is modified to do this when the env var MLIR_GENERATE_PATTERN_CATALOG is set.

Example usage:

```
mkdir build && cd build
cmake -G Ninja ../llvm \
  -DLLVM_ENABLE_PROJECTS="mlir" \
  -DLLVM_TARGETS_TO_BUILD="host" \
  -DCMAKE_BUILD_TYPE=DEBUG
ninja -j 24 check-mlir
MLIR_GENERATE_PATTERN_CATALOG=1 bin/llvm-lit -j 24 -v -a tools/mlir/test | grep 'pattern-logging-listener' | sed 's/^# | [pattern-logging-listener] //g' | sort | uniq > pattern_catalog.txt
```

Sample pattern catalog output (that fits in a gist):
https://gist.github.com/j2kun/02d1ab8d31c10d71027724984c89905a

---------

Co-authored-by: Jeremy Kun <j2kun@users.noreply.github.com>
Co-authored-by: Mehdi Amini <joker.eph@gmail.com>
---
 mlir/include/mlir/IR/PatternMatch.h             | 19 ++++++++++
 mlir/lib/IR/CMakeLists.txt                      |  1 +
 mlir/lib/IR/PatternLoggingListener.cpp          | 50 +++++++++++++++++++++++++
 mlir/lib/Rewrite/PatternApplicator.cpp          | 16 +++++++-
 mlir/test/IR/test-pattern-logging-listener.mlir | 17 +++++++++
 mlir/test/lit.cfg.py                            | 11 ++++++
 6 files changed, 112 insertions(+), 2 deletions(-)
 create mode 100644 mlir/lib/IR/PatternLoggingListener.cpp
 create mode 100644 mlir/test/IR/test-pattern-logging-listener.mlir

diff --git a/mlir/include/mlir/IR/PatternMatch.h b/mlir/include/mlir/IR/PatternMatch.h
index afeb784b85a1..3a2dbd136b43 100644
--- a/mlir/include/mlir/IR/PatternMatch.h
+++ b/mlir/include/mlir/IR/PatternMatch.h
@@ -475,6 +475,25 @@ public:
     RewriterBase::Listener *rewriteListener;
   };
 
+  /// A listener that logs notification events to llvm::dbgs() before
+  /// forwarding to the base listener.
+  struct PatternLoggingListener : public RewriterBase::ForwardingListener {
+    PatternLoggingListener(OpBuilder::Listener *listener, StringRef patternName)
+        : RewriterBase::ForwardingListener(listener), patternName(patternName) {
+    }
+
+    void notifyOperationInserted(Operation *op, InsertPoint previous) override;
+    void notifyOperationModified(Operation *op) override;
+    void notifyOperationReplaced(Operation *op, Operation *newOp) override;
+    void notifyOperationReplaced(Operation *op,
+                                 ValueRange replacement) override;
+    void notifyOperationErased(Operation *op) override;
+    void notifyPatternBegin(const Pattern &pattern, Operation *op) override;
+
+  private:
+    StringRef patternName;
+  };
+
   /// Move the blocks that belong to "region" before the given position in
   /// another region "parent". The two regions must be different. The caller
   /// is responsible for creating or updating the operation transferring flow
diff --git a/mlir/lib/IR/CMakeLists.txt b/mlir/lib/IR/CMakeLists.txt
index 4cabac185171..3ef69cea18f0 100644
--- a/mlir/lib/IR/CMakeLists.txt
+++ b/mlir/lib/IR/CMakeLists.txt
@@ -29,6 +29,7 @@ add_mlir_library(MLIRIR
   ODSSupport.cpp
   Operation.cpp
   OperationSupport.cpp
+  PatternLoggingListener.cpp
   PatternMatch.cpp
   Region.cpp
   RegionKindInterface.cpp
diff --git a/mlir/lib/IR/PatternLoggingListener.cpp b/mlir/lib/IR/PatternLoggingListener.cpp
new file mode 100644
index 000000000000..ce2123ae1a19
--- /dev/null
+++ b/mlir/lib/IR/PatternLoggingListener.cpp
@@ -0,0 +1,50 @@
+#include "mlir/IR/PatternMatch.h"
+#include "llvm/Support/Debug.h"
+
+#define DEBUG_TYPE "pattern-logging-listener"
+#define DBGS() (llvm::dbgs() << "[" << DEBUG_TYPE << "] ")
+#define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n")
+
+using namespace mlir;
+
+void RewriterBase::PatternLoggingListener::notifyOperationInserted(
+    Operation *op, InsertPoint previous) {
+  LDBG(patternName << " | notifyOperationInserted"
+                   << " | " << op->getName());
+  ForwardingListener::notifyOperationInserted(op, previous);
+}
+
+void RewriterBase::PatternLoggingListener::notifyOperationModified(
+    Operation *op) {
+  LDBG(patternName << " | notifyOperationModified"
+                   << " | " << op->getName());
+  ForwardingListener::notifyOperationModified(op);
+}
+
+void RewriterBase::PatternLoggingListener::notifyOperationReplaced(
+    Operation *op, Operation *newOp) {
+  LDBG(patternName << " | notifyOperationReplaced (with op)"
+                   << " | " << op->getName() << " | " << newOp->getName());
+  ForwardingListener::notifyOperationReplaced(op, newOp);
+}
+
+void RewriterBase::PatternLoggingListener::notifyOperationReplaced(
+    Operation *op, ValueRange replacement) {
+  LDBG(patternName << " | notifyOperationReplaced (with values)"
+                   << " | " << op->getName());
+  ForwardingListener::notifyOperationReplaced(op, replacement);
+}
+
+void RewriterBase::PatternLoggingListener::notifyOperationErased(
+    Operation *op) {
+  LDBG(patternName << " | notifyOperationErased"
+                   << " | " << op->getName());
+  ForwardingListener::notifyOperationErased(op);
+}
+
+void RewriterBase::PatternLoggingListener::notifyPatternBegin(
+    const Pattern &pattern, Operation *op) {
+  LDBG(patternName << " | notifyPatternBegin"
+                   << " | " << op->getName());
+  ForwardingListener::notifyPatternBegin(pattern, op);
+}
diff --git a/mlir/lib/Rewrite/PatternApplicator.cpp b/mlir/lib/Rewrite/PatternApplicator.cpp
index 4a12183492fd..b2b372b7b124 100644
--- a/mlir/lib/Rewrite/PatternApplicator.cpp
+++ b/mlir/lib/Rewrite/PatternApplicator.cpp
@@ -15,6 +15,10 @@
 #include "ByteCode.h"
 #include "llvm/Support/Debug.h"
 
+#ifndef NDEBUG
+#include "llvm/ADT/ScopeExit.h"
+#endif
+
 #define DEBUG_TYPE "pattern-application"
 
 using namespace mlir;
@@ -206,11 +210,19 @@ LogicalResult PatternApplicator::matchAndRewrite(
           } else {
             LLVM_DEBUG(llvm::dbgs() << "Trying to match \""
                                     << bestPattern->getDebugName() << "\"\n");
-
             const auto *pattern =
                 static_cast<const RewritePattern *>(bestPattern);
-            result = pattern->matchAndRewrite(op, rewriter);
 
+#ifndef NDEBUG
+            OpBuilder::Listener *oldListener = rewriter.getListener();
+            auto loggingListener =
+                std::make_unique<RewriterBase::PatternLoggingListener>(
+                    oldListener, pattern->getDebugName());
+            rewriter.setListener(loggingListener.get());
+            auto resetListenerCallback = llvm::make_scope_exit(
+                [&] { rewriter.setListener(oldListener); });
+#endif
+            result = pattern->matchAndRewrite(op, rewriter);
             LLVM_DEBUG(llvm::dbgs()
                        << "\"" << bestPattern->getDebugName() << "\" result "
                        << succeeded(result) << "\n");
diff --git a/mlir/test/IR/test-pattern-logging-listener.mlir b/mlir/test/IR/test-pattern-logging-listener.mlir
new file mode 100644
index 000000000000..a1d27741a072
--- /dev/null
+++ b/mlir/test/IR/test-pattern-logging-listener.mlir
@@ -0,0 +1,17 @@
+// RUN: mlir-opt %s --test-walk-pattern-rewrite-driver \
+// RUN:   --allow-unregistered-dialect --debug-only=pattern-logging-listener 2>&1 | FileCheck %s
+
+// Check that when replacing an op with a new op, we get appropriate
+// pattern-logging lines. The regex is because the anonymous namespace is
+// printed differently on different platforms.
+
+// CHECK: [pattern-logging-listener] {{.anonymous.namespace.}}::ReplaceWithNewOp | notifyOperationInserted | test.new_op
+// CHECK: [pattern-logging-listener] {{.anonymous.namespace.}}::ReplaceWithNewOp | notifyOperationReplaced (with values) | test.replace_with_new_op
+// CHECK: [pattern-logging-listener] {{.anonymous.namespace.}}::ReplaceWithNewOp | notifyOperationModified | arith.addi
+// CHECK: [pattern-logging-listener] {{.anonymous.namespace.}}::ReplaceWithNewOp | notifyOperationModified | arith.addi
+// CHECK: [pattern-logging-listener] {{.anonymous.namespace.}}::ReplaceWithNewOp | notifyOperationErased | test.replace_with_new_op
+func.func @replace_with_new_op() -> i32 {
+  %a = "test.replace_with_new_op"() : () -> (i32)
+  %res = arith.addi %a, %a : i32
+  return %res : i32
+}
diff --git a/mlir/test/lit.cfg.py b/mlir/test/lit.cfg.py
index 9b5cadd62bef..233fef8ec429 100644
--- a/mlir/test/lit.cfg.py
+++ b/mlir/test/lit.cfg.py
@@ -301,6 +301,17 @@ if "MLIR_OPT_CHECK_IR_ROUNDTRIP" in os.environ:
             ToolSubst("mlir-opt", "mlir-opt --verify-roundtrip", unresolved="fatal"),
         ]
     )
+elif "MLIR_GENERATE_PATTERN_CATALOG" in os.environ:
+    tools.extend(
+        [
+            ToolSubst(
+                "mlir-opt",
+                "mlir-opt --debug-only=pattern-logging-listener --mlir-disable-threading",
+                unresolved="fatal",
+            ),
+            ToolSubst("FileCheck", "FileCheck --dump-input=always", unresolved="fatal"),
+        ]
+    )
 else:
     tools.extend(["mlir-opt"])
 
-- 
cgit v1.2.3


From d97c224e8cbba9158ebda6f12f9a06b09534ae29 Mon Sep 17 00:00:00 2001
From: Connector Switch <c8ef@outlook.com>
Date: Fri, 18 Jul 2025 00:10:07 +0800
Subject: [libc][NFC]: Correct some comments about SDCOMP-26094. (#149317)

---
 libc/test/src/math/cospif_test.cpp  | 2 +-
 libc/test/src/math/sincosf_test.cpp | 2 +-
 libc/test/src/math/sinpif_test.cpp  | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/libc/test/src/math/cospif_test.cpp b/libc/test/src/math/cospif_test.cpp
index cb88bfcade0d..5c30fb7c8718 100644
--- a/libc/test/src/math/cospif_test.cpp
+++ b/libc/test/src/math/cospif_test.cpp
@@ -100,7 +100,7 @@ TEST_F(LlvmLibcCospifTest, SmallValues) {
                                  LIBC_NAMESPACE::cospif(x), 0.5);
 }
 
-// SDCOMP-26094: check sinfpi in the cases for which the range reducer
+// SDCOMP-26094: check cospif in the cases for which the range reducer
 // returns values furthest beyond its nominal upper bound of pi/4.
 TEST_F(LlvmLibcCospifTest, SDCOMP_26094) {
   for (uint32_t v : SDCOMP26094_VALUES) {
diff --git a/libc/test/src/math/sincosf_test.cpp b/libc/test/src/math/sincosf_test.cpp
index ad2155f329cd..4aac1fabfbd6 100644
--- a/libc/test/src/math/sincosf_test.cpp
+++ b/libc/test/src/math/sincosf_test.cpp
@@ -164,7 +164,7 @@ TEST_F(LlvmLibcSinCosfTest, SpecialValues) {
   }
 }
 
-// SDCOMP-26094: check sinf in the cases for which the range reducer
+// SDCOMP-26094: check sincosf in the cases for which the range reducer
 // returns values furthest beyond its nominal upper bound of pi/4.
 TEST_F(LlvmLibcSinCosfTest, SDCOMP_26094) {
   for (uint32_t v : SDCOMP26094_VALUES) {
diff --git a/libc/test/src/math/sinpif_test.cpp b/libc/test/src/math/sinpif_test.cpp
index 986c676761f0..94e3dbc4f07d 100644
--- a/libc/test/src/math/sinpif_test.cpp
+++ b/libc/test/src/math/sinpif_test.cpp
@@ -100,7 +100,7 @@ TEST_F(LlvmLibcSinpifTest, SmallValues) {
                                  LIBC_NAMESPACE::sinpif(x), 0.5);
 }
 
-// SDCOMP-26094: check sinfpi in the cases for which the range reducer
+// SDCOMP-26094: check sinpif in the cases for which the range reducer
 // returns values furthest beyond its nominal upper bound of pi/4.
 TEST_F(LlvmLibcSinpifTest, SDCOMP_26094) {
   for (uint32_t v : SDCOMP26094_VALUES) {
-- 
cgit v1.2.3


From 011d38bdac95647a872a5faa339465e26535df35 Mon Sep 17 00:00:00 2001
From: erichkeane <ekeane@nvidia.com>
Date: Thu, 17 Jul 2025 07:43:14 -0700
Subject: [OpenACC] Update OpenACC macro, remove override macro

As we are now Sema-complete for OpenACC 3.4 (and thus have a conforming
implementation, in all modes), we can now set the _OPENACC macro
correctly.

Additionally, we remove the temporary 'override' functionality, which
was intended to allow people to experiment with this. We aren't having a
deprecation period as OpenACC support is still considered experimental.
---
 clang/include/clang/Basic/LangOptions.h   |  5 -----
 clang/include/clang/Driver/Options.td     | 13 -------------
 clang/lib/Driver/ToolChains/Clang.cpp     |  9 ---------
 clang/lib/Frontend/CompilerInvocation.cpp | 12 ++----------
 clang/lib/Frontend/InitPreprocessor.cpp   | 12 ++----------
 clang/test/Driver/openacc.c               | 12 ------------
 clang/test/Preprocessor/openacc.c         |  8 ++------
 7 files changed, 6 insertions(+), 65 deletions(-)

diff --git a/clang/include/clang/Basic/LangOptions.h b/clang/include/clang/Basic/LangOptions.h
index 937cbff4e3ea..0407897359b5 100644
--- a/clang/include/clang/Basic/LangOptions.h
+++ b/clang/include/clang/Basic/LangOptions.h
@@ -633,11 +633,6 @@ public:
   // received as a result of a standard operator new (-fcheck-new)
   bool CheckNew = false;
 
-  // In OpenACC mode, contains a user provided override for the _OPENACC macro.
-  // This exists so that we can override the macro value and test our incomplete
-  // implementation on real-world examples.
-  std::string OpenACCMacroOverride;
-
   /// The HLSL root signature version for dxil.
   llvm::dxbc::RootSignatureVersion HLSLRootSigVer =
       llvm::dxbc::RootSignatureVersion::V1_1;
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index a8c1b5dd8ab3..6c22f06b269f 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -1422,19 +1422,6 @@ def fno_hip_emit_relocatable : Flag<["-"], "fno-hip-emit-relocatable">,
   HelpText<"Do not override toolchain to compile HIP source to relocatable">;
 }
 
-// Clang specific/exclusive options for OpenACC.
-def openacc_macro_override
-    : Separate<["-"], "fexperimental-openacc-macro-override">,
-      Visibility<[ClangOption, CC1Option]>,
-      Group<f_Group>,
-      HelpText<"Overrides the _OPENACC macro value for experimental testing "
-               "during OpenACC support development">;
-def openacc_macro_override_EQ
-    : Joined<["-"], "fexperimental-openacc-macro-override=">,
-      Alias<openacc_macro_override>;
-
-// End Clang specific/exclusive options for OpenACC.
-
 def libomptarget_amdgpu_bc_path_EQ : Joined<["--"], "libomptarget-amdgpu-bc-path=">, Group<i_Group>,
   HelpText<"Path to libomptarget-amdgcn bitcode library">;
 def libomptarget_amdgcn_bc_path_EQ : Joined<["--"], "libomptarget-amdgcn-bc-path=">, Group<i_Group>,
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 456bfe885f35..8880c9375143 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -3846,15 +3846,6 @@ static void RenderOpenACCOptions(const Driver &D, const ArgList &Args,
     return;
 
   CmdArgs.push_back("-fopenacc");
-
-  if (Arg *A = Args.getLastArg(options::OPT_openacc_macro_override)) {
-    StringRef Value = A->getValue();
-    int Version;
-    if (!Value.getAsInteger(10, Version))
-      A->renderAsInput(Args, CmdArgs);
-    else
-      D.Diag(diag::err_drv_clang_unsupported) << Value;
-  }
 }
 
 static void RenderBuiltinOptions(const ToolChain &TC, const llvm::Triple &T,
diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp
index 6ab36d867596..3a36250da57a 100644
--- a/clang/lib/Frontend/CompilerInvocation.cpp
+++ b/clang/lib/Frontend/CompilerInvocation.cpp
@@ -3913,12 +3913,8 @@ void CompilerInvocationBase::GenerateLangArgs(const LangOptions &Opts,
   if (Opts.OpenMPCUDAMode)
     GenerateArg(Consumer, OPT_fopenmp_cuda_mode);
 
-  if (Opts.OpenACC) {
+  if (Opts.OpenACC)
     GenerateArg(Consumer, OPT_fopenacc);
-    if (!Opts.OpenACCMacroOverride.empty())
-      GenerateArg(Consumer, OPT_openacc_macro_override,
-                  Opts.OpenACCMacroOverride);
-  }
 
   // The arguments used to set Optimize, OptimizeSize and NoInlineDefine are
   // generated from CodeGenOptions.
@@ -4424,13 +4420,9 @@ bool CompilerInvocation::ParseLangArgs(LangOptions &Opts, ArgList &Args,
                         Args.hasArg(options::OPT_fopenmp_cuda_mode);
 
   // OpenACC Configuration.
-  if (Args.hasArg(options::OPT_fopenacc)) {
+  if (Args.hasArg(options::OPT_fopenacc))
     Opts.OpenACC = true;
 
-    if (Arg *A = Args.getLastArg(options::OPT_openacc_macro_override))
-      Opts.OpenACCMacroOverride = A->getValue();
-  }
-
   if (Arg *A = Args.getLastArg(OPT_ffp_contract)) {
     StringRef Val = A->getValue();
     if (Val == "fast")
diff --git a/clang/lib/Frontend/InitPreprocessor.cpp b/clang/lib/Frontend/InitPreprocessor.cpp
index 38b2e0cf1ca5..382ccd610946 100644
--- a/clang/lib/Frontend/InitPreprocessor.cpp
+++ b/clang/lib/Frontend/InitPreprocessor.cpp
@@ -639,16 +639,8 @@ static void InitializeStandardPredefinedMacros(const TargetInfo &TI,
     }
   }
 
-  if (LangOpts.OpenACC) {
-    // FIXME: When we have full support for OpenACC, we should set this to the
-    // version we support. Until then, set as '1' by default, but provide a
-    // temporary mechanism for users to override this so real-world examples can
-    // be tested against.
-    if (!LangOpts.OpenACCMacroOverride.empty())
-      Builder.defineMacro("_OPENACC", LangOpts.OpenACCMacroOverride);
-    else
-      Builder.defineMacro("_OPENACC", "1");
-  }
+  if (LangOpts.OpenACC)
+    Builder.defineMacro("_OPENACC", "202506");
 }
 
 /// Initialize the predefined C++ language feature test macros defined in
diff --git a/clang/test/Driver/openacc.c b/clang/test/Driver/openacc.c
index c7f1d2545bd0..f46e2a32bcab 100644
--- a/clang/test/Driver/openacc.c
+++ b/clang/test/Driver/openacc.c
@@ -1,14 +1,2 @@
 // RUN: %clang -S -### -fopenacc %s 2>&1 | FileCheck %s --check-prefix=CHECK-DRIVER
 // CHECK-DRIVER: "-cc1" {{.*}} "-fopenacc"
-
-// RUN: %clang -S -### -fopenacc -fexperimental-openacc-macro-override=202211 %s 2>&1 | FileCheck %s --check-prefix=CHECK-MACRO-OVERRIDE
-// RUN: %clang -S -### -fopenacc -fexperimental-openacc-macro-override 202211 %s 2>&1 | FileCheck %s --check-prefix=CHECK-MACRO-OVERRIDE
-// CHECK-MACRO-OVERRIDE: "-cc1"{{.*}} "-fexperimental-openacc-macro-override" "202211"
-
-// RUN: not %clang -S -fopenacc -fexperimental-openacc-macro-override=202211L %s 2>&1 | FileCheck %s --check-prefix=INVALID
-// RUN: not %clang -S -fopenacc -fexperimental-openacc-macro-override 202211L %s 2>&1 | FileCheck %s --check-prefix=INVALID
-// RUN: not %clang -S -fopenacc -fexperimental-openacc-macro-override=L202211 %s 2>&1 | FileCheck %s --check-prefix=INVALID
-// RUN: not %clang -S -fopenacc -fexperimental-openacc-macro-override L202211 %s 2>&1 | FileCheck %s --check-prefix=INVALID
-// RUN: not %clang -S -fopenacc -fexperimental-openacc-macro-override=2022L11 %s 2>&1 | FileCheck %s --check-prefix=INVALID
-// RUN: not %clang -S -fopenacc -fexperimental-openacc-macro-override 2022L11 %s 2>&1 | FileCheck %s --check-prefix=INVALID
-// INVALID: error: the clang compiler does not support
diff --git a/clang/test/Preprocessor/openacc.c b/clang/test/Preprocessor/openacc.c
index be7052f00e0c..283baa6c2fe4 100644
--- a/clang/test/Preprocessor/openacc.c
+++ b/clang/test/Preprocessor/openacc.c
@@ -1,13 +1,9 @@
 // RUN: %clang_cc1 -E -fopenacc %s | FileCheck %s --check-prefix=DEFAULT
-// RUN: %clang_cc1 -E -fopenacc -fexperimental-openacc-macro-override 202211 %s | FileCheck %s --check-prefix=OVERRIDE
 
-// DEFAULT: OpenACC:1:
-// OVERRIDE: OpenACC:202211:
+// DEFAULT: OpenACC:202506:
 OpenACC:_OPENACC:
 
 // RUN: %clang_cc1 -E -dM -fopenacc %s | FileCheck %s --check-prefix=MACRO_PRINT_DEF
-// RUN: %clang_cc1 -E -dM -fopenacc -fexperimental-openacc-macro-override 202211 %s | FileCheck %s --check-prefix=MACRO_PRINT_OVR
-// MACRO_PRINT_DEF: #define _OPENACC 1
-// MACRO_PRINT_OVR: #define _OPENACC 202211
+// MACRO_PRINT_DEF: #define _OPENACC 202506
 
 
-- 
cgit v1.2.3


From a6fb3b3c18fd48a2eaaa8c969edbc013b9276a09 Mon Sep 17 00:00:00 2001
From: Jacob Lalonde <jalalonde@fb.com>
Date: Thu, 17 Jul 2025 09:18:23 -0700
Subject: [LLDB] Process minidump better error messages (#149206)

Prior, Process Minidump would return

```
Status::FromErrorString("could not parse memory info");
```

For any unsuccessful memory read, with no differentiation between an
error in LLDB and the data simply not being present. This lead to a lot
of user confusion and overall pretty terrible user experience. To fix
this I've refactored the APIs so we can pass an error back in an llvm
expected.

There were also no shell tests for memory read and process Minidump so I
added one.
---
 .../Plugins/Process/minidump/MinidumpParser.cpp    | 34 ++++++++++++++----
 .../Plugins/Process/minidump/MinidumpParser.h      |  3 +-
 .../Plugins/Process/minidump/ProcessMinidump.cpp   |  9 +++--
 .../test/Shell/Minidump/missing-memory-region.yaml | 42 ++++++++++++++++++++++
 .../Process/minidump/MinidumpParserTest.cpp        | 23 ++++++------
 5 files changed, 91 insertions(+), 20 deletions(-)
 create mode 100644 lldb/test/Shell/Minidump/missing-memory-region.yaml

diff --git a/lldb/source/Plugins/Process/minidump/MinidumpParser.cpp b/lldb/source/Plugins/Process/minidump/MinidumpParser.cpp
index ef691b77193c..58ebb7be1199 100644
--- a/lldb/source/Plugins/Process/minidump/MinidumpParser.cpp
+++ b/lldb/source/Plugins/Process/minidump/MinidumpParser.cpp
@@ -108,13 +108,21 @@ MinidumpParser::GetThreadContext(const minidump::Thread &td) {
 
 llvm::ArrayRef<uint8_t>
 MinidumpParser::GetThreadContextWow64(const minidump::Thread &td) {
+  Log *log = GetLog(LLDBLog::Process);
   // On Windows, a 32-bit process can run on a 64-bit machine under WOW64. If
   // the minidump was captured with a 64-bit debugger, then the CONTEXT we just
   // grabbed from the mini_dump_thread is the one for the 64-bit "native"
   // process rather than the 32-bit "guest" process we care about.  In this
   // case, we can get the 32-bit CONTEXT from the TEB (Thread Environment
   // Block) of the 64-bit process.
-  auto teb_mem = GetMemory(td.EnvironmentBlock, sizeof(TEB64));
+  auto teb_mem_maybe = GetMemory(td.EnvironmentBlock, sizeof(TEB64));
+  if (!teb_mem_maybe) {
+    LLDB_LOG_ERROR(log, teb_mem_maybe.takeError(),
+                   "Failed to read Thread Environment Block: {0}");
+    return {};
+  }
+
+  auto teb_mem = *teb_mem_maybe;
   if (teb_mem.empty())
     return {};
 
@@ -126,8 +134,16 @@ MinidumpParser::GetThreadContextWow64(const minidump::Thread &td) {
   // Slot 1 of the thread-local storage in the 64-bit TEB points to a structure
   // that includes the 32-bit CONTEXT (after a ULONG). See:
   // https://msdn.microsoft.com/en-us/library/ms681670.aspx
-  auto context =
+  auto context_maybe =
       GetMemory(wow64teb->tls_slots[1] + 4, sizeof(MinidumpContext_x86_32));
+  if (!context_maybe) {
+    LLDB_LOG_ERROR(log, context_maybe.takeError(),
+                   "Failed to read WOW Thread Context: {0}");
+    return {};
+  }
+
+  auto context = *context_maybe;
+
   if (context.size() < sizeof(MinidumpContext_x86_32))
     return {};
 
@@ -478,11 +494,13 @@ void MinidumpParser::PopulateMemoryRanges() {
   m_memory_ranges.Sort();
 }
 
-llvm::ArrayRef<uint8_t> MinidumpParser::GetMemory(lldb::addr_t addr,
-                                                  size_t size) {
+llvm::Expected<llvm::ArrayRef<uint8_t>>
+MinidumpParser::GetMemory(lldb::addr_t addr, size_t size) {
   std::optional<minidump::Range> range = FindMemoryRange(addr);
   if (!range)
-    return {};
+    return llvm::createStringError(
+        llvm::inconvertibleErrorCode(),
+        "No memory range found for address (0x%" PRIx64 ")", addr);
 
   // There's at least some overlap between the beginning of the desired range
   // (addr) and the current range.  Figure out where the overlap begins and
@@ -491,7 +509,11 @@ llvm::ArrayRef<uint8_t> MinidumpParser::GetMemory(lldb::addr_t addr,
   const size_t offset = addr - range->start;
 
   if (addr < range->start || offset >= range->range_ref.size())
-    return {};
+    return llvm::createStringError(
+        llvm::inconvertibleErrorCode(),
+        "Address (0x%" PRIx64 ") is not in range [0x%" PRIx64 " - 0x%" PRIx64
+        ")",
+        addr, range->start, range->start + range->range_ref.size());
 
   const size_t overlap = std::min(size, range->range_ref.size() - offset);
   return range->range_ref.slice(offset, overlap);
diff --git a/lldb/source/Plugins/Process/minidump/MinidumpParser.h b/lldb/source/Plugins/Process/minidump/MinidumpParser.h
index 14599f8d572a..3b7d33daca71 100644
--- a/lldb/source/Plugins/Process/minidump/MinidumpParser.h
+++ b/lldb/source/Plugins/Process/minidump/MinidumpParser.h
@@ -104,7 +104,8 @@ public:
 
   std::optional<Range> FindMemoryRange(lldb::addr_t addr);
 
-  llvm::ArrayRef<uint8_t> GetMemory(lldb::addr_t addr, size_t size);
+  llvm::Expected<llvm::ArrayRef<uint8_t>> GetMemory(lldb::addr_t addr,
+                                                    size_t size);
 
   /// Returns a list of memory regions and a flag indicating whether the list is
   /// complete (includes all regions mapped into the process memory).
diff --git a/lldb/source/Plugins/Process/minidump/ProcessMinidump.cpp b/lldb/source/Plugins/Process/minidump/ProcessMinidump.cpp
index ef3c00e2857d..17a421a72274 100644
--- a/lldb/source/Plugins/Process/minidump/ProcessMinidump.cpp
+++ b/lldb/source/Plugins/Process/minidump/ProcessMinidump.cpp
@@ -322,12 +322,15 @@ size_t ProcessMinidump::ReadMemory(lldb::addr_t addr, void *buf, size_t size,
 size_t ProcessMinidump::DoReadMemory(lldb::addr_t addr, void *buf, size_t size,
                                      Status &error) {
 
-  llvm::ArrayRef<uint8_t> mem = m_minidump_parser->GetMemory(addr, size);
-  if (mem.empty()) {
-    error = Status::FromErrorString("could not parse memory info");
+  llvm::Expected<llvm::ArrayRef<uint8_t>> mem_maybe =
+      m_minidump_parser->GetMemory(addr, size);
+  if (!mem_maybe) {
+    error = Status::FromError(mem_maybe.takeError());
     return 0;
   }
 
+  llvm::ArrayRef<uint8_t> mem = *mem_maybe;
+
   std::memcpy(buf, mem.data(), mem.size());
   return mem.size();
 }
diff --git a/lldb/test/Shell/Minidump/missing-memory-region.yaml b/lldb/test/Shell/Minidump/missing-memory-region.yaml
new file mode 100644
index 000000000000..1784cacfaf1b
--- /dev/null
+++ b/lldb/test/Shell/Minidump/missing-memory-region.yaml
@@ -0,0 +1,42 @@
+# Check that looking up a memory region not present in the Minidump fails
+# even if it's in the /proc/<pid>/maps file.
+
+# RUN: yaml2obj %s -o %t
+# RUN: %lldb -c %t -o "memory read 0x5000" 2>&1 | FileCheck %s
+
+# CHECK-LABEL: (lldb) memory read 0x5000
+# CHECK-NEXT: error: No memory range found for address (0x5000)
+
+--- !minidump
+Streams:
+  - Type:            SystemInfo
+    Processor Arch:  AMD64
+    Processor Level: 6
+    Processor Revision: 15876
+    Number of Processors: 40
+    Platform ID:     Linux
+    CSD Version:     'Linux 3.13.0-91-generic #138-Ubuntu SMP Fri Jun 24 17:00:34 UTC 2016 x86_64'
+    CPU:
+      Vendor ID:       GenuineIntel
+      Version Info:    0x00000000
+      Feature Info:    0x00000000
+  - Type:            LinuxProcStatus
+    Text:             |
+      Name:	test-yaml
+      Umask:	0002
+      State:	t (tracing stop)
+      Pid:	8567
+  - Type:            LinuxMaps
+    Text:             |
+      0x1000-0x1100     r-xp 00000000 00:00 0
+      0x2000-0x2200     rw-p 00000000 00:00 0
+      0x4000-0x6000     rw-- 00000000 00:00 0
+  - Type:            Memory64List
+    Memory Ranges:
+      - Start of Memory Range: 0x1000
+        Data Size:       0x100
+        Content :        ''
+      - Start of Memory Range: 0x2000
+        Data Size:       0x200
+        Content :        ''
+...
diff --git a/lldb/unittests/Process/minidump/MinidumpParserTest.cpp b/lldb/unittests/Process/minidump/MinidumpParserTest.cpp
index ee31c8e63644..44f653c6fa13 100644
--- a/lldb/unittests/Process/minidump/MinidumpParserTest.cpp
+++ b/lldb/unittests/Process/minidump/MinidumpParserTest.cpp
@@ -308,16 +308,19 @@ Streams:
 )"),
                     llvm::Succeeded());
 
-  EXPECT_EQ((llvm::ArrayRef<uint8_t>{0x54}), parser->GetMemory(0x401d46, 1));
-  EXPECT_EQ((llvm::ArrayRef<uint8_t>{0x54, 0x21}),
-            parser->GetMemory(0x401d46, 4));
-
-  EXPECT_EQ((llvm::ArrayRef<uint8_t>{0xc8, 0x4d, 0x04, 0xbc, 0xe9}),
-            parser->GetMemory(0x7ffceb34a000, 5));
-  EXPECT_EQ((llvm::ArrayRef<uint8_t>{0xc8, 0x4d, 0x04}),
-            parser->GetMemory(0x7ffceb34a000, 3));
-
-  EXPECT_EQ(llvm::ArrayRef<uint8_t>(), parser->GetMemory(0x500000, 512));
+  EXPECT_THAT_EXPECTED(parser->GetMemory(0x401d46, 1),
+                       llvm::HasValue(llvm::ArrayRef<uint8_t>{0x54}));
+  EXPECT_THAT_EXPECTED(parser->GetMemory(0x401d46, 4),
+                       llvm::HasValue(llvm::ArrayRef<uint8_t>{0x54, 0x21}));
+  EXPECT_THAT_EXPECTED(
+      parser->GetMemory(0x7ffceb34a000, 5),
+      llvm::HasValue(llvm::ArrayRef<uint8_t>{0xc8, 0x4d, 0x04, 0xbc, 0xe9}));
+  EXPECT_THAT_EXPECTED(
+      parser->GetMemory(0x7ffceb34a000, 3),
+      llvm::HasValue(llvm::ArrayRef<uint8_t>{0xc8, 0x4d, 0x04}));
+  EXPECT_THAT_EXPECTED(
+      parser->GetMemory(0x500000, 512),
+      llvm::FailedWithMessage("No memory range found for address (0x500000)"));
 }
 
 TEST_F(MinidumpParserTest, FindMemoryRangeWithFullMemoryMinidump) {
-- 
cgit v1.2.3


From e8a891b0f9d2a742ac3904116aaec2c7c9231b24 Mon Sep 17 00:00:00 2001
From: Jonathan Cohen <joncoh@apple.com>
Date: Thu, 17 Jul 2025 19:21:57 +0300
Subject: [AArch64][Machine-Combiner] Split gather patterns into neon regs to
 multiple vectors (#142941)

This changes optimizes gather-like sequences, where we load values
separately into lanes of a neon vector. Since each load has serial
dependency, when performing multiple i32 loads into a 128 bit vector for example, it
is more profitable to load into separate vector registers and zip them.

rdar://151851094
---
 llvm/lib/Target/AArch64/AArch64InstrInfo.cpp       | 265 +++++++++++++++
 llvm/lib/Target/AArch64/AArch64InstrInfo.h         |   4 +
 .../AArch64/aarch64-combine-gather-lanes.mir       | 364 +++++++++++++++++++++
 .../complex-deinterleaving-uniform-cases.ll        | 134 ++++----
 llvm/test/CodeGen/AArch64/concat-vector.ll         |   5 +-
 .../CodeGen/AArch64/fp-maximumnum-minimumnum.ll    |  50 +--
 llvm/test/CodeGen/AArch64/fsh.ll                   | 113 +++----
 llvm/test/CodeGen/AArch64/llvm.frexp.ll            |  14 +-
 llvm/test/CodeGen/AArch64/neon-dotreduce.ll        | 345 +++++++++----------
 llvm/test/CodeGen/AArch64/nontemporal.ll           |  48 +--
 10 files changed, 996 insertions(+), 346 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/aarch64-combine-gather-lanes.mir

diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 996b0edd2420..bc57537ad5df 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -20,6 +20,7 @@
 #include "Utils/AArch64BaseInfo.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/CFIInstBuilder.h"
 #include "llvm/CodeGen/LivePhysRegs.h"
@@ -35,6 +36,7 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
 #include "llvm/CodeGen/StackMaps.h"
+#include "llvm/CodeGen/TargetOpcodes.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/IR/DebugInfoMetadata.h"
@@ -7351,6 +7353,9 @@ bool AArch64InstrInfo::isThroughputPattern(unsigned Pattern) const {
   case AArch64MachineCombinerPattern::MULSUBv2i32_indexed_OP2:
   case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP1:
   case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP2:
+  case AArch64MachineCombinerPattern::GATHER_LANE_i32:
+  case AArch64MachineCombinerPattern::GATHER_LANE_i16:
+  case AArch64MachineCombinerPattern::GATHER_LANE_i8:
     return true;
   } // end switch (Pattern)
   return false;
@@ -7391,11 +7396,252 @@ static bool getMiscPatterns(MachineInstr &Root,
   return false;
 }
 
+static bool getGatherPattern(MachineInstr &Root,
+                             SmallVectorImpl<unsigned> &Patterns,
+                             unsigned LoadLaneOpCode, unsigned NumLanes) {
+  const MachineFunction *MF = Root.getMF();
+
+  // Early exit if optimizing for size.
+  if (MF->getFunction().hasMinSize())
+    return false;
+
+  const MachineRegisterInfo &MRI = MF->getRegInfo();
+  const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
+
+  // The root of the pattern must load into the last lane of the vector.
+  if (Root.getOperand(2).getImm() != NumLanes - 1)
+    return false;
+
+  // Check that we have load into all lanes except lane 0.
+  // For each load we also want to check that:
+  // 1. It has a single non-debug use (since we will be replacing the virtual
+  // register)
+  // 2. That the addressing mode only uses a single offset register.
+  auto *CurrInstr = MRI.getUniqueVRegDef(Root.getOperand(1).getReg());
+  auto Range = llvm::seq<unsigned>(1, NumLanes - 1);
+  SmallSet<unsigned, 4> RemainingLanes(Range.begin(), Range.end());
+  while (!RemainingLanes.empty() && CurrInstr &&
+         CurrInstr->getOpcode() == LoadLaneOpCode &&
+         MRI.hasOneNonDBGUse(CurrInstr->getOperand(0).getReg()) &&
+         CurrInstr->getNumOperands() == 4) {
+    RemainingLanes.erase(CurrInstr->getOperand(2).getImm());
+    CurrInstr = MRI.getUniqueVRegDef(CurrInstr->getOperand(1).getReg());
+  }
+
+  if (!RemainingLanes.empty())
+    return false;
+
+  // Match the SUBREG_TO_REG sequence.
+  if (CurrInstr->getOpcode() != TargetOpcode::SUBREG_TO_REG)
+    return false;
+
+  // Verify that the subreg to reg loads an integer into the first lane.
+  auto Lane0LoadReg = CurrInstr->getOperand(2).getReg();
+  unsigned SingleLaneSizeInBits = 128 / NumLanes;
+  if (TRI->getRegSizeInBits(Lane0LoadReg, MRI) != SingleLaneSizeInBits)
+    return false;
+
+  // Verify that it also has a single non debug use.
+  if (!MRI.hasOneNonDBGUse(Lane0LoadReg))
+    return false;
+
+  switch (NumLanes) {
+  case 4:
+    Patterns.push_back(AArch64MachineCombinerPattern::GATHER_LANE_i32);
+    break;
+  case 8:
+    Patterns.push_back(AArch64MachineCombinerPattern::GATHER_LANE_i16);
+    break;
+  case 16:
+    Patterns.push_back(AArch64MachineCombinerPattern::GATHER_LANE_i8);
+    break;
+  default:
+    llvm_unreachable("Got bad number of lanes for gather pattern.");
+  }
+
+  return true;
+}
+
+/// Search for patterns where we use LD1 instructions to load into
+/// separate lanes of an 128 bit Neon register. We can increase Memory Level
+/// Parallelism by loading into 2 Neon registers instead.
+static bool getLoadPatterns(MachineInstr &Root,
+                            SmallVectorImpl<unsigned> &Patterns) {
+
+  // The pattern searches for loads into single lanes.
+  switch (Root.getOpcode()) {
+  case AArch64::LD1i32:
+    return getGatherPattern(Root, Patterns, Root.getOpcode(), 4);
+  case AArch64::LD1i16:
+    return getGatherPattern(Root, Patterns, Root.getOpcode(), 8);
+  case AArch64::LD1i8:
+    return getGatherPattern(Root, Patterns, Root.getOpcode(), 16);
+  default:
+    return false;
+  }
+}
+
+static void
+generateGatherPattern(MachineInstr &Root,
+                      SmallVectorImpl<MachineInstr *> &InsInstrs,
+                      SmallVectorImpl<MachineInstr *> &DelInstrs,
+                      DenseMap<Register, unsigned> &InstrIdxForVirtReg,
+                      unsigned Pattern, unsigned NumLanes) {
+
+  MachineFunction &MF = *Root.getParent()->getParent();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
+
+  // Gather the initial load instructions to build the pattern
+  SmallVector<MachineInstr *, 16> LoadToLaneInstrs;
+  MachineInstr *CurrInstr = &Root;
+  for (unsigned i = 0; i < NumLanes - 1; ++i) {
+    LoadToLaneInstrs.push_back(CurrInstr);
+    CurrInstr = MRI.getUniqueVRegDef(CurrInstr->getOperand(1).getReg());
+  }
+
+  // Sort the load instructions according to the lane.
+  llvm::sort(LoadToLaneInstrs,
+             [](const MachineInstr *A, const MachineInstr *B) {
+               return A->getOperand(2).getImm() > B->getOperand(2).getImm();
+             });
+
+  MachineInstr *SubregToReg = CurrInstr;
+  LoadToLaneInstrs.push_back(
+      MRI.getUniqueVRegDef(SubregToReg->getOperand(2).getReg()));
+  auto LoadToLaneInstrsAscending = llvm::reverse(LoadToLaneInstrs);
+
+  const TargetRegisterClass *FPR128RegClass =
+      MRI.getRegClass(Root.getOperand(0).getReg());
+
+  auto LoadLaneToRegister = [&](MachineInstr *OriginalInstr,
+                                Register SrcRegister, unsigned Lane,
+                                Register OffsetRegister) {
+    auto NewRegister = MRI.createVirtualRegister(FPR128RegClass);
+    MachineInstrBuilder LoadIndexIntoRegister =
+        BuildMI(MF, MIMetadata(*OriginalInstr), TII->get(Root.getOpcode()),
+                NewRegister)
+            .addReg(SrcRegister)
+            .addImm(Lane)
+            .addReg(OffsetRegister, getKillRegState(true));
+    InstrIdxForVirtReg.insert(std::make_pair(NewRegister, InsInstrs.size()));
+    InsInstrs.push_back(LoadIndexIntoRegister);
+    return NewRegister;
+  };
+
+  // Helper to create load instruction based on opcode
+  auto CreateLoadInstruction = [&](unsigned NumLanes, Register DestReg,
+                                   Register OffsetReg) -> MachineInstrBuilder {
+    unsigned Opcode;
+    switch (NumLanes) {
+    case 4:
+      Opcode = AArch64::LDRSui;
+      break;
+    case 8:
+      Opcode = AArch64::LDRHui;
+      break;
+    case 16:
+      Opcode = AArch64::LDRBui;
+      break;
+    default:
+      llvm_unreachable(
+          "Got unsupported number of lanes in machine-combiner gather pattern");
+    }
+    // Immediate offset load
+    return BuildMI(MF, MIMetadata(Root), TII->get(Opcode), DestReg)
+        .addReg(OffsetReg)
+        .addImm(0); // immediate offset
+  };
+
+  // Load the remaining lanes into register 0.
+  auto LanesToLoadToReg0 =
+      llvm::make_range(LoadToLaneInstrsAscending.begin() + 1,
+                       LoadToLaneInstrsAscending.begin() + NumLanes / 2);
+  auto PrevReg = SubregToReg->getOperand(0).getReg();
+  for (auto [Index, LoadInstr] : llvm::enumerate(LanesToLoadToReg0)) {
+    PrevReg = LoadLaneToRegister(LoadInstr, PrevReg, Index + 1,
+                                 LoadInstr->getOperand(3).getReg());
+    DelInstrs.push_back(LoadInstr);
+  }
+  auto LastLoadReg0 = PrevReg;
+
+  // First load into register 1. Perform a LDRSui to zero out the upper lanes in
+  // a single instruction.
+  auto Lane0Load = *LoadToLaneInstrsAscending.begin();
+  auto OriginalSplitLoad =
+      *std::next(LoadToLaneInstrsAscending.begin(), NumLanes / 2);
+  auto DestRegForMiddleIndex = MRI.createVirtualRegister(
+      MRI.getRegClass(Lane0Load->getOperand(0).getReg()));
+
+  MachineInstrBuilder MiddleIndexLoadInstr =
+      CreateLoadInstruction(NumLanes, DestRegForMiddleIndex,
+                            OriginalSplitLoad->getOperand(3).getReg());
+
+  InstrIdxForVirtReg.insert(
+      std::make_pair(DestRegForMiddleIndex, InsInstrs.size()));
+  InsInstrs.push_back(MiddleIndexLoadInstr);
+  DelInstrs.push_back(OriginalSplitLoad);
+
+  // Subreg To Reg instruction for register 1.
+  auto DestRegForSubregToReg = MRI.createVirtualRegister(FPR128RegClass);
+  unsigned SubregType;
+  switch (NumLanes) {
+  case 4:
+    SubregType = AArch64::ssub;
+    break;
+  case 8:
+    SubregType = AArch64::hsub;
+    break;
+  case 16:
+    SubregType = AArch64::bsub;
+    break;
+  default:
+    llvm_unreachable(
+        "Got invalid NumLanes for machine-combiner gather pattern");
+  }
+
+  auto SubRegToRegInstr =
+      BuildMI(MF, MIMetadata(Root), TII->get(SubregToReg->getOpcode()),
+              DestRegForSubregToReg)
+          .addImm(0)
+          .addReg(DestRegForMiddleIndex, getKillRegState(true))
+          .addImm(SubregType);
+  InstrIdxForVirtReg.insert(
+      std::make_pair(DestRegForSubregToReg, InsInstrs.size()));
+  InsInstrs.push_back(SubRegToRegInstr);
+
+  // Load remaining lanes into register 1.
+  auto LanesToLoadToReg1 =
+      llvm::make_range(LoadToLaneInstrsAscending.begin() + NumLanes / 2 + 1,
+                       LoadToLaneInstrsAscending.end());
+  PrevReg = SubRegToRegInstr->getOperand(0).getReg();
+  for (auto [Index, LoadInstr] : llvm::enumerate(LanesToLoadToReg1)) {
+    PrevReg = LoadLaneToRegister(LoadInstr, PrevReg, Index + 1,
+                                 LoadInstr->getOperand(3).getReg());
+    if (Index == NumLanes / 2 - 2) {
+      break;
+    }
+    DelInstrs.push_back(LoadInstr);
+  }
+  auto LastLoadReg1 = PrevReg;
+
+  // Create the final zip instruction to combine the results.
+  MachineInstrBuilder ZipInstr =
+      BuildMI(MF, MIMetadata(Root), TII->get(AArch64::ZIP1v2i64),
+              Root.getOperand(0).getReg())
+          .addReg(LastLoadReg0)
+          .addReg(LastLoadReg1);
+  InsInstrs.push_back(ZipInstr);
+}
+
 CombinerObjective
 AArch64InstrInfo::getCombinerObjective(unsigned Pattern) const {
   switch (Pattern) {
   case AArch64MachineCombinerPattern::SUBADD_OP1:
   case AArch64MachineCombinerPattern::SUBADD_OP2:
+  case AArch64MachineCombinerPattern::GATHER_LANE_i32:
+  case AArch64MachineCombinerPattern::GATHER_LANE_i16:
+  case AArch64MachineCombinerPattern::GATHER_LANE_i8:
     return CombinerObjective::MustReduceDepth;
   default:
     return TargetInstrInfo::getCombinerObjective(Pattern);
@@ -7425,6 +7671,10 @@ bool AArch64InstrInfo::getMachineCombinerPatterns(
   if (getMiscPatterns(Root, Patterns))
     return true;
 
+  // Load patterns
+  if (getLoadPatterns(Root, Patterns))
+    return true;
+
   return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns,
                                                      DoRegPressureReduce);
 }
@@ -8680,6 +8930,21 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
     MUL = genFNegatedMAD(MF, MRI, TII, Root, InsInstrs);
     break;
   }
+  case AArch64MachineCombinerPattern::GATHER_LANE_i32: {
+    generateGatherPattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg,
+                          Pattern, 4);
+    break;
+  }
+  case AArch64MachineCombinerPattern::GATHER_LANE_i16: {
+    generateGatherPattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg,
+                          Pattern, 8);
+    break;
+  }
+  case AArch64MachineCombinerPattern::GATHER_LANE_i8: {
+    generateGatherPattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg,
+                          Pattern, 16);
+    break;
+  }
 
   } // end switch (Pattern)
   // Record MUL and ADD/SUB for deletion
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
index 7c255da333e4..02734866e712 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
@@ -172,6 +172,10 @@ enum AArch64MachineCombinerPattern : unsigned {
   FMULv8i16_indexed_OP2,
 
   FNMADD,
+
+  GATHER_LANE_i32,
+  GATHER_LANE_i16,
+  GATHER_LANE_i8
 };
 class AArch64InstrInfo final : public AArch64GenInstrInfo {
   const AArch64RegisterInfo RI;
diff --git a/llvm/test/CodeGen/AArch64/aarch64-combine-gather-lanes.mir b/llvm/test/CodeGen/AArch64/aarch64-combine-gather-lanes.mir
new file mode 100644
index 000000000000..09eb18b0e357
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/aarch64-combine-gather-lanes.mir
@@ -0,0 +1,364 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -run-pass=machine-combiner -mcpu=neoverse-n2 -mtriple=aarch64-none-linux-gnu -verify-machineinstrs %s -o - | FileCheck %s
+
+---
+name:            split_loads_to_fpr128
+body:             |
+  bb.0.entry:
+    liveins: $x0, $x1, $x2, $x3, $x4
+
+    ; CHECK-LABEL: name: split_loads_to_fpr128
+    ; CHECK: [[COPY:%[0-9]+]]:gpr64common = COPY $x0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64common = COPY $x1
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64common = COPY $x2
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr64common = COPY $x3
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gpr64common = COPY $x4
+    ; CHECK-NEXT: [[LD_i32:%[0-9]+]]:fpr32 = LDRSroX [[COPY]], killed [[COPY1]], 0, 1
+    ; CHECK-NEXT: [[FIRST_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD_i32]], %subreg.ssub
+    ; CHECK-NEXT: [[LD0_1:%[0-9]+]]:fpr128 = LD1i32 [[FIRST_REG]], 1, killed [[COPY2]] 
+    ; CHECK-NEXT: [[LD1_0:%[0-9]+]]:fpr32 = LDRSui [[COPY3]], 0
+    ; CHECK-NEXT: [[SECOND_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD1_0]], %subreg.ssub
+    ; CHECK-NEXT: [[LD1_1:%[0-9]+]]:fpr128 = LD1i32 [[SECOND_REG]], 1, killed [[COPY4]]
+    ; CHECK-NEXT: [[ZIP:%[0-9]+]]:fpr128 = ZIP1v2i64 [[LD0_1]], [[LD1_1]]
+    ; CHECK-NEXT: $q0 = COPY [[ZIP]]
+    ; CHECK-NEXT: RET_ReallyLR implicit $q0
+    %0:gpr64common = COPY $x0
+    %1:gpr64common = COPY $x1
+    %2:gpr64common = COPY $x2
+    %3:gpr64common = COPY $x3
+    %4:gpr64common = COPY $x4
+    %5:fpr32 = LDRSroX %0, killed %1, 0, 1
+    %6:fpr128 = SUBREG_TO_REG 0, killed %5, %subreg.ssub
+    %7:fpr128 = LD1i32 %6, 1, killed %2
+    %8:fpr128 = LD1i32 %7, 2, killed %3
+    %9:fpr128 = LD1i32 %8, 3, killed %4
+    $q0 = COPY %9
+    RET_ReallyLR implicit $q0
+
+---
+name:            split_loads_to_fpr128_ui
+body:             |
+  bb.0.entry:
+    liveins: $x0, $x1, $x2, $x3, $x4
+
+    ; CHECK-LABEL: name: split_loads_to_fpr128_ui
+    ; CHECK: [[COPY:%[0-9]+]]:gpr64common = COPY $x0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64common = COPY $x1
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64common = COPY $x2
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr64common = COPY $x3
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gpr64common = COPY $x4
+    ; CHECK-NEXT: [[LD_i32:%[0-9]+]]:fpr32 = LDRSui [[COPY]], 0
+    ; CHECK-NEXT: [[FIRST_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD_i32]], %subreg.ssub
+    ; CHECK-NEXT: [[LD0_1:%[0-9]+]]:fpr128 = LD1i32 [[FIRST_REG]], 1, killed [[COPY1]] 
+    ; CHECK-NEXT: [[LD1_0:%[0-9]+]]:fpr32 = LDRSui [[COPY2]], 0
+    ; CHECK-NEXT: [[SECOND_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD1_0]], %subreg.ssub
+    ; CHECK-NEXT: [[LD1_1:%[0-9]+]]:fpr128 = LD1i32 [[SECOND_REG]], 1, killed [[COPY3]]
+    ; CHECK-NEXT: [[ZIP:%[0-9]+]]:fpr128 = ZIP1v2i64 [[LD0_1]], [[LD1_1]]
+    ; CHECK-NEXT: $q0 = COPY [[ZIP]]
+    ; CHECK-NEXT: RET_ReallyLR implicit $q0
+    %0:gpr64common = COPY $x0
+    %1:gpr64common = COPY $x1
+    %2:gpr64common = COPY $x2
+    %3:gpr64common = COPY $x3
+    %4:gpr64common = COPY $x4
+    %5:fpr32 = LDRSui %0, 0
+    %6:fpr128 = SUBREG_TO_REG 0, killed %5, %subreg.ssub
+    %7:fpr128 = LD1i32 %6, 1, killed %1
+    %8:fpr128 = LD1i32 %7, 2, killed %2
+    %9:fpr128 = LD1i32 %8, 3, killed %3
+    $q0 = COPY %9
+    RET_ReallyLR implicit $q0
+
+---
+name:            split_loads_to_fpr128_i16
+body:             |
+  bb.0.entry:
+    liveins: $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7, $x8
+
+    ; CHECK-LABEL: name: split_loads_to_fpr128_i16
+    ; CHECK: [[COPY:%[0-9]+]]:gpr64common = COPY $x0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64common = COPY $x1
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64common = COPY $x2
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr64common = COPY $x3
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gpr64common = COPY $x4
+    ; CHECK-NEXT: [[COPY5:%[0-9]+]]:gpr64common = COPY $x5
+    ; CHECK-NEXT: [[COPY6:%[0-9]+]]:gpr64common = COPY $x6
+    ; CHECK-NEXT: [[COPY7:%[0-9]+]]:gpr64common = COPY $x7
+    ; CHECK-NEXT: [[COPY8:%[0-9]+]]:gpr64common = COPY $x8
+    ; CHECK-NEXT: [[LD_i16:%[0-9]+]]:fpr16 = LDRHroX [[COPY]], killed [[COPY1]], 0, 1
+    ; CHECK-NEXT: [[FIRST_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD_i16]], %subreg.hsub
+    ; CHECK-NEXT: [[LD0_1:%[0-9]+]]:fpr128 = LD1i16 [[FIRST_REG]], 1, killed [[COPY2]]
+    ; CHECK-NEXT: [[LD0_2:%[0-9]+]]:fpr128 = LD1i16 [[LD0_1]], 2, killed [[COPY3]]
+    ; CHECK-NEXT: [[LD0_3:%[0-9]+]]:fpr128 = LD1i16 [[LD0_2]], 3, killed [[COPY4]]
+    ; CHECK-NEXT: [[LD1_0:%[0-9]+]]:fpr16 = LDRHui [[COPY5]], 0
+    ; CHECK-NEXT: [[SECOND_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD1_0]], %subreg.hsub
+    ; CHECK-NEXT: [[LD1_1:%[0-9]+]]:fpr128 = LD1i16 [[SECOND_REG]], 1, killed [[COPY6]]
+    ; CHECK-NEXT: [[LD1_2:%[0-9]+]]:fpr128 = LD1i16 [[LD1_1]], 2, killed [[COPY7]]
+    ; CHECK-NEXT: [[LD1_3:%[0-9]+]]:fpr128 = LD1i16 [[LD1_2]], 3, killed [[COPY8]]
+    ; CHECK-NEXT: [[ZIP:%[0-9]+]]:fpr128 = ZIP1v2i64 [[LD0_3]], [[LD1_3]]
+    ; CHECK-NEXT: $q0 = COPY [[ZIP]]
+    ; CHECK-NEXT: RET_ReallyLR implicit $q0
+    %0:gpr64common = COPY $x0
+    %1:gpr64common = COPY $x1
+    %2:gpr64common = COPY $x2
+    %3:gpr64common = COPY $x3
+    %4:gpr64common = COPY $x4
+    %5:gpr64common = COPY $x5
+    %6:gpr64common = COPY $x6
+    %7:gpr64common = COPY $x7
+    %8:gpr64common = COPY $x8
+    %9:fpr16 = LDRHroX %0, killed %1, 0, 1
+    %10:fpr128 = SUBREG_TO_REG 0, killed %9, %subreg.hsub
+    %11:fpr128 = LD1i16 %10, 1, killed %2
+    %12:fpr128 = LD1i16 %11, 2, killed %3
+    %13:fpr128 = LD1i16 %12, 3, killed %4
+    %14:fpr128 = LD1i16 %13, 4, killed %5
+    %15:fpr128 = LD1i16 %14, 5, killed %6
+    %16:fpr128 = LD1i16 %15, 6, killed %7
+    %17:fpr128 = LD1i16 %16, 7, killed %8
+    $q0 = COPY %17
+    RET_ReallyLR implicit $q0
+
+---
+name:            split_loads_to_fpr128_i16_ui
+body:             |
+  bb.0.entry:
+    liveins: $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7, $x8
+
+    ; CHECK-LABEL: name: split_loads_to_fpr128_i16_ui
+    ; CHECK: [[COPY:%[0-9]+]]:gpr64common = COPY $x0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64common = COPY $x1
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64common = COPY $x2
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr64common = COPY $x3
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gpr64common = COPY $x4
+    ; CHECK-NEXT: [[COPY5:%[0-9]+]]:gpr64common = COPY $x5
+    ; CHECK-NEXT: [[COPY6:%[0-9]+]]:gpr64common = COPY $x6
+    ; CHECK-NEXT: [[COPY7:%[0-9]+]]:gpr64common = COPY $x7
+    ; CHECK-NEXT: [[COPY8:%[0-9]+]]:gpr64common = COPY $x8
+    ; CHECK-NEXT: [[LD_i16:%[0-9]+]]:fpr16 = LDRHui [[COPY]], 0
+    ; CHECK-NEXT: [[FIRST_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD_i16]], %subreg.hsub
+    ; CHECK-NEXT: [[LD0_1:%[0-9]+]]:fpr128 = LD1i16 [[FIRST_REG]], 1, killed [[COPY1]]
+    ; CHECK-NEXT: [[LD0_2:%[0-9]+]]:fpr128 = LD1i16 [[LD0_1]], 2, killed [[COPY2]]
+    ; CHECK-NEXT: [[LD0_3:%[0-9]+]]:fpr128 = LD1i16 [[LD0_2]], 3, killed [[COPY3]]
+    ; CHECK-NEXT: [[LD1_0:%[0-9]+]]:fpr16 = LDRHui [[COPY4]], 0
+    ; CHECK-NEXT: [[SECOND_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD1_0]], %subreg.hsub
+    ; CHECK-NEXT: [[LD1_1:%[0-9]+]]:fpr128 = LD1i16 [[SECOND_REG]], 1, killed [[COPY5]]
+    ; CHECK-NEXT: [[LD1_2:%[0-9]+]]:fpr128 = LD1i16 [[LD1_1]], 2, killed [[COPY6]]
+    ; CHECK-NEXT: [[LD1_3:%[0-9]+]]:fpr128 = LD1i16 [[LD1_2]], 3, killed [[COPY7]]
+    ; CHECK-NEXT: [[ZIP:%[0-9]+]]:fpr128 = ZIP1v2i64 [[LD0_3]], [[LD1_3]]
+    ; CHECK-NEXT: $q0 = COPY [[ZIP]]
+    ; CHECK-NEXT: RET_ReallyLR implicit $q0
+    %0:gpr64common = COPY $x0
+    %1:gpr64common = COPY $x1
+    %2:gpr64common = COPY $x2
+    %3:gpr64common = COPY $x3
+    %4:gpr64common = COPY $x4
+    %5:gpr64common = COPY $x5
+    %6:gpr64common = COPY $x6
+    %7:gpr64common = COPY $x7
+    %8:gpr64common = COPY $x8
+    %9:fpr16 = LDRHui %0, 0
+    %10:fpr128 = SUBREG_TO_REG 0, killed %9, %subreg.hsub
+    %11:fpr128 = LD1i16 %10, 1, killed %1
+    %12:fpr128 = LD1i16 %11, 2, killed %2
+    %13:fpr128 = LD1i16 %12, 3, killed %3
+    %14:fpr128 = LD1i16 %13, 4, killed %4
+    %15:fpr128 = LD1i16 %14, 5, killed %5
+    %16:fpr128 = LD1i16 %15, 6, killed %6
+    %17:fpr128 = LD1i16 %16, 7, killed %7
+    $q0 = COPY %17
+    RET_ReallyLR implicit $q0
+
+---
+name:            split_loads_to_fpr128_i8
+body:             |
+  bb.0.entry:
+    liveins: $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7, $x8, $x9, $x10, $x11, $x12, $x13, $x14, $x15, $x16
+
+    ; CHECK-LABEL: name: split_loads_to_fpr128_i8
+    ; CHECK: [[COPY:%[0-9]+]]:gpr64common = COPY $x0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64common = COPY $x1
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64common = COPY $x2
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr64common = COPY $x3
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gpr64common = COPY $x4
+    ; CHECK-NEXT: [[COPY5:%[0-9]+]]:gpr64common = COPY $x5
+    ; CHECK-NEXT: [[COPY6:%[0-9]+]]:gpr64common = COPY $x6
+    ; CHECK-NEXT: [[COPY7:%[0-9]+]]:gpr64common = COPY $x7
+    ; CHECK-NEXT: [[COPY8:%[0-9]+]]:gpr64common = COPY $x8
+    ; CHECK-NEXT: [[COPY9:%[0-9]+]]:gpr64common = COPY $x9
+    ; CHECK-NEXT: [[COPY10:%[0-9]+]]:gpr64common = COPY $x10
+    ; CHECK-NEXT: [[COPY11:%[0-9]+]]:gpr64common = COPY $x11
+    ; CHECK-NEXT: [[COPY12:%[0-9]+]]:gpr64common = COPY $x12
+    ; CHECK-NEXT: [[COPY13:%[0-9]+]]:gpr64common = COPY $x13
+    ; CHECK-NEXT: [[COPY14:%[0-9]+]]:gpr64common = COPY $x14
+    ; CHECK-NEXT: [[COPY15:%[0-9]+]]:gpr64common = COPY $x15
+    ; CHECK-NEXT: [[COPY16:%[0-9]+]]:gpr64common = COPY $x16
+    ; CHECK-NEXT: [[LD_i8:%[0-9]+]]:fpr8 = LDRBroX [[COPY]], killed [[COPY1]], 0, 0
+    ; CHECK-NEXT: [[FIRST_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD_i8]], %subreg.bsub
+    ; CHECK-NEXT: [[LD0_1:%[0-9]+]]:fpr128 = LD1i8 [[FIRST_REG]], 1, killed [[COPY2]]
+    ; CHECK-NEXT: [[LD0_2:%[0-9]+]]:fpr128 = LD1i8 [[LD0_1]], 2, killed [[COPY3]]
+    ; CHECK-NEXT: [[LD0_3:%[0-9]+]]:fpr128 = LD1i8 [[LD0_2]], 3, killed [[COPY4]]
+    ; CHECK-NEXT: [[LD0_4:%[0-9]+]]:fpr128 = LD1i8 [[LD0_3]], 4, killed [[COPY5]]
+    ; CHECK-NEXT: [[LD0_5:%[0-9]+]]:fpr128 = LD1i8 [[LD0_4]], 5, killed [[COPY6]]
+    ; CHECK-NEXT: [[LD0_6:%[0-9]+]]:fpr128 = LD1i8 [[LD0_5]], 6, killed [[COPY7]]
+    ; CHECK-NEXT: [[LD0_7:%[0-9]+]]:fpr128 = LD1i8 [[LD0_6]], 7, killed [[COPY8]]
+    ; CHECK-NEXT: [[LD1_0:%[0-9]+]]:fpr8 = LDRBui [[COPY9]], 0
+    ; CHECK-NEXT: [[SECOND_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD1_0]], %subreg.bsub
+    ; CHECK-NEXT: [[LD1_1:%[0-9]+]]:fpr128 = LD1i8 [[SECOND_REG]], 1, killed [[COPY10]]
+    ; CHECK-NEXT: [[LD1_2:%[0-9]+]]:fpr128 = LD1i8 [[LD1_1]], 2, killed [[COPY11]]
+    ; CHECK-NEXT: [[LD1_3:%[0-9]+]]:fpr128 = LD1i8 [[LD1_2]], 3, killed [[COPY12]]
+    ; CHECK-NEXT: [[LD1_4:%[0-9]+]]:fpr128 = LD1i8 [[LD1_3]], 4, killed [[COPY13]]
+    ; CHECK-NEXT: [[LD1_5:%[0-9]+]]:fpr128 = LD1i8 [[LD1_4]], 5, killed [[COPY14]]
+    ; CHECK-NEXT: [[LD1_6:%[0-9]+]]:fpr128 = LD1i8 [[LD1_5]], 6, killed [[COPY15]]
+    ; CHECK-NEXT: [[LD1_7:%[0-9]+]]:fpr128 = LD1i8 [[LD1_6]], 7, killed [[COPY16]]
+    ; CHECK-NEXT: [[ZIP:%[0-9]+]]:fpr128 = ZIP1v2i64 [[LD0_7]], [[LD1_7]]
+    ; CHECK-NEXT: $q0 = COPY [[ZIP]]
+    ; CHECK-NEXT: RET_ReallyLR implicit $q0
+    %0:gpr64common = COPY $x0
+    %1:gpr64common = COPY $x1
+    %2:gpr64common = COPY $x2
+    %3:gpr64common = COPY $x3
+    %4:gpr64common = COPY $x4
+    %5:gpr64common = COPY $x5
+    %6:gpr64common = COPY $x6
+    %7:gpr64common = COPY $x7
+    %8:gpr64common = COPY $x8
+    %9:gpr64common = COPY $x9
+    %10:gpr64common = COPY $x10
+    %11:gpr64common = COPY $x11
+    %12:gpr64common = COPY $x12
+    %13:gpr64common = COPY $x13
+    %14:gpr64common = COPY $x14
+    %15:gpr64common = COPY $x15
+    %16:gpr64common = COPY $x16
+    %17:fpr8 = LDRBroX %0, killed %1, 0, 0
+    %18:fpr128 = SUBREG_TO_REG 0, killed %17, %subreg.bsub
+    %19:fpr128 = LD1i8 %18, 1, killed %2
+    %20:fpr128 = LD1i8 %19, 2, killed %3
+    %21:fpr128 = LD1i8 %20, 3, killed %4
+    %22:fpr128 = LD1i8 %21, 4, killed %5
+    %23:fpr128 = LD1i8 %22, 5, killed %6
+    %24:fpr128 = LD1i8 %23, 6, killed %7
+    %25:fpr128 = LD1i8 %24, 7, killed %8
+    %26:fpr128 = LD1i8 %25, 8, killed %9
+    %27:fpr128 = LD1i8 %26, 9, killed %10
+    %28:fpr128 = LD1i8 %27, 10, killed %11
+    %29:fpr128 = LD1i8 %28, 11, killed %12
+    %30:fpr128 = LD1i8 %29, 12, killed %13
+    %31:fpr128 = LD1i8 %30, 13, killed %14
+    %32:fpr128 = LD1i8 %31, 14, killed %15
+    %33:fpr128 = LD1i8 %32, 15, killed %16
+    $q0 = COPY %33
+    RET_ReallyLR implicit $q0
+
+---
+name:            negative_pattern_missing_lanes
+body:             |
+  bb.0.entry:
+    liveins: $x0, $x1
+
+    ; CHECK-LABEL: name: negative_pattern_missing_lanes
+    ; CHECK:      [[LD1:%.*]]:fpr128 = LDRQui $x1, 0
+    ; CHECK-NEXT: [[LD2:%.*]]:fpr128 = LD1i32 [[LD1]]
+  
+    %0:gpr64common = COPY $x0
+    %1:fpr128 = LDRQui $x1, 0
+    %2:fpr128 = LD1i32 %1, 3, %0
+    $q0 = COPY %2
+    RET_ReallyLR implicit $q0
+
+---
+name:            out_of_order_lanes
+body:             |
+  bb.0.entry:
+    liveins: $x0, $x1, $x2, $x3, $x4
+
+    ; CHECK-LABEL: name: out_of_order_lanes
+    ; CHECK: [[COPY:%[0-9]+]]:gpr64common = COPY $x0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64common = COPY $x1
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64common = COPY $x2
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr64common = COPY $x3
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gpr64common = COPY $x4
+    ; CHECK-NEXT: [[LD_i32:%[0-9]+]]:fpr32 = LDRSroX [[COPY]], killed [[COPY1]], 0, 1
+    ; CHECK-NEXT: [[FIRST_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD_i32]], %subreg.ssub
+    ; CHECK-NEXT: [[LD0_1:%[0-9]+]]:fpr128 = LD1i32 [[FIRST_REG]], 1, killed [[COPY3]] 
+    ; CHECK-NEXT: [[LD1_0:%[0-9]+]]:fpr32 = LDRSui [[COPY2]], 0
+    ; CHECK-NEXT: [[SECOND_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD1_0]], %subreg.ssub
+    ; CHECK-NEXT: [[LD1_1:%[0-9]+]]:fpr128 = LD1i32 [[SECOND_REG]], 1, killed [[COPY4]]
+    ; CHECK-NEXT: [[ZIP:%[0-9]+]]:fpr128 = ZIP1v2i64 [[LD0_1]], [[LD1_1]]
+    ; CHECK-NEXT: $q0 = COPY [[ZIP]]
+    ; CHECK-NEXT: RET_ReallyLR implicit $q0
+    %0:gpr64common = COPY $x0
+    %1:gpr64common = COPY $x1
+    %2:gpr64common = COPY $x2
+    %3:gpr64common = COPY $x3
+    %4:gpr64common = COPY $x4
+    %5:fpr32 = LDRSroX %0, killed %1, 0, 1
+    %6:fpr128 = SUBREG_TO_REG 0, killed %5, %subreg.ssub
+    %7:fpr128 = LD1i32 %6, 2, killed %2
+    %8:fpr128 = LD1i32 %7, 1, killed %3
+    %9:fpr128 = LD1i32 %8, 3, killed %4
+    $q0 = COPY %9
+    RET_ReallyLR implicit $q0
+
+---
+name:            negative_pattern_no_subreg_to_reg
+body:             |
+  bb.0.entry:
+    liveins: $x0, $x1, $x2, $x3
+
+    ; CHECK-LABEL: name: negative_pattern_no_subreg_to_reg
+    ; CHECK: [[COPY:%[0-9]+]]:gpr64common = COPY $x0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64common = COPY $x1
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64common = COPY $x2
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr64common = COPY $x3
+    ; CHECK-NEXT: [[INITIAL_VEC:%[0-9]+]]:fpr128 = LDRQui [[COPY]], 0
+    ; CHECK-NEXT: [[LD_LANE_1:%[0-9]+]]:fpr128 = LD1i32 [[INITIAL_VEC]], 1, killed [[COPY1]]
+    ; CHECK-NEXT: [[LD_LANE_2:%[0-9]+]]:fpr128 = LD1i32 [[LD_LANE_1]], 2, killed [[COPY2]]
+    ; CHECK-NEXT: [[LD_LANE_3:%[0-9]+]]:fpr128 = LD1i32 [[LD_LANE_2]], 3, killed [[COPY3]]
+    ; CHECK-NEXT: $q0 = COPY [[LD_LANE_3]]
+    ; CHECK-NEXT: RET_ReallyLR implicit $q0
+    %0:gpr64common = COPY $x0
+    %1:gpr64common = COPY $x1
+    %2:gpr64common = COPY $x2
+    %3:gpr64common = COPY $x3
+    %4:fpr128 = LDRQui %0, 0            
+    %5:fpr128 = LD1i32 %4, 1, killed %1 
+    %6:fpr128 = LD1i32 %5, 2, killed %2
+    %7:fpr128 = LD1i32 %6, 3, killed %3
+    $q0 = COPY %7
+    RET_ReallyLR implicit $q0
+
+---
+name:            negative_pattern_multiple_users
+body:             |
+  bb.0.entry:
+    liveins: $x0, $x1, $x2, $x3, $x4
+
+    ; CHECK-LABEL: name: negative_pattern_multiple_users
+    ; CHECK: [[COPY:%[0-9]+]]:gpr64common = COPY $x0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64common = COPY $x1
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64common = COPY $x2
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr64common = COPY $x3
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gpr64common = COPY $x4
+    ; CHECK-NEXT: [[LD_i32:%[0-9]+]]:fpr32 = LDRSroX [[COPY]], killed [[COPY1]], 0, 1
+    ; CHECK-NEXT: [[FIRST_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD_i32]], %subreg.ssub
+    ; CHECK-NEXT: [[LD_LANE_1:%[0-9]+]]:fpr128 = LD1i32 [[FIRST_REG]], 1, killed [[COPY2]]
+    ; CHECK-NEXT: [[LD_LANE_2:%[0-9]+]]:fpr128 = LD1i32 [[LD_LANE_1]], 2, killed [[COPY3]]
+    ; CHECK-NEXT: [[LD_LANE_3:%[0-9]+]]:fpr128 = LD1i32 [[LD_LANE_2]], 3, killed [[COPY4]]
+    ; CHECK-NEXT: $q0 = COPY [[LD_LANE_3]]
+    ; CHECK-NEXT: $q1 = COPY [[LD_LANE_2]]
+    ; CHECK-NEXT: RET_ReallyLR implicit $q0, implicit $q1
+    %0:gpr64common = COPY $x0
+    %1:gpr64common = COPY $x1
+    %2:gpr64common = COPY $x2
+    %3:gpr64common = COPY $x3
+    %4:gpr64common = COPY $x4
+    %5:fpr32 = LDRSroX %0, killed %1, 0, 1
+    %6:fpr128 = SUBREG_TO_REG 0, killed %5, %subreg.ssub
+    %7:fpr128 = LD1i32 %6, 1, killed %2
+    %8:fpr128 = LD1i32 %7, 2, killed %3
+    %9:fpr128 = LD1i32 %8, 3, killed %4
+    $q0 = COPY %9
+    $q1 = COPY %8
+    RET_ReallyLR implicit $q0, implicit $q1
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-uniform-cases.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-uniform-cases.ll
index 7686740aec30..13434fabefa7 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-uniform-cases.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-uniform-cases.ll
@@ -203,89 +203,93 @@ define <12 x float> @abp90c12(<12 x float> %a, <12 x float> %b, <12 x float> %c)
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    // kill: def $s1 killed $s1 def $q1
 ; CHECK-NEXT:    // kill: def $s3 killed $s3 def $q3
-; CHECK-NEXT:    ldr s17, [sp, #40]
-; CHECK-NEXT:    add x10, sp, #56
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-NEXT:    // kill: def $s2 killed $s2 def $q2
+; CHECK-NEXT:    ldr s17, [sp, #32]
+; CHECK-NEXT:    // kill: def $s5 killed $s5 def $q5
 ; CHECK-NEXT:    add x9, sp, #48
+; CHECK-NEXT:    add x10, sp, #64
 ; CHECK-NEXT:    mov v1.s[1], v3.s[0]
-; CHECK-NEXT:    ldr s3, [sp, #32]
-; CHECK-NEXT:    // kill: def $s2 killed $s2 def $q2
 ; CHECK-NEXT:    mov v0.s[1], v2.s[0]
-; CHECK-NEXT:    ld1 { v17.s }[1], [x10]
-; CHECK-NEXT:    // kill: def $s5 killed $s5 def $q5
-; CHECK-NEXT:    ldr s16, [sp, #8]
 ; CHECK-NEXT:    // kill: def $s4 killed $s4 def $q4
-; CHECK-NEXT:    add x10, sp, #24
-; CHECK-NEXT:    ld1 { v3.s }[1], [x9]
-; CHECK-NEXT:    add x9, sp, #72
-; CHECK-NEXT:    // kill: def $s7 killed $s7 def $q7
+; CHECK-NEXT:    add x11, sp, #72
+; CHECK-NEXT:    ld1 { v17.s }[1], [x9]
+; CHECK-NEXT:    ldr s18, [x10]
+; CHECK-NEXT:    add x9, sp, #80
+; CHECK-NEXT:    add x10, sp, #56
 ; CHECK-NEXT:    // kill: def $s6 killed $s6 def $q6
+; CHECK-NEXT:    // kill: def $s7 killed $s7 def $q7
+; CHECK-NEXT:    ldr s16, [sp, #8]
+; CHECK-NEXT:    ldr s3, [sp, #96]
+; CHECK-NEXT:    ld1 { v18.s }[1], [x9]
+; CHECK-NEXT:    add x9, sp, #88
 ; CHECK-NEXT:    ldr s2, [sp]
-; CHECK-NEXT:    ld1 { v16.s }[1], [x10]
-; CHECK-NEXT:    add x10, sp, #112
-; CHECK-NEXT:    ldr s20, [sp, #136]
 ; CHECK-NEXT:    mov v1.s[2], v5.s[0]
-; CHECK-NEXT:    ld1 { v17.s }[2], [x9]
-; CHECK-NEXT:    add x9, sp, #64
-; CHECK-NEXT:    ldr s5, [sp, #96]
-; CHECK-NEXT:    ld1 { v3.s }[2], [x9]
+; CHECK-NEXT:    ldr s5, [sp, #40]
 ; CHECK-NEXT:    mov v0.s[2], v4.s[0]
-; CHECK-NEXT:    add x9, sp, #88
-; CHECK-NEXT:    ldr s4, [sp, #104]
-; CHECK-NEXT:    ldr s19, [sp, #192]
 ; CHECK-NEXT:    ld1 { v5.s }[1], [x10]
-; CHECK-NEXT:    add x10, sp, #80
-; CHECK-NEXT:    ld1 { v17.s }[3], [x9]
-; CHECK-NEXT:    mov v1.s[3], v7.s[0]
-; CHECK-NEXT:    add x9, sp, #120
-; CHECK-NEXT:    ld1 { v3.s }[3], [x10]
-; CHECK-NEXT:    ld1 { v4.s }[1], [x9]
-; CHECK-NEXT:    ldr s7, [sp, #128]
+; CHECK-NEXT:    ldr s19, [x11]
 ; CHECK-NEXT:    add x10, sp, #144
+; CHECK-NEXT:    zip1 v4.2d, v17.2d, v18.2d
+; CHECK-NEXT:    add x11, sp, #160
+; CHECK-NEXT:    ldr s18, [sp, #136]
+; CHECK-NEXT:    ld1 { v19.s }[1], [x9]
 ; CHECK-NEXT:    mov v0.s[3], v6.s[0]
-; CHECK-NEXT:    add x9, sp, #16
+; CHECK-NEXT:    ldr s6, [sp, #128]
+; CHECK-NEXT:    mov v1.s[3], v7.s[0]
+; CHECK-NEXT:    add x9, sp, #24
+; CHECK-NEXT:    ldr s7, [sp, #104]
+; CHECK-NEXT:    ld1 { v16.s }[1], [x9]
+; CHECK-NEXT:    add x9, sp, #112
+; CHECK-NEXT:    ld1 { v6.s }[1], [x10]
+; CHECK-NEXT:    zip1 v5.2d, v5.2d, v19.2d
+; CHECK-NEXT:    add x10, sp, #120
+; CHECK-NEXT:    ld1 { v3.s }[1], [x9]
 ; CHECK-NEXT:    ld1 { v7.s }[1], [x10]
-; CHECK-NEXT:    ld1 { v2.s }[1], [x9]
-; CHECK-NEXT:    add x9, sp, #160
-; CHECK-NEXT:    fmul v6.4s, v17.4s, v1.4s
-; CHECK-NEXT:    fmul v18.4s, v4.4s, v16.4s
-; CHECK-NEXT:    fmul v16.4s, v5.4s, v16.4s
-; CHECK-NEXT:    fmul v1.4s, v3.4s, v1.4s
-; CHECK-NEXT:    add x10, sp, #208
-; CHECK-NEXT:    ld1 { v7.s }[2], [x9]
-; CHECK-NEXT:    add x9, sp, #152
-; CHECK-NEXT:    ld1 { v19.s }[1], [x10]
-; CHECK-NEXT:    ld1 { v20.s }[1], [x9]
+; CHECK-NEXT:    ldr s17, [x11]
 ; CHECK-NEXT:    add x9, sp, #176
-; CHECK-NEXT:    add x10, sp, #184
-; CHECK-NEXT:    fneg v6.4s, v6.4s
-; CHECK-NEXT:    fneg v18.4s, v18.4s
-; CHECK-NEXT:    fmla v16.4s, v2.4s, v4.4s
-; CHECK-NEXT:    fmla v1.4s, v0.4s, v17.4s
-; CHECK-NEXT:    ld1 { v7.s }[3], [x9]
-; CHECK-NEXT:    add x9, sp, #168
-; CHECK-NEXT:    ld1 { v20.s }[2], [x9]
-; CHECK-NEXT:    ldr s4, [sp, #200]
+; CHECK-NEXT:    add x10, sp, #16
+; CHECK-NEXT:    add x11, sp, #168
+; CHECK-NEXT:    ld1 { v17.s }[1], [x9]
+; CHECK-NEXT:    ld1 { v2.s }[1], [x10]
+; CHECK-NEXT:    add x9, sp, #152
+; CHECK-NEXT:    fmul v19.4s, v5.4s, v1.4s
+; CHECK-NEXT:    fmul v20.4s, v7.4s, v16.4s
+; CHECK-NEXT:    fmul v16.4s, v3.4s, v16.4s
+; CHECK-NEXT:    fmul v1.4s, v4.4s, v1.4s
+; CHECK-NEXT:    ld1 { v18.s }[1], [x9]
+; CHECK-NEXT:    ldr s21, [x11]
+; CHECK-NEXT:    zip1 v6.2d, v6.2d, v17.2d
+; CHECK-NEXT:    ldr s17, [sp, #192]
+; CHECK-NEXT:    add x9, sp, #184
+; CHECK-NEXT:    add x10, sp, #208
+; CHECK-NEXT:    ld1 { v21.s }[1], [x9]
 ; CHECK-NEXT:    add x9, sp, #216
-; CHECK-NEXT:    fmla v6.4s, v0.4s, v3.4s
-; CHECK-NEXT:    fmla v18.4s, v2.4s, v5.4s
-; CHECK-NEXT:    ld1 { v4.s }[1], [x9]
-; CHECK-NEXT:    fsub v0.4s, v7.4s, v1.4s
-; CHECK-NEXT:    fsub v1.4s, v19.4s, v16.4s
-; CHECK-NEXT:    ld1 { v20.s }[3], [x10]
-; CHECK-NEXT:    fadd v2.4s, v4.4s, v18.4s
-; CHECK-NEXT:    fadd v3.4s, v20.4s, v6.4s
+; CHECK-NEXT:    fneg v19.4s, v19.4s
+; CHECK-NEXT:    fneg v20.4s, v20.4s
+; CHECK-NEXT:    fmla v16.4s, v2.4s, v7.4s
+; CHECK-NEXT:    fmla v1.4s, v0.4s, v5.4s
+; CHECK-NEXT:    ld1 { v17.s }[1], [x10]
+; CHECK-NEXT:    ldr s5, [sp, #200]
+; CHECK-NEXT:    zip1 v7.2d, v18.2d, v21.2d
+; CHECK-NEXT:    ld1 { v5.s }[1], [x9]
+; CHECK-NEXT:    fmla v19.4s, v0.4s, v4.4s
+; CHECK-NEXT:    fmla v20.4s, v2.4s, v3.4s
+; CHECK-NEXT:    fsub v0.4s, v6.4s, v1.4s
+; CHECK-NEXT:    fsub v1.4s, v17.4s, v16.4s
+; CHECK-NEXT:    fadd v2.4s, v7.4s, v19.4s
+; CHECK-NEXT:    fadd v3.4s, v5.4s, v20.4s
 ; CHECK-NEXT:    ext v4.16b, v0.16b, v1.16b, #12
-; CHECK-NEXT:    ext v5.16b, v3.16b, v2.16b, #12
-; CHECK-NEXT:    trn2 v1.4s, v1.4s, v2.4s
+; CHECK-NEXT:    ext v5.16b, v2.16b, v3.16b, #12
+; CHECK-NEXT:    trn2 v1.4s, v1.4s, v3.4s
 ; CHECK-NEXT:    ext v4.16b, v0.16b, v4.16b, #12
-; CHECK-NEXT:    ext v5.16b, v3.16b, v5.16b, #8
+; CHECK-NEXT:    ext v5.16b, v2.16b, v5.16b, #8
 ; CHECK-NEXT:    rev64 v4.4s, v4.4s
-; CHECK-NEXT:    trn2 v2.4s, v4.4s, v5.4s
-; CHECK-NEXT:    zip2 v4.4s, v0.4s, v3.4s
-; CHECK-NEXT:    zip1 v0.4s, v0.4s, v3.4s
-; CHECK-NEXT:    ext v1.16b, v2.16b, v1.16b, #8
-; CHECK-NEXT:    mov v4.d[1], v2.d[0]
+; CHECK-NEXT:    trn2 v3.4s, v4.4s, v5.4s
+; CHECK-NEXT:    zip2 v4.4s, v0.4s, v2.4s
+; CHECK-NEXT:    zip1 v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    ext v1.16b, v3.16b, v1.16b, #8
+; CHECK-NEXT:    mov v4.d[1], v3.d[0]
 ; CHECK-NEXT:    str q0, [x8]
 ; CHECK-NEXT:    stp q4, q1, [x8, #16]
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/concat-vector.ll b/llvm/test/CodeGen/AArch64/concat-vector.ll
index acf15f1bd117..e6f27b95d92c 100644
--- a/llvm/test/CodeGen/AArch64/concat-vector.ll
+++ b/llvm/test/CodeGen/AArch64/concat-vector.ll
@@ -186,8 +186,9 @@ define <16 x i8> @concat_v16s8_v4s8_load(ptr %ptrA, ptr %ptrB, ptr %ptrC, ptr %p
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr s0, [x0]
 ; CHECK-NEXT:    ld1 { v0.s }[1], [x1]
-; CHECK-NEXT:    ld1 { v0.s }[2], [x2]
-; CHECK-NEXT:    ld1 { v0.s }[3], [x3]
+; CHECK-NEXT:    ldr s1, [x2]
+; CHECK-NEXT:    ld1 { v1.s }[1], [x3]
+; CHECK-NEXT:    zip1 v0.2d, v0.2d, v1.2d
 ; CHECK-NEXT:    ret
     %A = load <4 x i8>, ptr %ptrA
     %B = load <4 x i8>, ptr %ptrB
diff --git a/llvm/test/CodeGen/AArch64/fp-maximumnum-minimumnum.ll b/llvm/test/CodeGen/AArch64/fp-maximumnum-minimumnum.ll
index c6b8e41f9bdf..4906e2e15e51 100644
--- a/llvm/test/CodeGen/AArch64/fp-maximumnum-minimumnum.ll
+++ b/llvm/test/CodeGen/AArch64/fp-maximumnum-minimumnum.ll
@@ -1431,6 +1431,7 @@ define <9 x half> @max_v9f16(<9 x half> %a, <9 x half> %b) {
 ; FULLFP16-NEXT:    add x9, sp, #16
 ; FULLFP16-NEXT:    // kill: def $h3 killed $h3 def $q3
 ; FULLFP16-NEXT:    // kill: def $h4 killed $h4 def $q4
+; FULLFP16-NEXT:    add x10, sp, #40
 ; FULLFP16-NEXT:    // kill: def $h5 killed $h5 def $q5
 ; FULLFP16-NEXT:    // kill: def $h6 killed $h6 def $q6
 ; FULLFP16-NEXT:    // kill: def $h7 killed $h7 def $q7
@@ -1439,30 +1440,30 @@ define <9 x half> @max_v9f16(<9 x half> %a, <9 x half> %b) {
 ; FULLFP16-NEXT:    ld1 { v1.h }[1], [x9]
 ; FULLFP16-NEXT:    add x9, sp, #24
 ; FULLFP16-NEXT:    mov v0.h[2], v2.h[0]
-; FULLFP16-NEXT:    ldr h2, [sp]
 ; FULLFP16-NEXT:    ld1 { v1.h }[2], [x9]
 ; FULLFP16-NEXT:    add x9, sp, #32
-; FULLFP16-NEXT:    fminnm v2.8h, v2.8h, v2.8h
 ; FULLFP16-NEXT:    mov v0.h[3], v3.h[0]
 ; FULLFP16-NEXT:    ld1 { v1.h }[3], [x9]
-; FULLFP16-NEXT:    add x9, sp, #40
-; FULLFP16-NEXT:    ldr h3, [sp, #72]
-; FULLFP16-NEXT:    ld1 { v1.h }[4], [x9]
+; FULLFP16-NEXT:    ldr h2, [x10]
 ; FULLFP16-NEXT:    add x9, sp, #48
+; FULLFP16-NEXT:    ldr h3, [sp, #72]
+; FULLFP16-NEXT:    ld1 { v2.h }[1], [x9]
+; FULLFP16-NEXT:    add x9, sp, #56
 ; FULLFP16-NEXT:    fminnm v3.8h, v3.8h, v3.8h
 ; FULLFP16-NEXT:    mov v0.h[4], v4.h[0]
-; FULLFP16-NEXT:    ld1 { v1.h }[5], [x9]
-; FULLFP16-NEXT:    add x9, sp, #56
-; FULLFP16-NEXT:    fmaxnm v2.8h, v2.8h, v3.8h
-; FULLFP16-NEXT:    mov v0.h[5], v5.h[0]
-; FULLFP16-NEXT:    ld1 { v1.h }[6], [x9]
+; FULLFP16-NEXT:    ld1 { v2.h }[2], [x9]
 ; FULLFP16-NEXT:    add x9, sp, #64
-; FULLFP16-NEXT:    str h2, [x8, #16]
+; FULLFP16-NEXT:    mov v0.h[5], v5.h[0]
+; FULLFP16-NEXT:    ld1 { v2.h }[3], [x9]
+; FULLFP16-NEXT:    zip1 v1.2d, v1.2d, v2.2d
+; FULLFP16-NEXT:    ldr h2, [sp]
 ; FULLFP16-NEXT:    mov v0.h[6], v6.h[0]
-; FULLFP16-NEXT:    ld1 { v1.h }[7], [x9]
+; FULLFP16-NEXT:    fminnm v2.8h, v2.8h, v2.8h
 ; FULLFP16-NEXT:    fminnm v1.8h, v1.8h, v1.8h
 ; FULLFP16-NEXT:    mov v0.h[7], v7.h[0]
+; FULLFP16-NEXT:    fmaxnm v2.8h, v2.8h, v3.8h
 ; FULLFP16-NEXT:    fminnm v0.8h, v0.8h, v0.8h
+; FULLFP16-NEXT:    str h2, [x8, #16]
 ; FULLFP16-NEXT:    fmaxnm v0.8h, v0.8h, v1.8h
 ; FULLFP16-NEXT:    str q0, [x8]
 ; FULLFP16-NEXT:    ret
@@ -2012,6 +2013,7 @@ define <9 x half> @min_v9f16(<9 x half> %a, <9 x half> %b) {
 ; FULLFP16-NEXT:    add x9, sp, #16
 ; FULLFP16-NEXT:    // kill: def $h3 killed $h3 def $q3
 ; FULLFP16-NEXT:    // kill: def $h4 killed $h4 def $q4
+; FULLFP16-NEXT:    add x10, sp, #40
 ; FULLFP16-NEXT:    // kill: def $h5 killed $h5 def $q5
 ; FULLFP16-NEXT:    // kill: def $h6 killed $h6 def $q6
 ; FULLFP16-NEXT:    // kill: def $h7 killed $h7 def $q7
@@ -2020,30 +2022,30 @@ define <9 x half> @min_v9f16(<9 x half> %a, <9 x half> %b) {
 ; FULLFP16-NEXT:    ld1 { v1.h }[1], [x9]
 ; FULLFP16-NEXT:    add x9, sp, #24
 ; FULLFP16-NEXT:    mov v0.h[2], v2.h[0]
-; FULLFP16-NEXT:    ldr h2, [sp]
 ; FULLFP16-NEXT:    ld1 { v1.h }[2], [x9]
 ; FULLFP16-NEXT:    add x9, sp, #32
-; FULLFP16-NEXT:    fminnm v2.8h, v2.8h, v2.8h
 ; FULLFP16-NEXT:    mov v0.h[3], v3.h[0]
 ; FULLFP16-NEXT:    ld1 { v1.h }[3], [x9]
-; FULLFP16-NEXT:    add x9, sp, #40
-; FULLFP16-NEXT:    ldr h3, [sp, #72]
-; FULLFP16-NEXT:    ld1 { v1.h }[4], [x9]
+; FULLFP16-NEXT:    ldr h2, [x10]
 ; FULLFP16-NEXT:    add x9, sp, #48
+; FULLFP16-NEXT:    ldr h3, [sp, #72]
+; FULLFP16-NEXT:    ld1 { v2.h }[1], [x9]
+; FULLFP16-NEXT:    add x9, sp, #56
 ; FULLFP16-NEXT:    fminnm v3.8h, v3.8h, v3.8h
 ; FULLFP16-NEXT:    mov v0.h[4], v4.h[0]
-; FULLFP16-NEXT:    ld1 { v1.h }[5], [x9]
-; FULLFP16-NEXT:    add x9, sp, #56
-; FULLFP16-NEXT:    fminnm v2.8h, v2.8h, v3.8h
-; FULLFP16-NEXT:    mov v0.h[5], v5.h[0]
-; FULLFP16-NEXT:    ld1 { v1.h }[6], [x9]
+; FULLFP16-NEXT:    ld1 { v2.h }[2], [x9]
 ; FULLFP16-NEXT:    add x9, sp, #64
-; FULLFP16-NEXT:    str h2, [x8, #16]
+; FULLFP16-NEXT:    mov v0.h[5], v5.h[0]
+; FULLFP16-NEXT:    ld1 { v2.h }[3], [x9]
+; FULLFP16-NEXT:    zip1 v1.2d, v1.2d, v2.2d
+; FULLFP16-NEXT:    ldr h2, [sp]
 ; FULLFP16-NEXT:    mov v0.h[6], v6.h[0]
-; FULLFP16-NEXT:    ld1 { v1.h }[7], [x9]
+; FULLFP16-NEXT:    fminnm v2.8h, v2.8h, v2.8h
 ; FULLFP16-NEXT:    fminnm v1.8h, v1.8h, v1.8h
 ; FULLFP16-NEXT:    mov v0.h[7], v7.h[0]
+; FULLFP16-NEXT:    fminnm v2.8h, v2.8h, v3.8h
 ; FULLFP16-NEXT:    fminnm v0.8h, v0.8h, v0.8h
+; FULLFP16-NEXT:    str h2, [x8, #16]
 ; FULLFP16-NEXT:    fminnm v0.8h, v0.8h, v1.8h
 ; FULLFP16-NEXT:    str q0, [x8]
 ; FULLFP16-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/fsh.ll b/llvm/test/CodeGen/AArch64/fsh.ll
index 4c28c9082402..ae2ef2649102 100644
--- a/llvm/test/CodeGen/AArch64/fsh.ll
+++ b/llvm/test/CodeGen/AArch64/fsh.ll
@@ -2509,87 +2509,88 @@ define <7 x i32> @fshl_v7i32(<7 x i32> %a, <7 x i32> %b, <7 x i32> %c) {
 ;
 ; CHECK-GI-LABEL: fshl_v7i32:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ldr s3, [sp, #48]
-; CHECK-GI-NEXT:    ldr s20, [sp, #56]
-; CHECK-GI-NEXT:    add x9, sp, #56
+; CHECK-GI-NEXT:    ldr s17, [sp, #48]
+; CHECK-GI-NEXT:    add x8, sp, #56
+; CHECK-GI-NEXT:    add x9, sp, #64
 ; CHECK-GI-NEXT:    ldr s4, [sp, #48]
-; CHECK-GI-NEXT:    ldr s7, [sp, #80]
-; CHECK-GI-NEXT:    mov w12, #-1 // =0xffffffff
-; CHECK-GI-NEXT:    ldr s21, [sp, #88]
-; CHECK-GI-NEXT:    mov v3.s[1], v20.s[0]
-; CHECK-GI-NEXT:    fmov s20, w12
-; CHECK-GI-NEXT:    ld1 { v4.s }[1], [x9]
-; CHECK-GI-NEXT:    ldr s17, [sp]
-; CHECK-GI-NEXT:    add x13, sp, #64
-; CHECK-GI-NEXT:    mov v7.s[1], v21.s[0]
+; CHECK-GI-NEXT:    ldr s21, [sp, #56]
+; CHECK-GI-NEXT:    mov w10, #-1 // =0xffffffff
+; CHECK-GI-NEXT:    ld1 { v17.s }[1], [x8]
+; CHECK-GI-NEXT:    ldr s20, [x9]
+; CHECK-GI-NEXT:    add x8, sp, #72
+; CHECK-GI-NEXT:    mov v4.s[1], v21.s[0]
 ; CHECK-GI-NEXT:    fmov s21, w7
+; CHECK-GI-NEXT:    ldr s6, [sp]
+; CHECK-GI-NEXT:    ld1 { v20.s }[1], [x8]
 ; CHECK-GI-NEXT:    ldr s19, [sp, #64]
-; CHECK-GI-NEXT:    mov w11, #31 // =0x1f
-; CHECK-GI-NEXT:    mov v20.s[1], w12
+; CHECK-GI-NEXT:    ldr s7, [sp, #80]
+; CHECK-GI-NEXT:    ldr s22, [sp, #88]
+; CHECK-GI-NEXT:    mov w9, #31 // =0x1f
+; CHECK-GI-NEXT:    mov w11, #1 // =0x1
+; CHECK-GI-NEXT:    mov v21.s[1], v6.s[0]
+; CHECK-GI-NEXT:    fmov s6, w9
 ; CHECK-GI-NEXT:    ldr s18, [sp, #96]
-; CHECK-GI-NEXT:    ld1 { v4.s }[2], [x13]
-; CHECK-GI-NEXT:    mov w13, #1 // =0x1
-; CHECK-GI-NEXT:    mov v3.s[2], v19.s[0]
-; CHECK-GI-NEXT:    mov v21.s[1], v17.s[0]
-; CHECK-GI-NEXT:    fmov s17, w11
-; CHECK-GI-NEXT:    fmov s19, w13
+; CHECK-GI-NEXT:    zip1 v17.2d, v17.2d, v20.2d
+; CHECK-GI-NEXT:    fmov s20, w10
+; CHECK-GI-NEXT:    mov v7.s[1], v22.s[0]
+; CHECK-GI-NEXT:    mov v4.s[2], v19.s[0]
+; CHECK-GI-NEXT:    fmov s19, w11
 ; CHECK-GI-NEXT:    fmov s23, w0
-; CHECK-GI-NEXT:    fmov s24, w11
-; CHECK-GI-NEXT:    ldr s6, [sp, #8]
+; CHECK-GI-NEXT:    mov v6.s[1], w9
+; CHECK-GI-NEXT:    fmov s24, w9
+; CHECK-GI-NEXT:    ldr s2, [sp, #8]
+; CHECK-GI-NEXT:    mov v20.s[1], w10
 ; CHECK-GI-NEXT:    ldr s0, [sp, #24]
 ; CHECK-GI-NEXT:    ldr s5, [sp, #32]
+; CHECK-GI-NEXT:    mov v19.s[1], w11
 ; CHECK-GI-NEXT:    mov v7.s[2], v18.s[0]
-; CHECK-GI-NEXT:    mov v17.s[1], w11
-; CHECK-GI-NEXT:    mov v19.s[1], w13
-; CHECK-GI-NEXT:    mov v20.s[2], w12
 ; CHECK-GI-NEXT:    ldr s16, [sp, #72]
 ; CHECK-GI-NEXT:    mov v23.s[1], w1
 ; CHECK-GI-NEXT:    ldr s18, [sp, #80]
-; CHECK-GI-NEXT:    mov v21.s[2], v6.s[0]
-; CHECK-GI-NEXT:    mov v24.s[1], w11
+; CHECK-GI-NEXT:    mov v21.s[2], v2.s[0]
+; CHECK-GI-NEXT:    mov v24.s[1], w9
 ; CHECK-GI-NEXT:    mov v0.s[1], v5.s[0]
-; CHECK-GI-NEXT:    fmov s6, w4
-; CHECK-GI-NEXT:    add x10, sp, #88
+; CHECK-GI-NEXT:    fmov s5, w4
+; CHECK-GI-NEXT:    mov v20.s[2], w10
+; CHECK-GI-NEXT:    add x8, sp, #88
 ; CHECK-GI-NEXT:    movi v22.4s, #31
-; CHECK-GI-NEXT:    mov v3.s[3], v16.s[0]
-; CHECK-GI-NEXT:    mov v17.s[2], w11
-; CHECK-GI-NEXT:    mov v19.s[2], w13
-; CHECK-GI-NEXT:    ldr s2, [sp, #16]
-; CHECK-GI-NEXT:    ldr s1, [sp, #40]
-; CHECK-GI-NEXT:    ld1 { v18.s }[1], [x10]
-; CHECK-GI-NEXT:    eor v5.16b, v7.16b, v20.16b
+; CHECK-GI-NEXT:    mov v4.s[3], v16.s[0]
+; CHECK-GI-NEXT:    mov v6.s[2], w9
+; CHECK-GI-NEXT:    mov v19.s[2], w11
+; CHECK-GI-NEXT:    ldr s1, [sp, #16]
+; CHECK-GI-NEXT:    ldr s3, [sp, #40]
+; CHECK-GI-NEXT:    ld1 { v18.s }[1], [x8]
 ; CHECK-GI-NEXT:    mov v23.s[2], w2
-; CHECK-GI-NEXT:    mov v6.s[1], w5
-; CHECK-GI-NEXT:    add x8, sp, #72
-; CHECK-GI-NEXT:    add x9, sp, #96
-; CHECK-GI-NEXT:    mov v21.s[3], v2.s[0]
-; CHECK-GI-NEXT:    mov v24.s[2], w11
-; CHECK-GI-NEXT:    mov v0.s[2], v1.s[0]
-; CHECK-GI-NEXT:    ld1 { v4.s }[3], [x8]
-; CHECK-GI-NEXT:    bic v2.16b, v22.16b, v3.16b
-; CHECK-GI-NEXT:    ld1 { v18.s }[2], [x9]
-; CHECK-GI-NEXT:    and v1.16b, v5.16b, v17.16b
+; CHECK-GI-NEXT:    mov v5.s[1], w5
+; CHECK-GI-NEXT:    add x8, sp, #96
+; CHECK-GI-NEXT:    eor v2.16b, v7.16b, v20.16b
+; CHECK-GI-NEXT:    mov v21.s[3], v1.s[0]
+; CHECK-GI-NEXT:    mov v24.s[2], w9
+; CHECK-GI-NEXT:    mov v0.s[2], v3.s[0]
+; CHECK-GI-NEXT:    bic v1.16b, v22.16b, v4.16b
+; CHECK-GI-NEXT:    ld1 { v18.s }[2], [x8]
 ; CHECK-GI-NEXT:    neg v3.4s, v19.4s
+; CHECK-GI-NEXT:    and v4.16b, v17.16b, v22.16b
+; CHECK-GI-NEXT:    and v2.16b, v2.16b, v6.16b
 ; CHECK-GI-NEXT:    mov v23.s[3], w3
-; CHECK-GI-NEXT:    mov v6.s[2], w6
-; CHECK-GI-NEXT:    and v4.16b, v4.16b, v22.16b
-; CHECK-GI-NEXT:    ushr v5.4s, v21.4s, #1
-; CHECK-GI-NEXT:    neg v2.4s, v2.4s
-; CHECK-GI-NEXT:    and v7.16b, v18.16b, v24.16b
+; CHECK-GI-NEXT:    mov v5.s[2], w6
+; CHECK-GI-NEXT:    ushr v6.4s, v21.4s, #1
 ; CHECK-GI-NEXT:    neg v1.4s, v1.4s
+; CHECK-GI-NEXT:    and v7.16b, v18.16b, v24.16b
 ; CHECK-GI-NEXT:    ushl v0.4s, v0.4s, v3.4s
+; CHECK-GI-NEXT:    neg v2.4s, v2.4s
 ; CHECK-GI-NEXT:    ushl v3.4s, v23.4s, v4.4s
-; CHECK-GI-NEXT:    ushl v2.4s, v5.4s, v2.4s
-; CHECK-GI-NEXT:    ushl v4.4s, v6.4s, v7.4s
-; CHECK-GI-NEXT:    ushl v0.4s, v0.4s, v1.4s
-; CHECK-GI-NEXT:    orr v1.16b, v3.16b, v2.16b
+; CHECK-GI-NEXT:    ushl v1.4s, v6.4s, v1.4s
+; CHECK-GI-NEXT:    ushl v4.4s, v5.4s, v7.4s
+; CHECK-GI-NEXT:    ushl v0.4s, v0.4s, v2.4s
+; CHECK-GI-NEXT:    orr v1.16b, v3.16b, v1.16b
 ; CHECK-GI-NEXT:    orr v0.16b, v4.16b, v0.16b
 ; CHECK-GI-NEXT:    mov s2, v1.s[1]
 ; CHECK-GI-NEXT:    mov s3, v1.s[2]
 ; CHECK-GI-NEXT:    mov s4, v1.s[3]
+; CHECK-GI-NEXT:    fmov w0, s1
 ; CHECK-GI-NEXT:    mov s5, v0.s[1]
 ; CHECK-GI-NEXT:    mov s6, v0.s[2]
-; CHECK-GI-NEXT:    fmov w0, s1
 ; CHECK-GI-NEXT:    fmov w4, s0
 ; CHECK-GI-NEXT:    fmov w1, s2
 ; CHECK-GI-NEXT:    fmov w2, s3
diff --git a/llvm/test/CodeGen/AArch64/llvm.frexp.ll b/llvm/test/CodeGen/AArch64/llvm.frexp.ll
index 2213aa1429db..4e1876db772e 100644
--- a/llvm/test/CodeGen/AArch64/llvm.frexp.ll
+++ b/llvm/test/CodeGen/AArch64/llvm.frexp.ll
@@ -700,13 +700,14 @@ define { <4 x float>, <4 x i32> } @test_frexp_v4f32_v4i32(<4 x float> %a) nounwi
 ; CHECK-NEXT:    ldr s1, [sp, #44]
 ; CHECK-NEXT:    ldr q2, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 def $q0
-; CHECK-NEXT:    ld1 { v1.s }[1], [x19]
 ; CHECK-NEXT:    mov v2.s[3], v0.s[0]
-; CHECK-NEXT:    ld1 { v1.s }[2], [x20]
+; CHECK-NEXT:    ld1 { v1.s }[1], [x19]
+; CHECK-NEXT:    ldr s0, [x20]
+; CHECK-NEXT:    ld1 { v0.s }[1], [x21]
 ; CHECK-NEXT:    ldp x20, x19, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    mov v0.16b, v2.16b
-; CHECK-NEXT:    ld1 { v1.s }[3], [x21]
 ; CHECK-NEXT:    ldp x30, x21, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    zip1 v1.2d, v1.2d, v0.2d
+; CHECK-NEXT:    mov v0.16b, v2.16b
 ; CHECK-NEXT:    add sp, sp, #80
 ; CHECK-NEXT:    ret
 ;
@@ -872,10 +873,11 @@ define <4 x i32> @test_frexp_v4f32_v4i32_only_use_exp(<4 x float> %a) nounwind {
 ; CHECK-NEXT:    bl frexpf
 ; CHECK-NEXT:    ldr s0, [sp, #28]
 ; CHECK-NEXT:    ld1 { v0.s }[1], [x19]
-; CHECK-NEXT:    ld1 { v0.s }[2], [x20]
+; CHECK-NEXT:    ldr s1, [x20]
+; CHECK-NEXT:    ld1 { v1.s }[1], [x21]
 ; CHECK-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    ld1 { v0.s }[3], [x21]
 ; CHECK-NEXT:    ldp x30, x21, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    zip1 v0.2d, v0.2d, v1.2d
 ; CHECK-NEXT:    add sp, sp, #64
 ; CHECK-NEXT:    ret
 ;
diff --git a/llvm/test/CodeGen/AArch64/neon-dotreduce.ll b/llvm/test/CodeGen/AArch64/neon-dotreduce.ll
index 4f0c4080aa0c..9443004ea434 100644
--- a/llvm/test/CodeGen/AArch64/neon-dotreduce.ll
+++ b/llvm/test/CodeGen/AArch64/neon-dotreduce.ll
@@ -6810,195 +6810,200 @@ define i32 @test_sdot_v48i8_double_nomla(<48 x i8> %a, <48 x i8> %b, <48 x i8> %
 ; CHECK-SD-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-SD-NEXT:    .cfi_offset w29, -16
-; CHECK-SD-NEXT:    ldr b5, [sp, #208]
+; CHECK-SD-NEXT:    ldr b0, [sp, #208]
 ; CHECK-SD-NEXT:    add x8, sp, #216
-; CHECK-SD-NEXT:    fmov s0, w0
+; CHECK-SD-NEXT:    add x9, sp, #272
+; CHECK-SD-NEXT:    ldr b2, [sp, #80]
 ; CHECK-SD-NEXT:    ldr b4, [sp, #976]
-; CHECK-SD-NEXT:    add x9, sp, #984
-; CHECK-SD-NEXT:    add x12, sp, #328
-; CHECK-SD-NEXT:    ld1 { v5.b }[1], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #224
-; CHECK-SD-NEXT:    movi v1.16b, #1
-; CHECK-SD-NEXT:    mov v0.b[1], w1
-; CHECK-SD-NEXT:    ld1 { v4.b }[1], [x9]
-; CHECK-SD-NEXT:    movi v3.2d, #0000000000000000
-; CHECK-SD-NEXT:    add x11, sp, #992
 ; CHECK-SD-NEXT:    ldr b6, [sp, #720]
-; CHECK-SD-NEXT:    ldr b7, [sp, #80]
-; CHECK-SD-NEXT:    ld1 { v5.b }[2], [x8]
+; CHECK-SD-NEXT:    ld1 { v0.b }[1], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #224
+; CHECK-SD-NEXT:    fmov s16, w0
+; CHECK-SD-NEXT:    ldr b17, [sp, #848]
+; CHECK-SD-NEXT:    add x10, sp, #24
+; CHECK-SD-NEXT:    movi v19.2d, #0000000000000000
+; CHECK-SD-NEXT:    ld1 { v0.b }[2], [x8]
 ; CHECK-SD-NEXT:    add x8, sp, #232
-; CHECK-SD-NEXT:    add x13, sp, #88
-; CHECK-SD-NEXT:    ld1 { v4.b }[2], [x11]
-; CHECK-SD-NEXT:    ld1 { v7.b }[1], [x13]
-; CHECK-SD-NEXT:    add x13, sp, #856
-; CHECK-SD-NEXT:    mov v0.b[2], w2
-; CHECK-SD-NEXT:    add x14, sp, #1008
-; CHECK-SD-NEXT:    add x15, sp, #872
-; CHECK-SD-NEXT:    ld1 { v5.b }[3], [x8]
+; CHECK-SD-NEXT:    mov v16.b[1], w1
+; CHECK-SD-NEXT:    ld1 { v0.b }[3], [x8]
 ; CHECK-SD-NEXT:    add x8, sp, #240
-; CHECK-SD-NEXT:    add x16, sp, #888
-; CHECK-SD-NEXT:    add x10, sp, #16
-; CHECK-SD-NEXT:    add x9, sp, #24
-; CHECK-SD-NEXT:    add x11, sp, #40
-; CHECK-SD-NEXT:    movi v2.2d, #0000000000000000
-; CHECK-SD-NEXT:    ld1 { v5.b }[4], [x8]
+; CHECK-SD-NEXT:    mov v16.b[2], w2
+; CHECK-SD-NEXT:    ld1 { v0.b }[4], [x8]
 ; CHECK-SD-NEXT:    add x8, sp, #248
-; CHECK-SD-NEXT:    mov v0.b[3], w3
-; CHECK-SD-NEXT:    ld1 { v5.b }[5], [x8]
+; CHECK-SD-NEXT:    mov v16.b[3], w3
+; CHECK-SD-NEXT:    ld1 { v0.b }[5], [x8]
 ; CHECK-SD-NEXT:    add x8, sp, #256
-; CHECK-SD-NEXT:    mov v0.b[4], w4
-; CHECK-SD-NEXT:    ld1 { v5.b }[6], [x8]
+; CHECK-SD-NEXT:    ld1 { v0.b }[6], [x8]
 ; CHECK-SD-NEXT:    add x8, sp, #264
-; CHECK-SD-NEXT:    mov v0.b[5], w5
-; CHECK-SD-NEXT:    ld1 { v5.b }[7], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #272
-; CHECK-SD-NEXT:    ld1 { v5.b }[8], [x8]
+; CHECK-SD-NEXT:    mov v16.b[4], w4
+; CHECK-SD-NEXT:    ld1 { v0.b }[7], [x8]
+; CHECK-SD-NEXT:    ldr b1, [x9]
 ; CHECK-SD-NEXT:    add x8, sp, #280
-; CHECK-SD-NEXT:    mov v0.b[6], w6
-; CHECK-SD-NEXT:    ld1 { v5.b }[9], [x8]
+; CHECK-SD-NEXT:    add x9, sp, #88
+; CHECK-SD-NEXT:    mov v16.b[5], w5
+; CHECK-SD-NEXT:    ld1 { v1.b }[1], [x8]
 ; CHECK-SD-NEXT:    add x8, sp, #288
-; CHECK-SD-NEXT:    mov v0.b[7], w7
-; CHECK-SD-NEXT:    ld1 { v5.b }[10], [x8]
+; CHECK-SD-NEXT:    ld1 { v1.b }[2], [x8]
 ; CHECK-SD-NEXT:    add x8, sp, #296
-; CHECK-SD-NEXT:    ld1 { v0.b }[8], [x10]
-; CHECK-SD-NEXT:    add x10, sp, #128
-; CHECK-SD-NEXT:    ld1 { v5.b }[11], [x8]
+; CHECK-SD-NEXT:    mov v16.b[6], w6
+; CHECK-SD-NEXT:    ld1 { v1.b }[3], [x8]
 ; CHECK-SD-NEXT:    add x8, sp, #304
-; CHECK-SD-NEXT:    ld1 { v0.b }[9], [x9]
-; CHECK-SD-NEXT:    add x9, sp, #136
-; CHECK-SD-NEXT:    ld1 { v5.b }[12], [x8]
+; CHECK-SD-NEXT:    mov v16.b[7], w7
+; CHECK-SD-NEXT:    ld1 { v1.b }[4], [x8]
 ; CHECK-SD-NEXT:    add x8, sp, #312
-; CHECK-SD-NEXT:    ld1 { v5.b }[13], [x8]
+; CHECK-SD-NEXT:    ld1 { v1.b }[5], [x8]
 ; CHECK-SD-NEXT:    add x8, sp, #320
-; CHECK-SD-NEXT:    ld1 { v5.b }[14], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #32
-; CHECK-SD-NEXT:    ld1 { v0.b }[10], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #144
-; CHECK-SD-NEXT:    ld1 { v5.b }[15], [x12]
-; CHECK-SD-NEXT:    add x12, sp, #728
-; CHECK-SD-NEXT:    ld1 { v6.b }[1], [x12]
-; CHECK-SD-NEXT:    add x12, sp, #1000
-; CHECK-SD-NEXT:    ld1 { v0.b }[11], [x11]
-; CHECK-SD-NEXT:    ld1 { v4.b }[3], [x12]
-; CHECK-SD-NEXT:    add x12, sp, #736
-; CHECK-SD-NEXT:    add x11, sp, #920
-; CHECK-SD-NEXT:    sdot v3.4s, v5.16b, v1.16b
-; CHECK-SD-NEXT:    ldr b5, [sp, #848]
-; CHECK-SD-NEXT:    ld1 { v6.b }[2], [x12]
-; CHECK-SD-NEXT:    add x12, sp, #48
-; CHECK-SD-NEXT:    ld1 { v5.b }[1], [x13]
-; CHECK-SD-NEXT:    add x13, sp, #744
-; CHECK-SD-NEXT:    ld1 { v4.b }[4], [x14]
-; CHECK-SD-NEXT:    add x14, sp, #96
-; CHECK-SD-NEXT:    ld1 { v0.b }[12], [x12]
-; CHECK-SD-NEXT:    ld1 { v6.b }[3], [x13]
-; CHECK-SD-NEXT:    add x13, sp, #864
-; CHECK-SD-NEXT:    ld1 { v7.b }[2], [x14]
-; CHECK-SD-NEXT:    add x14, sp, #1016
-; CHECK-SD-NEXT:    ld1 { v5.b }[2], [x13]
-; CHECK-SD-NEXT:    add x13, sp, #752
-; CHECK-SD-NEXT:    ld1 { v4.b }[5], [x14]
-; CHECK-SD-NEXT:    add x14, sp, #104
-; CHECK-SD-NEXT:    ld1 { v6.b }[4], [x13]
-; CHECK-SD-NEXT:    add x13, sp, #1024
-; CHECK-SD-NEXT:    ld1 { v7.b }[3], [x14]
-; CHECK-SD-NEXT:    ld1 { v5.b }[3], [x15]
-; CHECK-SD-NEXT:    add x15, sp, #760
-; CHECK-SD-NEXT:    add x14, sp, #112
-; CHECK-SD-NEXT:    ld1 { v4.b }[6], [x13]
-; CHECK-SD-NEXT:    add x13, sp, #880
-; CHECK-SD-NEXT:    ld1 { v6.b }[5], [x15]
-; CHECK-SD-NEXT:    add x15, sp, #1032
-; CHECK-SD-NEXT:    ld1 { v7.b }[4], [x14]
-; CHECK-SD-NEXT:    ld1 { v5.b }[4], [x13]
-; CHECK-SD-NEXT:    add x14, sp, #768
-; CHECK-SD-NEXT:    add x13, sp, #120
-; CHECK-SD-NEXT:    ld1 { v4.b }[7], [x15]
-; CHECK-SD-NEXT:    add x15, sp, #1040
-; CHECK-SD-NEXT:    ld1 { v6.b }[6], [x14]
-; CHECK-SD-NEXT:    ld1 { v7.b }[5], [x13]
-; CHECK-SD-NEXT:    add x13, sp, #776
-; CHECK-SD-NEXT:    ld1 { v5.b }[5], [x16]
-; CHECK-SD-NEXT:    add x14, sp, #1048
-; CHECK-SD-NEXT:    ld1 { v4.b }[8], [x15]
-; CHECK-SD-NEXT:    add x15, sp, #896
-; CHECK-SD-NEXT:    ld1 { v6.b }[7], [x13]
-; CHECK-SD-NEXT:    ld1 { v7.b }[6], [x10]
-; CHECK-SD-NEXT:    add x10, sp, #784
-; CHECK-SD-NEXT:    ld1 { v5.b }[6], [x15]
-; CHECK-SD-NEXT:    add x13, sp, #1056
-; CHECK-SD-NEXT:    ld1 { v4.b }[9], [x14]
-; CHECK-SD-NEXT:    add x14, sp, #904
-; CHECK-SD-NEXT:    ld1 { v6.b }[8], [x10]
-; CHECK-SD-NEXT:    ld1 { v7.b }[7], [x9]
-; CHECK-SD-NEXT:    add x9, sp, #792
-; CHECK-SD-NEXT:    ld1 { v5.b }[7], [x14]
-; CHECK-SD-NEXT:    add x10, sp, #1064
-; CHECK-SD-NEXT:    ld1 { v4.b }[10], [x13]
-; CHECK-SD-NEXT:    add x13, sp, #912
-; CHECK-SD-NEXT:    ld1 { v6.b }[9], [x9]
-; CHECK-SD-NEXT:    ld1 { v7.b }[8], [x8]
-; CHECK-SD-NEXT:    add x9, sp, #800
-; CHECK-SD-NEXT:    ld1 { v5.b }[8], [x13]
+; CHECK-SD-NEXT:    ld1 { v1.b }[6], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #328
+; CHECK-SD-NEXT:    ld1 { v1.b }[7], [x8]
+; CHECK-SD-NEXT:    ld1 { v2.b }[1], [x9]
+; CHECK-SD-NEXT:    add x8, sp, #96
+; CHECK-SD-NEXT:    add x9, sp, #144
+; CHECK-SD-NEXT:    ld1 { v2.b }[2], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #104
+; CHECK-SD-NEXT:    zip1 v0.2d, v0.2d, v1.2d
+; CHECK-SD-NEXT:    movi v1.16b, #1
+; CHECK-SD-NEXT:    ld1 { v2.b }[3], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #112
+; CHECK-SD-NEXT:    ld1 { v2.b }[4], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #120
+; CHECK-SD-NEXT:    ld1 { v2.b }[5], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #128
+; CHECK-SD-NEXT:    ld1 { v2.b }[6], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #136
+; CHECK-SD-NEXT:    ld1 { v2.b }[7], [x8]
+; CHECK-SD-NEXT:    ldr b3, [x9]
 ; CHECK-SD-NEXT:    add x8, sp, #152
-; CHECK-SD-NEXT:    ld1 { v4.b }[11], [x10]
-; CHECK-SD-NEXT:    add x10, sp, #1072
-; CHECK-SD-NEXT:    ld1 { v6.b }[10], [x9]
-; CHECK-SD-NEXT:    ld1 { v7.b }[9], [x8]
-; CHECK-SD-NEXT:    add x9, sp, #808
-; CHECK-SD-NEXT:    ld1 { v5.b }[9], [x11]
-; CHECK-SD-NEXT:    add x8, sp, #56
-; CHECK-SD-NEXT:    ld1 { v4.b }[12], [x10]
-; CHECK-SD-NEXT:    add x10, sp, #160
-; CHECK-SD-NEXT:    ld1 { v0.b }[13], [x8]
-; CHECK-SD-NEXT:    ld1 { v6.b }[11], [x9]
-; CHECK-SD-NEXT:    add x9, sp, #928
-; CHECK-SD-NEXT:    ld1 { v7.b }[10], [x10]
-; CHECK-SD-NEXT:    add x10, sp, #1080
-; CHECK-SD-NEXT:    ld1 { v5.b }[10], [x9]
+; CHECK-SD-NEXT:    add x9, sp, #984
+; CHECK-SD-NEXT:    ld1 { v3.b }[1], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #160
+; CHECK-SD-NEXT:    ld1 { v3.b }[2], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #168
+; CHECK-SD-NEXT:    ld1 { v3.b }[3], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #176
+; CHECK-SD-NEXT:    ld1 { v3.b }[4], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #184
+; CHECK-SD-NEXT:    ld1 { v3.b }[5], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #192
+; CHECK-SD-NEXT:    ld1 { v3.b }[6], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #200
+; CHECK-SD-NEXT:    ld1 { v3.b }[7], [x8]
+; CHECK-SD-NEXT:    ld1 { v4.b }[1], [x9]
+; CHECK-SD-NEXT:    add x8, sp, #992
+; CHECK-SD-NEXT:    add x9, sp, #1040
+; CHECK-SD-NEXT:    ld1 { v4.b }[2], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #1000
+; CHECK-SD-NEXT:    zip1 v2.2d, v2.2d, v3.2d
+; CHECK-SD-NEXT:    ld1 { v4.b }[3], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #1008
+; CHECK-SD-NEXT:    ld1 { v4.b }[4], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #1016
+; CHECK-SD-NEXT:    ld1 { v4.b }[5], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #1024
+; CHECK-SD-NEXT:    ld1 { v4.b }[6], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #1032
+; CHECK-SD-NEXT:    ld1 { v4.b }[7], [x8]
+; CHECK-SD-NEXT:    ldr b5, [x9]
+; CHECK-SD-NEXT:    add x8, sp, #1048
+; CHECK-SD-NEXT:    add x9, sp, #728
+; CHECK-SD-NEXT:    ld1 { v5.b }[1], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #1056
+; CHECK-SD-NEXT:    ld1 { v5.b }[2], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #1064
+; CHECK-SD-NEXT:    ld1 { v5.b }[3], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #1072
+; CHECK-SD-NEXT:    ld1 { v5.b }[4], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #1080
+; CHECK-SD-NEXT:    ld1 { v5.b }[5], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #1088
+; CHECK-SD-NEXT:    ld1 { v5.b }[6], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #1096
+; CHECK-SD-NEXT:    ld1 { v5.b }[7], [x8]
+; CHECK-SD-NEXT:    ld1 { v6.b }[1], [x9]
+; CHECK-SD-NEXT:    add x8, sp, #736
+; CHECK-SD-NEXT:    add x9, sp, #784
+; CHECK-SD-NEXT:    ld1 { v6.b }[2], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #744
+; CHECK-SD-NEXT:    zip1 v4.2d, v4.2d, v5.2d
+; CHECK-SD-NEXT:    movi v5.2d, #0000000000000000
+; CHECK-SD-NEXT:    ld1 { v6.b }[3], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #752
+; CHECK-SD-NEXT:    sdot v19.4s, v4.16b, v1.16b
+; CHECK-SD-NEXT:    sdot v5.4s, v0.16b, v1.16b
+; CHECK-SD-NEXT:    ld1 { v6.b }[4], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #760
+; CHECK-SD-NEXT:    ld1 { v6.b }[5], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #768
+; CHECK-SD-NEXT:    ld1 { v6.b }[6], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #776
+; CHECK-SD-NEXT:    ld1 { v6.b }[7], [x8]
+; CHECK-SD-NEXT:    ldr b7, [x9]
+; CHECK-SD-NEXT:    add x8, sp, #792
+; CHECK-SD-NEXT:    add x9, sp, #856
+; CHECK-SD-NEXT:    ld1 { v7.b }[1], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #800
+; CHECK-SD-NEXT:    ld1 { v7.b }[2], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #808
+; CHECK-SD-NEXT:    ld1 { v7.b }[3], [x8]
 ; CHECK-SD-NEXT:    add x8, sp, #816
-; CHECK-SD-NEXT:    ld1 { v4.b }[13], [x10]
-; CHECK-SD-NEXT:    add x9, sp, #168
-; CHECK-SD-NEXT:    add x10, sp, #176
-; CHECK-SD-NEXT:    ld1 { v6.b }[12], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #936
-; CHECK-SD-NEXT:    ld1 { v7.b }[11], [x9]
-; CHECK-SD-NEXT:    add x9, sp, #1088
-; CHECK-SD-NEXT:    ld1 { v5.b }[11], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #64
-; CHECK-SD-NEXT:    ld1 { v4.b }[14], [x9]
-; CHECK-SD-NEXT:    add x9, sp, #824
-; CHECK-SD-NEXT:    ld1 { v0.b }[14], [x8]
-; CHECK-SD-NEXT:    ld1 { v6.b }[13], [x9]
-; CHECK-SD-NEXT:    add x9, sp, #944
-; CHECK-SD-NEXT:    ld1 { v7.b }[12], [x10]
-; CHECK-SD-NEXT:    add x10, sp, #1096
-; CHECK-SD-NEXT:    ld1 { v5.b }[12], [x9]
+; CHECK-SD-NEXT:    ld1 { v7.b }[4], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #824
+; CHECK-SD-NEXT:    ld1 { v7.b }[5], [x8]
 ; CHECK-SD-NEXT:    add x8, sp, #832
-; CHECK-SD-NEXT:    ld1 { v4.b }[15], [x10]
-; CHECK-SD-NEXT:    add x9, sp, #184
-; CHECK-SD-NEXT:    add x10, sp, #72
-; CHECK-SD-NEXT:    ld1 { v6.b }[14], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #952
-; CHECK-SD-NEXT:    ld1 { v7.b }[13], [x9]
-; CHECK-SD-NEXT:    ld1 { v5.b }[13], [x8]
+; CHECK-SD-NEXT:    ld1 { v7.b }[6], [x8]
 ; CHECK-SD-NEXT:    add x8, sp, #840
-; CHECK-SD-NEXT:    ld1 { v0.b }[15], [x10]
-; CHECK-SD-NEXT:    sdot v2.4s, v4.16b, v1.16b
-; CHECK-SD-NEXT:    add x9, sp, #192
-; CHECK-SD-NEXT:    ld1 { v6.b }[15], [x8]
+; CHECK-SD-NEXT:    ld1 { v7.b }[7], [x8]
+; CHECK-SD-NEXT:    ld1 { v17.b }[1], [x9]
+; CHECK-SD-NEXT:    add x8, sp, #864
+; CHECK-SD-NEXT:    add x9, sp, #16
+; CHECK-SD-NEXT:    ld1 { v16.b }[8], [x9]
+; CHECK-SD-NEXT:    add x9, sp, #912
+; CHECK-SD-NEXT:    ld1 { v17.b }[2], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #872
+; CHECK-SD-NEXT:    zip1 v0.2d, v6.2d, v7.2d
+; CHECK-SD-NEXT:    ld1 { v16.b }[9], [x10]
+; CHECK-SD-NEXT:    ld1 { v17.b }[3], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #880
+; CHECK-SD-NEXT:    sdot v19.4s, v0.16b, v1.16b
+; CHECK-SD-NEXT:    ld1 { v17.b }[4], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #888
+; CHECK-SD-NEXT:    ld1 { v17.b }[5], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #896
+; CHECK-SD-NEXT:    ld1 { v17.b }[6], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #904
+; CHECK-SD-NEXT:    ld1 { v17.b }[7], [x8]
+; CHECK-SD-NEXT:    ldr b18, [x9]
+; CHECK-SD-NEXT:    add x8, sp, #920
+; CHECK-SD-NEXT:    ld1 { v18.b }[1], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #32
+; CHECK-SD-NEXT:    ld1 { v16.b }[10], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #928
+; CHECK-SD-NEXT:    ld1 { v18.b }[2], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #40
+; CHECK-SD-NEXT:    ld1 { v16.b }[11], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #936
+; CHECK-SD-NEXT:    ld1 { v18.b }[3], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #48
+; CHECK-SD-NEXT:    ld1 { v16.b }[12], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #944
+; CHECK-SD-NEXT:    ld1 { v18.b }[4], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #56
+; CHECK-SD-NEXT:    ld1 { v16.b }[13], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #952
+; CHECK-SD-NEXT:    ld1 { v18.b }[5], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #64
+; CHECK-SD-NEXT:    ld1 { v16.b }[14], [x8]
 ; CHECK-SD-NEXT:    add x8, sp, #960
-; CHECK-SD-NEXT:    ld1 { v7.b }[14], [x9]
-; CHECK-SD-NEXT:    ld1 { v5.b }[14], [x8]
-; CHECK-SD-NEXT:    sdot v3.4s, v0.16b, v1.16b
-; CHECK-SD-NEXT:    add x8, sp, #200
-; CHECK-SD-NEXT:    add x9, sp, #968
-; CHECK-SD-NEXT:    sdot v2.4s, v6.16b, v1.16b
-; CHECK-SD-NEXT:    ld1 { v7.b }[15], [x8]
-; CHECK-SD-NEXT:    ld1 { v5.b }[15], [x9]
-; CHECK-SD-NEXT:    sdot v3.4s, v7.16b, v1.16b
-; CHECK-SD-NEXT:    sdot v2.4s, v5.16b, v1.16b
-; CHECK-SD-NEXT:    add v0.4s, v3.4s, v2.4s
+; CHECK-SD-NEXT:    ld1 { v18.b }[6], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #72
+; CHECK-SD-NEXT:    ld1 { v16.b }[15], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #968
+; CHECK-SD-NEXT:    ld1 { v18.b }[7], [x8]
+; CHECK-SD-NEXT:    sdot v5.4s, v16.16b, v1.16b
+; CHECK-SD-NEXT:    zip1 v0.2d, v17.2d, v18.2d
+; CHECK-SD-NEXT:    sdot v5.4s, v2.16b, v1.16b
+; CHECK-SD-NEXT:    sdot v19.4s, v0.16b, v1.16b
+; CHECK-SD-NEXT:    add v0.4s, v5.4s, v19.4s
 ; CHECK-SD-NEXT:    addv s0, v0.4s
 ; CHECK-SD-NEXT:    fmov w0, s0
 ; CHECK-SD-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/nontemporal.ll b/llvm/test/CodeGen/AArch64/nontemporal.ll
index f8ba150a0405..f7a87ae340a7 100644
--- a/llvm/test/CodeGen/AArch64/nontemporal.ll
+++ b/llvm/test/CodeGen/AArch64/nontemporal.ll
@@ -683,41 +683,43 @@ define void @test_stnp_v17f32(<17 x float> %v, ptr %ptr) {
 ;
 ; CHECK-BE-LABEL: test_stnp_v17f32:
 ; CHECK-BE:       // %bb.0: // %entry
-; CHECK-BE-NEXT:    // kill: def $s4 killed $s4 def $q4
+; CHECK-BE-NEXT:    // kill: def $s1 killed $s1 def $q1
 ; CHECK-BE-NEXT:    // kill: def $s0 killed $s0 def $q0
-; CHECK-BE-NEXT:    ldr s16, [sp, #36]
+; CHECK-BE-NEXT:    // kill: def $s4 killed $s4 def $q4
 ; CHECK-BE-NEXT:    // kill: def $s5 killed $s5 def $q5
-; CHECK-BE-NEXT:    // kill: def $s1 killed $s1 def $q1
-; CHECK-BE-NEXT:    ldr s17, [sp, #4]
-; CHECK-BE-NEXT:    add x8, sp, #44
-; CHECK-BE-NEXT:    mov v4.s[1], v5.s[0]
+; CHECK-BE-NEXT:    add x8, sp, #12
+; CHECK-BE-NEXT:    add x9, sp, #20
+; CHECK-BE-NEXT:    ldr s16, [sp, #36]
 ; CHECK-BE-NEXT:    mov v0.s[1], v1.s[0]
+; CHECK-BE-NEXT:    ldr s1, [sp, #4]
+; CHECK-BE-NEXT:    mov v4.s[1], v5.s[0]
+; CHECK-BE-NEXT:    add x10, sp, #52
 ; CHECK-BE-NEXT:    // kill: def $s6 killed $s6 def $q6
 ; CHECK-BE-NEXT:    // kill: def $s2 killed $s2 def $q2
 ; CHECK-BE-NEXT:    // kill: def $s7 killed $s7 def $q7
 ; CHECK-BE-NEXT:    // kill: def $s3 killed $s3 def $q3
-; CHECK-BE-NEXT:    ldr s1, [sp, #68]
-; CHECK-BE-NEXT:    ld1 { v16.s }[1], [x8]
-; CHECK-BE-NEXT:    add x8, sp, #12
-; CHECK-BE-NEXT:    ld1 { v17.s }[1], [x8]
-; CHECK-BE-NEXT:    add x8, sp, #52
-; CHECK-BE-NEXT:    str s1, [x0, #64]
-; CHECK-BE-NEXT:    ld1 { v16.s }[2], [x8]
-; CHECK-BE-NEXT:    add x8, sp, #20
+; CHECK-BE-NEXT:    ld1 { v1.s }[1], [x8]
+; CHECK-BE-NEXT:    ldr s5, [x9]
+; CHECK-BE-NEXT:    add x8, sp, #28
+; CHECK-BE-NEXT:    add x9, sp, #44
+; CHECK-BE-NEXT:    ld1 { v5.s }[1], [x8]
+; CHECK-BE-NEXT:    ld1 { v16.s }[1], [x9]
+; CHECK-BE-NEXT:    ldr s17, [x10]
+; CHECK-BE-NEXT:    add x8, sp, #60
 ; CHECK-BE-NEXT:    mov v4.s[2], v6.s[0]
 ; CHECK-BE-NEXT:    mov v0.s[2], v2.s[0]
-; CHECK-BE-NEXT:    ld1 { v17.s }[2], [x8]
-; CHECK-BE-NEXT:    add x8, sp, #60
-; CHECK-BE-NEXT:    ld1 { v16.s }[3], [x8]
-; CHECK-BE-NEXT:    add x8, sp, #28
-; CHECK-BE-NEXT:    ld1 { v17.s }[3], [x8]
+; CHECK-BE-NEXT:    ld1 { v17.s }[1], [x8]
+; CHECK-BE-NEXT:    ldr s2, [sp, #68]
+; CHECK-BE-NEXT:    add x8, x0, #32
+; CHECK-BE-NEXT:    zip1 v1.2d, v1.2d, v5.2d
+; CHECK-BE-NEXT:    add x9, x0, #48
+; CHECK-BE-NEXT:    str s2, [x0, #64]
+; CHECK-BE-NEXT:    zip1 v5.2d, v16.2d, v17.2d
 ; CHECK-BE-NEXT:    mov v4.s[3], v7.s[0]
-; CHECK-BE-NEXT:    add x8, x0, #48
 ; CHECK-BE-NEXT:    mov v0.s[3], v3.s[0]
-; CHECK-BE-NEXT:    st1 { v16.4s }, [x8]
-; CHECK-BE-NEXT:    add x8, x0, #32
-; CHECK-BE-NEXT:    st1 { v17.4s }, [x8]
+; CHECK-BE-NEXT:    st1 { v1.4s }, [x8]
 ; CHECK-BE-NEXT:    add x8, x0, #16
+; CHECK-BE-NEXT:    st1 { v5.4s }, [x9]
 ; CHECK-BE-NEXT:    st1 { v4.4s }, [x8]
 ; CHECK-BE-NEXT:    st1 { v0.4s }, [x0]
 ; CHECK-BE-NEXT:    ret
-- 
cgit v1.2.3


From 661cbd5a5254de22ba87a49e89f54b30e2874fb3 Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <Krzysztof.Parzyszek@amd.com>
Date: Thu, 17 Jul 2025 11:27:47 -0500
Subject: [utils][TableGen] Make some non-bitmask enums iterable (#148647)

Additionally, add sentinel values <Enum>::First_ and <Enum>::Last_ to
each one of those enums.

This will allow using `enum_seq_inclusive` to generate the list of
enum-typed values of any generated scoped (non-bitmask) enum.
---
 llvm/test/TableGen/directive1.td                   | 25 ++++++++++++++++++++++
 llvm/test/TableGen/directive2.td                   | 25 ++++++++++++++++++++++
 .../Frontend/OpenMPDirectiveNameParserTest.cpp     | 21 ++++++------------
 llvm/utils/TableGen/Basic/DirectiveEmitter.cpp     | 22 +++++++++++++++++--
 4 files changed, 76 insertions(+), 17 deletions(-)

diff --git a/llvm/test/TableGen/directive1.td b/llvm/test/TableGen/directive1.td
index 1d2bd51204e4..3eda077eeabf 100644
--- a/llvm/test/TableGen/directive1.td
+++ b/llvm/test/TableGen/directive1.td
@@ -53,6 +53,7 @@ def TDL_DirA : Directive<[Spelling<"dira">]> {
 // CHECK-EMPTY:
 // CHECK-NEXT:  #include "llvm/ADT/ArrayRef.h"
 // CHECK-NEXT:  #include "llvm/ADT/BitmaskEnum.h"
+// CHECK-NEXT:  #include "llvm/ADT/Sequence.h"
 // CHECK-NEXT:  #include "llvm/ADT/StringRef.h"
 // CHECK-NEXT:  #include "llvm/Frontend/Directive/Spelling.h"
 // CHECK-NEXT:  #include "llvm/Support/Compiler.h"
@@ -66,22 +67,26 @@ def TDL_DirA : Directive<[Spelling<"dira">]> {
 // CHECK-EMPTY:
 // CHECK-NEXT:  enum class Association {
 // CHECK-NEXT:    Block,
+// CHECK-NEXT:    First_ = Block,
 // CHECK-NEXT:    Declaration,
 // CHECK-NEXT:    Delimited,
 // CHECK-NEXT:    Loop,
 // CHECK-NEXT:    None,
 // CHECK-NEXT:    Separating,
+// CHECK-NEXT:    Last_ = Separating,
 // CHECK-NEXT:  };
 // CHECK-EMPTY:
 // CHECK-NEXT:  static constexpr std::size_t Association_enumSize = 6;
 // CHECK-EMPTY:
 // CHECK-NEXT:  enum class Category {
 // CHECK-NEXT:    Declarative,
+// CHECK-NEXT:    First_ = Declarative,
 // CHECK-NEXT:    Executable,
 // CHECK-NEXT:    Informational,
 // CHECK-NEXT:    Meta,
 // CHECK-NEXT:    Subsidiary,
 // CHECK-NEXT:    Utility,
+// CHECK-NEXT:    Last_ = Utility,
 // CHECK-NEXT:  };
 // CHECK-EMPTY:
 // CHECK-NEXT:  static constexpr std::size_t Category_enumSize = 6;
@@ -96,6 +101,8 @@ def TDL_DirA : Directive<[Spelling<"dira">]> {
 // CHECK-EMPTY:
 // CHECK-NEXT:  enum class Directive {
 // CHECK-NEXT:    TDLD_dira,
+// CHECK-NEXT:    First_ = TDLD_dira,
+// CHECK-NEXT:    Last_ = TDLD_dira,
 // CHECK-NEXT:  };
 // CHECK-EMPTY:
 // CHECK-NEXT:  static constexpr std::size_t Directive_enumSize = 1;
@@ -104,8 +111,10 @@ def TDL_DirA : Directive<[Spelling<"dira">]> {
 // CHECK-EMPTY:
 // CHECK-NEXT:  enum class Clause {
 // CHECK-NEXT:    TDLC_clausea,
+// CHECK-NEXT:    First_ = TDLC_clausea,
 // CHECK-NEXT:    TDLC_clauseb,
 // CHECK-NEXT:    TDLC_clausec,
+// CHECK-NEXT:    Last_ = TDLC_clausec,
 // CHECK-NEXT:  };
 // CHECK-EMPTY:
 // CHECK-NEXT:  static constexpr std::size_t Clause_enumSize = 3;
@@ -151,6 +160,22 @@ def TDL_DirA : Directive<[Spelling<"dira">]> {
 // CHECK-NEXT:  LLVM_ABI StringRef getTdlAKindName(AKind x);
 // CHECK-EMPTY:
 // CHECK-NEXT:  } // namespace tdl
+// CHECK-EMPTY:
+// CHECK-NEXT:  template <> struct enum_iteration_traits<tdl::Association> {
+// CHECK-NEXT:    static constexpr bool is_iterable = true;
+// CHECK-NEXT:  };
+// CHECK-EMPTY:
+// CHECK-NEXT:  template <> struct enum_iteration_traits<tdl::Category> {
+// CHECK-NEXT:    static constexpr bool is_iterable = true;
+// CHECK-NEXT:  };
+// CHECK-EMPTY:
+// CHECK-NEXT:  template <> struct enum_iteration_traits<tdl::Directive> {
+// CHECK-NEXT:    static constexpr bool is_iterable = true;
+// CHECK-NEXT:  };
+// CHECK-EMPTY:
+// CHECK-NEXT:  template <> struct enum_iteration_traits<tdl::Clause> {
+// CHECK-NEXT:    static constexpr bool is_iterable = true;
+// CHECK-NEXT:  };
 // CHECK-NEXT:  } // namespace llvm
 // CHECK-NEXT:  #endif // LLVM_Tdl_INC
 
diff --git a/llvm/test/TableGen/directive2.td b/llvm/test/TableGen/directive2.td
index 3a64bb3900a3..a25197c3efd9 100644
--- a/llvm/test/TableGen/directive2.td
+++ b/llvm/test/TableGen/directive2.td
@@ -46,6 +46,7 @@ def TDL_DirA : Directive<[Spelling<"dira">]> {
 // CHECK-NEXT:  #define LLVM_Tdl_INC
 // CHECK-EMPTY:
 // CHECK-NEXT:  #include "llvm/ADT/ArrayRef.h"
+// CHECK-NEXT:  #include "llvm/ADT/Sequence.h"
 // CHECK-NEXT:  #include "llvm/ADT/StringRef.h"
 // CHECK-NEXT:  #include "llvm/Frontend/Directive/Spelling.h"
 // CHECK-NEXT:  #include "llvm/Support/Compiler.h"
@@ -57,22 +58,26 @@ def TDL_DirA : Directive<[Spelling<"dira">]> {
 // CHECK-EMPTY:
 // CHECK-NEXT:  enum class Association {
 // CHECK-NEXT:    Block,
+// CHECK-NEXT:    First_ = Block,
 // CHECK-NEXT:    Declaration,
 // CHECK-NEXT:    Delimited,
 // CHECK-NEXT:    Loop,
 // CHECK-NEXT:    None,
 // CHECK-NEXT:    Separating,
+// CHECK-NEXT:    Last_ = Separating,
 // CHECK-NEXT:  };
 // CHECK-EMPTY:
 // CHECK-NEXT:  static constexpr std::size_t Association_enumSize = 6;
 // CHECK-EMPTY:
 // CHECK-NEXT:  enum class Category {
 // CHECK-NEXT:    Declarative,
+// CHECK-NEXT:    First_ = Declarative,
 // CHECK-NEXT:    Executable,
 // CHECK-NEXT:    Informational,
 // CHECK-NEXT:    Meta,
 // CHECK-NEXT:    Subsidiary,
 // CHECK-NEXT:    Utility,
+// CHECK-NEXT:    Last_ = Utility,
 // CHECK-NEXT:  };
 // CHECK-EMPTY:
 // CHECK-NEXT:  static constexpr std::size_t Category_enumSize = 6;
@@ -87,15 +92,19 @@ def TDL_DirA : Directive<[Spelling<"dira">]> {
 // CHECK-EMPTY:
 // CHECK-NEXT:  enum class Directive {
 // CHECK-NEXT:    TDLD_dira,
+// CHECK-NEXT:    First_ = TDLD_dira,
+// CHECK-NEXT:    Last_ = TDLD_dira,
 // CHECK-NEXT:  };
 // CHECK-EMPTY:
 // CHECK-NEXT:  static constexpr std::size_t Directive_enumSize = 1;
 // CHECK-EMPTY:
 // CHECK-NEXT:  enum class Clause {
 // CHECK-NEXT:    TDLC_clausea,
+// CHECK-NEXT:    First_ = TDLC_clausea,
 // CHECK-NEXT:    TDLC_clauseb,
 // CHECK-NEXT:    TDLC_clausec,
 // CHECK-NEXT:    TDLC_claused,
+// CHECK-NEXT:    Last_ = TDLC_claused,
 // CHECK-NEXT:  };
 // CHECK-EMPTY:
 // CHECK-NEXT:  static constexpr std::size_t Clause_enumSize = 4;
@@ -124,6 +133,22 @@ def TDL_DirA : Directive<[Spelling<"dira">]> {
 // CHECK-NEXT:  LLVM_ABI Category getDirectiveCategory(Directive D);
 // CHECK-NEXT:  LLVM_ABI SourceLanguage getDirectiveLanguages(Directive D);
 // CHECK-NEXT:  } // namespace tdl
+// CHECK-EMPTY:
+// CHECK-NEXT:  template <> struct enum_iteration_traits<tdl::Association> {
+// CHECK-NEXT:    static constexpr bool is_iterable = true;
+// CHECK-NEXT:  };
+// CHECK-EMPTY:
+// CHECK-NEXT:  template <> struct enum_iteration_traits<tdl::Category> {
+// CHECK-NEXT:    static constexpr bool is_iterable = true;
+// CHECK-NEXT:  };
+// CHECK-EMPTY:
+// CHECK-NEXT:  template <> struct enum_iteration_traits<tdl::Directive> {
+// CHECK-NEXT:    static constexpr bool is_iterable = true;
+// CHECK-NEXT:  };
+// CHECK-EMPTY:
+// CHECK-NEXT:  template <> struct enum_iteration_traits<tdl::Clause> {
+// CHECK-NEXT:    static constexpr bool is_iterable = true;
+// CHECK-NEXT:  };
 // CHECK-NEXT:  } // namespace llvm
 // CHECK-NEXT:  #endif // LLVM_Tdl_INC
 
diff --git a/llvm/unittests/Frontend/OpenMPDirectiveNameParserTest.cpp b/llvm/unittests/Frontend/OpenMPDirectiveNameParserTest.cpp
index 0363a08cc0f0..10329820bef7 100644
--- a/llvm/unittests/Frontend/OpenMPDirectiveNameParserTest.cpp
+++ b/llvm/unittests/Frontend/OpenMPDirectiveNameParserTest.cpp
@@ -48,12 +48,6 @@ static std::string &prepareParamName(std::string &Name) {
   return Name;
 }
 
-namespace llvm {
-template <> struct enum_iteration_traits<omp::Directive> {
-  static constexpr bool is_iterable = true;
-};
-} // namespace llvm
-
 // Test tokenizing.
 
 class Tokenize : public testing::TestWithParam<omp::Directive> {};
@@ -87,12 +81,10 @@ getParamName1(const testing::TestParamInfo<Tokenize::ParamType> &Info) {
   return prepareParamName(Name);
 }
 
-INSTANTIATE_TEST_SUITE_P(
-    DirectiveNameParserTest, Tokenize,
-    testing::ValuesIn(
-        llvm::enum_seq(static_cast<omp::Directive>(0),
-                       static_cast<omp::Directive>(omp::Directive_enumSize))),
-    getParamName1);
+INSTANTIATE_TEST_SUITE_P(DirectiveNameParserTest, Tokenize,
+                         testing::ValuesIn(llvm::enum_seq_inclusive(
+                             omp::Directive::First_, omp::Directive::Last_)),
+                         getParamName1);
 
 // Test parsing of valid names.
 
@@ -131,9 +123,8 @@ getParamName2(const testing::TestParamInfo<ParseValid::ParamType> &Info) {
 
 INSTANTIATE_TEST_SUITE_P(
     DirectiveNameParserTest, ParseValid,
-    testing::Combine(testing::ValuesIn(llvm::enum_seq(
-                         static_cast<omp::Directive>(0),
-                         static_cast<omp::Directive>(omp::Directive_enumSize))),
+    testing::Combine(testing::ValuesIn(llvm::enum_seq_inclusive(
+                         omp::Directive::First_, omp::Directive::Last_)),
                      testing::ValuesIn(omp::getOpenMPVersions())),
     getParamName2);
 
diff --git a/llvm/utils/TableGen/Basic/DirectiveEmitter.cpp b/llvm/utils/TableGen/Basic/DirectiveEmitter.cpp
index 177eecebce9a..f0e23690367d 100644
--- a/llvm/utils/TableGen/Basic/DirectiveEmitter.cpp
+++ b/llvm/utils/TableGen/Basic/DirectiveEmitter.cpp
@@ -106,8 +106,16 @@ static void generateEnumClass(ArrayRef<const Record *> Records, raw_ostream &OS,
                               bool ExportEnums) {
   OS << "\n";
   OS << "enum class " << Enum << " {\n";
-  for (const Record *R : Records) {
-    OS << "  " << getIdentifierName(R, Prefix) << ",\n";
+  if (!Records.empty()) {
+    std::string N;
+    for (auto [I, R] : llvm::enumerate(Records)) {
+      N = getIdentifierName(R, Prefix);
+      OS << "  " << N << ",\n";
+      // Make the sentinel names less likely to conflict with actual names...
+      if (I == 0)
+        OS << "  First_ = " << N << ",\n";
+    }
+    OS << "  Last_ = " << N << ",\n";
   }
   OS << "};\n";
   OS << "\n";
@@ -282,6 +290,7 @@ static void emitDirectivesDecl(const RecordKeeper &Records, raw_ostream &OS) {
   if (DirLang.hasEnableBitmaskEnumInNamespace())
     OS << "#include \"llvm/ADT/BitmaskEnum.h\"\n";
 
+  OS << "#include \"llvm/ADT/Sequence.h\"\n";
   OS << "#include \"llvm/ADT/StringRef.h\"\n";
   OS << "#include \"llvm/Frontend/Directive/Spelling.h\"\n";
   OS << "#include \"llvm/Support/Compiler.h\"\n";
@@ -375,6 +384,15 @@ static void emitDirectivesDecl(const RecordKeeper &Records, raw_ostream &OS) {
   for (auto Ns : reverse(Namespaces))
     OS << "} // namespace " << Ns << "\n";
 
+  // These specializations need to be in ::llvm.
+  for (StringRef Enum : {"Association", "Category", "Directive", "Clause"}) {
+    OS << "\n";
+    OS << "template <> struct enum_iteration_traits<"
+       << DirLang.getCppNamespace() << "::" << Enum << "> {\n";
+    OS << "  static constexpr bool is_iterable = true;\n";
+    OS << "};\n";
+  }
+
   OS << "} // namespace llvm\n";
 
   OS << "#endif // LLVM_" << Lang << "_INC\n";
-- 
cgit v1.2.3


From 0dae924c1f668f74370b642ba91f818b728aca40 Mon Sep 17 00:00:00 2001
From: delaram-talaashrafi <dtalaashrafi@nvidia.com>
Date: Thu, 17 Jul 2025 12:38:02 -0400
Subject: [openacc][flang] Support two type bindName representation in acc
 routine (#149147)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Based on the OpenACC specification — which states that if the bind name
is given as an identifier it should be resolved according to the
compiled language, and if given as a string it should be used unmodified
— we introduce two distinct `bindName` representations for `acc routine`
to handle each case appropriately: one as an array of `SymbolRefAttr`
for identifiers and another as an array of `StringAttr` for strings.

To ensure correct correspondence between bind names and devices, this
patch also introduces two separate sets of device attributes. The
routine operation is extended accordingly, along with the necessary
updates to the OpenACC dialect and its lowering.
---
 flang/lib/Lower/OpenACC.cpp                       | 102 ++++++++++++++------
 flang/test/Lower/OpenACC/acc-routine.f90          |   7 +-
 flang/test/Lower/OpenACC/acc-routine03.f90        |   2 +-
 mlir/include/mlir/Dialect/OpenACC/OpenACC.h       |   1 +
 mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td   |  12 ++-
 mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp           | 112 +++++++++++++++++-----
 mlir/unittests/Dialect/OpenACC/OpenACCOpsTest.cpp |  44 +++++++--
 7 files changed, 208 insertions(+), 72 deletions(-)

diff --git a/flang/lib/Lower/OpenACC.cpp b/flang/lib/Lower/OpenACC.cpp
index 25682cba5620..51eb33dec186 100644
--- a/flang/lib/Lower/OpenACC.cpp
+++ b/flang/lib/Lower/OpenACC.cpp
@@ -4414,10 +4414,34 @@ getAttributeValueByDeviceType(llvm::SmallVector<mlir::Attribute> &attributes,
   return std::nullopt;
 }
 
+// Helper function to extract string value from bind name variant
+static std::optional<llvm::StringRef> getBindNameStringValue(
+    const std::optional<std::variant<mlir::SymbolRefAttr, mlir::StringAttr>>
+        &bindNameValue) {
+  if (!bindNameValue.has_value())
+    return std::nullopt;
+
+  return std::visit(
+      [](const auto &attr) -> std::optional<llvm::StringRef> {
+        if constexpr (std::is_same_v<std::decay_t<decltype(attr)>,
+                                     mlir::StringAttr>) {
+          return attr.getValue();
+        } else if constexpr (std::is_same_v<std::decay_t<decltype(attr)>,
+                                            mlir::SymbolRefAttr>) {
+          return attr.getLeafReference();
+        } else {
+          return std::nullopt;
+        }
+      },
+      bindNameValue.value());
+}
+
 static bool compareDeviceTypeInfo(
     mlir::acc::RoutineOp op,
-    llvm::SmallVector<mlir::Attribute> &bindNameArrayAttr,
-    llvm::SmallVector<mlir::Attribute> &bindNameDeviceTypeArrayAttr,
+    llvm::SmallVector<mlir::Attribute> &bindIdNameArrayAttr,
+    llvm::SmallVector<mlir::Attribute> &bindStrNameArrayAttr,
+    llvm::SmallVector<mlir::Attribute> &bindIdNameDeviceTypeArrayAttr,
+    llvm::SmallVector<mlir::Attribute> &bindStrNameDeviceTypeArrayAttr,
     llvm::SmallVector<mlir::Attribute> &gangArrayAttr,
     llvm::SmallVector<mlir::Attribute> &gangDimArrayAttr,
     llvm::SmallVector<mlir::Attribute> &gangDimDeviceTypeArrayAttr,
@@ -4427,9 +4451,13 @@ static bool compareDeviceTypeInfo(
   for (uint32_t dtypeInt = 0;
        dtypeInt != mlir::acc::getMaxEnumValForDeviceType(); ++dtypeInt) {
     auto dtype = static_cast<mlir::acc::DeviceType>(dtypeInt);
-    if (op.getBindNameValue(dtype) !=
-        getAttributeValueByDeviceType<llvm::StringRef, mlir::StringAttr>(
-            bindNameArrayAttr, bindNameDeviceTypeArrayAttr, dtype))
+    auto bindNameValue = getBindNameStringValue(op.getBindNameValue(dtype));
+    if (bindNameValue !=
+            getAttributeValueByDeviceType<llvm::StringRef, mlir::StringAttr>(
+                bindIdNameArrayAttr, bindIdNameDeviceTypeArrayAttr, dtype) &&
+        bindNameValue !=
+            getAttributeValueByDeviceType<llvm::StringRef, mlir::StringAttr>(
+                bindStrNameArrayAttr, bindStrNameDeviceTypeArrayAttr, dtype))
       return false;
     if (op.hasGang(dtype) != hasDeviceType(gangArrayAttr, dtype))
       return false;
@@ -4476,8 +4504,10 @@ getArrayAttrOrNull(fir::FirOpBuilder &builder,
 void createOpenACCRoutineConstruct(
     Fortran::lower::AbstractConverter &converter, mlir::Location loc,
     mlir::ModuleOp mod, mlir::func::FuncOp funcOp, std::string funcName,
-    bool hasNohost, llvm::SmallVector<mlir::Attribute> &bindNames,
-    llvm::SmallVector<mlir::Attribute> &bindNameDeviceTypes,
+    bool hasNohost, llvm::SmallVector<mlir::Attribute> &bindIdNames,
+    llvm::SmallVector<mlir::Attribute> &bindStrNames,
+    llvm::SmallVector<mlir::Attribute> &bindIdNameDeviceTypes,
+    llvm::SmallVector<mlir::Attribute> &bindStrNameDeviceTypes,
     llvm::SmallVector<mlir::Attribute> &gangDeviceTypes,
     llvm::SmallVector<mlir::Attribute> &gangDimValues,
     llvm::SmallVector<mlir::Attribute> &gangDimDeviceTypes,
@@ -4490,7 +4520,8 @@ void createOpenACCRoutineConstruct(
         0) {
       // If the routine is already specified with the same clauses, just skip
       // the operation creation.
-      if (compareDeviceTypeInfo(routineOp, bindNames, bindNameDeviceTypes,
+      if (compareDeviceTypeInfo(routineOp, bindIdNames, bindStrNames,
+                                bindIdNameDeviceTypes, bindStrNameDeviceTypes,
                                 gangDeviceTypes, gangDimValues,
                                 gangDimDeviceTypes, seqDeviceTypes,
                                 workerDeviceTypes, vectorDeviceTypes) &&
@@ -4507,8 +4538,10 @@ void createOpenACCRoutineConstruct(
   modBuilder.create<mlir::acc::RoutineOp>(
       loc, routineOpStr,
       mlir::SymbolRefAttr::get(builder.getContext(), funcName),
-      getArrayAttrOrNull(builder, bindNames),
-      getArrayAttrOrNull(builder, bindNameDeviceTypes),
+      getArrayAttrOrNull(builder, bindIdNames),
+      getArrayAttrOrNull(builder, bindStrNames),
+      getArrayAttrOrNull(builder, bindIdNameDeviceTypes),
+      getArrayAttrOrNull(builder, bindStrNameDeviceTypes),
       getArrayAttrOrNull(builder, workerDeviceTypes),
       getArrayAttrOrNull(builder, vectorDeviceTypes),
       getArrayAttrOrNull(builder, seqDeviceTypes), hasNohost,
@@ -4525,8 +4558,10 @@ static void interpretRoutineDeviceInfo(
     llvm::SmallVector<mlir::Attribute> &seqDeviceTypes,
     llvm::SmallVector<mlir::Attribute> &vectorDeviceTypes,
     llvm::SmallVector<mlir::Attribute> &workerDeviceTypes,
-    llvm::SmallVector<mlir::Attribute> &bindNameDeviceTypes,
-    llvm::SmallVector<mlir::Attribute> &bindNames,
+    llvm::SmallVector<mlir::Attribute> &bindIdNameDeviceTypes,
+    llvm::SmallVector<mlir::Attribute> &bindStrNameDeviceTypes,
+    llvm::SmallVector<mlir::Attribute> &bindIdNames,
+    llvm::SmallVector<mlir::Attribute> &bindStrNames,
     llvm::SmallVector<mlir::Attribute> &gangDeviceTypes,
     llvm::SmallVector<mlir::Attribute> &gangDimValues,
     llvm::SmallVector<mlir::Attribute> &gangDimDeviceTypes) {
@@ -4559,16 +4594,18 @@ static void interpretRoutineDeviceInfo(
   if (dinfo.bindNameOpt().has_value()) {
     const auto &bindName = dinfo.bindNameOpt().value();
     mlir::Attribute bindNameAttr;
-    if (const auto &bindStr{std::get_if<std::string>(&bindName)}) {
+    if (const auto &bindSym{
+            std::get_if<Fortran::semantics::SymbolRef>(&bindName)}) {
+      bindNameAttr = builder.getSymbolRefAttr(converter.mangleName(*bindSym));
+      bindIdNames.push_back(bindNameAttr);
+      bindIdNameDeviceTypes.push_back(getDeviceTypeAttr());
+    } else if (const auto &bindStr{std::get_if<std::string>(&bindName)}) {
       bindNameAttr = builder.getStringAttr(*bindStr);
-    } else if (const auto &bindSym{
-                   std::get_if<Fortran::semantics::SymbolRef>(&bindName)}) {
-      bindNameAttr = builder.getStringAttr(converter.mangleName(*bindSym));
+      bindStrNames.push_back(bindNameAttr);
+      bindStrNameDeviceTypes.push_back(getDeviceTypeAttr());
     } else {
       llvm_unreachable("Unsupported bind name type");
     }
-    bindNames.push_back(bindNameAttr);
-    bindNameDeviceTypes.push_back(getDeviceTypeAttr());
   }
 }
 
@@ -4584,8 +4621,9 @@ void Fortran::lower::genOpenACCRoutineConstruct(
   bool hasNohost{false};
 
   llvm::SmallVector<mlir::Attribute> seqDeviceTypes, vectorDeviceTypes,
-      workerDeviceTypes, bindNameDeviceTypes, bindNames, gangDeviceTypes,
-      gangDimDeviceTypes, gangDimValues;
+      workerDeviceTypes, bindIdNameDeviceTypes, bindStrNameDeviceTypes,
+      bindIdNames, bindStrNames, gangDeviceTypes, gangDimDeviceTypes,
+      gangDimValues;
 
   for (const Fortran::semantics::OpenACCRoutineInfo &info : routineInfos) {
     // Device Independent Attributes
@@ -4594,24 +4632,26 @@ void Fortran::lower::genOpenACCRoutineConstruct(
     }
     // Note: Device Independent Attributes are set to the
     // none device type in `info`.
-    interpretRoutineDeviceInfo(converter, info, seqDeviceTypes,
-                               vectorDeviceTypes, workerDeviceTypes,
-                               bindNameDeviceTypes, bindNames, gangDeviceTypes,
-                               gangDimValues, gangDimDeviceTypes);
+    interpretRoutineDeviceInfo(
+        converter, info, seqDeviceTypes, vectorDeviceTypes, workerDeviceTypes,
+        bindIdNameDeviceTypes, bindStrNameDeviceTypes, bindIdNames,
+        bindStrNames, gangDeviceTypes, gangDimValues, gangDimDeviceTypes);
 
     // Device Dependent Attributes
     for (const Fortran::semantics::OpenACCRoutineDeviceTypeInfo &dinfo :
          info.deviceTypeInfos()) {
-      interpretRoutineDeviceInfo(
-          converter, dinfo, seqDeviceTypes, vectorDeviceTypes,
-          workerDeviceTypes, bindNameDeviceTypes, bindNames, gangDeviceTypes,
-          gangDimValues, gangDimDeviceTypes);
+      interpretRoutineDeviceInfo(converter, dinfo, seqDeviceTypes,
+                                 vectorDeviceTypes, workerDeviceTypes,
+                                 bindIdNameDeviceTypes, bindStrNameDeviceTypes,
+                                 bindIdNames, bindStrNames, gangDeviceTypes,
+                                 gangDimValues, gangDimDeviceTypes);
     }
   }
   createOpenACCRoutineConstruct(
-      converter, loc, mod, funcOp, funcName, hasNohost, bindNames,
-      bindNameDeviceTypes, gangDeviceTypes, gangDimValues, gangDimDeviceTypes,
-      seqDeviceTypes, workerDeviceTypes, vectorDeviceTypes);
+      converter, loc, mod, funcOp, funcName, hasNohost, bindIdNames,
+      bindStrNames, bindIdNameDeviceTypes, bindStrNameDeviceTypes,
+      gangDeviceTypes, gangDimValues, gangDimDeviceTypes, seqDeviceTypes,
+      workerDeviceTypes, vectorDeviceTypes);
 }
 
 static void
diff --git a/flang/test/Lower/OpenACC/acc-routine.f90 b/flang/test/Lower/OpenACC/acc-routine.f90
index 789f3a57e1f7..1a63b4120235 100644
--- a/flang/test/Lower/OpenACC/acc-routine.f90
+++ b/flang/test/Lower/OpenACC/acc-routine.f90
@@ -2,13 +2,14 @@
 
 ! RUN: bbc -fopenacc -emit-hlfir %s -o - | FileCheck %s
 
-! CHECK: acc.routine @[[r14:.*]] func(@_QPacc_routine19) bind("_QPacc_routine17" [#acc.device_type<host>], "_QPacc_routine17" [#acc.device_type<default>], "_QPacc_routine16" [#acc.device_type<multicore>])
-! CHECK: acc.routine @[[r13:.*]] func(@_QPacc_routine18) bind("_QPacc_routine17" [#acc.device_type<host>], "_QPacc_routine16" [#acc.device_type<multicore>])
+! CHECK: acc.routine @[[r14:.*]] func(@_QPacc_routine19) bind(@_QPacc_routine17 [#acc.device_type<host>], @_QPacc_routine17
+! [#acc.device_type<default>], @_QPacc_routine16 [#acc.device_type<multicore>])
+! CHECK: acc.routine @[[r13:.*]] func(@_QPacc_routine18) bind(@_QPacc_routine17 [#acc.device_type<host>], @_QPacc_routine16 [#acc.device_type<multicore>])
 ! CHECK: acc.routine @[[r12:.*]] func(@_QPacc_routine17) worker ([#acc.device_type<host>]) vector ([#acc.device_type<multicore>])
 ! CHECK: acc.routine @[[r11:.*]] func(@_QPacc_routine16) gang([#acc.device_type<nvidia>]) seq ([#acc.device_type<host>])
 ! CHECK: acc.routine @[[r10:.*]] func(@_QPacc_routine11) seq
 ! CHECK: acc.routine @[[r09:.*]] func(@_QPacc_routine10) seq
-! CHECK: acc.routine @[[r08:.*]] func(@_QPacc_routine9) bind("_QPacc_routine9a")
+! CHECK: acc.routine @[[r08:.*]] func(@_QPacc_routine9) bind(@_QPacc_routine9a)
 ! CHECK: acc.routine @[[r07:.*]] func(@_QPacc_routine8) bind("routine8_")
 ! CHECK: acc.routine @[[r06:.*]] func(@_QPacc_routine7) gang(dim: 1 : i64)
 ! CHECK: acc.routine @[[r05:.*]] func(@_QPacc_routine6) nohost
diff --git a/flang/test/Lower/OpenACC/acc-routine03.f90 b/flang/test/Lower/OpenACC/acc-routine03.f90
index 85e4ef580f98..ddd6bda0367e 100644
--- a/flang/test/Lower/OpenACC/acc-routine03.f90
+++ b/flang/test/Lower/OpenACC/acc-routine03.f90
@@ -30,6 +30,6 @@ end interface
 end subroutine
 
 ! CHECK: acc.routine @acc_routine_1 func(@_QPsub2) worker nohost
-! CHECK: acc.routine @acc_routine_0 func(@_QPsub1) bind("_QPsub2") worker
+! CHECK: acc.routine @acc_routine_0 func(@_QPsub1) bind(@_QPsub2) worker
 ! CHECK: func.func @_QPsub1(%arg0: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "a"}) attributes {acc.routine_info = #acc.routine_info<[@acc_routine_0]>}
 ! CHECK: func.func @_QPsub2(%arg0: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "a"}) attributes {acc.routine_info = #acc.routine_info<[@acc_routine_1]>}
diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACC.h b/mlir/include/mlir/Dialect/OpenACC/OpenACC.h
index 4eb666239d4e..8f87235fcd23 100644
--- a/mlir/include/mlir/Dialect/OpenACC/OpenACC.h
+++ b/mlir/include/mlir/Dialect/OpenACC/OpenACC.h
@@ -29,6 +29,7 @@
 #include "mlir/Interfaces/ControlFlowInterfaces.h"
 #include "mlir/Interfaces/LoopLikeInterface.h"
 #include "mlir/Interfaces/SideEffectInterfaces.h"
+#include <variant>
 
 #define GET_TYPEDEF_CLASSES
 #include "mlir/Dialect/OpenACC/OpenACCOpsTypes.h.inc"
diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td b/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
index 66378f116784..96b9adcc53b3 100644
--- a/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
+++ b/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
@@ -2772,8 +2772,10 @@ def OpenACC_RoutineOp : OpenACC_Op<"routine", [IsolatedFromAbove]> {
   }];
 
   let arguments = (ins SymbolNameAttr:$sym_name, SymbolRefAttr:$func_name,
-      OptionalAttr<StrArrayAttr>:$bindName,
-      OptionalAttr<DeviceTypeArrayAttr>:$bindNameDeviceType,
+      OptionalAttr<SymbolRefArrayAttr>:$bindIdName,
+      OptionalAttr<StrArrayAttr>:$bindStrName,
+      OptionalAttr<DeviceTypeArrayAttr>:$bindIdNameDeviceType,
+      OptionalAttr<DeviceTypeArrayAttr>:$bindStrNameDeviceType,
       OptionalAttr<DeviceTypeArrayAttr>:$worker,
       OptionalAttr<DeviceTypeArrayAttr>:$vector,
       OptionalAttr<DeviceTypeArrayAttr>:$seq, UnitAttr:$nohost,
@@ -2815,14 +2817,14 @@ def OpenACC_RoutineOp : OpenACC_Op<"routine", [IsolatedFromAbove]> {
     std::optional<int64_t> getGangDimValue();
     std::optional<int64_t> getGangDimValue(mlir::acc::DeviceType deviceType);
 
-    std::optional<llvm::StringRef> getBindNameValue();
-    std::optional<llvm::StringRef> getBindNameValue(mlir::acc::DeviceType deviceType);
+    std::optional<::std::variant<mlir::SymbolRefAttr, mlir::StringAttr>> getBindNameValue();
+    std::optional<::std::variant<mlir::SymbolRefAttr, mlir::StringAttr>> getBindNameValue(mlir::acc::DeviceType deviceType);
   }];
 
   let assemblyFormat = [{
     $sym_name `func` `(` $func_name `)`
     oilist (
-        `bind` `(` custom<BindName>($bindName, $bindNameDeviceType) `)`
+        `bind` `(` custom<BindName>($bindIdName, $bindStrName ,$bindIdNameDeviceType, $bindStrNameDeviceType) `)`
       | `gang` `` custom<RoutineGangClause>($gang, $gangDim, $gangDimDeviceType)
       | `worker` custom<DeviceTypeArrayAttr>($worker)
       | `vector` custom<DeviceTypeArrayAttr>($vector)
diff --git a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
index f2eab62b286a..fbc1f003ab64 100644
--- a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
+++ b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
@@ -21,6 +21,7 @@
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/TypeSwitch.h"
 #include "llvm/Support/LogicalResult.h"
+#include <variant>
 
 using namespace mlir;
 using namespace acc;
@@ -3461,40 +3462,88 @@ LogicalResult acc::RoutineOp::verify() {
   return success();
 }
 
-static ParseResult parseBindName(OpAsmParser &parser, mlir::ArrayAttr &bindName,
-                                 mlir::ArrayAttr &deviceTypes) {
-  llvm::SmallVector<mlir::Attribute> bindNameAttrs;
-  llvm::SmallVector<mlir::Attribute> deviceTypeAttrs;
+static ParseResult parseBindName(OpAsmParser &parser,
+                                 mlir::ArrayAttr &bindIdName,
+                                 mlir::ArrayAttr &bindStrName,
+                                 mlir::ArrayAttr &deviceIdTypes,
+                                 mlir::ArrayAttr &deviceStrTypes) {
+  llvm::SmallVector<mlir::Attribute> bindIdNameAttrs;
+  llvm::SmallVector<mlir::Attribute> bindStrNameAttrs;
+  llvm::SmallVector<mlir::Attribute> deviceIdTypeAttrs;
+  llvm::SmallVector<mlir::Attribute> deviceStrTypeAttrs;
 
   if (failed(parser.parseCommaSeparatedList([&]() {
-        if (parser.parseAttribute(bindNameAttrs.emplace_back()))
+        mlir::Attribute newAttr;
+        bool isSymbolRefAttr;
+        auto parseResult = parser.parseAttribute(newAttr);
+        if (auto symbolRefAttr = dyn_cast<mlir::SymbolRefAttr>(newAttr)) {
+          bindIdNameAttrs.push_back(symbolRefAttr);
+          isSymbolRefAttr = true;
+        } else if (auto stringAttr = dyn_cast<mlir::StringAttr>(newAttr)) {
+          bindStrNameAttrs.push_back(stringAttr);
+          isSymbolRefAttr = false;
+        }
+        if (parseResult)
           return failure();
         if (failed(parser.parseOptionalLSquare())) {
-          deviceTypeAttrs.push_back(mlir::acc::DeviceTypeAttr::get(
-              parser.getContext(), mlir::acc::DeviceType::None));
+          if (isSymbolRefAttr) {
+            deviceIdTypeAttrs.push_back(mlir::acc::DeviceTypeAttr::get(
+                parser.getContext(), mlir::acc::DeviceType::None));
+          } else {
+            deviceStrTypeAttrs.push_back(mlir::acc::DeviceTypeAttr::get(
+                parser.getContext(), mlir::acc::DeviceType::None));
+          }
         } else {
-          if (parser.parseAttribute(deviceTypeAttrs.emplace_back()) ||
-              parser.parseRSquare())
-            return failure();
+          if (isSymbolRefAttr) {
+            if (parser.parseAttribute(deviceIdTypeAttrs.emplace_back()) ||
+                parser.parseRSquare())
+              return failure();
+          } else {
+            if (parser.parseAttribute(deviceStrTypeAttrs.emplace_back()) ||
+                parser.parseRSquare())
+              return failure();
+          }
         }
         return success();
       })))
     return failure();
 
-  bindName = ArrayAttr::get(parser.getContext(), bindNameAttrs);
-  deviceTypes = ArrayAttr::get(parser.getContext(), deviceTypeAttrs);
+  bindIdName = ArrayAttr::get(parser.getContext(), bindIdNameAttrs);
+  bindStrName = ArrayAttr::get(parser.getContext(), bindStrNameAttrs);
+  deviceIdTypes = ArrayAttr::get(parser.getContext(), deviceIdTypeAttrs);
+  deviceStrTypes = ArrayAttr::get(parser.getContext(), deviceStrTypeAttrs);
 
   return success();
 }
 
 static void printBindName(mlir::OpAsmPrinter &p, mlir::Operation *op,
-                          std::optional<mlir::ArrayAttr> bindName,
-                          std::optional<mlir::ArrayAttr> deviceTypes) {
-  llvm::interleaveComma(llvm::zip(*bindName, *deviceTypes), p,
-                        [&](const auto &pair) {
-                          p << std::get<0>(pair);
-                          printSingleDeviceType(p, std::get<1>(pair));
-                        });
+                          std::optional<mlir::ArrayAttr> bindIdName,
+                          std::optional<mlir::ArrayAttr> bindStrName,
+                          std::optional<mlir::ArrayAttr> deviceIdTypes,
+                          std::optional<mlir::ArrayAttr> deviceStrTypes) {
+  // Create combined vectors for all bind names and device types
+  llvm::SmallVector<mlir::Attribute> allBindNames;
+  llvm::SmallVector<mlir::Attribute> allDeviceTypes;
+
+  // Append bindIdName and deviceIdTypes
+  if (hasDeviceTypeValues(deviceIdTypes)) {
+    allBindNames.append(bindIdName->begin(), bindIdName->end());
+    allDeviceTypes.append(deviceIdTypes->begin(), deviceIdTypes->end());
+  }
+
+  // Append bindStrName and deviceStrTypes
+  if (hasDeviceTypeValues(deviceStrTypes)) {
+    allBindNames.append(bindStrName->begin(), bindStrName->end());
+    allDeviceTypes.append(deviceStrTypes->begin(), deviceStrTypes->end());
+  }
+
+  // Print the combined sequence
+  if (!allBindNames.empty())
+    llvm::interleaveComma(llvm::zip(allBindNames, allDeviceTypes), p,
+                          [&](const auto &pair) {
+                            p << std::get<0>(pair);
+                            printSingleDeviceType(p, std::get<1>(pair));
+                          });
 }
 
 static ParseResult parseRoutineGangClause(OpAsmParser &parser,
@@ -3654,19 +3703,32 @@ bool RoutineOp::hasSeq(mlir::acc::DeviceType deviceType) {
   return hasDeviceType(getSeq(), deviceType);
 }
 
-std::optional<llvm::StringRef> RoutineOp::getBindNameValue() {
+std::optional<std::variant<mlir::SymbolRefAttr, mlir::StringAttr>>
+RoutineOp::getBindNameValue() {
   return getBindNameValue(mlir::acc::DeviceType::None);
 }
 
-std::optional<llvm::StringRef>
+std::optional<std::variant<mlir::SymbolRefAttr, mlir::StringAttr>>
 RoutineOp::getBindNameValue(mlir::acc::DeviceType deviceType) {
-  if (!hasDeviceTypeValues(getBindNameDeviceType()))
+  if (!hasDeviceTypeValues(getBindIdNameDeviceType()) &&
+      !hasDeviceTypeValues(getBindStrNameDeviceType())) {
     return std::nullopt;
-  if (auto pos = findSegment(*getBindNameDeviceType(), deviceType)) {
-    auto attr = (*getBindName())[*pos];
+  }
+
+  if (auto pos = findSegment(*getBindIdNameDeviceType(), deviceType)) {
+    auto attr = (*getBindIdName())[*pos];
+    auto symbolRefAttr = dyn_cast<mlir::SymbolRefAttr>(attr);
+    assert(symbolRefAttr && "expected SymbolRef");
+    return symbolRefAttr;
+  }
+
+  if (auto pos = findSegment(*getBindStrNameDeviceType(), deviceType)) {
+    auto attr = (*getBindStrName())[*pos];
     auto stringAttr = dyn_cast<mlir::StringAttr>(attr);
-    return stringAttr.getValue();
+    assert(stringAttr && "expected String");
+    return stringAttr;
   }
+
   return std::nullopt;
 }
 
diff --git a/mlir/unittests/Dialect/OpenACC/OpenACCOpsTest.cpp b/mlir/unittests/Dialect/OpenACC/OpenACCOpsTest.cpp
index aa16421cbec5..836efdb307f9 100644
--- a/mlir/unittests/Dialect/OpenACC/OpenACCOpsTest.cpp
+++ b/mlir/unittests/Dialect/OpenACC/OpenACCOpsTest.cpp
@@ -519,14 +519,44 @@ TEST_F(OpenACCOpsTest, routineOpTest) {
   op->removeGangDimDeviceTypeAttr();
   op->removeGangDimAttr();
 
-  op->setBindNameDeviceTypeAttr(b.getArrayAttr({dtypeNone}));
-  op->setBindNameAttr(b.getArrayAttr({b.getStringAttr("fname")}));
+  op->setBindIdNameDeviceTypeAttr(
+      b.getArrayAttr({DeviceTypeAttr::get(&context, DeviceType::Host)}));
+  op->setBindStrNameDeviceTypeAttr(b.getArrayAttr({dtypeNone}));
+  op->setBindIdNameAttr(
+      b.getArrayAttr({SymbolRefAttr::get(&context, "test_symbol")}));
+  op->setBindStrNameAttr(b.getArrayAttr({b.getStringAttr("fname")}));
   EXPECT_TRUE(op->getBindNameValue().has_value());
-  EXPECT_EQ(op->getBindNameValue().value(), "fname");
-  for (auto d : dtypesWithoutNone)
-    EXPECT_FALSE(op->getBindNameValue(d).has_value());
-  op->removeBindNameDeviceTypeAttr();
-  op->removeBindNameAttr();
+  EXPECT_TRUE(op->getBindNameValue(DeviceType::Host).has_value());
+  EXPECT_EQ(std::visit(
+                [](const auto &attr) -> std::string {
+                  if constexpr (std::is_same_v<std::decay_t<decltype(attr)>,
+                                               mlir::StringAttr>) {
+                    return attr.str();
+                  } else {
+                    return attr.getLeafReference().str();
+                  }
+                },
+                op->getBindNameValue().value()),
+            "fname");
+  EXPECT_EQ(std::visit(
+                [](const auto &attr) -> std::string {
+                  if constexpr (std::is_same_v<std::decay_t<decltype(attr)>,
+                                               mlir::StringAttr>) {
+                    return attr.str();
+                  } else {
+                    return attr.getLeafReference().str();
+                  }
+                },
+                op->getBindNameValue(DeviceType::Host).value()),
+            "test_symbol");
+  for (auto d : dtypesWithoutNone) {
+    if (d != DeviceType::Host)
+      EXPECT_FALSE(op->getBindNameValue(d).has_value());
+  }
+  op->removeBindIdNameDeviceTypeAttr();
+  op->removeBindStrNameDeviceTypeAttr();
+  op->removeBindIdNameAttr();
+  op->removeBindStrNameAttr();
 }
 
 template <typename Op>
-- 
cgit v1.2.3


From b3a8d0efc907aae8198ff16e5bfb8dc48f08b6ca Mon Sep 17 00:00:00 2001
From: Philip Reames <preames@rivosinc.com>
Date: Thu, 17 Jul 2025 09:42:36 -0700
Subject: [RISCV] Add additional coverage for one hot interleave load cases
 [nfc]

Add coverage for fixed vector vp.load, and the deinterleave intrinsic paths.
---
 .../RISCV/rvv/fixed-vectors-interleaved-access.ll  | 30 +++++++++++++--
 .../RISCV/rvv/vp-vector-interleaved-access.ll      | 45 ++++++++++++++++++++++
 2 files changed, 71 insertions(+), 4 deletions(-)

diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
index 041aae229288..019bbe2908a2 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
@@ -1718,6 +1718,28 @@ define void @load_factor4_one_active_storeback_full(ptr %ptr) {
   ret void
 }
 
+define <4 x i32> @vp_load_factor3_one_active(ptr %ptr) {
+; CHECK-LABEL: vp_load_factor3_one_active:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vlseg3e32.v v8, (a0)
+; CHECK-NEXT:    ret
+  %interleaved.vec = tail call <12 x i32> @llvm.vp.load.v12i32.p0(ptr %ptr, <12 x i1>  splat (i1 true), i32 12)
+  %v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+  ret <4 x i32> %v0
+}
+
+define <4 x i32> @vp_load_factor5_one_active(ptr %ptr) {
+; CHECK-LABEL: vp_load_factor5_one_active:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vlseg5e32.v v8, (a0)
+; CHECK-NEXT:    ret
+ %interleaved.vec = tail call <20 x i32> @llvm.vp.load.v20i32.p0(ptr %ptr, <20 x i1>  splat (i1 true), i32 20)
+  %v0 = shufflevector <20 x i32> %interleaved.vec, <20 x i32> poison, <4 x i32> <i32 0, i32 5, i32 10, i32 15>
+  ret <4 x i32> %v0
+}
+
 define void @store_factor4_one_active(ptr %ptr, <4 x i32> %v) {
 ; CHECK-LABEL: store_factor4_one_active:
 ; CHECK:       # %bb.0:
@@ -1804,8 +1826,8 @@ define {<4 x i32>, <4 x i32>, <4 x i32>} @invalid_vp_mask(ptr %ptr) {
 ; RV32-NEXT:    vle32.v v12, (a0), v0.t
 ; RV32-NEXT:    li a0, 36
 ; RV32-NEXT:    vmv.s.x v20, a1
-; RV32-NEXT:    lui a1, %hi(.LCPI51_0)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI51_0)
+; RV32-NEXT:    lui a1, %hi(.LCPI53_0)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI53_0)
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32-NEXT:    vle16.v v21, (a1)
 ; RV32-NEXT:    vcompress.vm v8, v12, v11
@@ -1880,8 +1902,8 @@ define {<4 x i32>, <4 x i32>, <4 x i32>} @invalid_vp_evl(ptr %ptr) {
 ; RV32-NEXT:    vmv.s.x v10, a0
 ; RV32-NEXT:    li a0, 146
 ; RV32-NEXT:    vmv.s.x v11, a0
-; RV32-NEXT:    lui a0, %hi(.LCPI52_0)
-; RV32-NEXT:    addi a0, a0, %lo(.LCPI52_0)
+; RV32-NEXT:    lui a0, %hi(.LCPI54_0)
+; RV32-NEXT:    addi a0, a0, %lo(.LCPI54_0)
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32-NEXT:    vle16.v v20, (a0)
 ; RV32-NEXT:    li a0, 36
diff --git a/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll
index 8cfa237858ac..23c0c826e85e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll
@@ -648,6 +648,51 @@ define void @masked_store_factor4_v2(<vscale x 1 x i1> %mask, <vscale x 1 x i32>
   ret void
 }
 
+define <vscale x 2 x i32> @load_factor2_oneactive(ptr %ptr, i32 %evl) {
+; RV32-LABEL: load_factor2_oneactive:
+; RV32:       # %bb.0:
+; RV32-NEXT:    slli a1, a1, 2
+; RV32-NEXT:    srli a1, a1, 1
+; RV32-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; RV32-NEXT:    vlseg2e32.v v7, (a0)
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: load_factor2_oneactive:
+; RV64:       # %bb.0:
+; RV64-NEXT:    slli a1, a1, 34
+; RV64-NEXT:    srli a1, a1, 33
+; RV64-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; RV64-NEXT:    vlseg2e32.v v7, (a0)
+; RV64-NEXT:    ret
+  %rvl = mul nuw i32 %evl, 4
+  %wide.masked.load = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr %ptr, <vscale x 4 x i1> splat (i1 true), i32 %rvl)
+  %deinterleaved.results = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %wide.masked.load)
+  %t0 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %deinterleaved.results, 1
+  ret <vscale x 2 x i32> %t0
+}
+
+define <vscale x 2 x i32> @load_factor5_oneactive(ptr %ptr, i32 %evl) {
+; RV32-LABEL: load_factor5_oneactive:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; RV32-NEXT:    vlseg5e32.v v5, (a0)
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: load_factor5_oneactive:
+; RV64:       # %bb.0:
+; RV64-NEXT:    slli a1, a1, 32
+; RV64-NEXT:    srli a1, a1, 32
+; RV64-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; RV64-NEXT:    vlseg5e32.v v5, (a0)
+; RV64-NEXT:    ret
+  %rvl = mul nuw i32 %evl, 5
+  %wide.masked.load = call <vscale x 10 x i32> @llvm.vp.load(ptr %ptr, <vscale x 10 x i1> splat (i1 true), i32 %rvl)
+  %deinterleaved.results = call { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave5(<vscale x 10 x i32> %wide.masked.load)
+  %t3 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %deinterleaved.results, 3
+  ret <vscale x 2 x i32> %t3
+}
+
+
 ; Negative tests
 
 define {<vscale x 2 x i32>, <vscale x 2 x i32>} @not_same_mask(<vscale x 2 x i1> %mask0, <vscale x 2 x i1> %mask1, ptr %ptr, i32 %evl) {
-- 
cgit v1.2.3


From e4a3541ff88af03c01007a94b6b5f5cea95ecf33 Mon Sep 17 00:00:00 2001
From: Akshay Khadse <akshayskhadse@gmail.com>
Date: Thu, 17 Jul 2025 12:57:04 -0400
Subject: [MLIR][Python] Support eliding large resource strings in PassManager
 (#149187)

- Introduces a `large_resource_limit` parameter across Python bindings,
enabling the eliding of resource strings exceeding a specified character
limit during IR printing.
- To maintain backward compatibilty, when using `operation.print()` API,
if `large_resource_limit` is None and the `large_elements_limit` is set,
the later will be used to elide the resource string as well. This change
was introduced by https://github.com/llvm/llvm-project/pull/125738.
- For printing using pass manager, the `large_resource_limit` and
`large_elements_limit` are completely independent of each other.
---
 mlir/lib/Bindings/Python/IRCore.cpp               | 22 ++++++---
 mlir/lib/Bindings/Python/IRModule.h               | 14 +++---
 mlir/lib/Bindings/Python/Pass.cpp                 | 12 ++++-
 mlir/python/mlir/_mlir_libs/_mlir/ir.pyi          |  7 +++
 mlir/python/mlir/_mlir_libs/_mlir/passmanager.pyi |  1 +
 mlir/test/python/ir/operation.py                  |  9 ++++
 mlir/test/python/pass_manager.py                  | 57 +++++++++++++++++++++++
 7 files changed, 107 insertions(+), 15 deletions(-)

diff --git a/mlir/lib/Bindings/Python/IRCore.cpp b/mlir/lib/Bindings/Python/IRCore.cpp
index d96148288530..7b790e90e0d8 100644
--- a/mlir/lib/Bindings/Python/IRCore.cpp
+++ b/mlir/lib/Bindings/Python/IRCore.cpp
@@ -97,6 +97,10 @@ Args:
   binary: Whether to write bytes (True) or str (False). Defaults to False.
   large_elements_limit: Whether to elide elements attributes above this
     number of elements. Defaults to None (no limit).
+  large_resource_limit: Whether to elide resource attributes above this
+    number of characters. Defaults to None (no limit). If large_elements_limit
+    is set and this is None, the behavior will be to use large_elements_limit
+    as large_resource_limit.
   enable_debug_info: Whether to print debug/location information. Defaults
     to False.
   pretty_debug_info: Whether to format debug information for easier reading
@@ -1303,6 +1307,7 @@ void PyOperation::checkValid() const {
 }
 
 void PyOperationBase::print(std::optional<int64_t> largeElementsLimit,
+                            std::optional<int64_t> largeResourceLimit,
                             bool enableDebugInfo, bool prettyDebugInfo,
                             bool printGenericOpForm, bool useLocalScope,
                             bool useNameLocAsPrefix, bool assumeVerified,
@@ -1314,10 +1319,10 @@ void PyOperationBase::print(std::optional<int64_t> largeElementsLimit,
     fileObject = nb::module_::import_("sys").attr("stdout");
 
   MlirOpPrintingFlags flags = mlirOpPrintingFlagsCreate();
-  if (largeElementsLimit) {
+  if (largeElementsLimit)
     mlirOpPrintingFlagsElideLargeElementsAttrs(flags, *largeElementsLimit);
-    mlirOpPrintingFlagsElideLargeResourceString(flags, *largeElementsLimit);
-  }
+  if (largeResourceLimit)
+    mlirOpPrintingFlagsElideLargeResourceString(flags, *largeResourceLimit);
   if (enableDebugInfo)
     mlirOpPrintingFlagsEnableDebugInfo(flags, /*enable=*/true,
                                        /*prettyForm=*/prettyDebugInfo);
@@ -1405,6 +1410,7 @@ void PyOperationBase::walk(
 
 nb::object PyOperationBase::getAsm(bool binary,
                                    std::optional<int64_t> largeElementsLimit,
+                                   std::optional<int64_t> largeResourceLimit,
                                    bool enableDebugInfo, bool prettyDebugInfo,
                                    bool printGenericOpForm, bool useLocalScope,
                                    bool useNameLocAsPrefix, bool assumeVerified,
@@ -1416,6 +1422,7 @@ nb::object PyOperationBase::getAsm(bool binary,
     fileObject = nb::module_::import_("io").attr("StringIO")();
   }
   print(/*largeElementsLimit=*/largeElementsLimit,
+        /*largeResourceLimit=*/largeResourceLimit,
         /*enableDebugInfo=*/enableDebugInfo,
         /*prettyDebugInfo=*/prettyDebugInfo,
         /*printGenericOpForm=*/printGenericOpForm,
@@ -3348,6 +3355,7 @@ void mlir::python::populateIRCore(nb::module_ &m) {
           [](PyOperationBase &self) {
             return self.getAsm(/*binary=*/false,
                                /*largeElementsLimit=*/std::nullopt,
+                               /*largeResourceLimit=*/std::nullopt,
                                /*enableDebugInfo=*/false,
                                /*prettyDebugInfo=*/false,
                                /*printGenericOpForm=*/false,
@@ -3363,11 +3371,12 @@ void mlir::python::populateIRCore(nb::module_ &m) {
            nb::arg("state"), nb::arg("file").none() = nb::none(),
            nb::arg("binary") = false, kOperationPrintStateDocstring)
       .def("print",
-           nb::overload_cast<std::optional<int64_t>, bool, bool, bool, bool,
-                             bool, bool, nb::object, bool, bool>(
-               &PyOperationBase::print),
+           nb::overload_cast<std::optional<int64_t>, std::optional<int64_t>,
+                             bool, bool, bool, bool, bool, bool, nb::object,
+                             bool, bool>(&PyOperationBase::print),
            // Careful: Lots of arguments must match up with print method.
            nb::arg("large_elements_limit").none() = nb::none(),
+           nb::arg("large_resource_limit").none() = nb::none(),
            nb::arg("enable_debug_info") = false,
            nb::arg("pretty_debug_info") = false,
            nb::arg("print_generic_op_form") = false,
@@ -3383,6 +3392,7 @@ void mlir::python::populateIRCore(nb::module_ &m) {
            // Careful: Lots of arguments must match up with get_asm method.
            nb::arg("binary") = false,
            nb::arg("large_elements_limit").none() = nb::none(),
+           nb::arg("large_resource_limit").none() = nb::none(),
            nb::arg("enable_debug_info") = false,
            nb::arg("pretty_debug_info") = false,
            nb::arg("print_generic_op_form") = false,
diff --git a/mlir/lib/Bindings/Python/IRModule.h b/mlir/lib/Bindings/Python/IRModule.h
index 9befcce725bb..0fdd2d1a7eff 100644
--- a/mlir/lib/Bindings/Python/IRModule.h
+++ b/mlir/lib/Bindings/Python/IRModule.h
@@ -599,18 +599,18 @@ class PyOperationBase {
 public:
   virtual ~PyOperationBase() = default;
   /// Implements the bound 'print' method and helps with others.
-  void print(std::optional<int64_t> largeElementsLimit, bool enableDebugInfo,
+  void print(std::optional<int64_t> largeElementsLimit,
+             std::optional<int64_t> largeResourceLimit, bool enableDebugInfo,
              bool prettyDebugInfo, bool printGenericOpForm, bool useLocalScope,
              bool useNameLocAsPrefix, bool assumeVerified,
              nanobind::object fileObject, bool binary, bool skipRegions);
   void print(PyAsmState &state, nanobind::object fileObject, bool binary);
 
-  nanobind::object getAsm(bool binary,
-                          std::optional<int64_t> largeElementsLimit,
-                          bool enableDebugInfo, bool prettyDebugInfo,
-                          bool printGenericOpForm, bool useLocalScope,
-                          bool useNameLocAsPrefix, bool assumeVerified,
-                          bool skipRegions);
+  nanobind::object
+  getAsm(bool binary, std::optional<int64_t> largeElementsLimit,
+         std::optional<int64_t> largeResourceLimit, bool enableDebugInfo,
+         bool prettyDebugInfo, bool printGenericOpForm, bool useLocalScope,
+         bool useNameLocAsPrefix, bool assumeVerified, bool skipRegions);
 
   // Implement the bound 'writeBytecode' method.
   void writeBytecode(const nanobind::object &fileObject,
diff --git a/mlir/lib/Bindings/Python/Pass.cpp b/mlir/lib/Bindings/Python/Pass.cpp
index 8d84864b9db4..20017e25b69b 100644
--- a/mlir/lib/Bindings/Python/Pass.cpp
+++ b/mlir/lib/Bindings/Python/Pass.cpp
@@ -78,12 +78,19 @@ void mlir::python::populatePassManagerSubmodule(nb::module_ &m) {
           [](PyPassManager &passManager, bool printBeforeAll,
              bool printAfterAll, bool printModuleScope, bool printAfterChange,
              bool printAfterFailure, std::optional<int64_t> largeElementsLimit,
-             bool enableDebugInfo, bool printGenericOpForm,
+             std::optional<int64_t> largeResourceLimit, bool enableDebugInfo,
+             bool printGenericOpForm,
              std::optional<std::string> optionalTreePrintingPath) {
             MlirOpPrintingFlags flags = mlirOpPrintingFlagsCreate();
-            if (largeElementsLimit)
+            if (largeElementsLimit) {
               mlirOpPrintingFlagsElideLargeElementsAttrs(flags,
                                                          *largeElementsLimit);
+              mlirOpPrintingFlagsElideLargeResourceString(flags,
+                                                          *largeElementsLimit);
+            }
+            if (largeResourceLimit)
+              mlirOpPrintingFlagsElideLargeResourceString(flags,
+                                                          *largeResourceLimit);
             if (enableDebugInfo)
               mlirOpPrintingFlagsEnableDebugInfo(flags, /*enable=*/true,
                                                  /*prettyForm=*/false);
@@ -103,6 +110,7 @@ void mlir::python::populatePassManagerSubmodule(nb::module_ &m) {
           "print_module_scope"_a = false, "print_after_change"_a = false,
           "print_after_failure"_a = false,
           "large_elements_limit"_a.none() = nb::none(),
+          "large_resource_limit"_a.none() = nb::none(),
           "enable_debug_info"_a = false, "print_generic_op_form"_a = false,
           "tree_printing_dir_path"_a.none() = nb::none(),
           "Enable IR printing, default as mlir-print-ir-after-all.")
diff --git a/mlir/python/mlir/_mlir_libs/_mlir/ir.pyi b/mlir/python/mlir/_mlir_libs/_mlir/ir.pyi
index ed476da28d6b..be71737e4b5b 100644
--- a/mlir/python/mlir/_mlir_libs/_mlir/ir.pyi
+++ b/mlir/python/mlir/_mlir_libs/_mlir/ir.pyi
@@ -200,6 +200,7 @@ class _OperationBase:
     def get_asm(
         binary: Literal[True],
         large_elements_limit: int | None = None,
+        large_resource_limit: int | None = None,
         enable_debug_info: bool = False,
         pretty_debug_info: bool = False,
         print_generic_op_form: bool = False,
@@ -212,6 +213,7 @@ class _OperationBase:
         self,
         binary: bool = False,
         large_elements_limit: int | None = None,
+        large_resource_limit: int | None = None,
         enable_debug_info: bool = False,
         pretty_debug_info: bool = False,
         print_generic_op_form: bool = False,
@@ -253,6 +255,7 @@ class _OperationBase:
     def print(
         self,
         large_elements_limit: int | None = None,
+        large_resource_limit: int | None = None,
         enable_debug_info: bool = False,
         pretty_debug_info: bool = False,
         print_generic_op_form: bool = False,
@@ -270,6 +273,10 @@ class _OperationBase:
           binary: Whether to write bytes (True) or str (False). Defaults to False.
           large_elements_limit: Whether to elide elements attributes above this
             number of elements. Defaults to None (no limit).
+          large_resource_limit: Whether to elide resource strings above this
+            number of characters. Defaults to None (no limit). If large_elements_limit
+            is set and this is None, the behavior will be to use large_elements_limit
+            as large_resource_limit.
           enable_debug_info: Whether to print debug/location information. Defaults
             to False.
           pretty_debug_info: Whether to format debug information for easier reading
diff --git a/mlir/python/mlir/_mlir_libs/_mlir/passmanager.pyi b/mlir/python/mlir/_mlir_libs/_mlir/passmanager.pyi
index 0d2eaffe16d3..1010daddae2a 100644
--- a/mlir/python/mlir/_mlir_libs/_mlir/passmanager.pyi
+++ b/mlir/python/mlir/_mlir_libs/_mlir/passmanager.pyi
@@ -23,6 +23,7 @@ class PassManager:
         print_after_change: bool = False,
         print_after_failure: bool = False,
         large_elements_limit: int | None = None,
+        large_resource_limit: int | None = None,
         enable_debug_info: bool = False,
         print_generic_op_form: bool = False,
         tree_printing_dir_path: str | None = None,
diff --git a/mlir/test/python/ir/operation.py b/mlir/test/python/ir/operation.py
index b08fe98397fb..ede1571f940f 100644
--- a/mlir/test/python/ir/operation.py
+++ b/mlir/test/python/ir/operation.py
@@ -686,6 +686,15 @@ def testOperationPrint():
         skip_regions=True,
     )
 
+    # Test print with large_resource_limit.
+    # CHECK: func.func @f1(%arg0: i32) -> i32
+    # CHECK-NOT: resource1: "0x08
+    module.operation.print(large_resource_limit=2)
+
+    # Test large_elements_limit has no effect on resource string
+    # CHECK: func.func @f1(%arg0: i32) -> i32
+    # CHECK: resource1: "0x08
+    module.operation.print(large_elements_limit=2)
 
 # CHECK-LABEL: TEST: testKnownOpView
 @run
diff --git a/mlir/test/python/pass_manager.py b/mlir/test/python/pass_manager.py
index 85d2eb304882..e26d42bb3291 100644
--- a/mlir/test/python/pass_manager.py
+++ b/mlir/test/python/pass_manager.py
@@ -363,6 +363,63 @@ def testPrintIrLargeLimitElements():
         pm.run(module)
 
 
+# CHECK-LABEL: TEST: testPrintIrLargeResourceLimit
+@run
+def testPrintIrLargeResourceLimit():
+    with Context() as ctx:
+        module = ModuleOp.parse(
+            """
+          module {
+            func.func @main() -> tensor<3xi64> {
+              %0 = arith.constant dense_resource<blob1> : tensor<3xi64>
+              return %0 : tensor<3xi64>
+            }
+          }
+          {-#
+            dialect_resources: {
+              builtin: {
+                blob1: "0x010000000000000002000000000000000300000000000000"
+              }
+            }
+          #-}
+        """
+        )
+        pm = PassManager.parse("builtin.module(canonicalize)")
+        ctx.enable_multithreading(False)
+        pm.enable_ir_printing(large_resource_limit=4)
+        # CHECK-NOT: blob1: "0x01
+        pm.run(module)
+
+
+# CHECK-LABEL: TEST: testPrintIrLargeResourceLimitVsElementsLimit
+@run
+def testPrintIrLargeResourceLimitVsElementsLimit():
+    """Test that large_elements_limit does not affect the printing of resources."""
+    with Context() as ctx:
+        module = ModuleOp.parse(
+            """
+          module {
+            func.func @main() -> tensor<3xi64> {
+              %0 = arith.constant dense_resource<blob1> : tensor<3xi64>
+              return %0 : tensor<3xi64>
+            }
+          }
+          {-#
+            dialect_resources: {
+              builtin: {
+                blob1: "0x010000000000000002000000000000000300000000000000"
+              }
+            }
+          #-}
+        """
+        )
+        pm = PassManager.parse("builtin.module(canonicalize)")
+        ctx.enable_multithreading(False)
+        pm.enable_ir_printing(large_elements_limit=1)
+        # CHECK-NOT: blob1: "0x01
+        pm.run(module)
+
+
 # CHECK-LABEL: TEST: testPrintIrTree
 @run
 def testPrintIrTree():
-- 
cgit v1.2.3


From d35931c49e5b37243ace2b79bec87463772b6c94 Mon Sep 17 00:00:00 2001
From: T0b1-iOS <T0b1-iOS@users.noreply.github.com>
Date: Thu, 17 Jul 2025 18:57:32 +0200
Subject: [Clang][CodeGen][X86] don't coerce int128 into `{i64,i64}` for
 SysV-like ABIs (#135230)

Currently, clang coerces (u)int128_t to two i64 IR parameters when they
are passed in registers. This leads to broken debug info for them after
applying SROA+InstCombine. SROA generates IR like this
([godbolt](https://godbolt.org/z/YrTa4chfc)):
```llvm
define dso_local { i64, i64 } @add(i64 noundef %a.coerce0, i64 noundef %a.coerce1)  {
entry:
  %a.sroa.2.0.insert.ext = zext i64 %a.coerce1 to i128
  %a.sroa.2.0.insert.shift = shl nuw i128 %a.sroa.2.0.insert.ext, 64
  %a.sroa.0.0.insert.ext = zext i64 %a.coerce0 to i128
  %a.sroa.0.0.insert.insert = or i128 %a.sroa.2.0.insert.shift, %a.sroa.0.0.insert.ext
    #dbg_value(i128 %a.sroa.0.0.insert.insert, !17, !DIExpression(), !18)
// ...
!17 = !DILocalVariable(name: "a", arg: 1, scope: !10, file: !11, line: 1, type: !14)
// ...
```

and InstCombine then removes the `or`, moving it into the
`DIExpression`, and the `shl` at which point the debug info salvaging in
`Transforms/Local` replaces the arguments with `poison` as it does not
allow constants larger than 64 bit in `DIExpression`s.

I'm working under the assumption that there is interest in fixing this.
If not, please tell me.
By not coercing `int128_t`s into `{i64, i64}` but keeping them as
`i128`, the debug info stays intact and SelectionDAG then generates two
`DW_OP_LLVM_fragment` expressions for the two corresponding argument
registers.

Given that the ABI code for x64 seems to not coerce the argument when it
is passed on the stack, it should not lead to any problems keeping it as
an `i128` when it is passed in registers.

Alternatively, this could be fixed by checking if a constant value fits
in 64 bits in the debug info salvaging code and then extending the value
on the expression stack to the necessary width. This fixes InstCombine
breaking the debug info but then SelectionDAG removes the expression and
that seems significantly more complex to debug.

Another fix may be to generate `DW_OP_LLVM_fragment` expressions when
removing the `or` as it gets marked as disjoint by InstCombine. However,
I don't know if the KnownBits information is still available at the time
the `or` gets removed and it would probably require refactoring of the
debug info salvaging code as that currently only seems to replace single
expressions and is not designed to support generating new debug records.

Converting `(u)int128_t` arguments to `i128` in the IR seems like the
simpler solution, if it doesn't cause any ABI issues.
---
 clang/lib/CodeGen/Targets/X86.cpp         | 32 ++++++++++++-----
 clang/test/CodeGen/X86/i128-debuginfo.c   | 10 ++++++
 clang/test/CodeGen/X86/x86_64-arguments.c | 39 +++++++++++++++++++++
 clang/test/CodeGen/alloc-align-attr.c     | 58 +++++++++----------------------
 clang/test/CodeGen/builtins.c             | 18 ++--------
 clang/test/CodeGen/ext-int-cc.c           |  4 +--
 clang/test/CodeGen/extend-arg-64.c        |  2 +-
 7 files changed, 96 insertions(+), 67 deletions(-)
 create mode 100644 clang/test/CodeGen/X86/i128-debuginfo.c

diff --git a/clang/lib/CodeGen/Targets/X86.cpp b/clang/lib/CodeGen/Targets/X86.cpp
index 0b712ac2dabc..abb91486e7ee 100644
--- a/clang/lib/CodeGen/Targets/X86.cpp
+++ b/clang/lib/CodeGen/Targets/X86.cpp
@@ -2470,13 +2470,12 @@ GetSSETypeAtOffset(llvm::Type *IRType, unsigned IROffset,
   return llvm::Type::getDoubleTy(getVMContext());
 }
 
-
 /// GetINTEGERTypeAtOffset - The ABI specifies that a value should be passed in
-/// an 8-byte GPR.  This means that we either have a scalar or we are talking
-/// about the high or low part of an up-to-16-byte struct.  This routine picks
-/// the best LLVM IR type to represent this, which may be i64 or may be anything
-/// else that the backend will pass in a GPR that works better (e.g. i8, %foo*,
-/// etc).
+/// one or more 8-byte GPRs.  This means that we either have a scalar or we are
+/// talking about the high and/or low part of an up-to-16-byte struct.  This
+/// routine picks the best LLVM IR type to represent this, which may be i64 or
+/// may be anything else that the backend will pass in GPRs that works better
+/// (e.g. i8, %foo*, etc).
 ///
 /// PrefType is an LLVM IR type that corresponds to (part of) the IR type for
 /// the source type.  IROffset is an offset in bytes into the LLVM IR type that
@@ -2534,6 +2533,13 @@ GetINTEGERTypeAtOffset(llvm::Type *IRType, unsigned IROffset,
                                   SourceOffset);
   }
 
+  // if we have a 128-bit integer, we can pass it safely using an i128
+  // so we return that
+  if (IRType->isIntegerTy(128)) {
+    assert(IROffset == 0);
+    return IRType;
+  }
+
   // Okay, we don't have any better idea of what to pass, so we pass this in an
   // integer register that isn't too big to fit the rest of the struct.
   unsigned TySizeInBytes =
@@ -2591,8 +2597,7 @@ GetX86_64ByValArgumentPair(llvm::Type *Lo, llvm::Type *Hi,
   return Result;
 }
 
-ABIArgInfo X86_64ABIInfo::
-classifyReturnType(QualType RetTy) const {
+ABIArgInfo X86_64ABIInfo::classifyReturnType(QualType RetTy) const {
   // AMD64-ABI 3.2.3p4: Rule 1. Classify the return type with the
   // classification algorithm.
   X86_64ABIInfo::Class Lo, Hi;
@@ -2638,6 +2643,12 @@ classifyReturnType(QualType RetTy) const {
           isPromotableIntegerTypeForABI(RetTy))
         return ABIArgInfo::getExtend(RetTy);
     }
+
+    if (ResType->isIntegerTy(128)) {
+      // i128 are passed directly
+      assert(Hi == Integer);
+      return ABIArgInfo::getDirect(ResType);
+    }
     break;
 
     // AMD64-ABI 3.2.3p4: Rule 4. If the class is SSE, the next
@@ -2783,6 +2794,11 @@ X86_64ABIInfo::classifyArgumentType(QualType Ty, unsigned freeIntRegs,
         return ABIArgInfo::getExtend(Ty, CGT.ConvertType(Ty));
     }
 
+    if (ResType->isIntegerTy(128)) {
+      assert(Hi == Integer);
+      ++neededInt;
+      return ABIArgInfo::getDirect(ResType);
+    }
     break;
 
     // AMD64-ABI 3.2.3p3: Rule 3. If the class is SSE, the next
diff --git a/clang/test/CodeGen/X86/i128-debuginfo.c b/clang/test/CodeGen/X86/i128-debuginfo.c
new file mode 100644
index 000000000000..4b865c1bed9f
--- /dev/null
+++ b/clang/test/CodeGen/X86/i128-debuginfo.c
@@ -0,0 +1,10 @@
+// no autogeneration since update_cc_test_checks does not support -g
+// RUN: %clang_cc1 -triple x86_64-pc-linux -O1 -debug-info-kind=limited -emit-llvm -o - %s | FileCheck %s
+
+// CHECK-LABEL: define{{.*}} i128 @add(i128 noundef %a)
+// CHECK: #dbg_value(i128 %a, ![[DI:.*]], !DIExpression()
+__int128_t add(__int128_t a) {
+  return a + a;
+}
+
+// CHECK: ![[DI]] = !DILocalVariable(name: "a", arg: 1
diff --git a/clang/test/CodeGen/X86/x86_64-arguments.c b/clang/test/CodeGen/X86/x86_64-arguments.c
index 82845f0a2b31..580f9487395d 100644
--- a/clang/test/CodeGen/X86/x86_64-arguments.c
+++ b/clang/test/CodeGen/X86/x86_64-arguments.c
@@ -551,6 +551,45 @@ struct s68 {
 void f68(struct s68 x) {
 }
 
+// CHECK-LABEL: define{{.*}} i128 @f69(i128 noundef %a)
+__int128_t f69(__int128_t a) {
+  return a;
+}
+
+// CHECK-LABEL: define{{.*}} i128 @f70(i128 noundef %a)
+__uint128_t f70(__uint128_t a) {
+  return a;
+}
+
+// check that registers are correctly counted for (u)int128_t arguments
+struct s71 {
+  long long a, b;
+};
+// CHECK-LABEL: define{{.*}} void @f71(i128 noundef %a, i128 noundef %b, i64 noundef %c, ptr noundef byval(%struct.s71) align 8 %d)
+void f71(__int128_t a, __int128_t b, long long c, struct s71 d) {
+}
+// CHECK-LABEL: define{{.*}} void @f72(i128 noundef %a, i128 noundef %b, i64 %d.coerce0, i64 %d.coerce1)
+void f72(__int128_t a, __int128_t b, struct s71 d) {
+}
+
+// check that structs containing (u)int128_t are passed correctly
+struct s73 {
+  struct inner {
+    __uint128_t a;
+  };
+  struct inner in;
+};
+// CHECK-LABEL: define{{.*}} i128 @f73(i128 %a.coerce)
+struct s73 f73(struct s73 a) {
+  return a;
+}
+
+// check that _BitInt(128) is still passed correctly on the stack
+// CHECK-LABEL: define{{.*}} i128 @f74(i128 noundef %b, i128 noundef %c, i128 noundef %d, i64 noundef %e, ptr noundef byval(i128) align 8 %0)
+_BitInt(128) f74(__uint128_t b, __uint128_t c, __uint128_t d, long e, _BitInt(128) a) {
+  return a;
+}
+
 /// The synthesized __va_list_tag does not have file/line fields.
 // CHECK:      = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "__va_list_tag",
 // CHECK-NOT:  file:
diff --git a/clang/test/CodeGen/alloc-align-attr.c b/clang/test/CodeGen/alloc-align-attr.c
index 76e5d1041b19..c4c4e76eaaa0 100644
--- a/clang/test/CodeGen/alloc-align-attr.c
+++ b/clang/test/CodeGen/alloc-align-attr.c
@@ -70,66 +70,42 @@ __INT32_TYPE__ test4(__SIZE_TYPE__ a) {
 
 struct Empty {};
 struct MultiArgs { __INT64_TYPE__ a, b;};
-// Struct parameter doesn't take up an IR parameter, 'i' takes up 2.
+// Struct parameter doesn't take up an IR parameter, 'i' takes up 1.
 // Truncation to i64 is permissible, since alignments of greater than 2^64 are insane.
 __INT32_TYPE__ *m3(struct Empty s, __int128_t i) __attribute__((alloc_align(2)));
 // CHECK-LABEL: @test5(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[A:%.*]] = alloca i128, align 16
 // CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i128, align 16
 // CHECK-NEXT:    [[E:%.*]] = alloca [[STRUCT_EMPTY:%.*]], align 1
-// CHECK-NEXT:    [[COERCE:%.*]] = alloca i128, align 16
-// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw { i64, i64 }, ptr [[A]], i32 0, i32 0
-// CHECK-NEXT:    store i64 [[A_COERCE0:%.*]], ptr [[TMP0]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw { i64, i64 }, ptr [[A]], i32 0, i32 1
-// CHECK-NEXT:    store i64 [[A_COERCE1:%.*]], ptr [[TMP1]], align 8
-// CHECK-NEXT:    [[A1:%.*]] = load i128, ptr [[A]], align 16
-// CHECK-NEXT:    store i128 [[A1]], ptr [[A_ADDR]], align 16
-// CHECK-NEXT:    [[TMP2:%.*]] = load i128, ptr [[A_ADDR]], align 16
-// CHECK-NEXT:    store i128 [[TMP2]], ptr [[COERCE]], align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw { i64, i64 }, ptr [[COERCE]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[TMP3]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw { i64, i64 }, ptr [[COERCE]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP5]], align 8
-// CHECK-NEXT:    [[CALL:%.*]] = call ptr @m3(i64 noundef [[TMP4]], i64 noundef [[TMP6]])
-// CHECK-NEXT:    [[CASTED_ALIGN:%.*]] = trunc i128 [[TMP2]] to i64
+// CHECK-NEXT:    store i128 [[A:%.*]], ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    [[CALL:%.*]] = call ptr @m3(i128 noundef [[TMP0]])
+// CHECK-NEXT:    [[CASTED_ALIGN:%.*]] = trunc i128 [[TMP0]] to i64
 // CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[CALL]], i64 [[CASTED_ALIGN]]) ]
-// CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[CALL]], align 4
-// CHECK-NEXT:    ret i32 [[TMP7]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[CALL]], align 4
+// CHECK-NEXT:    ret i32 [[TMP1]]
 //
 __INT32_TYPE__ test5(__int128_t a) {
   struct Empty e;
   return *m3(e, a);
 }
-// Struct parameter takes up 2 parameters, 'i' takes up 2.
+// Struct parameter takes up 2 parameters, 'i' takes up 1.
 __INT32_TYPE__ *m4(struct MultiArgs s, __int128_t i) __attribute__((alloc_align(2)));
 // CHECK-LABEL: @test6(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[A:%.*]] = alloca i128, align 16
 // CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i128, align 16
 // CHECK-NEXT:    [[E:%.*]] = alloca [[STRUCT_MULTIARGS:%.*]], align 8
-// CHECK-NEXT:    [[COERCE:%.*]] = alloca i128, align 16
-// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw { i64, i64 }, ptr [[A]], i32 0, i32 0
-// CHECK-NEXT:    store i64 [[A_COERCE0:%.*]], ptr [[TMP0]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw { i64, i64 }, ptr [[A]], i32 0, i32 1
-// CHECK-NEXT:    store i64 [[A_COERCE1:%.*]], ptr [[TMP1]], align 8
-// CHECK-NEXT:    [[A1:%.*]] = load i128, ptr [[A]], align 16
-// CHECK-NEXT:    store i128 [[A1]], ptr [[A_ADDR]], align 16
-// CHECK-NEXT:    [[TMP2:%.*]] = load i128, ptr [[A_ADDR]], align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw { i64, i64 }, ptr [[E]], i32 0, i32 0
+// CHECK-NEXT:    store i128 [[A:%.*]], ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw { i64, i64 }, ptr [[E]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[TMP1]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw { i64, i64 }, ptr [[E]], i32 0, i32 1
 // CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[TMP3]], align 8
-// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw { i64, i64 }, ptr [[E]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP5]], align 8
-// CHECK-NEXT:    store i128 [[TMP2]], ptr [[COERCE]], align 16
-// CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw { i64, i64 }, ptr [[COERCE]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP7]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw { i64, i64 }, ptr [[COERCE]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP9]], align 8
-// CHECK-NEXT:    [[CALL:%.*]] = call ptr @m4(i64 [[TMP4]], i64 [[TMP6]], i64 noundef [[TMP8]], i64 noundef [[TMP10]])
-// CHECK-NEXT:    [[CASTED_ALIGN:%.*]] = trunc i128 [[TMP2]] to i64
+// CHECK-NEXT:    [[CALL:%.*]] = call ptr @m4(i64 [[TMP2]], i64 [[TMP4]], i128 noundef [[TMP0]])
+// CHECK-NEXT:    [[CASTED_ALIGN:%.*]] = trunc i128 [[TMP0]] to i64
 // CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[CALL]], i64 [[CASTED_ALIGN]]) ]
-// CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[CALL]], align 4
-// CHECK-NEXT:    ret i32 [[TMP11]]
+// CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[CALL]], align 4
+// CHECK-NEXT:    ret i32 [[TMP5]]
 //
 __INT32_TYPE__ test6(__int128_t a) {
   struct MultiArgs e;
diff --git a/clang/test/CodeGen/builtins.c b/clang/test/CodeGen/builtins.c
index eda6c67fdad0..aa9965b81598 100644
--- a/clang/test/CodeGen/builtins.c
+++ b/clang/test/CodeGen/builtins.c
@@ -956,36 +956,24 @@ void test_builtin_os_log_errno(void) {
 void test_builtin_os_log_long_double(void *buf, long double ld) {
   // CHECK: %[[BUF_ADDR:.*]] = alloca ptr, align 8
   // CHECK: %[[LD_ADDR:.*]] = alloca x86_fp80, align 16
-  // CHECK: %[[COERCE:.*]] = alloca i128, align 16
   // CHECK: store ptr %[[BUF]], ptr %[[BUF_ADDR]], align 8
   // CHECK: store x86_fp80 %[[LD]], ptr %[[LD_ADDR]], align 16
   // CHECK: %[[V0:.*]] = load ptr, ptr %[[BUF_ADDR]], align 8
   // CHECK: %[[V1:.*]] = load x86_fp80, ptr %[[LD_ADDR]], align 16
   // CHECK: %[[V2:.*]] = bitcast x86_fp80 %[[V1]] to i80
   // CHECK: %[[V3:.*]] = zext i80 %[[V2]] to i128
-  // CHECK: store i128 %[[V3]], ptr %[[COERCE]], align 16
-  // CHECK: %[[V5:.*]] = getelementptr inbounds nuw { i64, i64 }, ptr %[[COERCE]], i32 0, i32 0
-  // CHECK: %[[V6:.*]] = load i64, ptr %[[V5]], align 16
-  // CHECK: %[[V7:.*]] = getelementptr inbounds nuw { i64, i64 }, ptr %[[COERCE]], i32 0, i32 1
-  // CHECK: %[[V8:.*]] = load i64, ptr %[[V7]], align 8
-  // CHECK: call void @__os_log_helper_1_0_1_16_0(ptr noundef %[[V0]], i64 noundef %[[V6]], i64 noundef %[[V8]])
+  // CHECK: call void @__os_log_helper_1_0_1_16_0(ptr noundef %[[V0]], i128 noundef %[[V3]])
 
   __builtin_os_log_format(buf, "%Lf", ld);
 }
 
 // CHECK-LABEL: define linkonce_odr hidden void @__os_log_helper_1_0_1_16_0
-// CHECK: (ptr noundef %[[BUFFER:.*]], i64 noundef %[[ARG0_COERCE0:.*]], i64 noundef %[[ARG0_COERCE1:.*]])
+// CHECK: (ptr noundef %[[BUFFER:.*]], i128 noundef %[[ARG0:.*]])
 
-// CHECK: %[[ARG0:.*]] = alloca i128, align 16
 // CHECK: %[[BUFFER_ADDR:.*]] = alloca ptr, align 8
 // CHECK: %[[ARG0_ADDR:.*]] = alloca i128, align 16
-// CHECK: %[[V1:.*]] = getelementptr inbounds nuw { i64, i64 }, ptr %[[ARG0]], i32 0, i32 0
-// CHECK: store i64 %[[ARG0_COERCE0]], ptr %[[V1]], align 16
-// CHECK: %[[V2:.*]] = getelementptr inbounds nuw { i64, i64 }, ptr %[[ARG0]], i32 0, i32 1
-// CHECK: store i64 %[[ARG0_COERCE1]], ptr %[[V2]], align 8
-// CHECK: %[[ARG01:.*]] = load i128, ptr %[[ARG0]], align 16
 // CHECK: store ptr %[[BUFFER]], ptr %[[BUFFER_ADDR]], align 8
-// CHECK: store i128 %[[ARG01]], ptr %[[ARG0_ADDR]], align 16
+// CHECK: store i128 %[[ARG0]], ptr %[[ARG0_ADDR]], align 16
 // CHECK: %[[BUF:.*]] = load ptr, ptr %[[BUFFER_ADDR]], align 8
 // CHECK: %[[SUMMARY:.*]] = getelementptr i8, ptr %[[BUF]], i64 0
 // CHECK: store i8 0, ptr %[[SUMMARY]], align 1
diff --git a/clang/test/CodeGen/ext-int-cc.c b/clang/test/CodeGen/ext-int-cc.c
index f31a4eb240c2..fdca4012ee4a 100644
--- a/clang/test/CodeGen/ext-int-cc.c
+++ b/clang/test/CodeGen/ext-int-cc.c
@@ -32,7 +32,7 @@
 
 // Make sure 128 and 64 bit versions are passed like integers.
 void ParamPassing(_BitInt(128) b, _BitInt(64) c) {}
-// LIN64: define{{.*}} void @ParamPassing(i64 %{{.+}}, i64 %{{.+}}, i64 %{{.+}})
+// LIN64: define{{.*}} void @ParamPassing(i128 %{{.+}}, i64 %{{.+}})
 // WIN64: define dso_local void @ParamPassing(ptr %{{.+}}, i64 %{{.+}})
 // LIN32: define{{.*}} void @ParamPassing(ptr %{{.+}}, i64 %{{.+}})
 // WIN32: define dso_local void @ParamPassing(ptr %{{.+}}, i64 %{{.+}})
@@ -251,7 +251,7 @@ _BitInt(127) ReturnPassing3(void) { return 0; }
 // LA32: define{{.*}} void @ReturnPassing3(ptr dead_on_unwind noalias writable sret
 
 _BitInt(128) ReturnPassing4(void) { return 0; }
-// LIN64: define{{.*}} { i64, i64 } @ReturnPassing4(
+// LIN64: define{{.*}} i128 @ReturnPassing4(
 // WIN64: define dso_local void @ReturnPassing4(ptr dead_on_unwind noalias writable sret
 // LIN32: define{{.*}} void @ReturnPassing4(ptr dead_on_unwind noalias writable sret
 // WIN32: define dso_local void @ReturnPassing4(ptr dead_on_unwind noalias writable sret
diff --git a/clang/test/CodeGen/extend-arg-64.c b/clang/test/CodeGen/extend-arg-64.c
index 2cb56d35af21..8b99c01807ec 100644
--- a/clang/test/CodeGen/extend-arg-64.c
+++ b/clang/test/CodeGen/extend-arg-64.c
@@ -84,7 +84,7 @@ int test(void) {
 #ifdef D128
   knr(i128);
   // CHECKEXT: load i128
-  // CHECKEXT: call{{.*}} void (i64, i64, ...) @knr
+  // CHECKEXT: call{{.*}} void (i128, ...) @knr
 #endif
 
   knr(u32, s32, u16, s16, u8, s8);
-- 
cgit v1.2.3


From ff5784bb9094f6035851dc7abc4a5760fdc21e45 Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <Krzysztof.Parzyszek@amd.com>
Date: Thu, 17 Jul 2025 12:11:12 -0500
Subject: [flang][OpenMP] Move extractOmpDirective to Utils.cpp, NFC (#148653)

---
 flang/lib/Lower/OpenMP/OpenMP.cpp | 84 ---------------------------------------
 flang/lib/Lower/OpenMP/Utils.cpp  | 84 +++++++++++++++++++++++++++++++++++++++
 flang/lib/Lower/OpenMP/Utils.h    |  3 ++
 3 files changed, 87 insertions(+), 84 deletions(-)

diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp
index 4458f62eea95..fcb20fdf187f 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -372,90 +372,6 @@ extractMappedBaseValues(llvm::ArrayRef<mlir::Value> vars,
   });
 }
 
-/// Get the directive enumeration value corresponding to the given OpenMP
-/// construct PFT node.
-llvm::omp::Directive
-extractOmpDirective(const parser::OpenMPConstruct &ompConstruct) {
-  return common::visit(
-      common::visitors{
-          [](const parser::OpenMPAllocatorsConstruct &c) {
-            return llvm::omp::OMPD_allocators;
-          },
-          [](const parser::OpenMPAssumeConstruct &c) {
-            return llvm::omp::OMPD_assume;
-          },
-          [](const parser::OpenMPAtomicConstruct &c) {
-            return llvm::omp::OMPD_atomic;
-          },
-          [](const parser::OpenMPBlockConstruct &c) {
-            return std::get<parser::OmpBlockDirective>(
-                       std::get<parser::OmpBeginBlockDirective>(c.t).t)
-                .v;
-          },
-          [](const parser::OpenMPCriticalConstruct &c) {
-            return llvm::omp::OMPD_critical;
-          },
-          [](const parser::OpenMPDeclarativeAllocate &c) {
-            return llvm::omp::OMPD_allocate;
-          },
-          [](const parser::OpenMPDispatchConstruct &c) {
-            return llvm::omp::OMPD_dispatch;
-          },
-          [](const parser::OpenMPExecutableAllocate &c) {
-            return llvm::omp::OMPD_allocate;
-          },
-          [](const parser::OpenMPLoopConstruct &c) {
-            return std::get<parser::OmpLoopDirective>(
-                       std::get<parser::OmpBeginLoopDirective>(c.t).t)
-                .v;
-          },
-          [](const parser::OpenMPSectionConstruct &c) {
-            return llvm::omp::OMPD_section;
-          },
-          [](const parser::OpenMPSectionsConstruct &c) {
-            return std::get<parser::OmpSectionsDirective>(
-                       std::get<parser::OmpBeginSectionsDirective>(c.t).t)
-                .v;
-          },
-          [](const parser::OpenMPStandaloneConstruct &c) {
-            return common::visit(
-                common::visitors{
-                    [](const parser::OpenMPSimpleStandaloneConstruct &c) {
-                      return c.v.DirId();
-                    },
-                    [](const parser::OpenMPFlushConstruct &c) {
-                      return llvm::omp::OMPD_flush;
-                    },
-                    [](const parser::OpenMPCancelConstruct &c) {
-                      return llvm::omp::OMPD_cancel;
-                    },
-                    [](const parser::OpenMPCancellationPointConstruct &c) {
-                      return llvm::omp::OMPD_cancellation_point;
-                    },
-                    [](const parser::OmpMetadirectiveDirective &c) {
-                      return llvm::omp::OMPD_metadirective;
-                    },
-                    [](const parser::OpenMPDepobjConstruct &c) {
-                      return llvm::omp::OMPD_depobj;
-                    },
-                    [](const parser::OpenMPInteropConstruct &c) {
-                      return llvm::omp::OMPD_interop;
-                    }},
-                c.u);
-          },
-          [](const parser::OpenMPUtilityConstruct &c) {
-            return common::visit(
-                common::visitors{[](const parser::OmpErrorDirective &c) {
-                                   return llvm::omp::OMPD_error;
-                                 },
-                                 [](const parser::OmpNothingDirective &c) {
-                                   return llvm::omp::OMPD_nothing;
-                                 }},
-                c.u);
-          }},
-      ompConstruct.u);
-}
-
 /// Populate the global \see hostEvalInfo after processing clauses for the given
 /// \p eval OpenMP target construct, or nested constructs, if these must be
 /// evaluated outside of the target region per the spec.
diff --git a/flang/lib/Lower/OpenMP/Utils.cpp b/flang/lib/Lower/OpenMP/Utils.cpp
index 2e53f01f1da6..b194150c0f7f 100644
--- a/flang/lib/Lower/OpenMP/Utils.cpp
+++ b/flang/lib/Lower/OpenMP/Utils.cpp
@@ -661,6 +661,90 @@ bool collectLoopRelatedInfo(
 
   return found;
 }
+
+/// Get the directive enumeration value corresponding to the given OpenMP
+/// construct PFT node.
+llvm::omp::Directive
+extractOmpDirective(const parser::OpenMPConstruct &ompConstruct) {
+  return common::visit(
+      common::visitors{
+          [](const parser::OpenMPAllocatorsConstruct &c) {
+            return llvm::omp::OMPD_allocators;
+          },
+          [](const parser::OpenMPAssumeConstruct &c) {
+            return llvm::omp::OMPD_assume;
+          },
+          [](const parser::OpenMPAtomicConstruct &c) {
+            return llvm::omp::OMPD_atomic;
+          },
+          [](const parser::OpenMPBlockConstruct &c) {
+            return std::get<parser::OmpBlockDirective>(
+                       std::get<parser::OmpBeginBlockDirective>(c.t).t)
+                .v;
+          },
+          [](const parser::OpenMPCriticalConstruct &c) {
+            return llvm::omp::OMPD_critical;
+          },
+          [](const parser::OpenMPDeclarativeAllocate &c) {
+            return llvm::omp::OMPD_allocate;
+          },
+          [](const parser::OpenMPDispatchConstruct &c) {
+            return llvm::omp::OMPD_dispatch;
+          },
+          [](const parser::OpenMPExecutableAllocate &c) {
+            return llvm::omp::OMPD_allocate;
+          },
+          [](const parser::OpenMPLoopConstruct &c) {
+            return std::get<parser::OmpLoopDirective>(
+                       std::get<parser::OmpBeginLoopDirective>(c.t).t)
+                .v;
+          },
+          [](const parser::OpenMPSectionConstruct &c) {
+            return llvm::omp::OMPD_section;
+          },
+          [](const parser::OpenMPSectionsConstruct &c) {
+            return std::get<parser::OmpSectionsDirective>(
+                       std::get<parser::OmpBeginSectionsDirective>(c.t).t)
+                .v;
+          },
+          [](const parser::OpenMPStandaloneConstruct &c) {
+            return common::visit(
+                common::visitors{
+                    [](const parser::OpenMPSimpleStandaloneConstruct &c) {
+                      return c.v.DirId();
+                    },
+                    [](const parser::OpenMPFlushConstruct &c) {
+                      return llvm::omp::OMPD_flush;
+                    },
+                    [](const parser::OpenMPCancelConstruct &c) {
+                      return llvm::omp::OMPD_cancel;
+                    },
+                    [](const parser::OpenMPCancellationPointConstruct &c) {
+                      return llvm::omp::OMPD_cancellation_point;
+                    },
+                    [](const parser::OmpMetadirectiveDirective &c) {
+                      return llvm::omp::OMPD_metadirective;
+                    },
+                    [](const parser::OpenMPDepobjConstruct &c) {
+                      return llvm::omp::OMPD_depobj;
+                    },
+                    [](const parser::OpenMPInteropConstruct &c) {
+                      return llvm::omp::OMPD_interop;
+                    }},
+                c.u);
+          },
+          [](const parser::OpenMPUtilityConstruct &c) {
+            return common::visit(
+                common::visitors{[](const parser::OmpErrorDirective &c) {
+                                   return llvm::omp::OMPD_error;
+                                 },
+                                 [](const parser::OmpNothingDirective &c) {
+                                   return llvm::omp::OMPD_nothing;
+                                 }},
+                c.u);
+          }},
+      ompConstruct.u);
+}
 } // namespace omp
 } // namespace lower
 } // namespace Fortran
diff --git a/flang/lib/Lower/OpenMP/Utils.h b/flang/lib/Lower/OpenMP/Utils.h
index 1526bd4e9023..8e3ad5c3452e 100644
--- a/flang/lib/Lower/OpenMP/Utils.h
+++ b/flang/lib/Lower/OpenMP/Utils.h
@@ -166,6 +166,9 @@ bool collectLoopRelatedInfo(
     lower::pft::Evaluation &eval, const omp::List<omp::Clause> &clauses,
     mlir::omp::LoopRelatedClauseOps &result,
     llvm::SmallVectorImpl<const semantics::Symbol *> &iv);
+
+llvm::omp::Directive
+extractOmpDirective(const parser::OpenMPConstruct &ompConstruct);
 } // namespace omp
 } // namespace lower
 } // namespace Fortran
-- 
cgit v1.2.3