summaryrefslogtreecommitdiff
path: root/llvm/test/CodeGen/Thumb2
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/test/CodeGen/Thumb2')
-rw-r--r--llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll34
-rw-r--r--llvm/test/CodeGen/Thumb2/LowOverheadLoops/while-loops.ll91
-rw-r--r--llvm/test/CodeGen/Thumb2/mve-gather-increment.ll24
-rw-r--r--llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll90
-rw-r--r--llvm/test/CodeGen/Thumb2/mve-laneinterleaving-reduct.ll89
-rw-r--r--llvm/test/CodeGen/Thumb2/mve-pipelineloops.ll52
6 files changed, 194 insertions, 186 deletions
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll
index 98e082be4cad..1769c5d2fd38 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll
@@ -21,12 +21,11 @@ define dso_local void @varying_outer_2d_reduction(ptr nocapture readonly %Input,
; ENABLED-NEXT: it lt
; ENABLED-NEXT: bxlt lr
; ENABLED-NEXT: .LBB0_1: @ %for.body.lr.ph
-; ENABLED-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+; ENABLED-NEXT: push.w {r4, r5, r6, r7, r9, r10, r11, lr}
; ENABLED-NEXT: mov r11, r0
-; ENABLED-NEXT: ldr r0, [sp, #36]
+; ENABLED-NEXT: ldr r0, [sp, #32]
; ENABLED-NEXT: add.w r9, r2, #3
; ENABLED-NEXT: mov.w r12, #0
-; ENABLED-NEXT: mov.w r8, #1
; ENABLED-NEXT: mov r10, r11
; ENABLED-NEXT: uxth r0, r0
; ENABLED-NEXT: rsbs r5, r0, #0
@@ -50,16 +49,18 @@ define dso_local void @varying_outer_2d_reduction(ptr nocapture readonly %Input,
; ENABLED-NEXT: @ %bb.5: @ %vector.ph
; ENABLED-NEXT: @ in Loop: Header=BB0_4 Depth=1
; ENABLED-NEXT: bic r0, r9, #3
-; ENABLED-NEXT: sub.w r4, r2, r12
+; ENABLED-NEXT: movs r7, #1
; ENABLED-NEXT: subs r0, #4
+; ENABLED-NEXT: sub.w r4, r2, r12
; ENABLED-NEXT: vmov.i32 q1, #0x0
-; ENABLED-NEXT: mov r7, r10
-; ENABLED-NEXT: add.w r6, r8, r0, lsr #2
+; ENABLED-NEXT: add.w r6, r7, r0, lsr #2
; ENABLED-NEXT: adds r0, r2, #3
; ENABLED-NEXT: sub.w r0, r0, r12
; ENABLED-NEXT: bic r0, r0, #3
; ENABLED-NEXT: subs r0, #4
-; ENABLED-NEXT: add.w lr, r8, r0, lsr #2
+; ENABLED-NEXT: add.w r0, r7, r0, lsr #2
+; ENABLED-NEXT: mov r7, r10
+; ENABLED-NEXT: dls lr, r0
; ENABLED-NEXT: mov r0, r11
; ENABLED-NEXT: .LBB0_6: @ %vector.body
; ENABLED-NEXT: @ Parent Loop BB0_4 Depth=1
@@ -82,7 +83,7 @@ define dso_local void @varying_outer_2d_reduction(ptr nocapture readonly %Input,
; ENABLED-NEXT: vaddv.u32 r0, q0
; ENABLED-NEXT: b .LBB0_3
; ENABLED-NEXT: .LBB0_8:
-; ENABLED-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+; ENABLED-NEXT: pop.w {r4, r5, r6, r7, r9, r10, r11, lr}
; ENABLED-NEXT: bx lr
;
; NOREDUCTIONS-LABEL: varying_outer_2d_reduction:
@@ -91,12 +92,11 @@ define dso_local void @varying_outer_2d_reduction(ptr nocapture readonly %Input,
; NOREDUCTIONS-NEXT: it lt
; NOREDUCTIONS-NEXT: bxlt lr
; NOREDUCTIONS-NEXT: .LBB0_1: @ %for.body.lr.ph
-; NOREDUCTIONS-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+; NOREDUCTIONS-NEXT: push.w {r4, r5, r6, r7, r9, r10, r11, lr}
; NOREDUCTIONS-NEXT: mov r11, r0
-; NOREDUCTIONS-NEXT: ldr r0, [sp, #36]
+; NOREDUCTIONS-NEXT: ldr r0, [sp, #32]
; NOREDUCTIONS-NEXT: add.w r9, r2, #3
; NOREDUCTIONS-NEXT: mov.w r12, #0
-; NOREDUCTIONS-NEXT: mov.w r8, #1
; NOREDUCTIONS-NEXT: mov r10, r11
; NOREDUCTIONS-NEXT: uxth r0, r0
; NOREDUCTIONS-NEXT: rsbs r5, r0, #0
@@ -120,16 +120,18 @@ define dso_local void @varying_outer_2d_reduction(ptr nocapture readonly %Input,
; NOREDUCTIONS-NEXT: @ %bb.5: @ %vector.ph
; NOREDUCTIONS-NEXT: @ in Loop: Header=BB0_4 Depth=1
; NOREDUCTIONS-NEXT: bic r0, r9, #3
-; NOREDUCTIONS-NEXT: sub.w r4, r2, r12
+; NOREDUCTIONS-NEXT: movs r7, #1
; NOREDUCTIONS-NEXT: subs r0, #4
+; NOREDUCTIONS-NEXT: sub.w r4, r2, r12
; NOREDUCTIONS-NEXT: vmov.i32 q1, #0x0
-; NOREDUCTIONS-NEXT: mov r7, r10
-; NOREDUCTIONS-NEXT: add.w r6, r8, r0, lsr #2
+; NOREDUCTIONS-NEXT: add.w r6, r7, r0, lsr #2
; NOREDUCTIONS-NEXT: adds r0, r2, #3
; NOREDUCTIONS-NEXT: sub.w r0, r0, r12
; NOREDUCTIONS-NEXT: bic r0, r0, #3
; NOREDUCTIONS-NEXT: subs r0, #4
-; NOREDUCTIONS-NEXT: add.w lr, r8, r0, lsr #2
+; NOREDUCTIONS-NEXT: add.w r0, r7, r0, lsr #2
+; NOREDUCTIONS-NEXT: mov r7, r10
+; NOREDUCTIONS-NEXT: dls lr, r0
; NOREDUCTIONS-NEXT: mov r0, r11
; NOREDUCTIONS-NEXT: .LBB0_6: @ %vector.body
; NOREDUCTIONS-NEXT: @ Parent Loop BB0_4 Depth=1
@@ -152,7 +154,7 @@ define dso_local void @varying_outer_2d_reduction(ptr nocapture readonly %Input,
; NOREDUCTIONS-NEXT: vaddv.u32 r0, q0
; NOREDUCTIONS-NEXT: b .LBB0_3
; NOREDUCTIONS-NEXT: .LBB0_8:
-; NOREDUCTIONS-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+; NOREDUCTIONS-NEXT: pop.w {r4, r5, r6, r7, r9, r10, r11, lr}
; NOREDUCTIONS-NEXT: bx lr
entry:
%conv = sext i16 %N to i32
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/while-loops.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/while-loops.ll
index 435acc29f076..cbcbf1f392ce 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/while-loops.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/while-loops.ll
@@ -165,73 +165,74 @@ define dso_local i32 @b(ptr %c, i32 %d, i32 %e, ptr %n) "frame-pointer"="all" {
; CHECK-NEXT: sub sp, #16
; CHECK-NEXT: wls lr, r1, .LBB2_3
; CHECK-NEXT: @ %bb.1: @ %while.body.preheader
-; CHECK-NEXT: add.w r9, r3, #4
-; CHECK-NEXT: add.w r10, r0, #4
+; CHECK-NEXT: adds r6, r3, #4
+; CHECK-NEXT: adds r1, r0, #4
; CHECK-NEXT: mvn r8, #1
-; CHECK-NEXT: @ implicit-def: $r6
+; CHECK-NEXT: @ implicit-def: $r9
; CHECK-NEXT: @ implicit-def: $r4
; CHECK-NEXT: str r2, [sp] @ 4-byte Spill
; CHECK-NEXT: .LBB2_2: @ %while.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: ldr.w r1, [r10]
+; CHECK-NEXT: str r1, [sp, #12] @ 4-byte Spill
; CHECK-NEXT: asrs r2, r4, #31
-; CHECK-NEXT: str r6, [sp, #4] @ 4-byte Spill
+; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT: ldr r1, [r1]
; CHECK-NEXT: muls r1, r3, r1
; CHECK-NEXT: adds r4, r4, r1
; CHECK-NEXT: adc.w r1, r2, r1, asr #31
; CHECK-NEXT: adds.w r2, r4, #-2147483648
-; CHECK-NEXT: ldrd r5, r4, [r8]
-; CHECK-NEXT: adc r2, r1, #0
+; CHECK-NEXT: ldrd r2, r4, [r8]
+; CHECK-NEXT: adc r5, r1, #0
+; CHECK-NEXT: str r2, [sp, #4] @ 4-byte Spill
+; CHECK-NEXT: smull r4, r2, r4, r9
+; CHECK-NEXT: asrs r1, r5, #31
; CHECK-NEXT: str r5, [sp, #8] @ 4-byte Spill
-; CHECK-NEXT: smull r4, r5, r4, r6
-; CHECK-NEXT: asrs r1, r2, #31
-; CHECK-NEXT: str r2, [sp, #12] @ 4-byte Spill
-; CHECK-NEXT: subs r4, r2, r4
-; CHECK-NEXT: sbcs r1, r5
-; CHECK-NEXT: adds.w r6, r4, #-2147483648
-; CHECK-NEXT: ldr r4, [r10, #-4]
-; CHECK-NEXT: adc r11, r1, #0
-; CHECK-NEXT: mov r1, r9
-; CHECK-NEXT: add.w r10, r10, #4
+; CHECK-NEXT: subs r4, r5, r4
+; CHECK-NEXT: sbcs r1, r2
+; CHECK-NEXT: ldr r2, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT: adds.w r10, r4, #-2147483648
+; CHECK-NEXT: adc r1, r1, #0
+; CHECK-NEXT: ldr r4, [r2, #-4]
; CHECK-NEXT: muls r4, r3, r4
; CHECK-NEXT: adds r3, #4
; CHECK-NEXT: adds.w r12, r4, #-2147483648
; CHECK-NEXT: asr.w r5, r4, #31
-; CHECK-NEXT: ldr.w r4, [r9]
+; CHECK-NEXT: ldr r4, [r6]
; CHECK-NEXT: adc r5, r5, #0
; CHECK-NEXT: mul r2, r4, r0
+; CHECK-NEXT: adds r0, #4
; CHECK-NEXT: add.w r2, r2, #-2147483648
; CHECK-NEXT: asrl r12, r5, r2
-; CHECK-NEXT: smull r2, r9, r4, r12
-; CHECK-NEXT: mov r12, r0
-; CHECK-NEXT: lsll r2, r9, #30
-; CHECK-NEXT: asr.w r5, r9, #31
-; CHECK-NEXT: mov r2, r9
-; CHECK-NEXT: mov r9, r1
-; CHECK-NEXT: ldrd r1, r0, [sp, #4] @ 8-byte Folded Reload
-; CHECK-NEXT: lsll r2, r5, r4
-; CHECK-NEXT: lsrl r2, r5, #2
-; CHECK-NEXT: muls r0, r1, r0
-; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload
-; CHECK-NEXT: adds r0, #2
-; CHECK-NEXT: lsll r2, r5, r0
-; CHECK-NEXT: add.w r0, r2, #-2147483648
+; CHECK-NEXT: smull r2, r5, r4, r12
+; CHECK-NEXT: lsll r2, r5, #30
+; CHECK-NEXT: ldr r2, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT: asr.w r11, r5, #31
+; CHECK-NEXT: mov r12, r5
+; CHECK-NEXT: lsll r12, r11, r4
+; CHECK-NEXT: mul r2, r2, r9
+; CHECK-NEXT: lsrl r12, r11, #2
+; CHECK-NEXT: adds r2, #2
+; CHECK-NEXT: lsll r12, r11, r2
; CHECK-NEXT: ldr r2, [sp] @ 4-byte Reload
-; CHECK-NEXT: asrl r6, r11, r0
-; CHECK-NEXT: movs r0, #2
-; CHECK-NEXT: lsrl r6, r11, #2
-; CHECK-NEXT: str r6, [r0]
-; CHECK-NEXT: ldr r0, [r8], #-4
-; CHECK-NEXT: mls r0, r0, r4, r1
-; CHECK-NEXT: adds.w r4, r0, #-2147483648
-; CHECK-NEXT: asr.w r1, r0, #31
+; CHECK-NEXT: add.w r5, r12, #-2147483648
+; CHECK-NEXT: asrl r10, r1, r5
+; CHECK-NEXT: ldr r5, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT: lsrl r10, r1, #2
+; CHECK-NEXT: movs r1, #2
+; CHECK-NEXT: mov r9, r10
+; CHECK-NEXT: str.w r10, [r1]
+; CHECK-NEXT: ldr r1, [r8], #-4
+; CHECK-NEXT: mls r5, r1, r4, r5
+; CHECK-NEXT: adds.w r4, r5, #-2147483648
+; CHECK-NEXT: asr.w r1, r5, #31
; CHECK-NEXT: adc r1, r1, #0
; CHECK-NEXT: lsrl r4, r1, #2
-; CHECK-NEXT: rsbs r0, r4, #0
-; CHECK-NEXT: str r0, [r2]
-; CHECK-NEXT: str r0, [r9, #-4]
-; CHECK-NEXT: add.w r9, r9, #4
-; CHECK-NEXT: add.w r0, r12, #4
+; CHECK-NEXT: rsbs r1, r4, #0
+; CHECK-NEXT: str r1, [r2]
+; CHECK-NEXT: str r1, [r6, #-4]
+; CHECK-NEXT: adds r6, #4
+; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT: adds r1, #4
; CHECK-NEXT: le lr, .LBB2_2
; CHECK-NEXT: .LBB2_3: @ %while.end
; CHECK-NEXT: add sp, #16
diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll b/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll
index b60ee7c6d406..0d86f22a321e 100644
--- a/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll
@@ -1313,29 +1313,27 @@ define arm_aapcs_vfpcc void @gather_inc_v16i8_simple(ptr noalias nocapture reado
; CHECK-NEXT: @ Child Loop BB16_3 Depth 2
; CHECK-NEXT: ldr.w r8, [sp, #56] @ 4-byte Reload
; CHECK-NEXT: vldrw.u32 q5, [sp] @ 16-byte Reload
-; CHECK-NEXT: vldrw.u32 q6, [sp, #16] @ 16-byte Reload
+; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload
; CHECK-NEXT: vldrw.u32 q7, [sp, #32] @ 16-byte Reload
; CHECK-NEXT: vmov q4, q3
; CHECK-NEXT: .LBB16_3: @ %vector.body
; CHECK-NEXT: @ Parent Loop BB16_2 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT: vmov q0, q6
-; CHECK-NEXT: vadd.i32 q6, q5, r0
-; CHECK-NEXT: vmov r7, r3, d13
+; CHECK-NEXT: vadd.i32 q1, q5, r0
; CHECK-NEXT: vadd.i32 q2, q4, r0
-; CHECK-NEXT: vmov r5, r6, d5
-; CHECK-NEXT: vmov q1, q7
-; CHECK-NEXT: vmov r4, r10, d12
+; CHECK-NEXT: vmov r7, r3, d3
; CHECK-NEXT: vadd.i32 q6, q0, lr
+; CHECK-NEXT: vmov r5, r6, d5
; CHECK-NEXT: subs.w r9, r9, #16
+; CHECK-NEXT: vmov r4, r10, d2
+; CHECK-NEXT: vadd.i32 q1, q7, lr
; CHECK-NEXT: vadd.i32 q4, q4, lr
; CHECK-NEXT: vadd.i32 q5, q5, lr
-; CHECK-NEXT: vadd.i32 q7, q7, lr
; CHECK-NEXT: ldrb.w r11, [r3]
; CHECK-NEXT: ldrb r3, [r7]
; CHECK-NEXT: vmov r7, r12, d4
-; CHECK-NEXT: vadd.i32 q2, q1, r0
-; CHECK-NEXT: vadd.i32 q1, q0, r0
+; CHECK-NEXT: vadd.i32 q2, q7, r0
+; CHECK-NEXT: vadd.i32 q7, q0, r0
; CHECK-NEXT: ldrb r5, [r5]
; CHECK-NEXT: ldrb r6, [r6]
; CHECK-NEXT: ldrb r4, [r4]
@@ -1344,7 +1342,7 @@ define arm_aapcs_vfpcc void @gather_inc_v16i8_simple(ptr noalias nocapture reado
; CHECK-NEXT: ldrb.w r1, [r12]
; CHECK-NEXT: vmov.8 q0[0], r7
; CHECK-NEXT: vmov.8 q0[1], r1
-; CHECK-NEXT: vmov r1, r7, d3
+; CHECK-NEXT: vmov r1, r7, d15
; CHECK-NEXT: vmov.8 q0[2], r5
; CHECK-NEXT: vmov.8 q0[3], r6
; CHECK-NEXT: vmov.8 q0[4], r4
@@ -1359,7 +1357,8 @@ define arm_aapcs_vfpcc void @gather_inc_v16i8_simple(ptr noalias nocapture reado
; CHECK-NEXT: ldrb r3, [r5]
; CHECK-NEXT: ldrb.w r12, [r7]
; CHECK-NEXT: ldrb r5, [r4]
-; CHECK-NEXT: vmov r4, r7, d2
+; CHECK-NEXT: vmov r4, r7, d14
+; CHECK-NEXT: vmov q7, q1
; CHECK-NEXT: ldrb r4, [r4]
; CHECK-NEXT: ldrb r7, [r7]
; CHECK-NEXT: vmov.8 q0[8], r4
@@ -1371,6 +1370,7 @@ define arm_aapcs_vfpcc void @gather_inc_v16i8_simple(ptr noalias nocapture reado
; CHECK-NEXT: vmov.8 q0[14], r3
; CHECK-NEXT: vmov.8 q0[15], r12
; CHECK-NEXT: vstrb.8 q0, [r8], #16
+; CHECK-NEXT: vmov q0, q6
; CHECK-NEXT: bne .LBB16_3
; CHECK-NEXT: @ %bb.4: @ %middle.block
; CHECK-NEXT: @ in Loop: Header=BB16_2 Depth=1
diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll
index c0b2da7eff41..eedca2cd4a5d 100644
--- a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll
@@ -236,11 +236,11 @@ define arm_aapcs_vfpcc void @push_out_mul_gather_scatter(ptr noalias nocapture r
; CHECK-NEXT: vldrw.u32 q1, [r1]
; CHECK-NEXT: .LBB5_1: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vldrw.u32 q3, [r0, q1, uxtw #2]
+; CHECK-NEXT: vldrw.u32 q2, [r0, q1, uxtw #2]
+; CHECK-NEXT: vadd.i32 q3, q1, q0
; CHECK-NEXT: subs r2, #4
-; CHECK-NEXT: vmov q2, q1
-; CHECK-NEXT: vadd.i32 q1, q1, q0
-; CHECK-NEXT: vstrw.32 q3, [r0, q2, uxtw #2]
+; CHECK-NEXT: vstrw.32 q2, [r0, q1, uxtw #2]
+; CHECK-NEXT: vmov q1, q3
; CHECK-NEXT: bne .LBB5_1
; CHECK-NEXT: @ %bb.2: @ %end
; CHECK-NEXT: bx lr
@@ -330,20 +330,20 @@ define arm_aapcs_vfpcc void @non_gatscat_use1(ptr noalias nocapture readonly %da
; CHECK-NEXT: vpush {d8, d9}
; CHECK-NEXT: adr r4, .LCPI7_0
; CHECK-NEXT: mov.w r12, #9
-; CHECK-NEXT: vldrw.u32 q0, [r4]
+; CHECK-NEXT: vldrw.u32 q1, [r4]
; CHECK-NEXT: mov.w lr, #12
; CHECK-NEXT: movs r4, #8
-; CHECK-NEXT: vdup.32 q1, r0
+; CHECK-NEXT: vdup.32 q0, r0
; CHECK-NEXT: .LBB7_1: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vmov q2, q0
-; CHECK-NEXT: vmov q3, q1
-; CHECK-NEXT: vmla.i32 q3, q2, lr
-; CHECK-NEXT: subs r2, #4
+; CHECK-NEXT: vmov q3, q0
+; CHECK-NEXT: vadd.i32 q2, q1, r4
+; CHECK-NEXT: vmla.i32 q3, q1, lr
+; CHECK-NEXT: vmul.i32 q1, q1, r12
; CHECK-NEXT: vldrw.u32 q4, [q3, #24]
-; CHECK-NEXT: vmul.i32 q2, q2, r12
-; CHECK-NEXT: vadd.i32 q0, q0, r4
-; CHECK-NEXT: vstrw.32 q2, [r3]
+; CHECK-NEXT: subs r2, #4
+; CHECK-NEXT: vstrw.32 q1, [r3]
+; CHECK-NEXT: vmov q1, q2
; CHECK-NEXT: vstrb.8 q4, [r1], #16
; CHECK-NEXT: bne .LBB7_1
; CHECK-NEXT: @ %bb.2: @ %end
@@ -390,22 +390,22 @@ define arm_aapcs_vfpcc void @non_gatscat_use2(ptr noalias nocapture readonly %da
; CHECK-NEXT: vpush {d8, d9, d10, d11}
; CHECK-NEXT: adr r4, .LCPI8_0
; CHECK-NEXT: movs r5, #18
-; CHECK-NEXT: vldrw.u32 q0, [r4]
+; CHECK-NEXT: vldrw.u32 q2, [r4]
; CHECK-NEXT: mov.w r12, #9
; CHECK-NEXT: mov.w lr, #12
; CHECK-NEXT: movs r4, #8
-; CHECK-NEXT: vdup.32 q1, r0
-; CHECK-NEXT: vdup.32 q2, r5
+; CHECK-NEXT: vdup.32 q0, r0
+; CHECK-NEXT: vdup.32 q1, r5
; CHECK-NEXT: .LBB8_1: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vmov q3, q0
-; CHECK-NEXT: vmov q4, q1
-; CHECK-NEXT: vmla.i32 q4, q3, lr
+; CHECK-NEXT: vmov q4, q0
+; CHECK-NEXT: vadd.i32 q3, q2, r4
+; CHECK-NEXT: vmla.i32 q4, q2, lr
; CHECK-NEXT: subs r2, #4
; CHECK-NEXT: vldrw.u32 q5, [q4, #24]
-; CHECK-NEXT: vmov q4, q2
-; CHECK-NEXT: vmla.i32 q4, q3, r12
-; CHECK-NEXT: vadd.i32 q0, q0, r4
+; CHECK-NEXT: vmov q4, q1
+; CHECK-NEXT: vmla.i32 q4, q2, r12
+; CHECK-NEXT: vmov q2, q3
; CHECK-NEXT: vstrb.8 q5, [r1], #16
; CHECK-NEXT: vstrw.32 q4, [r3]
; CHECK-NEXT: bne .LBB8_1
@@ -487,21 +487,21 @@ define dso_local void @arm_mat_mult_q31(ptr noalias nocapture readonly %A, ptr n
; CHECK-NEXT: @ => This Loop Header: Depth=2
; CHECK-NEXT: @ Child Loop BB9_3 Depth 3
; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload
-; CHECK-NEXT: vmov q1, q2
+; CHECK-NEXT: vmov q7, q2
; CHECK-NEXT: dls lr, r10
; CHECK-NEXT: vmov.i32 q5, #0x0
-; CHECK-NEXT: vmlas.i32 q1, q0, r7
-; CHECK-NEXT: vmov q7, q4
+; CHECK-NEXT: vmlas.i32 q7, q0, r7
+; CHECK-NEXT: vmov q6, q4
; CHECK-NEXT: .LBB9_3: @ %vector.body
; CHECK-NEXT: @ Parent Loop BB9_1 Depth=1
; CHECK-NEXT: @ Parent Loop BB9_2 Depth=2
; CHECK-NEXT: @ => This Inner Loop Header: Depth=3
-; CHECK-NEXT: vmov q6, q1
-; CHECK-NEXT: vadd.i32 q1, q1, q3
-; CHECK-NEXT: vldrw.u32 q0, [r1, q6, uxtw #2]
-; CHECK-NEXT: vldrw.u32 q6, [q7, #32]!
-; CHECK-NEXT: vmul.i32 q0, q0, q6
-; CHECK-NEXT: vadd.i32 q5, q0, q5
+; CHECK-NEXT: vadd.i32 q0, q7, q3
+; CHECK-NEXT: vldrw.u32 q1, [r1, q7, uxtw #2]
+; CHECK-NEXT: vldrw.u32 q7, [q6, #32]!
+; CHECK-NEXT: vmul.i32 q1, q1, q7
+; CHECK-NEXT: vmov q7, q0
+; CHECK-NEXT: vadd.i32 q5, q1, q5
; CHECK-NEXT: le lr, .LBB9_3
; CHECK-NEXT: @ %bb.4: @ %middle.block
; CHECK-NEXT: @ in Loop: Header=BB9_2 Depth=2
@@ -702,12 +702,12 @@ define dso_local void @arm_mat_mult_q15(ptr noalias nocapture readonly %A, ptr n
; CHECK-NEXT: @ Parent Loop BB10_5 Depth=1
; CHECK-NEXT: @ Parent Loop BB10_8 Depth=2
; CHECK-NEXT: @ => This Inner Loop Header: Depth=3
-; CHECK-NEXT: vmov q6, q5
-; CHECK-NEXT: vadd.i32 q5, q5, q3
-; CHECK-NEXT: vldrh.s32 q7, [r1, q6, uxtw #1]
-; CHECK-NEXT: vldrh.s32 q6, [r3], #8
-; CHECK-NEXT: vmul.i32 q6, q7, q6
-; CHECK-NEXT: vadd.i32 q4, q6, q4
+; CHECK-NEXT: vadd.i32 q6, q5, q3
+; CHECK-NEXT: vldrh.s32 q7, [r1, q5, uxtw #1]
+; CHECK-NEXT: vldrh.s32 q5, [r3], #8
+; CHECK-NEXT: vmul.i32 q5, q7, q5
+; CHECK-NEXT: vadd.i32 q4, q5, q4
+; CHECK-NEXT: vmov q5, q6
; CHECK-NEXT: le lr, .LBB10_11
; CHECK-NEXT: @ %bb.12: @ %middle.block
; CHECK-NEXT: @ in Loop: Header=BB10_8 Depth=2
@@ -922,15 +922,15 @@ define hidden arm_aapcs_vfpcc i32 @arm_depthwise_conv_s8(ptr nocapture readonly
; CHECK-NEXT: @ Parent Loop BB11_3 Depth=3
; CHECK-NEXT: @ Parent Loop BB11_4 Depth=4
; CHECK-NEXT: @ => This Inner Loop Header: Depth=5
-; CHECK-NEXT: vmov q7, q5
-; CHECK-NEXT: vmov q6, q4
-; CHECK-NEXT: vldrb.s32 q2, [r0, q7]
-; CHECK-NEXT: vldrb.s32 q7, [r1, q6]
-; CHECK-NEXT: subs r5, #4
-; CHECK-NEXT: vadd.i32 q4, q4, q0
+; CHECK-NEXT: vldrb.s32 q2, [r0, q5]
+; CHECK-NEXT: vadd.i32 q7, q5, q0
+; CHECK-NEXT: vldrb.s32 q5, [r1, q4]
+; CHECK-NEXT: vadd.i32 q6, q4, q0
; CHECK-NEXT: vadd.i32 q2, q2, r2
-; CHECK-NEXT: vadd.i32 q5, q5, q0
-; CHECK-NEXT: vmlava.u32 r12, q2, q7
+; CHECK-NEXT: subs r5, #4
+; CHECK-NEXT: vmlava.u32 r12, q2, q5
+; CHECK-NEXT: vmov q5, q7
+; CHECK-NEXT: vmov q4, q6
; CHECK-NEXT: bne .LBB11_5
; CHECK-NEXT: @ %bb.6: @ %middle.block
; CHECK-NEXT: @ in Loop: Header=BB11_4 Depth=4
diff --git a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-reduct.ll b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-reduct.ll
index 652d25af02e7..828f8e4f8304 100644
--- a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-reduct.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-reduct.ll
@@ -105,66 +105,68 @@ define void @correlate(ptr nocapture noundef readonly %ID, ptr nocapture noundef
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
-; CHECK-NEXT: .pad #12
-; CHECK-NEXT: sub sp, #12
+; CHECK-NEXT: .pad #8
+; CHECK-NEXT: sub sp, #8
; CHECK-NEXT: cmp r3, #1
-; CHECK-NEXT: stm.w sp, {r0, r1, r3} @ 12-byte Folded Spill
+; CHECK-NEXT: str r0, [sp] @ 4-byte Spill
; CHECK-NEXT: blt .LBB4_12
; CHECK-NEXT: @ %bb.1: @ %for.body.lr.ph
-; CHECK-NEXT: ldr r1, [sp, #48]
-; CHECK-NEXT: add.w r12, r2, #3
+; CHECK-NEXT: ldr r7, [sp, #44]
+; CHECK-NEXT: add.w r10, r2, #3
; CHECK-NEXT: ldr.w r11, [sp] @ 4-byte Reload
-; CHECK-NEXT: mov.w r10, #0
-; CHECK-NEXT: mov r8, r2
+; CHECK-NEXT: mov r9, r2
+; CHECK-NEXT: mov r5, r1
; CHECK-NEXT: mov r0, r2
-; CHECK-NEXT: uxth r3, r1
+; CHECK-NEXT: uxth.w r12, r7
+; CHECK-NEXT: mov.w r8, #0
+; CHECK-NEXT: str.w r9, [sp, #4] @ 4-byte Spill
; CHECK-NEXT: b .LBB4_4
; CHECK-NEXT: .LBB4_2: @ in Loop: Header=BB4_4 Depth=1
; CHECK-NEXT: movs r6, #0
; CHECK-NEXT: .LBB4_3: @ %for.end
; CHECK-NEXT: @ in Loop: Header=BB4_4 Depth=1
-; CHECK-NEXT: ldr r7, [sp, #4] @ 4-byte Reload
-; CHECK-NEXT: lsrs r2, r6, #16
-; CHECK-NEXT: sub.w r12, r12, #1
+; CHECK-NEXT: lsrs r0, r6, #16
+; CHECK-NEXT: sub.w r10, r10, #1
+; CHECK-NEXT: strh.w r0, [r5, r8, lsl #1]
+; CHECK-NEXT: add.w r8, r8, #1
; CHECK-NEXT: add.w r11, r11, #2
-; CHECK-NEXT: sub.w r8, r8, #1
-; CHECK-NEXT: strh.w r2, [r7, r10, lsl #1]
-; CHECK-NEXT: add.w r10, r10, #1
-; CHECK-NEXT: ldr r2, [sp, #8] @ 4-byte Reload
-; CHECK-NEXT: cmp r10, r2
-; CHECK-NEXT: mov r2, r0
+; CHECK-NEXT: ldr r0, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT: sub.w r9, r9, #1
+; CHECK-NEXT: cmp r8, r3
; CHECK-NEXT: beq .LBB4_12
; CHECK-NEXT: .LBB4_4: @ %for.body
; CHECK-NEXT: @ =>This Loop Header: Depth=1
; CHECK-NEXT: @ Child Loop BB4_8 Depth 2
; CHECK-NEXT: @ Child Loop BB4_11 Depth 2
-; CHECK-NEXT: cmp r2, r10
+; CHECK-NEXT: cmp r0, r8
; CHECK-NEXT: ble .LBB4_2
; CHECK-NEXT: @ %bb.5: @ %vector.main.loop.iter.check
; CHECK-NEXT: @ in Loop: Header=BB4_4 Depth=1
-; CHECK-NEXT: sub.w r4, r2, r10
-; CHECK-NEXT: cmp r4, #8
+; CHECK-NEXT: sub.w r0, r0, r8
+; CHECK-NEXT: mov r2, r5
+; CHECK-NEXT: cmp r0, #8
; CHECK-NEXT: bhs .LBB4_7
; CHECK-NEXT: @ %bb.6: @ in Loop: Header=BB4_4 Depth=1
; CHECK-NEXT: movs r6, #0
-; CHECK-NEXT: mov.w r9, #0
+; CHECK-NEXT: movs r1, #0
+; CHECK-NEXT: mov r5, r2
; CHECK-NEXT: b .LBB4_10
; CHECK-NEXT: .LBB4_7: @ %vector.ph
; CHECK-NEXT: @ in Loop: Header=BB4_4 Depth=1
-; CHECK-NEXT: bic r2, r8, #7
-; CHECK-NEXT: movs r7, #1
-; CHECK-NEXT: subs r2, #8
-; CHECK-NEXT: bic r9, r4, #7
-; CHECK-NEXT: movs r6, #0
+; CHECK-NEXT: bic r7, r9, #7
+; CHECK-NEXT: movs r6, #1
+; CHECK-NEXT: subs r7, #8
+; CHECK-NEXT: bic r1, r0, #7
; CHECK-NEXT: mov r5, r11
-; CHECK-NEXT: add.w lr, r7, r2, lsr #3
-; CHECK-NEXT: ldr r2, [sp] @ 4-byte Reload
+; CHECK-NEXT: add.w lr, r6, r7, lsr #3
+; CHECK-NEXT: movs r6, #0
+; CHECK-NEXT: ldr r4, [sp] @ 4-byte Reload
; CHECK-NEXT: .LBB4_8: @ %vector.body
; CHECK-NEXT: @ Parent Loop BB4_4 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT: vldrh.u16 q0, [r2], #16
+; CHECK-NEXT: vldrh.u16 q0, [r4], #16
; CHECK-NEXT: vldrh.u16 q1, [r5], #16
-; CHECK-NEXT: rsbs r7, r3, #0
+; CHECK-NEXT: rsb.w r7, r12, #0
; CHECK-NEXT: vmullb.s16 q2, q1, q0
; CHECK-NEXT: vmullt.s16 q0, q1, q0
; CHECK-NEXT: vshl.s32 q2, r7
@@ -174,29 +176,32 @@ define void @correlate(ptr nocapture noundef readonly %ID, ptr nocapture noundef
; CHECK-NEXT: le lr, .LBB4_8
; CHECK-NEXT: @ %bb.9: @ %middle.block
; CHECK-NEXT: @ in Loop: Header=BB4_4 Depth=1
-; CHECK-NEXT: cmp r4, r9
+; CHECK-NEXT: mov r5, r2
+; CHECK-NEXT: cmp r0, r1
; CHECK-NEXT: beq .LBB4_3
; CHECK-NEXT: .LBB4_10: @ %vec.epilog.ph
; CHECK-NEXT: @ in Loop: Header=BB4_4 Depth=1
-; CHECK-NEXT: ldr r1, [sp] @ 4-byte Reload
-; CHECK-NEXT: add.w r2, r9, r10
-; CHECK-NEXT: add.w r7, r1, r9, lsl #1
-; CHECK-NEXT: add.w r2, r1, r2, lsl #1
-; CHECK-NEXT: sub.w r5, r8, r9
-; CHECK-NEXT: dlstp.32 lr, r5
+; CHECK-NEXT: ldr r0, [sp] @ 4-byte Reload
+; CHECK-NEXT: add.w r4, r1, r8
+; CHECK-NEXT: sub.w r7, r9, r1
+; CHECK-NEXT: add.w r2, r0, r1, lsl #1
+; CHECK-NEXT: add.w r4, r0, r4, lsl #1
+; CHECK-NEXT: mov r1, r5
+; CHECK-NEXT: mov r5, r1
+; CHECK-NEXT: dlstp.32 lr, r7
; CHECK-NEXT: .LBB4_11: @ %vec.epilog.vector.body
; CHECK-NEXT: @ Parent Loop BB4_4 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT: rsbs r4, r3, #0
-; CHECK-NEXT: vldrh.s32 q0, [r7], #8
-; CHECK-NEXT: vldrh.s32 q1, [r2], #8
+; CHECK-NEXT: rsb.w r0, r12, #0
+; CHECK-NEXT: vldrh.s32 q0, [r2], #8
+; CHECK-NEXT: vldrh.s32 q1, [r4], #8
; CHECK-NEXT: vmul.i32 q0, q1, q0
-; CHECK-NEXT: vshl.s32 q0, r4
+; CHECK-NEXT: vshl.s32 q0, r0
; CHECK-NEXT: vaddva.u32 r6, q0
; CHECK-NEXT: letp lr, .LBB4_11
; CHECK-NEXT: b .LBB4_3
; CHECK-NEXT: .LBB4_12: @ %for.end17
-; CHECK-NEXT: add sp, #12
+; CHECK-NEXT: add sp, #8
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
entry:
%conv = sext i16 %Ls to i32
diff --git a/llvm/test/CodeGen/Thumb2/mve-pipelineloops.ll b/llvm/test/CodeGen/Thumb2/mve-pipelineloops.ll
index d6c5cde30ed7..43ed5eefbf4c 100644
--- a/llvm/test/CodeGen/Thumb2/mve-pipelineloops.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-pipelineloops.ll
@@ -18,50 +18,50 @@ define void @arm_cmplx_dot_prod_q15(ptr noundef %pSrcA, ptr noundef %pSrcB, i32
; CHECK-NEXT: csel r7, r6, r5, hs
; CHECK-NEXT: add.w lr, r7, #1
; CHECK-NEXT: mov r4, r5
-; CHECK-NEXT: vldrh.u16 q1, [r0], #32
+; CHECK-NEXT: vldrh.u16 q0, [r0], #32
; CHECK-NEXT: movs r7, #0
; CHECK-NEXT: mov r8, r5
-; CHECK-NEXT: vldrh.u16 q2, [r1], #32
-; CHECK-NEXT: vmlsldava.s16 r4, r7, q1, q2
-; CHECK-NEXT: vldrh.u16 q0, [r0, #-16]
-; CHECK-NEXT: vmlaldavax.s16 r8, r5, q1, q2
-; CHECK-NEXT: vldrh.u16 q2, [r1, #-16]
-; CHECK-NEXT: vmlsldava.s16 r4, r7, q0, q2
; CHECK-NEXT: vldrh.u16 q1, [r1], #32
+; CHECK-NEXT: vmlsldava.s16 r4, r7, q0, q1
+; CHECK-NEXT: vldrh.u16 q2, [r0, #-16]
+; CHECK-NEXT: vmlaldavax.s16 r8, r5, q0, q1
+; CHECK-NEXT: vldrh.u16 q3, [r1, #-16]
+; CHECK-NEXT: vmlsldava.s16 r4, r7, q2, q3
+; CHECK-NEXT: vldrh.u16 q0, [r1], #32
; CHECK-NEXT: sub.w lr, lr, #1
; CHECK-NEXT: cmp.w lr, #0
-; CHECK-NEXT: vldrh.u16 q3, [r0], #32
+; CHECK-NEXT: vldrh.u16 q1, [r0], #32
; CHECK-NEXT: beq .LBB0_3
; CHECK-NEXT: .p2align 2
; CHECK-NEXT: .LBB0_2: @ %while.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vmlaldavax.s16 r8, r5, q0, q2
-; CHECK-NEXT: vldrh.u16 q2, [r1, #-16]
-; CHECK-NEXT: vmlsldava.s16 r4, r7, q3, q1
-; CHECK-NEXT: vldrh.u16 q0, [r0, #-16]
-; CHECK-NEXT: vmlaldavax.s16 r8, r5, q3, q1
-; CHECK-NEXT: vldrh.u16 q3, [r0], #32
-; CHECK-NEXT: vmlsldava.s16 r4, r7, q0, q2
-; CHECK-NEXT: vldrh.u16 q1, [r1], #32
+; CHECK-NEXT: vmlaldavax.s16 r8, r5, q2, q3
+; CHECK-NEXT: vldrh.u16 q3, [r1, #-16]
+; CHECK-NEXT: vmlsldava.s16 r4, r7, q1, q0
+; CHECK-NEXT: vldrh.u16 q2, [r0, #-16]
+; CHECK-NEXT: vmlaldavax.s16 r8, r5, q1, q0
+; CHECK-NEXT: vldrh.u16 q1, [r0], #32
+; CHECK-NEXT: vmlsldava.s16 r4, r7, q2, q3
+; CHECK-NEXT: vldrh.u16 q0, [r1], #32
; CHECK-NEXT: le lr, .LBB0_2
; CHECK-NEXT: .LBB0_3:
-; CHECK-NEXT: vmlaldavax.s16 r8, r5, q0, q2
+; CHECK-NEXT: vmlaldavax.s16 r8, r5, q2, q3
; CHECK-NEXT: movs r6, #14
; CHECK-NEXT: and.w r2, r6, r2, lsl #1
-; CHECK-NEXT: vmlaldavax.s16 r8, r5, q3, q1
-; CHECK-NEXT: vldrh.u16 q0, [r0, #-16]
-; CHECK-NEXT: vmlsldava.s16 r4, r7, q3, q1
-; CHECK-NEXT: vldrh.u16 q1, [r1, #-16]
-; CHECK-NEXT: vmlaldavax.s16 r8, r5, q0, q1
+; CHECK-NEXT: vmlaldavax.s16 r8, r5, q1, q0
+; CHECK-NEXT: vldrh.u16 q2, [r0, #-16]
+; CHECK-NEXT: vmlsldava.s16 r4, r7, q1, q0
+; CHECK-NEXT: vldrh.u16 q0, [r1, #-16]
+; CHECK-NEXT: vmlaldavax.s16 r8, r5, q2, q0
; CHECK-NEXT: vctp.16 r2
-; CHECK-NEXT: vmlsldava.s16 r4, r7, q0, q1
+; CHECK-NEXT: vmlsldava.s16 r4, r7, q2, q0
; CHECK-NEXT: vpst
-; CHECK-NEXT: vldrht.u16 q2, [r0]
+; CHECK-NEXT: vldrht.u16 q1, [r0]
; CHECK-NEXT: cmp r2, #9
; CHECK-NEXT: vpsttt
; CHECK-NEXT: vldrht.u16 q0, [r1]
-; CHECK-NEXT: vmlsldavat.s16 r4, r7, q2, q0
-; CHECK-NEXT: vmlaldavaxt.s16 r8, r5, q2, q0
+; CHECK-NEXT: vmlsldavat.s16 r4, r7, q1, q0
+; CHECK-NEXT: vmlaldavaxt.s16 r8, r5, q1, q0
; CHECK-NEXT: blo .LBB0_10
; CHECK-NEXT: @ %bb.4: @ %do.body.1
; CHECK-NEXT: subs r2, #8