diff options
Diffstat (limited to 'llvm/test/CodeGen/Thumb2')
6 files changed, 194 insertions, 186 deletions
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll index 98e082be4cad..1769c5d2fd38 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll @@ -21,12 +21,11 @@ define dso_local void @varying_outer_2d_reduction(ptr nocapture readonly %Input, ; ENABLED-NEXT: it lt ; ENABLED-NEXT: bxlt lr ; ENABLED-NEXT: .LBB0_1: @ %for.body.lr.ph -; ENABLED-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; ENABLED-NEXT: push.w {r4, r5, r6, r7, r9, r10, r11, lr} ; ENABLED-NEXT: mov r11, r0 -; ENABLED-NEXT: ldr r0, [sp, #36] +; ENABLED-NEXT: ldr r0, [sp, #32] ; ENABLED-NEXT: add.w r9, r2, #3 ; ENABLED-NEXT: mov.w r12, #0 -; ENABLED-NEXT: mov.w r8, #1 ; ENABLED-NEXT: mov r10, r11 ; ENABLED-NEXT: uxth r0, r0 ; ENABLED-NEXT: rsbs r5, r0, #0 @@ -50,16 +49,18 @@ define dso_local void @varying_outer_2d_reduction(ptr nocapture readonly %Input, ; ENABLED-NEXT: @ %bb.5: @ %vector.ph ; ENABLED-NEXT: @ in Loop: Header=BB0_4 Depth=1 ; ENABLED-NEXT: bic r0, r9, #3 -; ENABLED-NEXT: sub.w r4, r2, r12 +; ENABLED-NEXT: movs r7, #1 ; ENABLED-NEXT: subs r0, #4 +; ENABLED-NEXT: sub.w r4, r2, r12 ; ENABLED-NEXT: vmov.i32 q1, #0x0 -; ENABLED-NEXT: mov r7, r10 -; ENABLED-NEXT: add.w r6, r8, r0, lsr #2 +; ENABLED-NEXT: add.w r6, r7, r0, lsr #2 ; ENABLED-NEXT: adds r0, r2, #3 ; ENABLED-NEXT: sub.w r0, r0, r12 ; ENABLED-NEXT: bic r0, r0, #3 ; ENABLED-NEXT: subs r0, #4 -; ENABLED-NEXT: add.w lr, r8, r0, lsr #2 +; ENABLED-NEXT: add.w r0, r7, r0, lsr #2 +; ENABLED-NEXT: mov r7, r10 +; ENABLED-NEXT: dls lr, r0 ; ENABLED-NEXT: mov r0, r11 ; ENABLED-NEXT: .LBB0_6: @ %vector.body ; ENABLED-NEXT: @ Parent Loop BB0_4 Depth=1 @@ -82,7 +83,7 @@ define dso_local void @varying_outer_2d_reduction(ptr nocapture readonly %Input, ; ENABLED-NEXT: vaddv.u32 r0, q0 ; ENABLED-NEXT: b .LBB0_3 ; ENABLED-NEXT: .LBB0_8: -; ENABLED-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; ENABLED-NEXT: pop.w {r4, r5, r6, r7, r9, r10, r11, lr} ; ENABLED-NEXT: bx lr ; ; NOREDUCTIONS-LABEL: varying_outer_2d_reduction: @@ -91,12 +92,11 @@ define dso_local void @varying_outer_2d_reduction(ptr nocapture readonly %Input, ; NOREDUCTIONS-NEXT: it lt ; NOREDUCTIONS-NEXT: bxlt lr ; NOREDUCTIONS-NEXT: .LBB0_1: @ %for.body.lr.ph -; NOREDUCTIONS-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; NOREDUCTIONS-NEXT: push.w {r4, r5, r6, r7, r9, r10, r11, lr} ; NOREDUCTIONS-NEXT: mov r11, r0 -; NOREDUCTIONS-NEXT: ldr r0, [sp, #36] +; NOREDUCTIONS-NEXT: ldr r0, [sp, #32] ; NOREDUCTIONS-NEXT: add.w r9, r2, #3 ; NOREDUCTIONS-NEXT: mov.w r12, #0 -; NOREDUCTIONS-NEXT: mov.w r8, #1 ; NOREDUCTIONS-NEXT: mov r10, r11 ; NOREDUCTIONS-NEXT: uxth r0, r0 ; NOREDUCTIONS-NEXT: rsbs r5, r0, #0 @@ -120,16 +120,18 @@ define dso_local void @varying_outer_2d_reduction(ptr nocapture readonly %Input, ; NOREDUCTIONS-NEXT: @ %bb.5: @ %vector.ph ; NOREDUCTIONS-NEXT: @ in Loop: Header=BB0_4 Depth=1 ; NOREDUCTIONS-NEXT: bic r0, r9, #3 -; NOREDUCTIONS-NEXT: sub.w r4, r2, r12 +; NOREDUCTIONS-NEXT: movs r7, #1 ; NOREDUCTIONS-NEXT: subs r0, #4 +; NOREDUCTIONS-NEXT: sub.w r4, r2, r12 ; NOREDUCTIONS-NEXT: vmov.i32 q1, #0x0 -; NOREDUCTIONS-NEXT: mov r7, r10 -; NOREDUCTIONS-NEXT: add.w r6, r8, r0, lsr #2 +; NOREDUCTIONS-NEXT: add.w r6, r7, r0, lsr #2 ; NOREDUCTIONS-NEXT: adds r0, r2, #3 ; NOREDUCTIONS-NEXT: sub.w r0, r0, r12 ; NOREDUCTIONS-NEXT: bic r0, r0, #3 ; NOREDUCTIONS-NEXT: subs r0, #4 -; NOREDUCTIONS-NEXT: add.w lr, r8, r0, lsr #2 +; NOREDUCTIONS-NEXT: add.w r0, r7, r0, lsr #2 +; NOREDUCTIONS-NEXT: mov r7, r10 +; NOREDUCTIONS-NEXT: dls lr, r0 ; NOREDUCTIONS-NEXT: mov r0, r11 ; NOREDUCTIONS-NEXT: .LBB0_6: @ %vector.body ; NOREDUCTIONS-NEXT: @ Parent Loop BB0_4 Depth=1 @@ -152,7 +154,7 @@ define dso_local void @varying_outer_2d_reduction(ptr nocapture readonly %Input, ; NOREDUCTIONS-NEXT: vaddv.u32 r0, q0 ; NOREDUCTIONS-NEXT: b .LBB0_3 ; NOREDUCTIONS-NEXT: .LBB0_8: -; NOREDUCTIONS-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; NOREDUCTIONS-NEXT: pop.w {r4, r5, r6, r7, r9, r10, r11, lr} ; NOREDUCTIONS-NEXT: bx lr entry: %conv = sext i16 %N to i32 diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/while-loops.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/while-loops.ll index 435acc29f076..cbcbf1f392ce 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/while-loops.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/while-loops.ll @@ -165,73 +165,74 @@ define dso_local i32 @b(ptr %c, i32 %d, i32 %e, ptr %n) "frame-pointer"="all" { ; CHECK-NEXT: sub sp, #16 ; CHECK-NEXT: wls lr, r1, .LBB2_3 ; CHECK-NEXT: @ %bb.1: @ %while.body.preheader -; CHECK-NEXT: add.w r9, r3, #4 -; CHECK-NEXT: add.w r10, r0, #4 +; CHECK-NEXT: adds r6, r3, #4 +; CHECK-NEXT: adds r1, r0, #4 ; CHECK-NEXT: mvn r8, #1 -; CHECK-NEXT: @ implicit-def: $r6 +; CHECK-NEXT: @ implicit-def: $r9 ; CHECK-NEXT: @ implicit-def: $r4 ; CHECK-NEXT: str r2, [sp] @ 4-byte Spill ; CHECK-NEXT: .LBB2_2: @ %while.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldr.w r1, [r10] +; CHECK-NEXT: str r1, [sp, #12] @ 4-byte Spill ; CHECK-NEXT: asrs r2, r4, #31 -; CHECK-NEXT: str r6, [sp, #4] @ 4-byte Spill +; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: ldr r1, [r1] ; CHECK-NEXT: muls r1, r3, r1 ; CHECK-NEXT: adds r4, r4, r1 ; CHECK-NEXT: adc.w r1, r2, r1, asr #31 ; CHECK-NEXT: adds.w r2, r4, #-2147483648 -; CHECK-NEXT: ldrd r5, r4, [r8] -; CHECK-NEXT: adc r2, r1, #0 +; CHECK-NEXT: ldrd r2, r4, [r8] +; CHECK-NEXT: adc r5, r1, #0 +; CHECK-NEXT: str r2, [sp, #4] @ 4-byte Spill +; CHECK-NEXT: smull r4, r2, r4, r9 +; CHECK-NEXT: asrs r1, r5, #31 ; CHECK-NEXT: str r5, [sp, #8] @ 4-byte Spill -; CHECK-NEXT: smull r4, r5, r4, r6 -; CHECK-NEXT: asrs r1, r2, #31 -; CHECK-NEXT: str r2, [sp, #12] @ 4-byte Spill -; CHECK-NEXT: subs r4, r2, r4 -; CHECK-NEXT: sbcs r1, r5 -; CHECK-NEXT: adds.w r6, r4, #-2147483648 -; CHECK-NEXT: ldr r4, [r10, #-4] -; CHECK-NEXT: adc r11, r1, #0 -; CHECK-NEXT: mov r1, r9 -; CHECK-NEXT: add.w r10, r10, #4 +; CHECK-NEXT: subs r4, r5, r4 +; CHECK-NEXT: sbcs r1, r2 +; CHECK-NEXT: ldr r2, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: adds.w r10, r4, #-2147483648 +; CHECK-NEXT: adc r1, r1, #0 +; CHECK-NEXT: ldr r4, [r2, #-4] ; CHECK-NEXT: muls r4, r3, r4 ; CHECK-NEXT: adds r3, #4 ; CHECK-NEXT: adds.w r12, r4, #-2147483648 ; CHECK-NEXT: asr.w r5, r4, #31 -; CHECK-NEXT: ldr.w r4, [r9] +; CHECK-NEXT: ldr r4, [r6] ; CHECK-NEXT: adc r5, r5, #0 ; CHECK-NEXT: mul r2, r4, r0 +; CHECK-NEXT: adds r0, #4 ; CHECK-NEXT: add.w r2, r2, #-2147483648 ; CHECK-NEXT: asrl r12, r5, r2 -; CHECK-NEXT: smull r2, r9, r4, r12 -; CHECK-NEXT: mov r12, r0 -; CHECK-NEXT: lsll r2, r9, #30 -; CHECK-NEXT: asr.w r5, r9, #31 -; CHECK-NEXT: mov r2, r9 -; CHECK-NEXT: mov r9, r1 -; CHECK-NEXT: ldrd r1, r0, [sp, #4] @ 8-byte Folded Reload -; CHECK-NEXT: lsll r2, r5, r4 -; CHECK-NEXT: lsrl r2, r5, #2 -; CHECK-NEXT: muls r0, r1, r0 -; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload -; CHECK-NEXT: adds r0, #2 -; CHECK-NEXT: lsll r2, r5, r0 -; CHECK-NEXT: add.w r0, r2, #-2147483648 +; CHECK-NEXT: smull r2, r5, r4, r12 +; CHECK-NEXT: lsll r2, r5, #30 +; CHECK-NEXT: ldr r2, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: asr.w r11, r5, #31 +; CHECK-NEXT: mov r12, r5 +; CHECK-NEXT: lsll r12, r11, r4 +; CHECK-NEXT: mul r2, r2, r9 +; CHECK-NEXT: lsrl r12, r11, #2 +; CHECK-NEXT: adds r2, #2 +; CHECK-NEXT: lsll r12, r11, r2 ; CHECK-NEXT: ldr r2, [sp] @ 4-byte Reload -; CHECK-NEXT: asrl r6, r11, r0 -; CHECK-NEXT: movs r0, #2 -; CHECK-NEXT: lsrl r6, r11, #2 -; CHECK-NEXT: str r6, [r0] -; CHECK-NEXT: ldr r0, [r8], #-4 -; CHECK-NEXT: mls r0, r0, r4, r1 -; CHECK-NEXT: adds.w r4, r0, #-2147483648 -; CHECK-NEXT: asr.w r1, r0, #31 +; CHECK-NEXT: add.w r5, r12, #-2147483648 +; CHECK-NEXT: asrl r10, r1, r5 +; CHECK-NEXT: ldr r5, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: lsrl r10, r1, #2 +; CHECK-NEXT: movs r1, #2 +; CHECK-NEXT: mov r9, r10 +; CHECK-NEXT: str.w r10, [r1] +; CHECK-NEXT: ldr r1, [r8], #-4 +; CHECK-NEXT: mls r5, r1, r4, r5 +; CHECK-NEXT: adds.w r4, r5, #-2147483648 +; CHECK-NEXT: asr.w r1, r5, #31 ; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: lsrl r4, r1, #2 -; CHECK-NEXT: rsbs r0, r4, #0 -; CHECK-NEXT: str r0, [r2] -; CHECK-NEXT: str r0, [r9, #-4] -; CHECK-NEXT: add.w r9, r9, #4 -; CHECK-NEXT: add.w r0, r12, #4 +; CHECK-NEXT: rsbs r1, r4, #0 +; CHECK-NEXT: str r1, [r2] +; CHECK-NEXT: str r1, [r6, #-4] +; CHECK-NEXT: adds r6, #4 +; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: adds r1, #4 ; CHECK-NEXT: le lr, .LBB2_2 ; CHECK-NEXT: .LBB2_3: @ %while.end ; CHECK-NEXT: add sp, #16 diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll b/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll index b60ee7c6d406..0d86f22a321e 100644 --- a/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll +++ b/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll @@ -1313,29 +1313,27 @@ define arm_aapcs_vfpcc void @gather_inc_v16i8_simple(ptr noalias nocapture reado ; CHECK-NEXT: @ Child Loop BB16_3 Depth 2 ; CHECK-NEXT: ldr.w r8, [sp, #56] @ 4-byte Reload ; CHECK-NEXT: vldrw.u32 q5, [sp] @ 16-byte Reload -; CHECK-NEXT: vldrw.u32 q6, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload ; CHECK-NEXT: vldrw.u32 q7, [sp, #32] @ 16-byte Reload ; CHECK-NEXT: vmov q4, q3 ; CHECK-NEXT: .LBB16_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB16_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: vmov q0, q6 -; CHECK-NEXT: vadd.i32 q6, q5, r0 -; CHECK-NEXT: vmov r7, r3, d13 +; CHECK-NEXT: vadd.i32 q1, q5, r0 ; CHECK-NEXT: vadd.i32 q2, q4, r0 -; CHECK-NEXT: vmov r5, r6, d5 -; CHECK-NEXT: vmov q1, q7 -; CHECK-NEXT: vmov r4, r10, d12 +; CHECK-NEXT: vmov r7, r3, d3 ; CHECK-NEXT: vadd.i32 q6, q0, lr +; CHECK-NEXT: vmov r5, r6, d5 ; CHECK-NEXT: subs.w r9, r9, #16 +; CHECK-NEXT: vmov r4, r10, d2 +; CHECK-NEXT: vadd.i32 q1, q7, lr ; CHECK-NEXT: vadd.i32 q4, q4, lr ; CHECK-NEXT: vadd.i32 q5, q5, lr -; CHECK-NEXT: vadd.i32 q7, q7, lr ; CHECK-NEXT: ldrb.w r11, [r3] ; CHECK-NEXT: ldrb r3, [r7] ; CHECK-NEXT: vmov r7, r12, d4 -; CHECK-NEXT: vadd.i32 q2, q1, r0 -; CHECK-NEXT: vadd.i32 q1, q0, r0 +; CHECK-NEXT: vadd.i32 q2, q7, r0 +; CHECK-NEXT: vadd.i32 q7, q0, r0 ; CHECK-NEXT: ldrb r5, [r5] ; CHECK-NEXT: ldrb r6, [r6] ; CHECK-NEXT: ldrb r4, [r4] @@ -1344,7 +1342,7 @@ define arm_aapcs_vfpcc void @gather_inc_v16i8_simple(ptr noalias nocapture reado ; CHECK-NEXT: ldrb.w r1, [r12] ; CHECK-NEXT: vmov.8 q0[0], r7 ; CHECK-NEXT: vmov.8 q0[1], r1 -; CHECK-NEXT: vmov r1, r7, d3 +; CHECK-NEXT: vmov r1, r7, d15 ; CHECK-NEXT: vmov.8 q0[2], r5 ; CHECK-NEXT: vmov.8 q0[3], r6 ; CHECK-NEXT: vmov.8 q0[4], r4 @@ -1359,7 +1357,8 @@ define arm_aapcs_vfpcc void @gather_inc_v16i8_simple(ptr noalias nocapture reado ; CHECK-NEXT: ldrb r3, [r5] ; CHECK-NEXT: ldrb.w r12, [r7] ; CHECK-NEXT: ldrb r5, [r4] -; CHECK-NEXT: vmov r4, r7, d2 +; CHECK-NEXT: vmov r4, r7, d14 +; CHECK-NEXT: vmov q7, q1 ; CHECK-NEXT: ldrb r4, [r4] ; CHECK-NEXT: ldrb r7, [r7] ; CHECK-NEXT: vmov.8 q0[8], r4 @@ -1371,6 +1370,7 @@ define arm_aapcs_vfpcc void @gather_inc_v16i8_simple(ptr noalias nocapture reado ; CHECK-NEXT: vmov.8 q0[14], r3 ; CHECK-NEXT: vmov.8 q0[15], r12 ; CHECK-NEXT: vstrb.8 q0, [r8], #16 +; CHECK-NEXT: vmov q0, q6 ; CHECK-NEXT: bne .LBB16_3 ; CHECK-NEXT: @ %bb.4: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB16_2 Depth=1 diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll index c0b2da7eff41..eedca2cd4a5d 100644 --- a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll +++ b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll @@ -236,11 +236,11 @@ define arm_aapcs_vfpcc void @push_out_mul_gather_scatter(ptr noalias nocapture r ; CHECK-NEXT: vldrw.u32 q1, [r1] ; CHECK-NEXT: .LBB5_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q3, [r0, q1, uxtw #2] +; CHECK-NEXT: vldrw.u32 q2, [r0, q1, uxtw #2] +; CHECK-NEXT: vadd.i32 q3, q1, q0 ; CHECK-NEXT: subs r2, #4 -; CHECK-NEXT: vmov q2, q1 -; CHECK-NEXT: vadd.i32 q1, q1, q0 -; CHECK-NEXT: vstrw.32 q3, [r0, q2, uxtw #2] +; CHECK-NEXT: vstrw.32 q2, [r0, q1, uxtw #2] +; CHECK-NEXT: vmov q1, q3 ; CHECK-NEXT: bne .LBB5_1 ; CHECK-NEXT: @ %bb.2: @ %end ; CHECK-NEXT: bx lr @@ -330,20 +330,20 @@ define arm_aapcs_vfpcc void @non_gatscat_use1(ptr noalias nocapture readonly %da ; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: adr r4, .LCPI7_0 ; CHECK-NEXT: mov.w r12, #9 -; CHECK-NEXT: vldrw.u32 q0, [r4] +; CHECK-NEXT: vldrw.u32 q1, [r4] ; CHECK-NEXT: mov.w lr, #12 ; CHECK-NEXT: movs r4, #8 -; CHECK-NEXT: vdup.32 q1, r0 +; CHECK-NEXT: vdup.32 q0, r0 ; CHECK-NEXT: .LBB7_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmov q2, q0 -; CHECK-NEXT: vmov q3, q1 -; CHECK-NEXT: vmla.i32 q3, q2, lr -; CHECK-NEXT: subs r2, #4 +; CHECK-NEXT: vmov q3, q0 +; CHECK-NEXT: vadd.i32 q2, q1, r4 +; CHECK-NEXT: vmla.i32 q3, q1, lr +; CHECK-NEXT: vmul.i32 q1, q1, r12 ; CHECK-NEXT: vldrw.u32 q4, [q3, #24] -; CHECK-NEXT: vmul.i32 q2, q2, r12 -; CHECK-NEXT: vadd.i32 q0, q0, r4 -; CHECK-NEXT: vstrw.32 q2, [r3] +; CHECK-NEXT: subs r2, #4 +; CHECK-NEXT: vstrw.32 q1, [r3] +; CHECK-NEXT: vmov q1, q2 ; CHECK-NEXT: vstrb.8 q4, [r1], #16 ; CHECK-NEXT: bne .LBB7_1 ; CHECK-NEXT: @ %bb.2: @ %end @@ -390,22 +390,22 @@ define arm_aapcs_vfpcc void @non_gatscat_use2(ptr noalias nocapture readonly %da ; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: adr r4, .LCPI8_0 ; CHECK-NEXT: movs r5, #18 -; CHECK-NEXT: vldrw.u32 q0, [r4] +; CHECK-NEXT: vldrw.u32 q2, [r4] ; CHECK-NEXT: mov.w r12, #9 ; CHECK-NEXT: mov.w lr, #12 ; CHECK-NEXT: movs r4, #8 -; CHECK-NEXT: vdup.32 q1, r0 -; CHECK-NEXT: vdup.32 q2, r5 +; CHECK-NEXT: vdup.32 q0, r0 +; CHECK-NEXT: vdup.32 q1, r5 ; CHECK-NEXT: .LBB8_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmov q3, q0 -; CHECK-NEXT: vmov q4, q1 -; CHECK-NEXT: vmla.i32 q4, q3, lr +; CHECK-NEXT: vmov q4, q0 +; CHECK-NEXT: vadd.i32 q3, q2, r4 +; CHECK-NEXT: vmla.i32 q4, q2, lr ; CHECK-NEXT: subs r2, #4 ; CHECK-NEXT: vldrw.u32 q5, [q4, #24] -; CHECK-NEXT: vmov q4, q2 -; CHECK-NEXT: vmla.i32 q4, q3, r12 -; CHECK-NEXT: vadd.i32 q0, q0, r4 +; CHECK-NEXT: vmov q4, q1 +; CHECK-NEXT: vmla.i32 q4, q2, r12 +; CHECK-NEXT: vmov q2, q3 ; CHECK-NEXT: vstrb.8 q5, [r1], #16 ; CHECK-NEXT: vstrw.32 q4, [r3] ; CHECK-NEXT: bne .LBB8_1 @@ -487,21 +487,21 @@ define dso_local void @arm_mat_mult_q31(ptr noalias nocapture readonly %A, ptr n ; CHECK-NEXT: @ => This Loop Header: Depth=2 ; CHECK-NEXT: @ Child Loop BB9_3 Depth 3 ; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vmov q1, q2 +; CHECK-NEXT: vmov q7, q2 ; CHECK-NEXT: dls lr, r10 ; CHECK-NEXT: vmov.i32 q5, #0x0 -; CHECK-NEXT: vmlas.i32 q1, q0, r7 -; CHECK-NEXT: vmov q7, q4 +; CHECK-NEXT: vmlas.i32 q7, q0, r7 +; CHECK-NEXT: vmov q6, q4 ; CHECK-NEXT: .LBB9_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB9_1 Depth=1 ; CHECK-NEXT: @ Parent Loop BB9_2 Depth=2 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=3 -; CHECK-NEXT: vmov q6, q1 -; CHECK-NEXT: vadd.i32 q1, q1, q3 -; CHECK-NEXT: vldrw.u32 q0, [r1, q6, uxtw #2] -; CHECK-NEXT: vldrw.u32 q6, [q7, #32]! -; CHECK-NEXT: vmul.i32 q0, q0, q6 -; CHECK-NEXT: vadd.i32 q5, q0, q5 +; CHECK-NEXT: vadd.i32 q0, q7, q3 +; CHECK-NEXT: vldrw.u32 q1, [r1, q7, uxtw #2] +; CHECK-NEXT: vldrw.u32 q7, [q6, #32]! +; CHECK-NEXT: vmul.i32 q1, q1, q7 +; CHECK-NEXT: vmov q7, q0 +; CHECK-NEXT: vadd.i32 q5, q1, q5 ; CHECK-NEXT: le lr, .LBB9_3 ; CHECK-NEXT: @ %bb.4: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB9_2 Depth=2 @@ -702,12 +702,12 @@ define dso_local void @arm_mat_mult_q15(ptr noalias nocapture readonly %A, ptr n ; CHECK-NEXT: @ Parent Loop BB10_5 Depth=1 ; CHECK-NEXT: @ Parent Loop BB10_8 Depth=2 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=3 -; CHECK-NEXT: vmov q6, q5 -; CHECK-NEXT: vadd.i32 q5, q5, q3 -; CHECK-NEXT: vldrh.s32 q7, [r1, q6, uxtw #1] -; CHECK-NEXT: vldrh.s32 q6, [r3], #8 -; CHECK-NEXT: vmul.i32 q6, q7, q6 -; CHECK-NEXT: vadd.i32 q4, q6, q4 +; CHECK-NEXT: vadd.i32 q6, q5, q3 +; CHECK-NEXT: vldrh.s32 q7, [r1, q5, uxtw #1] +; CHECK-NEXT: vldrh.s32 q5, [r3], #8 +; CHECK-NEXT: vmul.i32 q5, q7, q5 +; CHECK-NEXT: vadd.i32 q4, q5, q4 +; CHECK-NEXT: vmov q5, q6 ; CHECK-NEXT: le lr, .LBB10_11 ; CHECK-NEXT: @ %bb.12: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB10_8 Depth=2 @@ -922,15 +922,15 @@ define hidden arm_aapcs_vfpcc i32 @arm_depthwise_conv_s8(ptr nocapture readonly ; CHECK-NEXT: @ Parent Loop BB11_3 Depth=3 ; CHECK-NEXT: @ Parent Loop BB11_4 Depth=4 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=5 -; CHECK-NEXT: vmov q7, q5 -; CHECK-NEXT: vmov q6, q4 -; CHECK-NEXT: vldrb.s32 q2, [r0, q7] -; CHECK-NEXT: vldrb.s32 q7, [r1, q6] -; CHECK-NEXT: subs r5, #4 -; CHECK-NEXT: vadd.i32 q4, q4, q0 +; CHECK-NEXT: vldrb.s32 q2, [r0, q5] +; CHECK-NEXT: vadd.i32 q7, q5, q0 +; CHECK-NEXT: vldrb.s32 q5, [r1, q4] +; CHECK-NEXT: vadd.i32 q6, q4, q0 ; CHECK-NEXT: vadd.i32 q2, q2, r2 -; CHECK-NEXT: vadd.i32 q5, q5, q0 -; CHECK-NEXT: vmlava.u32 r12, q2, q7 +; CHECK-NEXT: subs r5, #4 +; CHECK-NEXT: vmlava.u32 r12, q2, q5 +; CHECK-NEXT: vmov q5, q7 +; CHECK-NEXT: vmov q4, q6 ; CHECK-NEXT: bne .LBB11_5 ; CHECK-NEXT: @ %bb.6: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB11_4 Depth=4 diff --git a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-reduct.ll b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-reduct.ll index 652d25af02e7..828f8e4f8304 100644 --- a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-reduct.ll +++ b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-reduct.ll @@ -105,66 +105,68 @@ define void @correlate(ptr nocapture noundef readonly %ID, ptr nocapture noundef ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} -; CHECK-NEXT: .pad #12 -; CHECK-NEXT: sub sp, #12 +; CHECK-NEXT: .pad #8 +; CHECK-NEXT: sub sp, #8 ; CHECK-NEXT: cmp r3, #1 -; CHECK-NEXT: stm.w sp, {r0, r1, r3} @ 12-byte Folded Spill +; CHECK-NEXT: str r0, [sp] @ 4-byte Spill ; CHECK-NEXT: blt .LBB4_12 ; CHECK-NEXT: @ %bb.1: @ %for.body.lr.ph -; CHECK-NEXT: ldr r1, [sp, #48] -; CHECK-NEXT: add.w r12, r2, #3 +; CHECK-NEXT: ldr r7, [sp, #44] +; CHECK-NEXT: add.w r10, r2, #3 ; CHECK-NEXT: ldr.w r11, [sp] @ 4-byte Reload -; CHECK-NEXT: mov.w r10, #0 -; CHECK-NEXT: mov r8, r2 +; CHECK-NEXT: mov r9, r2 +; CHECK-NEXT: mov r5, r1 ; CHECK-NEXT: mov r0, r2 -; CHECK-NEXT: uxth r3, r1 +; CHECK-NEXT: uxth.w r12, r7 +; CHECK-NEXT: mov.w r8, #0 +; CHECK-NEXT: str.w r9, [sp, #4] @ 4-byte Spill ; CHECK-NEXT: b .LBB4_4 ; CHECK-NEXT: .LBB4_2: @ in Loop: Header=BB4_4 Depth=1 ; CHECK-NEXT: movs r6, #0 ; CHECK-NEXT: .LBB4_3: @ %for.end ; CHECK-NEXT: @ in Loop: Header=BB4_4 Depth=1 -; CHECK-NEXT: ldr r7, [sp, #4] @ 4-byte Reload -; CHECK-NEXT: lsrs r2, r6, #16 -; CHECK-NEXT: sub.w r12, r12, #1 +; CHECK-NEXT: lsrs r0, r6, #16 +; CHECK-NEXT: sub.w r10, r10, #1 +; CHECK-NEXT: strh.w r0, [r5, r8, lsl #1] +; CHECK-NEXT: add.w r8, r8, #1 ; CHECK-NEXT: add.w r11, r11, #2 -; CHECK-NEXT: sub.w r8, r8, #1 -; CHECK-NEXT: strh.w r2, [r7, r10, lsl #1] -; CHECK-NEXT: add.w r10, r10, #1 -; CHECK-NEXT: ldr r2, [sp, #8] @ 4-byte Reload -; CHECK-NEXT: cmp r10, r2 -; CHECK-NEXT: mov r2, r0 +; CHECK-NEXT: ldr r0, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: sub.w r9, r9, #1 +; CHECK-NEXT: cmp r8, r3 ; CHECK-NEXT: beq .LBB4_12 ; CHECK-NEXT: .LBB4_4: @ %for.body ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB4_8 Depth 2 ; CHECK-NEXT: @ Child Loop BB4_11 Depth 2 -; CHECK-NEXT: cmp r2, r10 +; CHECK-NEXT: cmp r0, r8 ; CHECK-NEXT: ble .LBB4_2 ; CHECK-NEXT: @ %bb.5: @ %vector.main.loop.iter.check ; CHECK-NEXT: @ in Loop: Header=BB4_4 Depth=1 -; CHECK-NEXT: sub.w r4, r2, r10 -; CHECK-NEXT: cmp r4, #8 +; CHECK-NEXT: sub.w r0, r0, r8 +; CHECK-NEXT: mov r2, r5 +; CHECK-NEXT: cmp r0, #8 ; CHECK-NEXT: bhs .LBB4_7 ; CHECK-NEXT: @ %bb.6: @ in Loop: Header=BB4_4 Depth=1 ; CHECK-NEXT: movs r6, #0 -; CHECK-NEXT: mov.w r9, #0 +; CHECK-NEXT: movs r1, #0 +; CHECK-NEXT: mov r5, r2 ; CHECK-NEXT: b .LBB4_10 ; CHECK-NEXT: .LBB4_7: @ %vector.ph ; CHECK-NEXT: @ in Loop: Header=BB4_4 Depth=1 -; CHECK-NEXT: bic r2, r8, #7 -; CHECK-NEXT: movs r7, #1 -; CHECK-NEXT: subs r2, #8 -; CHECK-NEXT: bic r9, r4, #7 -; CHECK-NEXT: movs r6, #0 +; CHECK-NEXT: bic r7, r9, #7 +; CHECK-NEXT: movs r6, #1 +; CHECK-NEXT: subs r7, #8 +; CHECK-NEXT: bic r1, r0, #7 ; CHECK-NEXT: mov r5, r11 -; CHECK-NEXT: add.w lr, r7, r2, lsr #3 -; CHECK-NEXT: ldr r2, [sp] @ 4-byte Reload +; CHECK-NEXT: add.w lr, r6, r7, lsr #3 +; CHECK-NEXT: movs r6, #0 +; CHECK-NEXT: ldr r4, [sp] @ 4-byte Reload ; CHECK-NEXT: .LBB4_8: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB4_4 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: vldrh.u16 q0, [r2], #16 +; CHECK-NEXT: vldrh.u16 q0, [r4], #16 ; CHECK-NEXT: vldrh.u16 q1, [r5], #16 -; CHECK-NEXT: rsbs r7, r3, #0 +; CHECK-NEXT: rsb.w r7, r12, #0 ; CHECK-NEXT: vmullb.s16 q2, q1, q0 ; CHECK-NEXT: vmullt.s16 q0, q1, q0 ; CHECK-NEXT: vshl.s32 q2, r7 @@ -174,29 +176,32 @@ define void @correlate(ptr nocapture noundef readonly %ID, ptr nocapture noundef ; CHECK-NEXT: le lr, .LBB4_8 ; CHECK-NEXT: @ %bb.9: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB4_4 Depth=1 -; CHECK-NEXT: cmp r4, r9 +; CHECK-NEXT: mov r5, r2 +; CHECK-NEXT: cmp r0, r1 ; CHECK-NEXT: beq .LBB4_3 ; CHECK-NEXT: .LBB4_10: @ %vec.epilog.ph ; CHECK-NEXT: @ in Loop: Header=BB4_4 Depth=1 -; CHECK-NEXT: ldr r1, [sp] @ 4-byte Reload -; CHECK-NEXT: add.w r2, r9, r10 -; CHECK-NEXT: add.w r7, r1, r9, lsl #1 -; CHECK-NEXT: add.w r2, r1, r2, lsl #1 -; CHECK-NEXT: sub.w r5, r8, r9 -; CHECK-NEXT: dlstp.32 lr, r5 +; CHECK-NEXT: ldr r0, [sp] @ 4-byte Reload +; CHECK-NEXT: add.w r4, r1, r8 +; CHECK-NEXT: sub.w r7, r9, r1 +; CHECK-NEXT: add.w r2, r0, r1, lsl #1 +; CHECK-NEXT: add.w r4, r0, r4, lsl #1 +; CHECK-NEXT: mov r1, r5 +; CHECK-NEXT: mov r5, r1 +; CHECK-NEXT: dlstp.32 lr, r7 ; CHECK-NEXT: .LBB4_11: @ %vec.epilog.vector.body ; CHECK-NEXT: @ Parent Loop BB4_4 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: rsbs r4, r3, #0 -; CHECK-NEXT: vldrh.s32 q0, [r7], #8 -; CHECK-NEXT: vldrh.s32 q1, [r2], #8 +; CHECK-NEXT: rsb.w r0, r12, #0 +; CHECK-NEXT: vldrh.s32 q0, [r2], #8 +; CHECK-NEXT: vldrh.s32 q1, [r4], #8 ; CHECK-NEXT: vmul.i32 q0, q1, q0 -; CHECK-NEXT: vshl.s32 q0, r4 +; CHECK-NEXT: vshl.s32 q0, r0 ; CHECK-NEXT: vaddva.u32 r6, q0 ; CHECK-NEXT: letp lr, .LBB4_11 ; CHECK-NEXT: b .LBB4_3 ; CHECK-NEXT: .LBB4_12: @ %for.end17 -; CHECK-NEXT: add sp, #12 +; CHECK-NEXT: add sp, #8 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} entry: %conv = sext i16 %Ls to i32 diff --git a/llvm/test/CodeGen/Thumb2/mve-pipelineloops.ll b/llvm/test/CodeGen/Thumb2/mve-pipelineloops.ll index d6c5cde30ed7..43ed5eefbf4c 100644 --- a/llvm/test/CodeGen/Thumb2/mve-pipelineloops.ll +++ b/llvm/test/CodeGen/Thumb2/mve-pipelineloops.ll @@ -18,50 +18,50 @@ define void @arm_cmplx_dot_prod_q15(ptr noundef %pSrcA, ptr noundef %pSrcB, i32 ; CHECK-NEXT: csel r7, r6, r5, hs ; CHECK-NEXT: add.w lr, r7, #1 ; CHECK-NEXT: mov r4, r5 -; CHECK-NEXT: vldrh.u16 q1, [r0], #32 +; CHECK-NEXT: vldrh.u16 q0, [r0], #32 ; CHECK-NEXT: movs r7, #0 ; CHECK-NEXT: mov r8, r5 -; CHECK-NEXT: vldrh.u16 q2, [r1], #32 -; CHECK-NEXT: vmlsldava.s16 r4, r7, q1, q2 -; CHECK-NEXT: vldrh.u16 q0, [r0, #-16] -; CHECK-NEXT: vmlaldavax.s16 r8, r5, q1, q2 -; CHECK-NEXT: vldrh.u16 q2, [r1, #-16] -; CHECK-NEXT: vmlsldava.s16 r4, r7, q0, q2 ; CHECK-NEXT: vldrh.u16 q1, [r1], #32 +; CHECK-NEXT: vmlsldava.s16 r4, r7, q0, q1 +; CHECK-NEXT: vldrh.u16 q2, [r0, #-16] +; CHECK-NEXT: vmlaldavax.s16 r8, r5, q0, q1 +; CHECK-NEXT: vldrh.u16 q3, [r1, #-16] +; CHECK-NEXT: vmlsldava.s16 r4, r7, q2, q3 +; CHECK-NEXT: vldrh.u16 q0, [r1], #32 ; CHECK-NEXT: sub.w lr, lr, #1 ; CHECK-NEXT: cmp.w lr, #0 -; CHECK-NEXT: vldrh.u16 q3, [r0], #32 +; CHECK-NEXT: vldrh.u16 q1, [r0], #32 ; CHECK-NEXT: beq .LBB0_3 ; CHECK-NEXT: .p2align 2 ; CHECK-NEXT: .LBB0_2: @ %while.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmlaldavax.s16 r8, r5, q0, q2 -; CHECK-NEXT: vldrh.u16 q2, [r1, #-16] -; CHECK-NEXT: vmlsldava.s16 r4, r7, q3, q1 -; CHECK-NEXT: vldrh.u16 q0, [r0, #-16] -; CHECK-NEXT: vmlaldavax.s16 r8, r5, q3, q1 -; CHECK-NEXT: vldrh.u16 q3, [r0], #32 -; CHECK-NEXT: vmlsldava.s16 r4, r7, q0, q2 -; CHECK-NEXT: vldrh.u16 q1, [r1], #32 +; CHECK-NEXT: vmlaldavax.s16 r8, r5, q2, q3 +; CHECK-NEXT: vldrh.u16 q3, [r1, #-16] +; CHECK-NEXT: vmlsldava.s16 r4, r7, q1, q0 +; CHECK-NEXT: vldrh.u16 q2, [r0, #-16] +; CHECK-NEXT: vmlaldavax.s16 r8, r5, q1, q0 +; CHECK-NEXT: vldrh.u16 q1, [r0], #32 +; CHECK-NEXT: vmlsldava.s16 r4, r7, q2, q3 +; CHECK-NEXT: vldrh.u16 q0, [r1], #32 ; CHECK-NEXT: le lr, .LBB0_2 ; CHECK-NEXT: .LBB0_3: -; CHECK-NEXT: vmlaldavax.s16 r8, r5, q0, q2 +; CHECK-NEXT: vmlaldavax.s16 r8, r5, q2, q3 ; CHECK-NEXT: movs r6, #14 ; CHECK-NEXT: and.w r2, r6, r2, lsl #1 -; CHECK-NEXT: vmlaldavax.s16 r8, r5, q3, q1 -; CHECK-NEXT: vldrh.u16 q0, [r0, #-16] -; CHECK-NEXT: vmlsldava.s16 r4, r7, q3, q1 -; CHECK-NEXT: vldrh.u16 q1, [r1, #-16] -; CHECK-NEXT: vmlaldavax.s16 r8, r5, q0, q1 +; CHECK-NEXT: vmlaldavax.s16 r8, r5, q1, q0 +; CHECK-NEXT: vldrh.u16 q2, [r0, #-16] +; CHECK-NEXT: vmlsldava.s16 r4, r7, q1, q0 +; CHECK-NEXT: vldrh.u16 q0, [r1, #-16] +; CHECK-NEXT: vmlaldavax.s16 r8, r5, q2, q0 ; CHECK-NEXT: vctp.16 r2 -; CHECK-NEXT: vmlsldava.s16 r4, r7, q0, q1 +; CHECK-NEXT: vmlsldava.s16 r4, r7, q2, q0 ; CHECK-NEXT: vpst -; CHECK-NEXT: vldrht.u16 q2, [r0] +; CHECK-NEXT: vldrht.u16 q1, [r0] ; CHECK-NEXT: cmp r2, #9 ; CHECK-NEXT: vpsttt ; CHECK-NEXT: vldrht.u16 q0, [r1] -; CHECK-NEXT: vmlsldavat.s16 r4, r7, q2, q0 -; CHECK-NEXT: vmlaldavaxt.s16 r8, r5, q2, q0 +; CHECK-NEXT: vmlsldavat.s16 r4, r7, q1, q0 +; CHECK-NEXT: vmlaldavaxt.s16 r8, r5, q1, q0 ; CHECK-NEXT: blo .LBB0_10 ; CHECK-NEXT: @ %bb.4: @ %do.body.1 ; CHECK-NEXT: subs r2, #8 |
