diff options
Diffstat (limited to 'llvm/test/CodeGen/AArch64')
103 files changed, 14635 insertions, 2921 deletions
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic.ll b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic.ll index 5bc041aef88b..e6bf3ab67471 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic.ll +++ b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic.ll @@ -6002,15 +6002,17 @@ define { i8, i1 } @cmpxchg_i8(ptr %ptr, i8 %desired, i8 %new) { ; CHECK-NOLSE-O1-NEXT: b.ne LBB67_4 ; CHECK-NOLSE-O1-NEXT: ; %bb.2: ; %cmpxchg.trystore ; CHECK-NOLSE-O1-NEXT: ; in Loop: Header=BB67_1 Depth=1 -; CHECK-NOLSE-O1-NEXT: stxrb w9, w2, [x8] -; CHECK-NOLSE-O1-NEXT: cbnz w9, LBB67_1 -; CHECK-NOLSE-O1-NEXT: ; %bb.3: -; CHECK-NOLSE-O1-NEXT: mov w1, #1 ; =0x1 +; CHECK-NOLSE-O1-NEXT: stxrb w10, w2, [x8] +; CHECK-NOLSE-O1-NEXT: mov w9, #1 ; =0x1 +; CHECK-NOLSE-O1-NEXT: cbnz w10, LBB67_1 +; CHECK-NOLSE-O1-NEXT: ; %bb.3: ; %cmpxchg.end +; CHECK-NOLSE-O1-NEXT: mov w1, w9 ; CHECK-NOLSE-O1-NEXT: ; kill: def $w0 killed $w0 killed $x0 ; CHECK-NOLSE-O1-NEXT: ret ; CHECK-NOLSE-O1-NEXT: LBB67_4: ; %cmpxchg.nostore -; CHECK-NOLSE-O1-NEXT: mov w1, wzr +; CHECK-NOLSE-O1-NEXT: mov w9, wzr ; CHECK-NOLSE-O1-NEXT: clrex +; CHECK-NOLSE-O1-NEXT: mov w1, w9 ; CHECK-NOLSE-O1-NEXT: ; kill: def $w0 killed $w0 killed $x0 ; CHECK-NOLSE-O1-NEXT: ret ; @@ -6108,15 +6110,17 @@ define { i16, i1 } @cmpxchg_i16(ptr %ptr, i16 %desired, i16 %new) { ; CHECK-NOLSE-O1-NEXT: b.ne LBB68_4 ; CHECK-NOLSE-O1-NEXT: ; %bb.2: ; %cmpxchg.trystore ; CHECK-NOLSE-O1-NEXT: ; in Loop: Header=BB68_1 Depth=1 -; CHECK-NOLSE-O1-NEXT: stxrh w9, w2, [x8] -; CHECK-NOLSE-O1-NEXT: cbnz w9, LBB68_1 -; CHECK-NOLSE-O1-NEXT: ; %bb.3: -; CHECK-NOLSE-O1-NEXT: mov w1, #1 ; =0x1 +; CHECK-NOLSE-O1-NEXT: stxrh w10, w2, [x8] +; CHECK-NOLSE-O1-NEXT: mov w9, #1 ; =0x1 +; CHECK-NOLSE-O1-NEXT: cbnz w10, LBB68_1 +; CHECK-NOLSE-O1-NEXT: ; %bb.3: ; %cmpxchg.end +; CHECK-NOLSE-O1-NEXT: mov w1, w9 ; CHECK-NOLSE-O1-NEXT: ; kill: def $w0 killed $w0 killed $x0 ; CHECK-NOLSE-O1-NEXT: ret ; CHECK-NOLSE-O1-NEXT: LBB68_4: ; %cmpxchg.nostore -; CHECK-NOLSE-O1-NEXT: mov w1, wzr +; CHECK-NOLSE-O1-NEXT: mov w9, wzr ; CHECK-NOLSE-O1-NEXT: clrex +; CHECK-NOLSE-O1-NEXT: mov w1, w9 ; CHECK-NOLSE-O1-NEXT: ; kill: def $w0 killed $w0 killed $x0 ; CHECK-NOLSE-O1-NEXT: ret ; @@ -6206,6 +6210,7 @@ define { i32, i1 } @cmpxchg_i32(ptr %ptr, i32 %desired, i32 %new) { ; CHECK-NOLSE-O1-LABEL: cmpxchg_i32: ; CHECK-NOLSE-O1: ; %bb.0: ; CHECK-NOLSE-O1-NEXT: mov x8, x0 +; CHECK-NOLSE-O1-NEXT: mov w9, #1 ; =0x1 ; CHECK-NOLSE-O1-NEXT: LBB69_1: ; %cmpxchg.start ; CHECK-NOLSE-O1-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NOLSE-O1-NEXT: ldxr w0, [x8] @@ -6213,15 +6218,16 @@ define { i32, i1 } @cmpxchg_i32(ptr %ptr, i32 %desired, i32 %new) { ; CHECK-NOLSE-O1-NEXT: b.ne LBB69_4 ; CHECK-NOLSE-O1-NEXT: ; %bb.2: ; %cmpxchg.trystore ; CHECK-NOLSE-O1-NEXT: ; in Loop: Header=BB69_1 Depth=1 -; CHECK-NOLSE-O1-NEXT: stxr w9, w2, [x8] -; CHECK-NOLSE-O1-NEXT: cbnz w9, LBB69_1 -; CHECK-NOLSE-O1-NEXT: ; %bb.3: -; CHECK-NOLSE-O1-NEXT: mov w1, #1 ; =0x1 +; CHECK-NOLSE-O1-NEXT: stxr w10, w2, [x8] +; CHECK-NOLSE-O1-NEXT: cbnz w10, LBB69_1 +; CHECK-NOLSE-O1-NEXT: ; %bb.3: ; %cmpxchg.end +; CHECK-NOLSE-O1-NEXT: mov w1, w9 ; CHECK-NOLSE-O1-NEXT: ; kill: def $w0 killed $w0 killed $x0 ; CHECK-NOLSE-O1-NEXT: ret ; CHECK-NOLSE-O1-NEXT: LBB69_4: ; %cmpxchg.nostore -; CHECK-NOLSE-O1-NEXT: mov w1, wzr +; CHECK-NOLSE-O1-NEXT: mov w9, wzr ; CHECK-NOLSE-O1-NEXT: clrex +; CHECK-NOLSE-O1-NEXT: mov w1, w9 ; CHECK-NOLSE-O1-NEXT: ; kill: def $w0 killed $w0 killed $x0 ; CHECK-NOLSE-O1-NEXT: ret ; @@ -6306,6 +6312,7 @@ define { i64, i1 } @cmpxchg_i64(ptr %ptr, i64 %desired, i64 %new) { ; CHECK-NOLSE-O1-LABEL: cmpxchg_i64: ; CHECK-NOLSE-O1: ; %bb.0: ; CHECK-NOLSE-O1-NEXT: mov x8, x0 +; CHECK-NOLSE-O1-NEXT: mov w9, #1 ; =0x1 ; CHECK-NOLSE-O1-NEXT: LBB70_1: ; %cmpxchg.start ; CHECK-NOLSE-O1-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NOLSE-O1-NEXT: ldxr x0, [x8] @@ -6313,14 +6320,15 @@ define { i64, i1 } @cmpxchg_i64(ptr %ptr, i64 %desired, i64 %new) { ; CHECK-NOLSE-O1-NEXT: b.ne LBB70_4 ; CHECK-NOLSE-O1-NEXT: ; %bb.2: ; %cmpxchg.trystore ; CHECK-NOLSE-O1-NEXT: ; in Loop: Header=BB70_1 Depth=1 -; CHECK-NOLSE-O1-NEXT: stxr w9, x2, [x8] -; CHECK-NOLSE-O1-NEXT: cbnz w9, LBB70_1 -; CHECK-NOLSE-O1-NEXT: ; %bb.3: -; CHECK-NOLSE-O1-NEXT: mov w1, #1 ; =0x1 +; CHECK-NOLSE-O1-NEXT: stxr w10, x2, [x8] +; CHECK-NOLSE-O1-NEXT: cbnz w10, LBB70_1 +; CHECK-NOLSE-O1-NEXT: ; %bb.3: ; %cmpxchg.end +; CHECK-NOLSE-O1-NEXT: mov w1, w9 ; CHECK-NOLSE-O1-NEXT: ret ; CHECK-NOLSE-O1-NEXT: LBB70_4: ; %cmpxchg.nostore -; CHECK-NOLSE-O1-NEXT: mov w1, wzr +; CHECK-NOLSE-O1-NEXT: mov w9, wzr ; CHECK-NOLSE-O1-NEXT: clrex +; CHECK-NOLSE-O1-NEXT: mov w1, w9 ; CHECK-NOLSE-O1-NEXT: ret ; ; CHECK-OUTLINE-O1-LABEL: cmpxchg_i64: @@ -6404,6 +6412,7 @@ define { ptr, i1 } @cmpxchg_ptr(ptr %ptr, ptr %desired, ptr %new) { ; CHECK-NOLSE-O1-LABEL: cmpxchg_ptr: ; CHECK-NOLSE-O1: ; %bb.0: ; CHECK-NOLSE-O1-NEXT: mov x8, x0 +; CHECK-NOLSE-O1-NEXT: mov w9, #1 ; =0x1 ; CHECK-NOLSE-O1-NEXT: LBB71_1: ; %cmpxchg.start ; CHECK-NOLSE-O1-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NOLSE-O1-NEXT: ldxr x0, [x8] @@ -6411,14 +6420,15 @@ define { ptr, i1 } @cmpxchg_ptr(ptr %ptr, ptr %desired, ptr %new) { ; CHECK-NOLSE-O1-NEXT: b.ne LBB71_4 ; CHECK-NOLSE-O1-NEXT: ; %bb.2: ; %cmpxchg.trystore ; CHECK-NOLSE-O1-NEXT: ; in Loop: Header=BB71_1 Depth=1 -; CHECK-NOLSE-O1-NEXT: stxr w9, x2, [x8] -; CHECK-NOLSE-O1-NEXT: cbnz w9, LBB71_1 -; CHECK-NOLSE-O1-NEXT: ; %bb.3: -; CHECK-NOLSE-O1-NEXT: mov w1, #1 ; =0x1 +; CHECK-NOLSE-O1-NEXT: stxr w10, x2, [x8] +; CHECK-NOLSE-O1-NEXT: cbnz w10, LBB71_1 +; CHECK-NOLSE-O1-NEXT: ; %bb.3: ; %cmpxchg.end +; CHECK-NOLSE-O1-NEXT: mov w1, w9 ; CHECK-NOLSE-O1-NEXT: ret ; CHECK-NOLSE-O1-NEXT: LBB71_4: ; %cmpxchg.nostore -; CHECK-NOLSE-O1-NEXT: mov w1, wzr +; CHECK-NOLSE-O1-NEXT: mov w9, wzr ; CHECK-NOLSE-O1-NEXT: clrex +; CHECK-NOLSE-O1-NEXT: mov w1, w9 ; CHECK-NOLSE-O1-NEXT: ret ; ; CHECK-OUTLINE-O1-LABEL: cmpxchg_ptr: diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-pcsections.ll b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-pcsections.ll index 2779e89c373f..57481724936a 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-pcsections.ll +++ b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-pcsections.ll @@ -9,7 +9,7 @@ define i32 @val_compare_and_swap(ptr %p, i32 %cmp, i32 %new) { ; CHECK-NEXT: liveins: $w1, $w2, $x0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1.cmpxchg.start: - ; CHECK-NEXT: successors: %bb.2(0x7c000000), %bb.3(0x04000000) + ; CHECK-NEXT: successors: %bb.2(0x7ffff800), %bb.3(0x00000800) ; CHECK-NEXT: liveins: $w1, $w2, $x0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: renamable $w8 = LDAXRW renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s32) from %ir.p) @@ -17,7 +17,7 @@ define i32 @val_compare_and_swap(ptr %p, i32 %cmp, i32 %new) { ; CHECK-NEXT: Bcc 1, %bb.3, implicit killed $nzcv, pcsections !0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2.cmpxchg.trystore: - ; CHECK-NEXT: successors: %bb.4(0x04000000), %bb.1(0x7c000000) + ; CHECK-NEXT: successors: %bb.4(0x7ffff800), %bb.1(0x00000800) ; CHECK-NEXT: liveins: $w1, $w2, $x0, $x8 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: early-clobber renamable $w9 = STXRW renamable $w2, renamable $x0, pcsections !0 :: (volatile store (s32) into %ir.p) @@ -49,7 +49,7 @@ define i32 @val_compare_and_swap_from_load(ptr %p, i32 %cmp, ptr %pnew) { ; CHECK-NEXT: renamable $w9 = LDRWui killed renamable $x2, 0, implicit-def $x9, pcsections !0 :: (load (s32) from %ir.pnew) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1.cmpxchg.start: - ; CHECK-NEXT: successors: %bb.2(0x7c000000), %bb.3(0x04000000) + ; CHECK-NEXT: successors: %bb.2(0x7ffff800), %bb.3(0x00000800) ; CHECK-NEXT: liveins: $w1, $x0, $x9 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: renamable $w8 = LDAXRW renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s32) from %ir.p) @@ -57,7 +57,7 @@ define i32 @val_compare_and_swap_from_load(ptr %p, i32 %cmp, ptr %pnew) { ; CHECK-NEXT: Bcc 1, %bb.3, implicit killed $nzcv, pcsections !0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2.cmpxchg.trystore: - ; CHECK-NEXT: successors: %bb.4(0x04000000), %bb.1(0x7c000000) + ; CHECK-NEXT: successors: %bb.4(0x7ffff800), %bb.1(0x00000800) ; CHECK-NEXT: liveins: $w1, $x0, $x8, $x9 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: early-clobber renamable $w10 = STXRW renamable $w9, renamable $x0, pcsections !0 :: (volatile store (s32) into %ir.p) @@ -88,7 +88,7 @@ define i32 @val_compare_and_swap_rel(ptr %p, i32 %cmp, i32 %new) { ; CHECK-NEXT: liveins: $w1, $w2, $x0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1.cmpxchg.start: - ; CHECK-NEXT: successors: %bb.2(0x7c000000), %bb.3(0x04000000) + ; CHECK-NEXT: successors: %bb.2(0x7ffff800), %bb.3(0x00000800) ; CHECK-NEXT: liveins: $w1, $w2, $x0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: renamable $w8 = LDAXRW renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s32) from %ir.p) @@ -96,7 +96,7 @@ define i32 @val_compare_and_swap_rel(ptr %p, i32 %cmp, i32 %new) { ; CHECK-NEXT: Bcc 1, %bb.3, implicit killed $nzcv, pcsections !0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2.cmpxchg.trystore: - ; CHECK-NEXT: successors: %bb.4(0x04000000), %bb.1(0x7c000000) + ; CHECK-NEXT: successors: %bb.4(0x7ffff800), %bb.1(0x00000800) ; CHECK-NEXT: liveins: $w1, $w2, $x0, $x8 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: early-clobber renamable $w9 = STLXRW renamable $w2, renamable $x0, pcsections !0 :: (volatile store (s32) into %ir.p) @@ -126,7 +126,7 @@ define i64 @val_compare_and_swap_64(ptr %p, i64 %cmp, i64 %new) { ; CHECK-NEXT: liveins: $x0, $x1, $x2 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1.cmpxchg.start: - ; CHECK-NEXT: successors: %bb.2(0x7c000000), %bb.3(0x04000000) + ; CHECK-NEXT: successors: %bb.2(0x7ffff800), %bb.3(0x00000800) ; CHECK-NEXT: liveins: $x0, $x1, $x2 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: renamable $x8 = LDXRX renamable $x0, pcsections !0 :: (volatile load (s64) from %ir.p) @@ -134,7 +134,7 @@ define i64 @val_compare_and_swap_64(ptr %p, i64 %cmp, i64 %new) { ; CHECK-NEXT: Bcc 1, %bb.3, implicit killed $nzcv, pcsections !0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2.cmpxchg.trystore: - ; CHECK-NEXT: successors: %bb.4(0x04000000), %bb.1(0x7c000000) + ; CHECK-NEXT: successors: %bb.4(0x7ffff800), %bb.1(0x00000800) ; CHECK-NEXT: liveins: $x0, $x1, $x2, $x8 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: early-clobber renamable $w9 = STXRX renamable $x2, renamable $x0, pcsections !0 :: (volatile store (s64) into %ir.p) @@ -164,7 +164,7 @@ define i64 @val_compare_and_swap_64_monotonic_seqcst(ptr %p, i64 %cmp, i64 %new) ; CHECK-NEXT: liveins: $x0, $x1, $x2 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1.cmpxchg.start: - ; CHECK-NEXT: successors: %bb.2(0x7c000000), %bb.3(0x04000000) + ; CHECK-NEXT: successors: %bb.2(0x7ffff800), %bb.3(0x00000800) ; CHECK-NEXT: liveins: $x0, $x1, $x2 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: renamable $x8 = LDAXRX renamable $x0, pcsections !0 :: (volatile load (s64) from %ir.p) @@ -172,7 +172,7 @@ define i64 @val_compare_and_swap_64_monotonic_seqcst(ptr %p, i64 %cmp, i64 %new) ; CHECK-NEXT: Bcc 1, %bb.3, implicit killed $nzcv, pcsections !0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2.cmpxchg.trystore: - ; CHECK-NEXT: successors: %bb.4(0x04000000), %bb.1(0x7c000000) + ; CHECK-NEXT: successors: %bb.4(0x7ffff800), %bb.1(0x00000800) ; CHECK-NEXT: liveins: $x0, $x1, $x2, $x8 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: early-clobber renamable $w9 = STLXRX renamable $x2, renamable $x0, pcsections !0 :: (volatile store (s64) into %ir.p) @@ -202,7 +202,7 @@ define i64 @val_compare_and_swap_64_release_acquire(ptr %p, i64 %cmp, i64 %new) ; CHECK-NEXT: liveins: $x0, $x1, $x2 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1.cmpxchg.start: - ; CHECK-NEXT: successors: %bb.2(0x7c000000), %bb.3(0x04000000) + ; CHECK-NEXT: successors: %bb.2(0x7ffff800), %bb.3(0x00000800) ; CHECK-NEXT: liveins: $x0, $x1, $x2 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: renamable $x8 = LDAXRX renamable $x0, pcsections !0 :: (volatile load (s64) from %ir.p) @@ -210,7 +210,7 @@ define i64 @val_compare_and_swap_64_release_acquire(ptr %p, i64 %cmp, i64 %new) ; CHECK-NEXT: Bcc 1, %bb.3, implicit killed $nzcv, pcsections !0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2.cmpxchg.trystore: - ; CHECK-NEXT: successors: %bb.4(0x04000000), %bb.1(0x7c000000) + ; CHECK-NEXT: successors: %bb.4(0x7ffff800), %bb.1(0x00000800) ; CHECK-NEXT: liveins: $x0, $x1, $x2, $x8 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: early-clobber renamable $w9 = STLXRX renamable $x2, renamable $x0, pcsections !0 :: (volatile store (s64) into %ir.p) @@ -240,7 +240,7 @@ define i32 @fetch_and_nand(ptr %p) { ; CHECK-NEXT: liveins: $x0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1.atomicrmw.start: - ; CHECK-NEXT: successors: %bb.1(0x7c000000), %bb.2(0x04000000) + ; CHECK-NEXT: successors: %bb.1(0x7c000000), %bb.2(0x04000000) ; CHECK-NEXT: liveins: $x0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: renamable $w8 = LDXRW renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s32) from %ir.p) @@ -265,7 +265,7 @@ define i64 @fetch_and_nand_64(ptr %p) { ; CHECK-NEXT: liveins: $x0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1.atomicrmw.start: - ; CHECK-NEXT: successors: %bb.1(0x7c000000), %bb.2(0x04000000) + ; CHECK-NEXT: successors: %bb.1(0x7c000000), %bb.2(0x04000000) ; CHECK-NEXT: liveins: $x0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: renamable $x8 = LDAXRX renamable $x0, pcsections !0 :: (volatile load (s64) from %ir.p) @@ -292,7 +292,7 @@ define i32 @fetch_and_or(ptr %p) { ; CHECK-NEXT: renamable $w9 = MOVZWi 5, 0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1.atomicrmw.start: - ; CHECK-NEXT: successors: %bb.1(0x7c000000), %bb.2(0x04000000) + ; CHECK-NEXT: successors: %bb.1(0x7c000000), %bb.2(0x04000000) ; CHECK-NEXT: liveins: $w9, $x0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: renamable $w8 = LDAXRW renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s32) from %ir.p) @@ -316,7 +316,7 @@ define i64 @fetch_and_or_64(ptr %p) { ; CHECK-NEXT: liveins: $x0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1.atomicrmw.start: - ; CHECK-NEXT: successors: %bb.1(0x7c000000), %bb.2(0x04000000) + ; CHECK-NEXT: successors: %bb.1(0x7c000000), %bb.2(0x04000000) ; CHECK-NEXT: liveins: $x0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: renamable $x8 = LDXRX renamable $x0, pcsections !0 :: (volatile load (s64) from %ir.p) @@ -723,7 +723,7 @@ define i8 @atomicrmw_add_i8(ptr %ptr, i8 %rhs) { ; CHECK-NEXT: liveins: $w1, $x0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1.atomicrmw.start: - ; CHECK-NEXT: successors: %bb.1(0x7c000000), %bb.2(0x04000000) + ; CHECK-NEXT: successors: %bb.1(0x7c000000), %bb.2(0x04000000) ; CHECK-NEXT: liveins: $w1, $x0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: renamable $w8 = LDAXRB renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s8) from %ir.ptr) @@ -747,7 +747,7 @@ define i8 @atomicrmw_xchg_i8(ptr %ptr, i8 %rhs) { ; CHECK-NEXT: liveins: $w1, $x0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1.atomicrmw.start: - ; CHECK-NEXT: successors: %bb.1(0x7c000000), %bb.2(0x04000000) + ; CHECK-NEXT: successors: %bb.1(0x7c000000), %bb.2(0x04000000) ; CHECK-NEXT: liveins: $w1, $x0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: renamable $w8 = LDXRB renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s8) from %ir.ptr) @@ -770,7 +770,7 @@ define i8 @atomicrmw_sub_i8(ptr %ptr, i8 %rhs) { ; CHECK-NEXT: liveins: $w1, $x0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1.atomicrmw.start: - ; CHECK-NEXT: successors: %bb.1(0x7c000000), %bb.2(0x04000000) + ; CHECK-NEXT: successors: %bb.1(0x7c000000), %bb.2(0x04000000) ; CHECK-NEXT: liveins: $w1, $x0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: renamable $w8 = LDAXRB renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s8) from %ir.ptr) @@ -794,7 +794,7 @@ define i8 @atomicrmw_and_i8(ptr %ptr, i8 %rhs) { ; CHECK-NEXT: liveins: $w1, $x0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1.atomicrmw.start: - ; CHECK-NEXT: successors: %bb.1(0x7c000000), %bb.2(0x04000000) + ; CHECK-NEXT: successors: %bb.1(0x7c000000), %bb.2(0x04000000) ; CHECK-NEXT: liveins: $w1, $x0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: renamable $w8 = LDXRB renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s8) from %ir.ptr) @@ -818,7 +818,7 @@ define i8 @atomicrmw_or_i8(ptr %ptr, i8 %rhs) { ; CHECK-NEXT: liveins: $w1, $x0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1.atomicrmw.start: - ; CHECK-NEXT: successors: %bb.1(0x7c000000), %bb.2(0x04000000) + ; CHECK-NEXT: successors: %bb.1(0x7c000000), %bb.2(0x04000000) ; CHECK-NEXT: liveins: $w1, $x0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: renamable $w8 = LDAXRB renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s8) from %ir.ptr) @@ -842,7 +842,7 @@ define i8 @atomicrmw_xor_i8(ptr %ptr, i8 %rhs) { ; CHECK-NEXT: liveins: $w1, $x0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1.atomicrmw.start: - ; CHECK-NEXT: successors: %bb.1(0x7c000000), %bb.2(0x04000000) + ; CHECK-NEXT: successors: %bb.1(0x7c000000), %bb.2(0x04000000) ; CHECK-NEXT: liveins: $w1, $x0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: renamable $w8 = LDXRB renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s8) from %ir.ptr) @@ -866,7 +866,7 @@ define i8 @atomicrmw_min_i8(ptr %ptr, i8 %rhs) { ; CHECK-NEXT: liveins: $w1, $x0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1.atomicrmw.start: - ; CHECK-NEXT: successors: %bb.1(0x7c000000), %bb.2(0x04000000) + ; CHECK-NEXT: successors: %bb.1(0x7c000000), %bb.2(0x04000000) ; CHECK-NEXT: liveins: $w1, $x0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: renamable $w8 = LDAXRB renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s8) from %ir.ptr) @@ -892,7 +892,7 @@ define i8 @atomicrmw_max_i8(ptr %ptr, i8 %rhs) { ; CHECK-NEXT: liveins: $w1, $x0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1.atomicrmw.start: - ; CHECK-NEXT: successors: %bb.1(0x7c000000), %bb.2(0x04000000) + ; CHECK-NEXT: successors: %bb.1(0x7c000000), %bb.2(0x04000000) ; CHECK-NEXT: liveins: $w1, $x0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: renamable $w8 = LDXRB renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s8) from %ir.ptr) @@ -920,7 +920,7 @@ define i8 @atomicrmw_umin_i8(ptr %ptr, i8 %rhs) { ; CHECK-NEXT: renamable $w9 = ANDWri killed renamable $w1, 7 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1.atomicrmw.start: - ; CHECK-NEXT: successors: %bb.1(0x7c000000), %bb.2(0x04000000) + ; CHECK-NEXT: successors: %bb.1(0x7c000000), %bb.2(0x04000000) ; CHECK-NEXT: liveins: $w9, $x0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: renamable $w8 = LDAXRB renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s8) from %ir.ptr) @@ -948,7 +948,7 @@ define i8 @atomicrmw_umax_i8(ptr %ptr, i8 %rhs) { ; CHECK-NEXT: renamable $w9 = ANDWri killed renamable $w1, 7 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1.atomicrmw.start: - ; CHECK-NEXT: successors: %bb.1(0x7c000000), %bb.2(0x04000000) + ; CHECK-NEXT: successors: %bb.1(0x7c000000), %bb.2(0x04000000) ; CHECK-NEXT: liveins: $w9, $x0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: renamable $w8 = LDXRB renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s8) from %ir.ptr) @@ -974,7 +974,7 @@ define i16 @atomicrmw_add_i16(ptr %ptr, i16 %rhs) { ; CHECK-NEXT: liveins: $w1, $x0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1.atomicrmw.start: - ; CHECK-NEXT: successors: %bb.1(0x7c000000), %bb.2(0x04000000) + ; CHECK-NEXT: successors: %bb.1(0x7c000000), %bb.2(0x04000000) ; CHECK-NEXT: liveins: $w1, $x0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: renamable $w8 = LDAXRH renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s16) from %ir.ptr) @@ -998,7 +998,7 @@ define i16 @atomicrmw_xchg_i16(ptr %ptr, i16 %rhs) { ; CHECK-NEXT: liveins: $w1, $x0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1.atomicrmw.start: - ; CHECK-NEXT: successors: %bb.1(0x7c000000), %bb.2(0x04000000) + ; CHECK-NEXT: successors: %bb.1(0x7c000000), %bb.2(0x04000000) ; CHECK-NEXT: liveins: $w1, $x0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: renamable $w8 = LDXRH renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s16) from %ir.ptr) @@ -1021,7 +1021,7 @@ define i16 @atomicrmw_sub_i16(ptr %ptr, i16 %rhs) { ; CHECK-NEXT: liveins: $w1, $x0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1.atomicrmw.start: - ; CHECK-NEXT: successors: %bb.1(0x7c000000), %bb.2(0x04000000) + ; CHECK-NEXT: successors: %bb.1(0x7c000000), %bb.2(0x04000000) ; CHECK-NEXT: liveins: $w1, $x0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: renamable $w8 = LDAXRH renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s16) from %ir.ptr) @@ -1045,7 +1045,7 @@ define i16 @atomicrmw_and_i16(ptr %ptr, i16 %rhs) { ; CHECK-NEXT: liveins: $w1, $x0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1.atomicrmw.start: - ; CHECK-NEXT: successors: %bb.1(0x7c000000), %bb.2(0x04000000) + ; CHECK-NEXT: successors: %bb.1(0x7c000000), %bb.2(0x04000000) ; CHECK-NEXT: liveins: $w1, $x0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: renamable $w8 = LDXRH renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s16) from %ir.ptr) @@ -1069,7 +1069,7 @@ define i16 @atomicrmw_or_i16(ptr %ptr, i16 %rhs) { ; CHECK-NEXT: liveins: $w1, $x0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1.atomicrmw.start: - ; CHECK-NEXT: successors: %bb.1(0x7c000000), %bb.2(0x04000000) + ; CHECK-NEXT: successors: %bb.1(0x7c000000), %bb.2(0x04000000) ; CHECK-NEXT: liveins: $w1, $x0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: renamable $w8 = LDAXRH renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s16) from %ir.ptr) @@ -1093,7 +1093,7 @@ define i16 @atomicrmw_xor_i16(ptr %ptr, i16 %rhs) { ; CHECK-NEXT: liveins: $w1, $x0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1.atomicrmw.start: - ; CHECK-NEXT: successors: %bb.1(0x7c000000), %bb.2(0x04000000) + ; CHECK-NEXT: successors: %bb.1(0x7c000000), %bb.2(0x04000000) ; CHECK-NEXT: liveins: $w1, $x0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: renamable $w8 = LDXRH renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s16) from %ir.ptr) @@ -1117,7 +1117,7 @@ define i16 @atomicrmw_min_i16(ptr %ptr, i16 %rhs) { ; CHECK-NEXT: liveins: $w1, $x0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1.atomicrmw.start: - ; CHECK-NEXT: successors: %bb.1(0x7c000000), %bb.2(0x04000000) + ; CHECK-NEXT: successors: %bb.1(0x7c000000), %bb.2(0x04000000) ; CHECK-NEXT: liveins: $w1, $x0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: renamable $w8 = LDAXRH renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s16) from %ir.ptr) @@ -1143,7 +1143,7 @@ define i16 @atomicrmw_max_i16(ptr %ptr, i16 %rhs) { ; CHECK-NEXT: liveins: $w1, $x0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1.atomicrmw.start: - ; CHECK-NEXT: successors: %bb.1(0x7c000000), %bb.2(0x04000000) + ; CHECK-NEXT: successors: %bb.1(0x7c000000), %bb.2(0x04000000) ; CHECK-NEXT: liveins: $w1, $x0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: renamable $w8 = LDXRH renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s16) from %ir.ptr) @@ -1171,7 +1171,7 @@ define i16 @atomicrmw_umin_i16(ptr %ptr, i16 %rhs) { ; CHECK-NEXT: renamable $w9 = ANDWri killed renamable $w1, 15 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1.atomicrmw.start: - ; CHECK-NEXT: successors: %bb.1(0x7c000000), %bb.2(0x04000000) + ; CHECK-NEXT: successors: %bb.1(0x7c000000), %bb.2(0x04000000) ; CHECK-NEXT: liveins: $w9, $x0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: renamable $w8 = LDAXRH renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s16) from %ir.ptr) @@ -1199,7 +1199,7 @@ define i16 @atomicrmw_umax_i16(ptr %ptr, i16 %rhs) { ; CHECK-NEXT: renamable $w9 = ANDWri killed renamable $w1, 15 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1.atomicrmw.start: - ; CHECK-NEXT: successors: %bb.1(0x7c000000), %bb.2(0x04000000) + ; CHECK-NEXT: successors: %bb.1(0x7c000000), %bb.2(0x04000000) ; CHECK-NEXT: liveins: $w9, $x0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: renamable $w8 = LDXRH renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s16) from %ir.ptr) @@ -1227,34 +1227,35 @@ define { i8, i1 } @cmpxchg_i8(ptr %ptr, i8 %desired, i8 %new) { ; CHECK-NEXT: $x8 = ORRXrs $xzr, $x0, 0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1.cmpxchg.start: - ; CHECK-NEXT: successors: %bb.2(0x7c000000), %bb.4(0x04000000) + ; CHECK-NEXT: successors: %bb.2(0x7ffff800), %bb.3(0x00000800) ; CHECK-NEXT: liveins: $w1, $w2, $x8 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: renamable $w0 = LDXRB renamable $x8, implicit-def $x0, pcsections !0 :: (volatile load (s8) from %ir.ptr) ; CHECK-NEXT: renamable $w9 = ANDWri renamable $w0, 7, pcsections !0 ; CHECK-NEXT: dead $wzr = SUBSWrx killed renamable $w9, renamable $w1, 0, implicit-def $nzcv, pcsections !0 - ; CHECK-NEXT: Bcc 1, %bb.4, implicit killed $nzcv, pcsections !0 + ; CHECK-NEXT: Bcc 1, %bb.3, implicit killed $nzcv, pcsections !0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2.cmpxchg.trystore: - ; CHECK-NEXT: successors: %bb.3(0x04000000), %bb.1(0x7c000000) + ; CHECK-NEXT: successors: %bb.4(0x7ffff800), %bb.1(0x00000800) ; CHECK-NEXT: liveins: $w1, $w2, $x0, $x8 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: early-clobber renamable $w9 = STXRB renamable $w2, renamable $x8, pcsections !0 :: (volatile store (s8) into %ir.ptr) - ; CHECK-NEXT: CBNZW killed renamable $w9, %bb.1 + ; CHECK-NEXT: early-clobber renamable $w10 = STXRB renamable $w2, renamable $x8, pcsections !0 :: (volatile store (s8) into %ir.ptr) + ; CHECK-NEXT: renamable $w9 = MOVZWi 1, 0 + ; CHECK-NEXT: CBNZW killed renamable $w10, %bb.1 + ; CHECK-NEXT: B %bb.4 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: bb.3.cmpxchg.nostore: + ; CHECK-NEXT: successors: %bb.4(0x80000000) ; CHECK-NEXT: liveins: $x0 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: renamable $w1 = MOVZWi 1, 0 - ; CHECK-NEXT: $w0 = KILL renamable $w0, implicit killed $x0 - ; CHECK-NEXT: RET undef $lr, implicit $w0, implicit $w1 + ; CHECK-NEXT: $w9 = ORRWrs $wzr, $wzr, 0 + ; CHECK-NEXT: CLREX 15, pcsections !0 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.4.cmpxchg.nostore: - ; CHECK-NEXT: liveins: $x0 + ; CHECK-NEXT: bb.4.cmpxchg.end: + ; CHECK-NEXT: liveins: $w9, $x0 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $w1 = ORRWrs $wzr, $wzr, 0 - ; CHECK-NEXT: CLREX 15, pcsections !0 ; CHECK-NEXT: $w0 = KILL renamable $w0, implicit killed $x0 + ; CHECK-NEXT: $w1 = ORRWrs $wzr, killed $w9, 0 ; CHECK-NEXT: RET undef $lr, implicit $w0, implicit $w1 %res = cmpxchg ptr %ptr, i8 %desired, i8 %new monotonic monotonic, !pcsections !0 ret { i8, i1 } %res @@ -1269,35 +1270,36 @@ define { i16, i1 } @cmpxchg_i16(ptr %ptr, i16 %desired, i16 %new) { ; CHECK-NEXT: $x8 = ORRXrs $xzr, $x0, 0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1.cmpxchg.start: - ; CHECK-NEXT: successors: %bb.2(0x7c000000), %bb.4(0x04000000) + ; CHECK-NEXT: successors: %bb.2(0x7ffff800), %bb.3(0x00000800) ; CHECK-NEXT: liveins: $w1, $w2, $x8 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: renamable $w0 = LDXRH renamable $x8, implicit-def $x0, pcsections !0 :: (volatile load (s16) from %ir.ptr) ; CHECK-NEXT: renamable $w9 = ANDWri renamable $w0, 15, pcsections !0 ; CHECK-NEXT: dead $wzr = SUBSWrx killed renamable $w9, renamable $w1, 8, implicit-def $nzcv, pcsections !0 - ; CHECK-NEXT: Bcc 1, %bb.4, implicit killed $nzcv, pcsections !0 + ; CHECK-NEXT: Bcc 1, %bb.3, implicit killed $nzcv, pcsections !0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2.cmpxchg.trystore: - ; CHECK-NEXT: successors: %bb.3(0x04000000), %bb.1(0x7c000000) + ; CHECK-NEXT: successors: %bb.4(0x7ffff800), %bb.1(0x00000800) ; CHECK-NEXT: liveins: $w1, $w2, $x0, $x8 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: early-clobber renamable $w9 = STXRH renamable $w2, renamable $x8, pcsections !0 :: (volatile store (s16) into %ir.ptr) - ; CHECK-NEXT: CBNZW killed renamable $w9, %bb.1 + ; CHECK-NEXT: early-clobber renamable $w10 = STXRH renamable $w2, renamable $x8, pcsections !0 :: (volatile store (s16) into %ir.ptr) + ; CHECK-NEXT: renamable $w9 = MOVZWi 1, 0 + ; CHECK-NEXT: CBNZW killed renamable $w10, %bb.1 + ; CHECK-NEXT: B %bb.4 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: bb.3.cmpxchg.nostore: + ; CHECK-NEXT: successors: %bb.4(0x80000000) ; CHECK-NEXT: liveins: $x0 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: renamable $w1 = MOVZWi 1, 0 - ; CHECK-NEXT: $w0 = KILL renamable $w0, implicit killed $x0 - ; CHECK-NEXT: RET undef $lr, implicit $w0, implicit $w1 + ; CHECK-NEXT: $w9 = ORRWrs $wzr, $wzr, 0 + ; CHECK-NEXT: CLREX 15, pcsections !0 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.4.cmpxchg.nostore: - ; CHECK-NEXT: liveins: $x0 + ; CHECK-NEXT: bb.4.cmpxchg.end: + ; CHECK-NEXT: liveins: $w9, $x0 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $w1 = ORRWrs $wzr, $wzr, 0 - ; CHECK-NEXT: CLREX 15, pcsections !0 ; CHECK-NEXT: $w0 = KILL renamable $w0, implicit killed $x0 - ; CHECK-NEXT: RET undef $lr, implicit $w0, implicit $w1 + ; CHECK-NEXT: $w1 = ORRWrs $wzr, killed $w9, 0 + ; CHECK-NEXT: RET undef $lr, implicit $w0, implicit $w1 %res = cmpxchg ptr %ptr, i16 %desired, i16 %new monotonic monotonic, !pcsections !0 ret { i16, i1 } %res } diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/split-wide-shifts-multiway.ll b/llvm/test/CodeGen/AArch64/GlobalISel/split-wide-shifts-multiway.ll new file mode 100644 index 000000000000..ed68723e470a --- /dev/null +++ b/llvm/test/CodeGen/AArch64/GlobalISel/split-wide-shifts-multiway.ll @@ -0,0 +1,6333 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc %s -o - | FileCheck %s --check-prefixes CHECK,SDAG +; RUN: llc %s -global-isel -global-isel-abort=1 -o - | FileCheck %s --check-prefixes CHECK,GISEL +target datalayout = "e-m:o-i64:64-i512:128-n32:64-S128" +target triple = "arm64-apple-macosx14.0.0" + +define void @test_shl_i512(ptr %result, ptr %input, i32 %shift) { +; SDAG-LABEL: test_shl_i512: +; SDAG: ; %bb.0: ; %entry +; SDAG-NEXT: sub sp, sp, #128 +; SDAG-NEXT: .cfi_def_cfa_offset 128 +; SDAG-NEXT: ldp x9, x8, [x1, #48] +; SDAG-NEXT: movi.2d v0, #0000000000000000 +; SDAG-NEXT: ldp q1, q2, [x1] +; SDAG-NEXT: mvn w14, w2 +; SDAG-NEXT: ldr q3, [x1, #32] +; SDAG-NEXT: stp x9, x8, [sp, #112] +; SDAG-NEXT: mov w8, w2 +; SDAG-NEXT: mov x9, sp +; SDAG-NEXT: lsr x10, x8, #3 +; SDAG-NEXT: add x9, x9, #64 +; SDAG-NEXT: stp q0, q0, [sp] +; SDAG-NEXT: stp q0, q0, [sp, #32] +; SDAG-NEXT: and x3, x8, #0x3f +; SDAG-NEXT: and x10, x10, #0x38 +; SDAG-NEXT: stp q2, q3, [sp, #80] +; SDAG-NEXT: eor x3, x3, #0x3f +; SDAG-NEXT: sub x10, x9, x10 +; SDAG-NEXT: str q1, [sp, #64] +; SDAG-NEXT: ldp x9, x11, [x10] +; SDAG-NEXT: ldp x13, x12, [x10, #16] +; SDAG-NEXT: ldp x17, x16, [x10, #32] +; SDAG-NEXT: ldp x10, x2, [x10, #48] +; SDAG-NEXT: lsr x15, x11, #1 +; SDAG-NEXT: lsr x1, x12, #1 +; SDAG-NEXT: lsl x11, x11, x8 +; SDAG-NEXT: lsl x12, x12, x8 +; SDAG-NEXT: lsr x4, x16, #1 +; SDAG-NEXT: lsr x15, x15, x14 +; SDAG-NEXT: lsl x5, x17, x8 +; SDAG-NEXT: lsr x6, x10, #1 +; SDAG-NEXT: lsr x1, x1, x14 +; SDAG-NEXT: lsl x10, x10, x8 +; SDAG-NEXT: lsr x14, x4, x14 +; SDAG-NEXT: lsl x2, x2, x8 +; SDAG-NEXT: lsl x16, x16, x8 +; SDAG-NEXT: lsr x4, x6, x3 +; SDAG-NEXT: orr x1, x5, x1 +; SDAG-NEXT: orr x10, x10, x14 +; SDAG-NEXT: lsr x14, x17, #1 +; SDAG-NEXT: orr x17, x2, x4 +; SDAG-NEXT: lsr x2, x9, #1 +; SDAG-NEXT: stp x10, x17, [x0, #48] +; SDAG-NEXT: lsr x10, x13, #1 +; SDAG-NEXT: lsr x14, x14, x3 +; SDAG-NEXT: lsl x13, x13, x8 +; SDAG-NEXT: lsl x8, x9, x8 +; SDAG-NEXT: lsr x10, x10, x3 +; SDAG-NEXT: orr x14, x16, x14 +; SDAG-NEXT: lsr x16, x2, x3 +; SDAG-NEXT: orr x13, x13, x15 +; SDAG-NEXT: stp x1, x14, [x0, #32] +; SDAG-NEXT: orr x10, x12, x10 +; SDAG-NEXT: orr x9, x11, x16 +; SDAG-NEXT: stp x13, x10, [x0, #16] +; SDAG-NEXT: stp x8, x9, [x0] +; SDAG-NEXT: add sp, sp, #128 +; SDAG-NEXT: ret +; +; GISEL-LABEL: test_shl_i512: +; GISEL: ; %bb.0: ; %entry +; GISEL-NEXT: stp x28, x27, [sp, #-80]! ; 16-byte Folded Spill +; GISEL-NEXT: stp x26, x25, [sp, #16] ; 16-byte Folded Spill +; GISEL-NEXT: stp x24, x23, [sp, #32] ; 16-byte Folded Spill +; GISEL-NEXT: stp x22, x21, [sp, #48] ; 16-byte Folded Spill +; GISEL-NEXT: stp x20, x19, [sp, #64] ; 16-byte Folded Spill +; GISEL-NEXT: .cfi_def_cfa_offset 80 +; GISEL-NEXT: .cfi_offset w19, -8 +; GISEL-NEXT: .cfi_offset w20, -16 +; GISEL-NEXT: .cfi_offset w21, -24 +; GISEL-NEXT: .cfi_offset w22, -32 +; GISEL-NEXT: .cfi_offset w23, -40 +; GISEL-NEXT: .cfi_offset w24, -48 +; GISEL-NEXT: .cfi_offset w25, -56 +; GISEL-NEXT: .cfi_offset w26, -64 +; GISEL-NEXT: .cfi_offset w27, -72 +; GISEL-NEXT: .cfi_offset w28, -80 +; GISEL-NEXT: ldp x11, x15, [x1] +; GISEL-NEXT: mov w8, w2 +; GISEL-NEXT: lsr x9, x8, #6 +; GISEL-NEXT: and x14, x8, #0x3f +; GISEL-NEXT: mov w13, #64 ; =0x40 +; GISEL-NEXT: sub x16, x13, x14 +; GISEL-NEXT: ldp x3, x6, [x1, #16] +; GISEL-NEXT: lsl x10, x11, x14 +; GISEL-NEXT: cmp x9, #0 +; GISEL-NEXT: ldp x20, x21, [x1, #32] +; GISEL-NEXT: csel x12, x10, xzr, eq +; GISEL-NEXT: cmp x9, #1 +; GISEL-NEXT: csel x12, xzr, x12, eq +; GISEL-NEXT: cmp x9, #2 +; GISEL-NEXT: csel x12, xzr, x12, eq +; GISEL-NEXT: cmp x9, #3 +; GISEL-NEXT: lsr x26, x21, x16 +; GISEL-NEXT: csel x12, xzr, x12, eq +; GISEL-NEXT: cmp x9, #4 +; GISEL-NEXT: csel x12, xzr, x12, eq +; GISEL-NEXT: cmp x9, #5 +; GISEL-NEXT: csel x12, xzr, x12, eq +; GISEL-NEXT: cmp x9, #6 +; GISEL-NEXT: csel x12, xzr, x12, eq +; GISEL-NEXT: cmp x9, #7 +; GISEL-NEXT: csel x13, xzr, x12, eq +; GISEL-NEXT: cmp x8, #0 +; GISEL-NEXT: lsr x12, x11, x16 +; GISEL-NEXT: csel x13, x11, x13, eq +; GISEL-NEXT: lsl x11, x15, x14 +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x17, xzr, x12, eq +; GISEL-NEXT: cmp x9, #0 +; GISEL-NEXT: orr x17, x11, x17 +; GISEL-NEXT: csel x17, x17, xzr, eq +; GISEL-NEXT: cmp x9, #1 +; GISEL-NEXT: csel x17, x10, x17, eq +; GISEL-NEXT: cmp x9, #2 +; GISEL-NEXT: csel x17, xzr, x17, eq +; GISEL-NEXT: cmp x9, #3 +; GISEL-NEXT: csel x17, xzr, x17, eq +; GISEL-NEXT: cmp x9, #4 +; GISEL-NEXT: csel x17, xzr, x17, eq +; GISEL-NEXT: cmp x9, #5 +; GISEL-NEXT: csel x17, xzr, x17, eq +; GISEL-NEXT: cmp x9, #6 +; GISEL-NEXT: csel x17, xzr, x17, eq +; GISEL-NEXT: cmp x9, #7 +; GISEL-NEXT: csel x2, xzr, x17, eq +; GISEL-NEXT: cmp x8, #0 +; GISEL-NEXT: lsr x17, x15, x16 +; GISEL-NEXT: csel x15, x15, x2, eq +; GISEL-NEXT: lsl x2, x3, x14 +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x4, xzr, x17, eq +; GISEL-NEXT: cmp x9, #0 +; GISEL-NEXT: orr x4, x2, x4 +; GISEL-NEXT: csel x4, x4, xzr, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x5, xzr, x12, eq +; GISEL-NEXT: cmp x9, #1 +; GISEL-NEXT: orr x5, x11, x5 +; GISEL-NEXT: csel x4, x5, x4, eq +; GISEL-NEXT: cmp x9, #2 +; GISEL-NEXT: csel x4, x10, x4, eq +; GISEL-NEXT: cmp x9, #3 +; GISEL-NEXT: csel x4, xzr, x4, eq +; GISEL-NEXT: cmp x9, #4 +; GISEL-NEXT: csel x4, xzr, x4, eq +; GISEL-NEXT: cmp x9, #5 +; GISEL-NEXT: csel x4, xzr, x4, eq +; GISEL-NEXT: cmp x9, #6 +; GISEL-NEXT: csel x4, xzr, x4, eq +; GISEL-NEXT: cmp x9, #7 +; GISEL-NEXT: csel x5, xzr, x4, eq +; GISEL-NEXT: cmp x8, #0 +; GISEL-NEXT: lsr x4, x3, x16 +; GISEL-NEXT: csel x3, x3, x5, eq +; GISEL-NEXT: lsl x5, x6, x14 +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x7, xzr, x4, eq +; GISEL-NEXT: cmp x9, #0 +; GISEL-NEXT: orr x7, x5, x7 +; GISEL-NEXT: csel x7, x7, xzr, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x19, xzr, x17, eq +; GISEL-NEXT: cmp x9, #1 +; GISEL-NEXT: orr x19, x2, x19 +; GISEL-NEXT: csel x7, x19, x7, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x19, xzr, x12, eq +; GISEL-NEXT: cmp x9, #2 +; GISEL-NEXT: orr x19, x11, x19 +; GISEL-NEXT: csel x7, x19, x7, eq +; GISEL-NEXT: cmp x9, #3 +; GISEL-NEXT: csel x7, x10, x7, eq +; GISEL-NEXT: cmp x9, #4 +; GISEL-NEXT: csel x7, xzr, x7, eq +; GISEL-NEXT: cmp x9, #5 +; GISEL-NEXT: csel x7, xzr, x7, eq +; GISEL-NEXT: cmp x9, #6 +; GISEL-NEXT: csel x7, xzr, x7, eq +; GISEL-NEXT: cmp x9, #7 +; GISEL-NEXT: csel x19, xzr, x7, eq +; GISEL-NEXT: cmp x8, #0 +; GISEL-NEXT: lsr x7, x6, x16 +; GISEL-NEXT: csel x6, x6, x19, eq +; GISEL-NEXT: lsl x19, x20, x14 +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x22, xzr, x7, eq +; GISEL-NEXT: cmp x9, #0 +; GISEL-NEXT: orr x22, x19, x22 +; GISEL-NEXT: csel x22, x22, xzr, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x23, xzr, x4, eq +; GISEL-NEXT: cmp x9, #1 +; GISEL-NEXT: orr x23, x5, x23 +; GISEL-NEXT: csel x22, x23, x22, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x23, xzr, x17, eq +; GISEL-NEXT: cmp x9, #2 +; GISEL-NEXT: orr x23, x2, x23 +; GISEL-NEXT: csel x22, x23, x22, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x23, xzr, x12, eq +; GISEL-NEXT: cmp x9, #3 +; GISEL-NEXT: orr x23, x11, x23 +; GISEL-NEXT: csel x22, x23, x22, eq +; GISEL-NEXT: cmp x9, #4 +; GISEL-NEXT: csel x22, x10, x22, eq +; GISEL-NEXT: cmp x9, #5 +; GISEL-NEXT: csel x22, xzr, x22, eq +; GISEL-NEXT: cmp x9, #6 +; GISEL-NEXT: csel x22, xzr, x22, eq +; GISEL-NEXT: cmp x9, #7 +; GISEL-NEXT: csel x23, xzr, x22, eq +; GISEL-NEXT: cmp x8, #0 +; GISEL-NEXT: lsr x22, x20, x16 +; GISEL-NEXT: csel x20, x20, x23, eq +; GISEL-NEXT: lsl x23, x21, x14 +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x24, xzr, x22, eq +; GISEL-NEXT: cmp x9, #0 +; GISEL-NEXT: orr x24, x23, x24 +; GISEL-NEXT: csel x24, x24, xzr, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x25, xzr, x7, eq +; GISEL-NEXT: cmp x9, #1 +; GISEL-NEXT: orr x25, x19, x25 +; GISEL-NEXT: csel x24, x25, x24, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x25, xzr, x4, eq +; GISEL-NEXT: cmp x9, #2 +; GISEL-NEXT: orr x25, x5, x25 +; GISEL-NEXT: csel x24, x25, x24, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x25, xzr, x17, eq +; GISEL-NEXT: cmp x9, #3 +; GISEL-NEXT: orr x25, x2, x25 +; GISEL-NEXT: csel x24, x25, x24, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x25, xzr, x12, eq +; GISEL-NEXT: cmp x9, #4 +; GISEL-NEXT: orr x25, x11, x25 +; GISEL-NEXT: csel x24, x25, x24, eq +; GISEL-NEXT: cmp x9, #5 +; GISEL-NEXT: csel x24, x10, x24, eq +; GISEL-NEXT: cmp x9, #6 +; GISEL-NEXT: ldp x25, x1, [x1, #48] +; GISEL-NEXT: csel x24, xzr, x24, eq +; GISEL-NEXT: cmp x9, #7 +; GISEL-NEXT: stp x13, x15, [x0] +; GISEL-NEXT: csel x24, xzr, x24, eq +; GISEL-NEXT: cmp x8, #0 +; GISEL-NEXT: stp x3, x6, [x0, #16] +; GISEL-NEXT: csel x21, x21, x24, eq +; GISEL-NEXT: lsl x24, x25, x14 +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x27, xzr, x26, eq +; GISEL-NEXT: cmp x9, #0 +; GISEL-NEXT: lsr x16, x25, x16 +; GISEL-NEXT: orr x27, x24, x27 +; GISEL-NEXT: lsl x14, x1, x14 +; GISEL-NEXT: stp x20, x21, [x0, #32] +; GISEL-NEXT: csel x27, x27, xzr, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x28, xzr, x22, eq +; GISEL-NEXT: cmp x9, #1 +; GISEL-NEXT: orr x28, x23, x28 +; GISEL-NEXT: csel x27, x28, x27, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x28, xzr, x7, eq +; GISEL-NEXT: cmp x9, #2 +; GISEL-NEXT: orr x28, x19, x28 +; GISEL-NEXT: csel x27, x28, x27, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x28, xzr, x4, eq +; GISEL-NEXT: cmp x9, #3 +; GISEL-NEXT: orr x28, x5, x28 +; GISEL-NEXT: csel x27, x28, x27, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x28, xzr, x17, eq +; GISEL-NEXT: cmp x9, #4 +; GISEL-NEXT: orr x28, x2, x28 +; GISEL-NEXT: csel x27, x28, x27, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x28, xzr, x12, eq +; GISEL-NEXT: cmp x9, #5 +; GISEL-NEXT: orr x28, x11, x28 +; GISEL-NEXT: csel x27, x28, x27, eq +; GISEL-NEXT: cmp x9, #6 +; GISEL-NEXT: csel x27, x10, x27, eq +; GISEL-NEXT: cmp x9, #7 +; GISEL-NEXT: csel x27, xzr, x27, eq +; GISEL-NEXT: cmp x8, #0 +; GISEL-NEXT: csel x25, x25, x27, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x16, xzr, x16, eq +; GISEL-NEXT: cmp x9, #0 +; GISEL-NEXT: orr x14, x14, x16 +; GISEL-NEXT: csel x14, x14, xzr, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x16, xzr, x26, eq +; GISEL-NEXT: cmp x9, #1 +; GISEL-NEXT: orr x16, x24, x16 +; GISEL-NEXT: csel x14, x16, x14, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x16, xzr, x22, eq +; GISEL-NEXT: cmp x9, #2 +; GISEL-NEXT: orr x16, x23, x16 +; GISEL-NEXT: ldp x22, x21, [sp, #48] ; 16-byte Folded Reload +; GISEL-NEXT: csel x14, x16, x14, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x16, xzr, x7, eq +; GISEL-NEXT: cmp x9, #3 +; GISEL-NEXT: orr x16, x19, x16 +; GISEL-NEXT: ldp x20, x19, [sp, #64] ; 16-byte Folded Reload +; GISEL-NEXT: csel x14, x16, x14, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x16, xzr, x4, eq +; GISEL-NEXT: cmp x9, #4 +; GISEL-NEXT: orr x16, x5, x16 +; GISEL-NEXT: ldp x24, x23, [sp, #32] ; 16-byte Folded Reload +; GISEL-NEXT: csel x14, x16, x14, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x16, xzr, x17, eq +; GISEL-NEXT: cmp x9, #5 +; GISEL-NEXT: orr x16, x2, x16 +; GISEL-NEXT: csel x13, x16, x14, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x12, xzr, x12, eq +; GISEL-NEXT: cmp x9, #6 +; GISEL-NEXT: orr x11, x11, x12 +; GISEL-NEXT: csel x11, x11, x13, eq +; GISEL-NEXT: cmp x9, #7 +; GISEL-NEXT: csel x9, x10, x11, eq +; GISEL-NEXT: cmp x8, #0 +; GISEL-NEXT: csel x8, x1, x9, eq +; GISEL-NEXT: stp x25, x8, [x0, #48] +; GISEL-NEXT: ldp x26, x25, [sp, #16] ; 16-byte Folded Reload +; GISEL-NEXT: ldp x28, x27, [sp], #80 ; 16-byte Folded Reload +; GISEL-NEXT: ret +entry: + %input_val = load i512, ptr %input, align 64 + + %shift_ext = zext i32 %shift to i512 + %shifted = shl i512 %input_val, %shift_ext + + store i512 %shifted, ptr %result, align 64 + ret void +} + +define void @test_lshr_i512(ptr %result, ptr %input, i32 %shift) { +; SDAG-LABEL: test_lshr_i512: +; SDAG: ; %bb.0: ; %entry +; SDAG-NEXT: sub sp, sp, #128 +; SDAG-NEXT: .cfi_def_cfa_offset 128 +; SDAG-NEXT: ldp x9, x8, [x1, #48] +; SDAG-NEXT: movi.2d v0, #0000000000000000 +; SDAG-NEXT: ldp q1, q2, [x1] +; SDAG-NEXT: mvn w11, w2 +; SDAG-NEXT: ldr q3, [x1, #32] +; SDAG-NEXT: stp x9, x8, [sp, #48] +; SDAG-NEXT: mov w8, w2 +; SDAG-NEXT: lsr x10, x8, #3 +; SDAG-NEXT: stp q2, q3, [sp, #16] +; SDAG-NEXT: and x3, x8, #0x3f +; SDAG-NEXT: stp q0, q0, [sp, #64] +; SDAG-NEXT: eor x3, x3, #0x3f +; SDAG-NEXT: and x9, x10, #0x38 +; SDAG-NEXT: mov x10, sp +; SDAG-NEXT: stp q0, q0, [sp, #96] +; SDAG-NEXT: add x10, x10, x9 +; SDAG-NEXT: str q1, [sp] +; SDAG-NEXT: ldp x13, x16, [x10, #48] +; SDAG-NEXT: ldp x9, x14, [x10, #16] +; SDAG-NEXT: ldp x12, x17, [x10, #32] +; SDAG-NEXT: lsl x4, x16, #1 +; SDAG-NEXT: lsl x2, x13, #1 +; SDAG-NEXT: lsr x13, x13, x8 +; SDAG-NEXT: lsl x15, x9, #1 +; SDAG-NEXT: lsr x16, x16, x8 +; SDAG-NEXT: lsr x9, x9, x8 +; SDAG-NEXT: lsl x1, x12, #1 +; SDAG-NEXT: lsl x4, x4, x3 +; SDAG-NEXT: lsr x12, x12, x8 +; SDAG-NEXT: lsl x15, x15, x11 +; SDAG-NEXT: lsl x1, x1, x11 +; SDAG-NEXT: lsl x11, x2, x11 +; SDAG-NEXT: lsl x2, x17, #1 +; SDAG-NEXT: orr x13, x4, x13 +; SDAG-NEXT: ldp x10, x4, [x10] +; SDAG-NEXT: lsr x17, x17, x8 +; SDAG-NEXT: lsl x2, x2, x3 +; SDAG-NEXT: stp x13, x16, [x0, #48] +; SDAG-NEXT: lsl x16, x14, #1 +; SDAG-NEXT: lsr x14, x14, x8 +; SDAG-NEXT: lsl x13, x4, #1 +; SDAG-NEXT: orr x11, x17, x11 +; SDAG-NEXT: orr x12, x2, x12 +; SDAG-NEXT: lsl x16, x16, x3 +; SDAG-NEXT: lsr x10, x10, x8 +; SDAG-NEXT: stp x12, x11, [x0, #32] +; SDAG-NEXT: lsl x12, x13, x3 +; SDAG-NEXT: lsr x8, x4, x8 +; SDAG-NEXT: orr x11, x14, x1 +; SDAG-NEXT: orr x9, x16, x9 +; SDAG-NEXT: stp x9, x11, [x0, #16] +; SDAG-NEXT: orr x9, x12, x10 +; SDAG-NEXT: orr x8, x8, x15 +; SDAG-NEXT: stp x9, x8, [x0] +; SDAG-NEXT: add sp, sp, #128 +; SDAG-NEXT: ret +; +; GISEL-LABEL: test_lshr_i512: +; GISEL: ; %bb.0: ; %entry +; GISEL-NEXT: stp x26, x25, [sp, #-64]! ; 16-byte Folded Spill +; GISEL-NEXT: stp x24, x23, [sp, #16] ; 16-byte Folded Spill +; GISEL-NEXT: stp x22, x21, [sp, #32] ; 16-byte Folded Spill +; GISEL-NEXT: stp x20, x19, [sp, #48] ; 16-byte Folded Spill +; GISEL-NEXT: .cfi_def_cfa_offset 64 +; GISEL-NEXT: .cfi_offset w19, -8 +; GISEL-NEXT: .cfi_offset w20, -16 +; GISEL-NEXT: .cfi_offset w21, -24 +; GISEL-NEXT: .cfi_offset w22, -32 +; GISEL-NEXT: .cfi_offset w23, -40 +; GISEL-NEXT: .cfi_offset w24, -48 +; GISEL-NEXT: .cfi_offset w25, -56 +; GISEL-NEXT: .cfi_offset w26, -64 +; GISEL-NEXT: mov w8, w2 +; GISEL-NEXT: ldp x13, x2, [x1] +; GISEL-NEXT: mov w9, #64 ; =0x40 +; GISEL-NEXT: and x14, x8, #0x3f +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: sub x17, x9, x14 +; GISEL-NEXT: ldp x5, x16, [x1, #16] +; GISEL-NEXT: lsl x10, x2, x17 +; GISEL-NEXT: lsr x9, x8, #6 +; GISEL-NEXT: lsr x11, x13, x14 +; GISEL-NEXT: lsr x24, x2, x14 +; GISEL-NEXT: csel x10, xzr, x10, eq +; GISEL-NEXT: lsl x23, x5, x17 +; GISEL-NEXT: cmp x9, #0 +; GISEL-NEXT: orr x10, x11, x10 +; GISEL-NEXT: lsl x22, x16, x17 +; GISEL-NEXT: lsr x21, x5, x14 +; GISEL-NEXT: csel x10, x10, xzr, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: lsr x20, x16, x14 +; GISEL-NEXT: csel x11, xzr, x23, eq +; GISEL-NEXT: cmp x9, #1 +; GISEL-NEXT: orr x11, x24, x11 +; GISEL-NEXT: csel x10, x11, x10, eq +; GISEL-NEXT: ldp x15, x11, [x1, #32] +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x12, xzr, x22, eq +; GISEL-NEXT: cmp x9, #2 +; GISEL-NEXT: lsl x19, x15, x17 +; GISEL-NEXT: orr x12, x21, x12 +; GISEL-NEXT: lsl x6, x11, x17 +; GISEL-NEXT: csel x10, x12, x10, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: lsr x7, x15, x14 +; GISEL-NEXT: csel x12, xzr, x19, eq +; GISEL-NEXT: cmp x9, #3 +; GISEL-NEXT: orr x12, x20, x12 +; GISEL-NEXT: csel x4, x12, x10, eq +; GISEL-NEXT: ldp x12, x10, [x1, #48] +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x1, xzr, x6, eq +; GISEL-NEXT: cmp x9, #4 +; GISEL-NEXT: orr x1, x7, x1 +; GISEL-NEXT: lsl x3, x12, x17 +; GISEL-NEXT: lsl x17, x10, x17 +; GISEL-NEXT: csel x1, x1, x4, eq +; GISEL-NEXT: lsr x4, x11, x14 +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x25, xzr, x3, eq +; GISEL-NEXT: cmp x9, #5 +; GISEL-NEXT: orr x25, x4, x25 +; GISEL-NEXT: csel x25, x25, x1, eq +; GISEL-NEXT: lsr x1, x12, x14 +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x26, xzr, x17, eq +; GISEL-NEXT: cmp x9, #6 +; GISEL-NEXT: lsr x14, x10, x14 +; GISEL-NEXT: orr x26, x1, x26 +; GISEL-NEXT: csel x25, x26, x25, eq +; GISEL-NEXT: cmp x9, #7 +; GISEL-NEXT: csel x25, x14, x25, eq +; GISEL-NEXT: cmp x8, #0 +; GISEL-NEXT: csel x13, x13, x25, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x23, xzr, x23, eq +; GISEL-NEXT: cmp x9, #0 +; GISEL-NEXT: orr x23, x24, x23 +; GISEL-NEXT: csel x23, x23, xzr, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x24, xzr, x22, eq +; GISEL-NEXT: cmp x9, #1 +; GISEL-NEXT: orr x24, x21, x24 +; GISEL-NEXT: csel x23, x24, x23, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x24, xzr, x19, eq +; GISEL-NEXT: cmp x9, #2 +; GISEL-NEXT: orr x24, x20, x24 +; GISEL-NEXT: csel x23, x24, x23, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x24, xzr, x6, eq +; GISEL-NEXT: cmp x9, #3 +; GISEL-NEXT: orr x24, x7, x24 +; GISEL-NEXT: csel x23, x24, x23, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x24, xzr, x3, eq +; GISEL-NEXT: cmp x9, #4 +; GISEL-NEXT: orr x24, x4, x24 +; GISEL-NEXT: csel x23, x24, x23, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x24, xzr, x17, eq +; GISEL-NEXT: cmp x9, #5 +; GISEL-NEXT: orr x24, x1, x24 +; GISEL-NEXT: csel x23, x24, x23, eq +; GISEL-NEXT: cmp x9, #6 +; GISEL-NEXT: csel x23, x14, x23, eq +; GISEL-NEXT: cmp x9, #7 +; GISEL-NEXT: csel x23, xzr, x23, eq +; GISEL-NEXT: cmp x8, #0 +; GISEL-NEXT: csel x2, x2, x23, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x22, xzr, x22, eq +; GISEL-NEXT: cmp x9, #0 +; GISEL-NEXT: stp x13, x2, [x0] +; GISEL-NEXT: orr x21, x21, x22 +; GISEL-NEXT: ldp x24, x23, [sp, #16] ; 16-byte Folded Reload +; GISEL-NEXT: csel x21, x21, xzr, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x22, xzr, x19, eq +; GISEL-NEXT: cmp x9, #1 +; GISEL-NEXT: orr x22, x20, x22 +; GISEL-NEXT: csel x21, x22, x21, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x22, xzr, x6, eq +; GISEL-NEXT: cmp x9, #2 +; GISEL-NEXT: orr x22, x7, x22 +; GISEL-NEXT: csel x21, x22, x21, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x22, xzr, x3, eq +; GISEL-NEXT: cmp x9, #3 +; GISEL-NEXT: orr x22, x4, x22 +; GISEL-NEXT: csel x21, x22, x21, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x22, xzr, x17, eq +; GISEL-NEXT: cmp x9, #4 +; GISEL-NEXT: orr x22, x1, x22 +; GISEL-NEXT: csel x21, x22, x21, eq +; GISEL-NEXT: cmp x9, #5 +; GISEL-NEXT: csel x21, x14, x21, eq +; GISEL-NEXT: cmp x9, #6 +; GISEL-NEXT: csel x21, xzr, x21, eq +; GISEL-NEXT: cmp x9, #7 +; GISEL-NEXT: csel x21, xzr, x21, eq +; GISEL-NEXT: cmp x8, #0 +; GISEL-NEXT: csel x5, x5, x21, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x19, xzr, x19, eq +; GISEL-NEXT: cmp x9, #0 +; GISEL-NEXT: orr x19, x20, x19 +; GISEL-NEXT: ldp x22, x21, [sp, #32] ; 16-byte Folded Reload +; GISEL-NEXT: csel x19, x19, xzr, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x20, xzr, x6, eq +; GISEL-NEXT: cmp x9, #1 +; GISEL-NEXT: orr x20, x7, x20 +; GISEL-NEXT: csel x19, x20, x19, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x20, xzr, x3, eq +; GISEL-NEXT: cmp x9, #2 +; GISEL-NEXT: orr x20, x4, x20 +; GISEL-NEXT: csel x19, x20, x19, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x20, xzr, x17, eq +; GISEL-NEXT: cmp x9, #3 +; GISEL-NEXT: orr x20, x1, x20 +; GISEL-NEXT: csel x19, x20, x19, eq +; GISEL-NEXT: cmp x9, #4 +; GISEL-NEXT: csel x19, x14, x19, eq +; GISEL-NEXT: cmp x9, #5 +; GISEL-NEXT: csel x19, xzr, x19, eq +; GISEL-NEXT: cmp x9, #6 +; GISEL-NEXT: csel x19, xzr, x19, eq +; GISEL-NEXT: cmp x9, #7 +; GISEL-NEXT: csel x19, xzr, x19, eq +; GISEL-NEXT: cmp x8, #0 +; GISEL-NEXT: csel x16, x16, x19, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x6, xzr, x6, eq +; GISEL-NEXT: cmp x9, #0 +; GISEL-NEXT: stp x5, x16, [x0, #16] +; GISEL-NEXT: orr x6, x7, x6 +; GISEL-NEXT: ldp x20, x19, [sp, #48] ; 16-byte Folded Reload +; GISEL-NEXT: csel x6, x6, xzr, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x7, xzr, x3, eq +; GISEL-NEXT: cmp x9, #1 +; GISEL-NEXT: orr x7, x4, x7 +; GISEL-NEXT: csel x6, x7, x6, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x7, xzr, x17, eq +; GISEL-NEXT: cmp x9, #2 +; GISEL-NEXT: orr x7, x1, x7 +; GISEL-NEXT: csel x6, x7, x6, eq +; GISEL-NEXT: cmp x9, #3 +; GISEL-NEXT: csel x6, x14, x6, eq +; GISEL-NEXT: cmp x9, #4 +; GISEL-NEXT: csel x6, xzr, x6, eq +; GISEL-NEXT: cmp x9, #5 +; GISEL-NEXT: csel x6, xzr, x6, eq +; GISEL-NEXT: cmp x9, #6 +; GISEL-NEXT: csel x6, xzr, x6, eq +; GISEL-NEXT: cmp x9, #7 +; GISEL-NEXT: csel x6, xzr, x6, eq +; GISEL-NEXT: cmp x8, #0 +; GISEL-NEXT: csel x15, x15, x6, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x3, xzr, x3, eq +; GISEL-NEXT: cmp x9, #0 +; GISEL-NEXT: orr x3, x4, x3 +; GISEL-NEXT: csel x3, x3, xzr, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x4, xzr, x17, eq +; GISEL-NEXT: cmp x9, #1 +; GISEL-NEXT: orr x4, x1, x4 +; GISEL-NEXT: csel x3, x4, x3, eq +; GISEL-NEXT: cmp x9, #2 +; GISEL-NEXT: csel x3, x14, x3, eq +; GISEL-NEXT: cmp x9, #3 +; GISEL-NEXT: csel x3, xzr, x3, eq +; GISEL-NEXT: cmp x9, #4 +; GISEL-NEXT: csel x3, xzr, x3, eq +; GISEL-NEXT: cmp x9, #5 +; GISEL-NEXT: csel x3, xzr, x3, eq +; GISEL-NEXT: cmp x9, #6 +; GISEL-NEXT: csel x3, xzr, x3, eq +; GISEL-NEXT: cmp x9, #7 +; GISEL-NEXT: csel x3, xzr, x3, eq +; GISEL-NEXT: cmp x8, #0 +; GISEL-NEXT: csel x11, x11, x3, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x17, xzr, x17, eq +; GISEL-NEXT: cmp x9, #0 +; GISEL-NEXT: stp x15, x11, [x0, #32] +; GISEL-NEXT: orr x17, x1, x17 +; GISEL-NEXT: csel x17, x17, xzr, eq +; GISEL-NEXT: cmp x9, #1 +; GISEL-NEXT: csel x17, x14, x17, eq +; GISEL-NEXT: cmp x9, #2 +; GISEL-NEXT: csel x17, xzr, x17, eq +; GISEL-NEXT: cmp x9, #3 +; GISEL-NEXT: csel x17, xzr, x17, eq +; GISEL-NEXT: cmp x9, #4 +; GISEL-NEXT: csel x17, xzr, x17, eq +; GISEL-NEXT: cmp x9, #5 +; GISEL-NEXT: csel x17, xzr, x17, eq +; GISEL-NEXT: cmp x9, #6 +; GISEL-NEXT: csel x17, xzr, x17, eq +; GISEL-NEXT: cmp x9, #7 +; GISEL-NEXT: csel x17, xzr, x17, eq +; GISEL-NEXT: cmp x8, #0 +; GISEL-NEXT: csel x12, x12, x17, eq +; GISEL-NEXT: cmp x9, #0 +; GISEL-NEXT: csel x14, x14, xzr, eq +; GISEL-NEXT: cmp x9, #1 +; GISEL-NEXT: csel x14, xzr, x14, eq +; GISEL-NEXT: cmp x9, #2 +; GISEL-NEXT: csel x14, xzr, x14, eq +; GISEL-NEXT: cmp x9, #3 +; GISEL-NEXT: csel x13, xzr, x14, eq +; GISEL-NEXT: cmp x9, #4 +; GISEL-NEXT: csel x13, xzr, x13, eq +; GISEL-NEXT: cmp x9, #5 +; GISEL-NEXT: csel x13, xzr, x13, eq +; GISEL-NEXT: cmp x9, #6 +; GISEL-NEXT: csel x13, xzr, x13, eq +; GISEL-NEXT: cmp x9, #7 +; GISEL-NEXT: csel x9, xzr, x13, eq +; GISEL-NEXT: cmp x8, #0 +; GISEL-NEXT: csel x8, x10, x9, eq +; GISEL-NEXT: stp x12, x8, [x0, #48] +; GISEL-NEXT: ldp x26, x25, [sp], #64 ; 16-byte Folded Reload +; GISEL-NEXT: ret +entry: + %input_val = load i512, ptr %input, align 64 + %shift_ext = zext i32 %shift to i512 + %shifted = lshr i512 %input_val, %shift_ext + store i512 %shifted, ptr %result, align 64 + ret void +} + +define void @test_ashr_i512(ptr %result, ptr %input, i32 %shift) { +; SDAG-LABEL: test_ashr_i512: +; SDAG: ; %bb.0: ; %entry +; SDAG-NEXT: sub sp, sp, #128 +; SDAG-NEXT: .cfi_def_cfa_offset 128 +; SDAG-NEXT: ldp x9, x8, [x1, #48] +; SDAG-NEXT: mov x11, sp +; SDAG-NEXT: ldp q0, q1, [x1] +; SDAG-NEXT: ldr q2, [x1, #32] +; SDAG-NEXT: stp x9, x8, [sp, #48] +; SDAG-NEXT: asr x9, x8, #63 +; SDAG-NEXT: mov w8, w2 +; SDAG-NEXT: lsr x10, x8, #3 +; SDAG-NEXT: stp q1, q2, [sp, #16] +; SDAG-NEXT: and x3, x8, #0x3f +; SDAG-NEXT: str q0, [sp] +; SDAG-NEXT: eor x3, x3, #0x3f +; SDAG-NEXT: and x10, x10, #0x38 +; SDAG-NEXT: stp x9, x9, [sp, #112] +; SDAG-NEXT: stp x9, x9, [sp, #96] +; SDAG-NEXT: add x10, x11, x10 +; SDAG-NEXT: mvn w11, w2 +; SDAG-NEXT: stp x9, x9, [sp, #80] +; SDAG-NEXT: stp x9, x9, [sp, #64] +; SDAG-NEXT: ldp x13, x16, [x10, #48] +; SDAG-NEXT: ldp x9, x14, [x10, #16] +; SDAG-NEXT: ldp x12, x17, [x10, #32] +; SDAG-NEXT: lsl x4, x16, #1 +; SDAG-NEXT: lsl x2, x13, #1 +; SDAG-NEXT: lsr x13, x13, x8 +; SDAG-NEXT: lsl x15, x9, #1 +; SDAG-NEXT: asr x16, x16, x8 +; SDAG-NEXT: lsr x9, x9, x8 +; SDAG-NEXT: lsl x1, x12, #1 +; SDAG-NEXT: lsl x4, x4, x3 +; SDAG-NEXT: lsr x12, x12, x8 +; SDAG-NEXT: lsl x15, x15, x11 +; SDAG-NEXT: lsl x1, x1, x11 +; SDAG-NEXT: lsl x11, x2, x11 +; SDAG-NEXT: lsl x2, x17, #1 +; SDAG-NEXT: orr x13, x4, x13 +; SDAG-NEXT: ldp x10, x4, [x10] +; SDAG-NEXT: lsr x17, x17, x8 +; SDAG-NEXT: lsl x2, x2, x3 +; SDAG-NEXT: stp x13, x16, [x0, #48] +; SDAG-NEXT: lsl x16, x14, #1 +; SDAG-NEXT: lsr x14, x14, x8 +; SDAG-NEXT: lsl x13, x4, #1 +; SDAG-NEXT: orr x11, x17, x11 +; SDAG-NEXT: orr x12, x2, x12 +; SDAG-NEXT: lsl x16, x16, x3 +; SDAG-NEXT: lsr x10, x10, x8 +; SDAG-NEXT: stp x12, x11, [x0, #32] +; SDAG-NEXT: lsl x12, x13, x3 +; SDAG-NEXT: lsr x8, x4, x8 +; SDAG-NEXT: orr x11, x14, x1 +; SDAG-NEXT: orr x9, x16, x9 +; SDAG-NEXT: stp x9, x11, [x0, #16] +; SDAG-NEXT: orr x9, x12, x10 +; SDAG-NEXT: orr x8, x8, x15 +; SDAG-NEXT: stp x9, x8, [x0] +; SDAG-NEXT: add sp, sp, #128 +; SDAG-NEXT: ret +; +; GISEL-LABEL: test_ashr_i512: +; GISEL: ; %bb.0: ; %entry +; GISEL-NEXT: stp x28, x27, [sp, #-80]! ; 16-byte Folded Spill +; GISEL-NEXT: stp x26, x25, [sp, #16] ; 16-byte Folded Spill +; GISEL-NEXT: stp x24, x23, [sp, #32] ; 16-byte Folded Spill +; GISEL-NEXT: stp x22, x21, [sp, #48] ; 16-byte Folded Spill +; GISEL-NEXT: stp x20, x19, [sp, #64] ; 16-byte Folded Spill +; GISEL-NEXT: .cfi_def_cfa_offset 80 +; GISEL-NEXT: .cfi_offset w19, -8 +; GISEL-NEXT: .cfi_offset w20, -16 +; GISEL-NEXT: .cfi_offset w21, -24 +; GISEL-NEXT: .cfi_offset w22, -32 +; GISEL-NEXT: .cfi_offset w23, -40 +; GISEL-NEXT: .cfi_offset w24, -48 +; GISEL-NEXT: .cfi_offset w25, -56 +; GISEL-NEXT: .cfi_offset w26, -64 +; GISEL-NEXT: .cfi_offset w27, -72 +; GISEL-NEXT: .cfi_offset w28, -80 +; GISEL-NEXT: mov w8, w2 +; GISEL-NEXT: ldp x14, x4, [x1] +; GISEL-NEXT: mov w9, #64 ; =0x40 +; GISEL-NEXT: and x16, x8, #0x3f +; GISEL-NEXT: lsr x10, x8, #6 +; GISEL-NEXT: sub x15, x9, x16 +; GISEL-NEXT: ldr x9, [x1, #56] +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: lsl x12, x4, x15 +; GISEL-NEXT: ldp x7, x3, [x1, #16] +; GISEL-NEXT: lsr x13, x14, x16 +; GISEL-NEXT: asr x11, x9, #63 +; GISEL-NEXT: lsr x26, x4, x16 +; GISEL-NEXT: csel x12, xzr, x12, eq +; GISEL-NEXT: cmp x10, #0 +; GISEL-NEXT: lsl x25, x7, x15 +; GISEL-NEXT: orr x12, x13, x12 +; GISEL-NEXT: lsl x23, x3, x15 +; GISEL-NEXT: csel x12, x12, x11, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: lsr x24, x7, x16 +; GISEL-NEXT: csel x13, xzr, x25, eq +; GISEL-NEXT: cmp x10, #1 +; GISEL-NEXT: lsr x22, x3, x16 +; GISEL-NEXT: orr x13, x26, x13 +; GISEL-NEXT: csel x12, x13, x12, eq +; GISEL-NEXT: ldp x17, x13, [x1, #32] +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x2, xzr, x23, eq +; GISEL-NEXT: cmp x10, #2 +; GISEL-NEXT: orr x2, x24, x2 +; GISEL-NEXT: lsl x21, x17, x15 +; GISEL-NEXT: lsl x19, x13, x15 +; GISEL-NEXT: csel x2, x2, x12, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: ldr x12, [x1, #48] +; GISEL-NEXT: csel x1, xzr, x21, eq +; GISEL-NEXT: cmp x10, #3 +; GISEL-NEXT: lsr x20, x17, x16 +; GISEL-NEXT: orr x1, x22, x1 +; GISEL-NEXT: lsl x5, x12, x15 +; GISEL-NEXT: lsr x6, x13, x16 +; GISEL-NEXT: csel x1, x1, x2, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x2, xzr, x19, eq +; GISEL-NEXT: cmp x10, #4 +; GISEL-NEXT: orr x2, x20, x2 +; GISEL-NEXT: csel x2, x2, x1, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x1, xzr, x5, eq +; GISEL-NEXT: cmp x10, #5 +; GISEL-NEXT: orr x27, x6, x1 +; GISEL-NEXT: lsl x1, x9, x15 +; GISEL-NEXT: lsl x15, x11, x15 +; GISEL-NEXT: csel x27, x27, x2, eq +; GISEL-NEXT: lsr x2, x12, x16 +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x28, xzr, x1, eq +; GISEL-NEXT: cmp x10, #6 +; GISEL-NEXT: lsr x16, x9, x16 +; GISEL-NEXT: orr x28, x2, x28 +; GISEL-NEXT: csel x27, x28, x27, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x28, xzr, x15, eq +; GISEL-NEXT: cmp x10, #7 +; GISEL-NEXT: orr x28, x16, x28 +; GISEL-NEXT: csel x27, x28, x27, eq +; GISEL-NEXT: cmp x8, #0 +; GISEL-NEXT: csel x14, x14, x27, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x25, xzr, x25, eq +; GISEL-NEXT: cmp x10, #0 +; GISEL-NEXT: orr x25, x26, x25 +; GISEL-NEXT: csel x25, x25, x11, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x26, xzr, x23, eq +; GISEL-NEXT: cmp x10, #1 +; GISEL-NEXT: orr x26, x24, x26 +; GISEL-NEXT: csel x25, x26, x25, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x26, xzr, x21, eq +; GISEL-NEXT: cmp x10, #2 +; GISEL-NEXT: orr x26, x22, x26 +; GISEL-NEXT: csel x25, x26, x25, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x26, xzr, x19, eq +; GISEL-NEXT: cmp x10, #3 +; GISEL-NEXT: orr x26, x20, x26 +; GISEL-NEXT: csel x25, x26, x25, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x26, xzr, x5, eq +; GISEL-NEXT: cmp x10, #4 +; GISEL-NEXT: orr x26, x6, x26 +; GISEL-NEXT: csel x25, x26, x25, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x26, xzr, x1, eq +; GISEL-NEXT: cmp x10, #5 +; GISEL-NEXT: orr x26, x2, x26 +; GISEL-NEXT: csel x25, x26, x25, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x26, xzr, x15, eq +; GISEL-NEXT: cmp x10, #6 +; GISEL-NEXT: orr x26, x16, x26 +; GISEL-NEXT: csel x25, x26, x25, eq +; GISEL-NEXT: cmp x10, #7 +; GISEL-NEXT: csel x25, x11, x25, eq +; GISEL-NEXT: cmp x8, #0 +; GISEL-NEXT: csel x4, x4, x25, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x23, xzr, x23, eq +; GISEL-NEXT: cmp x10, #0 +; GISEL-NEXT: stp x14, x4, [x0] +; GISEL-NEXT: orr x23, x24, x23 +; GISEL-NEXT: ldp x26, x25, [sp, #16] ; 16-byte Folded Reload +; GISEL-NEXT: csel x23, x23, x11, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x24, xzr, x21, eq +; GISEL-NEXT: cmp x10, #1 +; GISEL-NEXT: orr x24, x22, x24 +; GISEL-NEXT: csel x23, x24, x23, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x24, xzr, x19, eq +; GISEL-NEXT: cmp x10, #2 +; GISEL-NEXT: orr x24, x20, x24 +; GISEL-NEXT: csel x23, x24, x23, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x24, xzr, x5, eq +; GISEL-NEXT: cmp x10, #3 +; GISEL-NEXT: orr x24, x6, x24 +; GISEL-NEXT: csel x23, x24, x23, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x24, xzr, x1, eq +; GISEL-NEXT: cmp x10, #4 +; GISEL-NEXT: orr x24, x2, x24 +; GISEL-NEXT: csel x23, x24, x23, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x24, xzr, x15, eq +; GISEL-NEXT: cmp x10, #5 +; GISEL-NEXT: orr x24, x16, x24 +; GISEL-NEXT: csel x23, x24, x23, eq +; GISEL-NEXT: cmp x10, #6 +; GISEL-NEXT: csel x23, x11, x23, eq +; GISEL-NEXT: cmp x10, #7 +; GISEL-NEXT: csel x23, x11, x23, eq +; GISEL-NEXT: cmp x8, #0 +; GISEL-NEXT: csel x7, x7, x23, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x21, xzr, x21, eq +; GISEL-NEXT: cmp x10, #0 +; GISEL-NEXT: orr x21, x22, x21 +; GISEL-NEXT: ldp x24, x23, [sp, #32] ; 16-byte Folded Reload +; GISEL-NEXT: csel x21, x21, x11, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x22, xzr, x19, eq +; GISEL-NEXT: cmp x10, #1 +; GISEL-NEXT: orr x22, x20, x22 +; GISEL-NEXT: csel x21, x22, x21, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x22, xzr, x5, eq +; GISEL-NEXT: cmp x10, #2 +; GISEL-NEXT: orr x22, x6, x22 +; GISEL-NEXT: csel x21, x22, x21, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x22, xzr, x1, eq +; GISEL-NEXT: cmp x10, #3 +; GISEL-NEXT: orr x22, x2, x22 +; GISEL-NEXT: csel x21, x22, x21, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x22, xzr, x15, eq +; GISEL-NEXT: cmp x10, #4 +; GISEL-NEXT: orr x22, x16, x22 +; GISEL-NEXT: csel x21, x22, x21, eq +; GISEL-NEXT: cmp x10, #5 +; GISEL-NEXT: csel x21, x11, x21, eq +; GISEL-NEXT: cmp x10, #6 +; GISEL-NEXT: csel x21, x11, x21, eq +; GISEL-NEXT: cmp x10, #7 +; GISEL-NEXT: csel x21, x11, x21, eq +; GISEL-NEXT: cmp x8, #0 +; GISEL-NEXT: csel x3, x3, x21, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x19, xzr, x19, eq +; GISEL-NEXT: cmp x10, #0 +; GISEL-NEXT: stp x7, x3, [x0, #16] +; GISEL-NEXT: orr x19, x20, x19 +; GISEL-NEXT: ldp x22, x21, [sp, #48] ; 16-byte Folded Reload +; GISEL-NEXT: csel x19, x19, x11, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x20, xzr, x5, eq +; GISEL-NEXT: cmp x10, #1 +; GISEL-NEXT: orr x20, x6, x20 +; GISEL-NEXT: csel x19, x20, x19, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x20, xzr, x1, eq +; GISEL-NEXT: cmp x10, #2 +; GISEL-NEXT: orr x20, x2, x20 +; GISEL-NEXT: csel x19, x20, x19, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x20, xzr, x15, eq +; GISEL-NEXT: cmp x10, #3 +; GISEL-NEXT: orr x20, x16, x20 +; GISEL-NEXT: csel x19, x20, x19, eq +; GISEL-NEXT: cmp x10, #4 +; GISEL-NEXT: csel x19, x11, x19, eq +; GISEL-NEXT: cmp x10, #5 +; GISEL-NEXT: csel x19, x11, x19, eq +; GISEL-NEXT: cmp x10, #6 +; GISEL-NEXT: csel x19, x11, x19, eq +; GISEL-NEXT: cmp x10, #7 +; GISEL-NEXT: csel x19, x11, x19, eq +; GISEL-NEXT: cmp x8, #0 +; GISEL-NEXT: csel x17, x17, x19, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x5, xzr, x5, eq +; GISEL-NEXT: cmp x10, #0 +; GISEL-NEXT: orr x5, x6, x5 +; GISEL-NEXT: ldp x20, x19, [sp, #64] ; 16-byte Folded Reload +; GISEL-NEXT: csel x5, x5, x11, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x6, xzr, x1, eq +; GISEL-NEXT: cmp x10, #1 +; GISEL-NEXT: orr x6, x2, x6 +; GISEL-NEXT: csel x5, x6, x5, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x6, xzr, x15, eq +; GISEL-NEXT: cmp x10, #2 +; GISEL-NEXT: orr x6, x16, x6 +; GISEL-NEXT: csel x5, x6, x5, eq +; GISEL-NEXT: cmp x10, #3 +; GISEL-NEXT: csel x5, x11, x5, eq +; GISEL-NEXT: cmp x10, #4 +; GISEL-NEXT: csel x5, x11, x5, eq +; GISEL-NEXT: cmp x10, #5 +; GISEL-NEXT: csel x5, x11, x5, eq +; GISEL-NEXT: cmp x10, #6 +; GISEL-NEXT: csel x5, x11, x5, eq +; GISEL-NEXT: cmp x10, #7 +; GISEL-NEXT: csel x5, x11, x5, eq +; GISEL-NEXT: cmp x8, #0 +; GISEL-NEXT: csel x13, x13, x5, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x1, xzr, x1, eq +; GISEL-NEXT: cmp x10, #0 +; GISEL-NEXT: stp x17, x13, [x0, #32] +; GISEL-NEXT: orr x1, x2, x1 +; GISEL-NEXT: csel x1, x1, x11, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x2, xzr, x15, eq +; GISEL-NEXT: cmp x10, #1 +; GISEL-NEXT: orr x2, x16, x2 +; GISEL-NEXT: csel x1, x2, x1, eq +; GISEL-NEXT: cmp x10, #2 +; GISEL-NEXT: csel x1, x11, x1, eq +; GISEL-NEXT: cmp x10, #3 +; GISEL-NEXT: csel x1, x11, x1, eq +; GISEL-NEXT: cmp x10, #4 +; GISEL-NEXT: csel x1, x11, x1, eq +; GISEL-NEXT: cmp x10, #5 +; GISEL-NEXT: csel x1, x11, x1, eq +; GISEL-NEXT: cmp x10, #6 +; GISEL-NEXT: csel x1, x11, x1, eq +; GISEL-NEXT: cmp x10, #7 +; GISEL-NEXT: csel x1, x11, x1, eq +; GISEL-NEXT: cmp x8, #0 +; GISEL-NEXT: csel x12, x12, x1, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x15, xzr, x15, eq +; GISEL-NEXT: cmp x10, #0 +; GISEL-NEXT: orr x15, x16, x15 +; GISEL-NEXT: csel x15, x15, x11, eq +; GISEL-NEXT: cmp x10, #1 +; GISEL-NEXT: csel x15, x11, x15, eq +; GISEL-NEXT: cmp x10, #2 +; GISEL-NEXT: csel x15, x11, x15, eq +; GISEL-NEXT: cmp x10, #3 +; GISEL-NEXT: csel x14, x11, x15, eq +; GISEL-NEXT: cmp x10, #4 +; GISEL-NEXT: csel x14, x11, x14, eq +; GISEL-NEXT: cmp x10, #5 +; GISEL-NEXT: csel x14, x11, x14, eq +; GISEL-NEXT: cmp x10, #6 +; GISEL-NEXT: csel x14, x11, x14, eq +; GISEL-NEXT: cmp x10, #7 +; GISEL-NEXT: csel x10, x11, x14, eq +; GISEL-NEXT: cmp x8, #0 +; GISEL-NEXT: csel x8, x9, x10, eq +; GISEL-NEXT: stp x12, x8, [x0, #48] +; GISEL-NEXT: ldp x28, x27, [sp], #80 ; 16-byte Folded Reload +; GISEL-NEXT: ret +entry: + %input_val = load i512, ptr %input, align 64 + %shift_ext = zext i32 %shift to i512 + %shifted = ashr i512 %input_val, %shift_ext + store i512 %shifted, ptr %result, align 64 + ret void +} + +; i1024 shift functions +define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) { +; SDAG-LABEL: test_shl_i1024: +; SDAG: ; %bb.0: ; %entry +; SDAG-NEXT: sub sp, sp, #352 +; SDAG-NEXT: stp x28, x27, [sp, #256] ; 16-byte Folded Spill +; SDAG-NEXT: stp x26, x25, [sp, #272] ; 16-byte Folded Spill +; SDAG-NEXT: stp x24, x23, [sp, #288] ; 16-byte Folded Spill +; SDAG-NEXT: stp x22, x21, [sp, #304] ; 16-byte Folded Spill +; SDAG-NEXT: stp x20, x19, [sp, #320] ; 16-byte Folded Spill +; SDAG-NEXT: stp x29, x30, [sp, #336] ; 16-byte Folded Spill +; SDAG-NEXT: .cfi_def_cfa_offset 352 +; SDAG-NEXT: .cfi_offset w30, -8 +; SDAG-NEXT: .cfi_offset w29, -16 +; SDAG-NEXT: .cfi_offset w19, -24 +; SDAG-NEXT: .cfi_offset w20, -32 +; SDAG-NEXT: .cfi_offset w21, -40 +; SDAG-NEXT: .cfi_offset w22, -48 +; SDAG-NEXT: .cfi_offset w23, -56 +; SDAG-NEXT: .cfi_offset w24, -64 +; SDAG-NEXT: .cfi_offset w25, -72 +; SDAG-NEXT: .cfi_offset w26, -80 +; SDAG-NEXT: .cfi_offset w27, -88 +; SDAG-NEXT: .cfi_offset w28, -96 +; SDAG-NEXT: ldp x8, x9, [x1, #112] +; SDAG-NEXT: movi.2d v0, #0000000000000000 +; SDAG-NEXT: ldp q1, q2, [x1] +; SDAG-NEXT: mov x10, sp +; SDAG-NEXT: ldp q3, q4, [x1, #32] +; SDAG-NEXT: add x10, x10, #128 +; SDAG-NEXT: ldp q5, q6, [x1, #64] +; SDAG-NEXT: mvn w4, w2 +; SDAG-NEXT: ldr q7, [x1, #96] +; SDAG-NEXT: stp x8, x9, [sp, #240] +; SDAG-NEXT: mov w8, w2 +; SDAG-NEXT: lsr x9, x8, #3 +; SDAG-NEXT: stp q0, q0, [sp] +; SDAG-NEXT: stp q0, q0, [sp, #32] +; SDAG-NEXT: ldp x29, x30, [sp, #336] ; 16-byte Folded Reload +; SDAG-NEXT: and x9, x9, #0x78 +; SDAG-NEXT: stp q0, q0, [sp, #64] +; SDAG-NEXT: stp q0, q0, [sp, #96] +; SDAG-NEXT: sub x1, x10, x9 +; SDAG-NEXT: and x10, x8, #0x3f +; SDAG-NEXT: stp q2, q3, [sp, #144] +; SDAG-NEXT: eor x10, x10, #0x3f +; SDAG-NEXT: stp q4, q5, [sp, #176] +; SDAG-NEXT: stp q6, q7, [sp, #208] +; SDAG-NEXT: str q1, [sp, #128] +; SDAG-NEXT: ldp x6, x19, [x1, #64] +; SDAG-NEXT: ldr x26, [x1, #96] +; SDAG-NEXT: ldp x22, x23, [x1, #80] +; SDAG-NEXT: ldp x27, x24, [x1, #104] +; SDAG-NEXT: lsr x20, x6, #1 +; SDAG-NEXT: lsr x21, x19, #1 +; SDAG-NEXT: lsl x19, x19, x8 +; SDAG-NEXT: ldp x9, x13, [x1] +; SDAG-NEXT: lsl x25, x22, x8 +; SDAG-NEXT: lsr x20, x20, x10 +; SDAG-NEXT: ldp x11, x14, [x1, #16] +; SDAG-NEXT: ldp x12, x15, [x1, #32] +; SDAG-NEXT: lsr x21, x21, x4 +; SDAG-NEXT: ldp x17, x2, [x1, #48] +; SDAG-NEXT: orr x19, x19, x20 +; SDAG-NEXT: ldr x1, [x1, #120] +; SDAG-NEXT: lsr x20, x24, #1 +; SDAG-NEXT: lsr x16, x13, #1 +; SDAG-NEXT: lsr x3, x14, #1 +; SDAG-NEXT: lsr x5, x15, #1 +; SDAG-NEXT: orr x21, x25, x21 +; SDAG-NEXT: lsr x7, x2, #1 +; SDAG-NEXT: lsr x25, x23, #1 +; SDAG-NEXT: lsr x28, x27, #1 +; SDAG-NEXT: lsl x1, x1, x8 +; SDAG-NEXT: lsr x20, x20, x10 +; SDAG-NEXT: lsr x16, x16, x4 +; SDAG-NEXT: lsr x3, x3, x4 +; SDAG-NEXT: lsr x5, x5, x4 +; SDAG-NEXT: lsr x7, x7, x4 +; SDAG-NEXT: lsr x22, x22, #1 +; SDAG-NEXT: lsr x25, x25, x4 +; SDAG-NEXT: lsr x4, x28, x4 +; SDAG-NEXT: orr x1, x1, x20 +; SDAG-NEXT: lsl x20, x23, x8 +; SDAG-NEXT: lsl x23, x24, x8 +; SDAG-NEXT: lsr x28, x26, #1 +; SDAG-NEXT: lsr x22, x22, x10 +; SDAG-NEXT: lsl x24, x27, x8 +; SDAG-NEXT: orr x4, x23, x4 +; SDAG-NEXT: lsl x6, x6, x8 +; SDAG-NEXT: lsl x2, x2, x8 +; SDAG-NEXT: lsr x27, x28, x10 +; SDAG-NEXT: stp x4, x1, [x0, #112] +; SDAG-NEXT: lsl x1, x26, x8 +; SDAG-NEXT: orr x20, x20, x22 +; SDAG-NEXT: lsr x4, x9, #1 +; SDAG-NEXT: lsl x13, x13, x8 +; SDAG-NEXT: orr x22, x24, x27 +; SDAG-NEXT: orr x1, x1, x25 +; SDAG-NEXT: stp x21, x20, [x0, #80] +; SDAG-NEXT: lsr x20, x17, #1 +; SDAG-NEXT: stp x1, x22, [x0, #96] +; SDAG-NEXT: lsr x1, x11, #1 +; SDAG-NEXT: lsr x21, x12, #1 +; SDAG-NEXT: lsl x14, x14, x8 +; SDAG-NEXT: lsl x15, x15, x8 +; SDAG-NEXT: lsr x20, x20, x10 +; SDAG-NEXT: lsl x17, x17, x8 +; SDAG-NEXT: orr x6, x6, x7 +; SDAG-NEXT: lsr x7, x21, x10 +; SDAG-NEXT: lsl x12, x12, x8 +; SDAG-NEXT: lsr x1, x1, x10 +; SDAG-NEXT: lsl x11, x11, x8 +; SDAG-NEXT: lsr x10, x4, x10 +; SDAG-NEXT: stp x6, x19, [x0, #64] +; SDAG-NEXT: orr x2, x2, x20 +; SDAG-NEXT: lsl x8, x9, x8 +; SDAG-NEXT: orr x17, x17, x5 +; SDAG-NEXT: ldp x20, x19, [sp, #320] ; 16-byte Folded Reload +; SDAG-NEXT: orr x15, x15, x7 +; SDAG-NEXT: ldp x22, x21, [sp, #304] ; 16-byte Folded Reload +; SDAG-NEXT: orr x12, x12, x3 +; SDAG-NEXT: ldp x24, x23, [sp, #288] ; 16-byte Folded Reload +; SDAG-NEXT: orr x14, x14, x1 +; SDAG-NEXT: ldp x26, x25, [sp, #272] ; 16-byte Folded Reload +; SDAG-NEXT: orr x11, x11, x16 +; SDAG-NEXT: ldp x28, x27, [sp, #256] ; 16-byte Folded Reload +; SDAG-NEXT: orr x9, x13, x10 +; SDAG-NEXT: stp x17, x2, [x0, #48] +; SDAG-NEXT: stp x12, x15, [x0, #32] +; SDAG-NEXT: stp x11, x14, [x0, #16] +; SDAG-NEXT: stp x8, x9, [x0] +; SDAG-NEXT: add sp, sp, #352 +; SDAG-NEXT: ret +; +; GISEL-LABEL: test_shl_i1024: +; GISEL: ; %bb.0: ; %entry +; GISEL-NEXT: sub sp, sp, #416 +; GISEL-NEXT: stp x28, x27, [sp, #320] ; 16-byte Folded Spill +; GISEL-NEXT: stp x26, x25, [sp, #336] ; 16-byte Folded Spill +; GISEL-NEXT: stp x24, x23, [sp, #352] ; 16-byte Folded Spill +; GISEL-NEXT: stp x22, x21, [sp, #368] ; 16-byte Folded Spill +; GISEL-NEXT: stp x20, x19, [sp, #384] ; 16-byte Folded Spill +; GISEL-NEXT: stp x29, x30, [sp, #400] ; 16-byte Folded Spill +; GISEL-NEXT: .cfi_def_cfa_offset 416 +; GISEL-NEXT: .cfi_offset w30, -8 +; GISEL-NEXT: .cfi_offset w29, -16 +; GISEL-NEXT: .cfi_offset w19, -24 +; GISEL-NEXT: .cfi_offset w20, -32 +; GISEL-NEXT: .cfi_offset w21, -40 +; GISEL-NEXT: .cfi_offset w22, -48 +; GISEL-NEXT: .cfi_offset w23, -56 +; GISEL-NEXT: .cfi_offset w24, -64 +; GISEL-NEXT: .cfi_offset w25, -72 +; GISEL-NEXT: .cfi_offset w26, -80 +; GISEL-NEXT: .cfi_offset w27, -88 +; GISEL-NEXT: .cfi_offset w28, -96 +; GISEL-NEXT: ldp x10, x11, [x1] +; GISEL-NEXT: mov w8, w2 +; GISEL-NEXT: lsr x9, x8, #6 +; GISEL-NEXT: and x16, x8, #0x3f +; GISEL-NEXT: mov w13, #64 ; =0x40 +; GISEL-NEXT: sub x21, x13, x16 +; GISEL-NEXT: str x0, [sp, #112] ; 8-byte Folded Spill +; GISEL-NEXT: mov x24, x16 +; GISEL-NEXT: lsl x25, x10, x16 +; GISEL-NEXT: cmp x9, #0 +; GISEL-NEXT: lsr x26, x10, x21 +; GISEL-NEXT: lsl x2, x11, x16 +; GISEL-NEXT: lsr x23, x11, x21 +; GISEL-NEXT: mov x22, x21 +; GISEL-NEXT: csel x12, x25, xzr, eq +; GISEL-NEXT: cmp x9, #1 +; GISEL-NEXT: str x1, [sp, #312] ; 8-byte Folded Spill +; GISEL-NEXT: csel x12, xzr, x12, eq +; GISEL-NEXT: cmp x9, #2 +; GISEL-NEXT: str x23, [sp, #208] ; 8-byte Folded Spill +; GISEL-NEXT: csel x12, xzr, x12, eq +; GISEL-NEXT: cmp x9, #3 +; GISEL-NEXT: stp x24, x22, [sp, #40] ; 16-byte Folded Spill +; GISEL-NEXT: csel x12, xzr, x12, eq +; GISEL-NEXT: cmp x9, #4 +; GISEL-NEXT: csel x12, xzr, x12, eq +; GISEL-NEXT: cmp x9, #5 +; GISEL-NEXT: csel x12, xzr, x12, eq +; GISEL-NEXT: cmp x9, #6 +; GISEL-NEXT: csel x12, xzr, x12, eq +; GISEL-NEXT: cmp x9, #7 +; GISEL-NEXT: csel x12, xzr, x12, eq +; GISEL-NEXT: cmp x9, #8 +; GISEL-NEXT: csel x12, xzr, x12, eq +; GISEL-NEXT: cmp x9, #9 +; GISEL-NEXT: csel x12, xzr, x12, eq +; GISEL-NEXT: cmp x9, #10 +; GISEL-NEXT: csel x12, xzr, x12, eq +; GISEL-NEXT: cmp x9, #11 +; GISEL-NEXT: csel x12, xzr, x12, eq +; GISEL-NEXT: cmp x9, #12 +; GISEL-NEXT: csel x12, xzr, x12, eq +; GISEL-NEXT: cmp x9, #13 +; GISEL-NEXT: csel x12, xzr, x12, eq +; GISEL-NEXT: cmp x9, #14 +; GISEL-NEXT: csel x12, xzr, x12, eq +; GISEL-NEXT: cmp x9, #15 +; GISEL-NEXT: csel x12, xzr, x12, eq +; GISEL-NEXT: cmp x8, #0 +; GISEL-NEXT: csel x10, x10, x12, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: str x10, [sp, #192] ; 8-byte Folded Spill +; GISEL-NEXT: csel x10, xzr, x26, eq +; GISEL-NEXT: cmp x9, #0 +; GISEL-NEXT: orr x10, x2, x10 +; GISEL-NEXT: csel x10, x10, xzr, eq +; GISEL-NEXT: cmp x9, #1 +; GISEL-NEXT: csel x10, x25, x10, eq +; GISEL-NEXT: cmp x9, #2 +; GISEL-NEXT: csel x10, xzr, x10, eq +; GISEL-NEXT: cmp x9, #3 +; GISEL-NEXT: csel x10, xzr, x10, eq +; GISEL-NEXT: cmp x9, #4 +; GISEL-NEXT: csel x10, xzr, x10, eq +; GISEL-NEXT: cmp x9, #5 +; GISEL-NEXT: csel x10, xzr, x10, eq +; GISEL-NEXT: cmp x9, #6 +; GISEL-NEXT: csel x10, xzr, x10, eq +; GISEL-NEXT: cmp x9, #7 +; GISEL-NEXT: csel x10, xzr, x10, eq +; GISEL-NEXT: cmp x9, #8 +; GISEL-NEXT: csel x10, xzr, x10, eq +; GISEL-NEXT: cmp x9, #9 +; GISEL-NEXT: csel x10, xzr, x10, eq +; GISEL-NEXT: cmp x9, #10 +; GISEL-NEXT: csel x10, xzr, x10, eq +; GISEL-NEXT: cmp x9, #11 +; GISEL-NEXT: csel x10, xzr, x10, eq +; GISEL-NEXT: cmp x9, #12 +; GISEL-NEXT: csel x10, xzr, x10, eq +; GISEL-NEXT: cmp x9, #13 +; GISEL-NEXT: csel x13, xzr, x10, eq +; GISEL-NEXT: cmp x9, #14 +; GISEL-NEXT: ldp x12, x10, [x1, #16] +; GISEL-NEXT: csel x13, xzr, x13, eq +; GISEL-NEXT: cmp x9, #15 +; GISEL-NEXT: csel x13, xzr, x13, eq +; GISEL-NEXT: cmp x8, #0 +; GISEL-NEXT: lsl x20, x12, x16 +; GISEL-NEXT: csel x11, x11, x13, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: str x11, [sp, #184] ; 8-byte Folded Spill +; GISEL-NEXT: csel x11, xzr, x23, eq +; GISEL-NEXT: cmp x9, #0 +; GISEL-NEXT: orr x11, x20, x11 +; GISEL-NEXT: lsr x15, x12, x21 +; GISEL-NEXT: lsl x14, x10, x16 +; GISEL-NEXT: csel x11, x11, xzr, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: lsr x17, x10, x21 +; GISEL-NEXT: csel x13, xzr, x26, eq +; GISEL-NEXT: cmp x9, #1 +; GISEL-NEXT: str x20, [sp, #8] ; 8-byte Folded Spill +; GISEL-NEXT: orr x13, x2, x13 +; GISEL-NEXT: csel x11, x13, x11, eq +; GISEL-NEXT: cmp x9, #2 +; GISEL-NEXT: csel x11, x25, x11, eq +; GISEL-NEXT: cmp x9, #3 +; GISEL-NEXT: csel x11, xzr, x11, eq +; GISEL-NEXT: cmp x9, #4 +; GISEL-NEXT: csel x11, xzr, x11, eq +; GISEL-NEXT: cmp x9, #5 +; GISEL-NEXT: csel x11, xzr, x11, eq +; GISEL-NEXT: cmp x9, #6 +; GISEL-NEXT: csel x11, xzr, x11, eq +; GISEL-NEXT: cmp x9, #7 +; GISEL-NEXT: csel x11, xzr, x11, eq +; GISEL-NEXT: cmp x9, #8 +; GISEL-NEXT: csel x11, xzr, x11, eq +; GISEL-NEXT: cmp x9, #9 +; GISEL-NEXT: csel x11, xzr, x11, eq +; GISEL-NEXT: cmp x9, #10 +; GISEL-NEXT: csel x11, xzr, x11, eq +; GISEL-NEXT: cmp x9, #11 +; GISEL-NEXT: csel x11, xzr, x11, eq +; GISEL-NEXT: cmp x9, #12 +; GISEL-NEXT: csel x11, xzr, x11, eq +; GISEL-NEXT: cmp x9, #13 +; GISEL-NEXT: csel x11, xzr, x11, eq +; GISEL-NEXT: cmp x9, #14 +; GISEL-NEXT: csel x11, xzr, x11, eq +; GISEL-NEXT: cmp x9, #15 +; GISEL-NEXT: csel x11, xzr, x11, eq +; GISEL-NEXT: cmp x8, #0 +; GISEL-NEXT: csel x11, x12, x11, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: str x11, [sp, #176] ; 8-byte Folded Spill +; GISEL-NEXT: csel x11, xzr, x15, eq +; GISEL-NEXT: cmp x9, #0 +; GISEL-NEXT: orr x11, x14, x11 +; GISEL-NEXT: csel x11, x11, xzr, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x12, xzr, x23, eq +; GISEL-NEXT: cmp x9, #1 +; GISEL-NEXT: orr x12, x20, x12 +; GISEL-NEXT: csel x11, x12, x11, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x12, xzr, x26, eq +; GISEL-NEXT: cmp x9, #2 +; GISEL-NEXT: orr x12, x2, x12 +; GISEL-NEXT: csel x11, x12, x11, eq +; GISEL-NEXT: cmp x9, #3 +; GISEL-NEXT: csel x11, x25, x11, eq +; GISEL-NEXT: cmp x9, #4 +; GISEL-NEXT: csel x11, xzr, x11, eq +; GISEL-NEXT: cmp x9, #5 +; GISEL-NEXT: csel x11, xzr, x11, eq +; GISEL-NEXT: cmp x9, #6 +; GISEL-NEXT: csel x11, xzr, x11, eq +; GISEL-NEXT: cmp x9, #7 +; GISEL-NEXT: csel x11, xzr, x11, eq +; GISEL-NEXT: cmp x9, #8 +; GISEL-NEXT: csel x11, xzr, x11, eq +; GISEL-NEXT: cmp x9, #9 +; GISEL-NEXT: csel x11, xzr, x11, eq +; GISEL-NEXT: cmp x9, #10 +; GISEL-NEXT: csel x11, xzr, x11, eq +; GISEL-NEXT: cmp x9, #11 +; GISEL-NEXT: csel x11, xzr, x11, eq +; GISEL-NEXT: cmp x9, #12 +; GISEL-NEXT: csel x11, xzr, x11, eq +; GISEL-NEXT: cmp x9, #13 +; GISEL-NEXT: csel x13, xzr, x11, eq +; GISEL-NEXT: cmp x9, #14 +; GISEL-NEXT: ldp x12, x11, [x1, #32] +; GISEL-NEXT: csel x13, xzr, x13, eq +; GISEL-NEXT: cmp x9, #15 +; GISEL-NEXT: csel x13, xzr, x13, eq +; GISEL-NEXT: cmp x8, #0 +; GISEL-NEXT: lsl x0, x12, x16 +; GISEL-NEXT: csel x10, x10, x13, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: str x10, [sp, #168] ; 8-byte Folded Spill +; GISEL-NEXT: csel x10, xzr, x17, eq +; GISEL-NEXT: cmp x9, #0 +; GISEL-NEXT: orr x10, x0, x10 +; GISEL-NEXT: lsr x27, x12, x21 +; GISEL-NEXT: lsl x19, x11, x16 +; GISEL-NEXT: csel x10, x10, xzr, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: lsr x3, x11, x21 +; GISEL-NEXT: csel x13, xzr, x15, eq +; GISEL-NEXT: cmp x9, #1 +; GISEL-NEXT: stp x27, x0, [sp, #240] ; 16-byte Folded Spill +; GISEL-NEXT: orr x13, x14, x13 +; GISEL-NEXT: mov x7, x3 +; GISEL-NEXT: csel x10, x13, x10, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x13, xzr, x23, eq +; GISEL-NEXT: cmp x9, #2 +; GISEL-NEXT: orr x13, x20, x13 +; GISEL-NEXT: csel x10, x13, x10, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x13, xzr, x26, eq +; GISEL-NEXT: cmp x9, #3 +; GISEL-NEXT: orr x13, x2, x13 +; GISEL-NEXT: csel x10, x13, x10, eq +; GISEL-NEXT: cmp x9, #4 +; GISEL-NEXT: csel x10, x25, x10, eq +; GISEL-NEXT: cmp x9, #5 +; GISEL-NEXT: csel x10, xzr, x10, eq +; GISEL-NEXT: cmp x9, #6 +; GISEL-NEXT: csel x10, xzr, x10, eq +; GISEL-NEXT: cmp x9, #7 +; GISEL-NEXT: csel x10, xzr, x10, eq +; GISEL-NEXT: cmp x9, #8 +; GISEL-NEXT: csel x10, xzr, x10, eq +; GISEL-NEXT: cmp x9, #9 +; GISEL-NEXT: csel x10, xzr, x10, eq +; GISEL-NEXT: cmp x9, #10 +; GISEL-NEXT: csel x10, xzr, x10, eq +; GISEL-NEXT: cmp x9, #11 +; GISEL-NEXT: csel x10, xzr, x10, eq +; GISEL-NEXT: cmp x9, #12 +; GISEL-NEXT: csel x10, xzr, x10, eq +; GISEL-NEXT: cmp x9, #13 +; GISEL-NEXT: csel x10, xzr, x10, eq +; GISEL-NEXT: cmp x9, #14 +; GISEL-NEXT: csel x10, xzr, x10, eq +; GISEL-NEXT: cmp x9, #15 +; GISEL-NEXT: csel x10, xzr, x10, eq +; GISEL-NEXT: cmp x8, #0 +; GISEL-NEXT: csel x10, x12, x10, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: str x10, [sp, #160] ; 8-byte Folded Spill +; GISEL-NEXT: csel x10, xzr, x27, eq +; GISEL-NEXT: cmp x9, #0 +; GISEL-NEXT: orr x10, x19, x10 +; GISEL-NEXT: csel x10, x10, xzr, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x12, xzr, x17, eq +; GISEL-NEXT: cmp x9, #1 +; GISEL-NEXT: orr x12, x0, x12 +; GISEL-NEXT: csel x10, x12, x10, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x12, xzr, x15, eq +; GISEL-NEXT: cmp x9, #2 +; GISEL-NEXT: orr x12, x14, x12 +; GISEL-NEXT: csel x10, x12, x10, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x12, xzr, x23, eq +; GISEL-NEXT: cmp x9, #3 +; GISEL-NEXT: orr x12, x20, x12 +; GISEL-NEXT: csel x10, x12, x10, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x12, xzr, x26, eq +; GISEL-NEXT: cmp x9, #4 +; GISEL-NEXT: orr x12, x2, x12 +; GISEL-NEXT: csel x10, x12, x10, eq +; GISEL-NEXT: cmp x9, #5 +; GISEL-NEXT: csel x10, x25, x10, eq +; GISEL-NEXT: cmp x9, #6 +; GISEL-NEXT: csel x10, xzr, x10, eq +; GISEL-NEXT: cmp x9, #7 +; GISEL-NEXT: csel x10, xzr, x10, eq +; GISEL-NEXT: cmp x9, #8 +; GISEL-NEXT: csel x10, xzr, x10, eq +; GISEL-NEXT: cmp x9, #9 +; GISEL-NEXT: csel x10, xzr, x10, eq +; GISEL-NEXT: cmp x9, #10 +; GISEL-NEXT: csel x10, xzr, x10, eq +; GISEL-NEXT: cmp x9, #11 +; GISEL-NEXT: csel x10, xzr, x10, eq +; GISEL-NEXT: cmp x9, #12 +; GISEL-NEXT: csel x10, xzr, x10, eq +; GISEL-NEXT: cmp x9, #13 +; GISEL-NEXT: csel x13, xzr, x10, eq +; GISEL-NEXT: cmp x9, #14 +; GISEL-NEXT: ldp x12, x10, [x1, #48] +; GISEL-NEXT: csel x13, xzr, x13, eq +; GISEL-NEXT: cmp x9, #15 +; GISEL-NEXT: csel x13, xzr, x13, eq +; GISEL-NEXT: cmp x8, #0 +; GISEL-NEXT: lsl x4, x12, x16 +; GISEL-NEXT: csel x11, x11, x13, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: str x11, [sp, #152] ; 8-byte Folded Spill +; GISEL-NEXT: csel x11, xzr, x3, eq +; GISEL-NEXT: cmp x9, #0 +; GISEL-NEXT: orr x11, x4, x11 +; GISEL-NEXT: lsl x30, x10, x16 +; GISEL-NEXT: lsr x28, x10, x21 +; GISEL-NEXT: csel x11, x11, xzr, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x13, xzr, x27, eq +; GISEL-NEXT: cmp x9, #1 +; GISEL-NEXT: str x30, [sp, #200] ; 8-byte Folded Spill +; GISEL-NEXT: orr x13, x19, x13 +; GISEL-NEXT: csel x11, x13, x11, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x13, xzr, x17, eq +; GISEL-NEXT: cmp x9, #2 +; GISEL-NEXT: orr x13, x0, x13 +; GISEL-NEXT: csel x11, x13, x11, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x13, xzr, x15, eq +; GISEL-NEXT: cmp x9, #3 +; GISEL-NEXT: orr x13, x14, x13 +; GISEL-NEXT: csel x11, x13, x11, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x13, xzr, x23, eq +; GISEL-NEXT: cmp x9, #4 +; GISEL-NEXT: orr x13, x20, x13 +; GISEL-NEXT: csel x11, x13, x11, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x13, xzr, x26, eq +; GISEL-NEXT: cmp x9, #5 +; GISEL-NEXT: orr x13, x2, x13 +; GISEL-NEXT: csel x11, x13, x11, eq +; GISEL-NEXT: cmp x9, #6 +; GISEL-NEXT: lsr x13, x12, x21 +; GISEL-NEXT: csel x11, x25, x11, eq +; GISEL-NEXT: cmp x9, #7 +; GISEL-NEXT: csel x11, xzr, x11, eq +; GISEL-NEXT: cmp x9, #8 +; GISEL-NEXT: mov x6, x13 +; GISEL-NEXT: csel x11, xzr, x11, eq +; GISEL-NEXT: cmp x9, #9 +; GISEL-NEXT: str x6, [sp, #256] ; 8-byte Folded Spill +; GISEL-NEXT: csel x11, xzr, x11, eq +; GISEL-NEXT: cmp x9, #10 +; GISEL-NEXT: csel x11, xzr, x11, eq +; GISEL-NEXT: cmp x9, #11 +; GISEL-NEXT: csel x11, xzr, x11, eq +; GISEL-NEXT: cmp x9, #12 +; GISEL-NEXT: csel x11, xzr, x11, eq +; GISEL-NEXT: cmp x9, #13 +; GISEL-NEXT: csel x11, xzr, x11, eq +; GISEL-NEXT: cmp x9, #14 +; GISEL-NEXT: csel x11, xzr, x11, eq +; GISEL-NEXT: cmp x9, #15 +; GISEL-NEXT: csel x11, xzr, x11, eq +; GISEL-NEXT: cmp x8, #0 +; GISEL-NEXT: csel x11, x12, x11, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: str x11, [sp, #144] ; 8-byte Folded Spill +; GISEL-NEXT: csel x11, xzr, x13, eq +; GISEL-NEXT: cmp x9, #0 +; GISEL-NEXT: orr x11, x30, x11 +; GISEL-NEXT: csel x11, x11, xzr, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x12, xzr, x3, eq +; GISEL-NEXT: cmp x9, #1 +; GISEL-NEXT: orr x12, x4, x12 +; GISEL-NEXT: csel x11, x12, x11, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x12, xzr, x27, eq +; GISEL-NEXT: cmp x9, #2 +; GISEL-NEXT: orr x12, x19, x12 +; GISEL-NEXT: csel x11, x12, x11, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x12, xzr, x17, eq +; GISEL-NEXT: cmp x9, #3 +; GISEL-NEXT: orr x12, x0, x12 +; GISEL-NEXT: csel x11, x12, x11, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x12, xzr, x15, eq +; GISEL-NEXT: cmp x9, #4 +; GISEL-NEXT: orr x12, x14, x12 +; GISEL-NEXT: csel x11, x12, x11, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x12, xzr, x23, eq +; GISEL-NEXT: cmp x9, #5 +; GISEL-NEXT: orr x12, x20, x12 +; GISEL-NEXT: csel x11, x12, x11, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x12, xzr, x26, eq +; GISEL-NEXT: cmp x9, #6 +; GISEL-NEXT: orr x12, x2, x12 +; GISEL-NEXT: csel x11, x12, x11, eq +; GISEL-NEXT: cmp x9, #7 +; GISEL-NEXT: csel x11, x25, x11, eq +; GISEL-NEXT: cmp x9, #8 +; GISEL-NEXT: csel x11, xzr, x11, eq +; GISEL-NEXT: cmp x9, #9 +; GISEL-NEXT: csel x11, xzr, x11, eq +; GISEL-NEXT: cmp x9, #10 +; GISEL-NEXT: csel x11, xzr, x11, eq +; GISEL-NEXT: cmp x9, #11 +; GISEL-NEXT: csel x11, xzr, x11, eq +; GISEL-NEXT: cmp x9, #12 +; GISEL-NEXT: csel x11, xzr, x11, eq +; GISEL-NEXT: cmp x9, #13 +; GISEL-NEXT: csel x11, xzr, x11, eq +; GISEL-NEXT: cmp x9, #14 +; GISEL-NEXT: csel x12, xzr, x11, eq +; GISEL-NEXT: ldp x11, x5, [x1, #64] +; GISEL-NEXT: cmp x9, #15 +; GISEL-NEXT: csel x12, xzr, x12, eq +; GISEL-NEXT: cmp x8, #0 +; GISEL-NEXT: csel x12, x10, x12, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: lsl x21, x11, x16 +; GISEL-NEXT: str x12, [sp, #136] ; 8-byte Folded Spill +; GISEL-NEXT: csel x12, xzr, x28, eq +; GISEL-NEXT: cmp x9, #0 +; GISEL-NEXT: orr x12, x21, x12 +; GISEL-NEXT: lsr x10, x11, x22 +; GISEL-NEXT: mov x16, x19 +; GISEL-NEXT: csel x12, x12, xzr, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: mov x1, x16 +; GISEL-NEXT: csel x13, xzr, x13, eq +; GISEL-NEXT: cmp x9, #1 +; GISEL-NEXT: str x16, [sp, #304] ; 8-byte Folded Spill +; GISEL-NEXT: orr x13, x30, x13 +; GISEL-NEXT: csel x12, x13, x12, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x13, xzr, x3, eq +; GISEL-NEXT: cmp x9, #2 +; GISEL-NEXT: lsl x3, x5, x24 +; GISEL-NEXT: orr x13, x4, x13 +; GISEL-NEXT: csel x12, x13, x12, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: stp x21, x3, [sp, #216] ; 16-byte Folded Spill +; GISEL-NEXT: csel x13, xzr, x27, eq +; GISEL-NEXT: cmp x9, #3 +; GISEL-NEXT: orr x13, x19, x13 +; GISEL-NEXT: mov x19, x28 +; GISEL-NEXT: csel x12, x13, x12, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x13, xzr, x17, eq +; GISEL-NEXT: cmp x9, #4 +; GISEL-NEXT: orr x13, x0, x13 +; GISEL-NEXT: csel x12, x13, x12, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x13, xzr, x15, eq +; GISEL-NEXT: cmp x9, #5 +; GISEL-NEXT: orr x13, x14, x13 +; GISEL-NEXT: csel x12, x13, x12, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x13, xzr, x23, eq +; GISEL-NEXT: cmp x9, #6 +; GISEL-NEXT: orr x13, x20, x13 +; GISEL-NEXT: csel x12, x13, x12, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x13, xzr, x26, eq +; GISEL-NEXT: cmp x9, #7 +; GISEL-NEXT: orr x13, x2, x13 +; GISEL-NEXT: csel x12, x13, x12, eq +; GISEL-NEXT: cmp x9, #8 +; GISEL-NEXT: csel x12, x25, x12, eq +; GISEL-NEXT: cmp x9, #9 +; GISEL-NEXT: csel x12, xzr, x12, eq +; GISEL-NEXT: cmp x9, #10 +; GISEL-NEXT: csel x12, xzr, x12, eq +; GISEL-NEXT: cmp x9, #11 +; GISEL-NEXT: csel x12, xzr, x12, eq +; GISEL-NEXT: cmp x9, #12 +; GISEL-NEXT: csel x12, xzr, x12, eq +; GISEL-NEXT: cmp x9, #13 +; GISEL-NEXT: csel x12, xzr, x12, eq +; GISEL-NEXT: cmp x9, #14 +; GISEL-NEXT: csel x12, xzr, x12, eq +; GISEL-NEXT: cmp x9, #15 +; GISEL-NEXT: csel x12, xzr, x12, eq +; GISEL-NEXT: cmp x8, #0 +; GISEL-NEXT: csel x11, x11, x12, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: str x11, [sp, #128] ; 8-byte Folded Spill +; GISEL-NEXT: csel x11, xzr, x10, eq +; GISEL-NEXT: cmp x9, #0 +; GISEL-NEXT: orr x11, x3, x11 +; GISEL-NEXT: csel x11, x11, xzr, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x12, xzr, x28, eq +; GISEL-NEXT: cmp x9, #1 +; GISEL-NEXT: mov x28, x4 +; GISEL-NEXT: orr x12, x21, x12 +; GISEL-NEXT: str x28, [sp, #32] ; 8-byte Folded Spill +; GISEL-NEXT: csel x11, x12, x11, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x12, xzr, x6, eq +; GISEL-NEXT: cmp x9, #2 +; GISEL-NEXT: orr x12, x30, x12 +; GISEL-NEXT: csel x11, x12, x11, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x12, xzr, x7, eq +; GISEL-NEXT: cmp x9, #3 +; GISEL-NEXT: orr x12, x4, x12 +; GISEL-NEXT: mov x4, x20 +; GISEL-NEXT: csel x11, x12, x11, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x12, xzr, x27, eq +; GISEL-NEXT: cmp x9, #4 +; GISEL-NEXT: mov x27, x2 +; GISEL-NEXT: orr x12, x16, x12 +; GISEL-NEXT: mov x16, x17 +; GISEL-NEXT: csel x11, x12, x11, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x12, xzr, x17, eq +; GISEL-NEXT: cmp x9, #5 +; GISEL-NEXT: mov x17, x15 +; GISEL-NEXT: orr x12, x0, x12 +; GISEL-NEXT: lsr x0, x5, x22 +; GISEL-NEXT: csel x11, x12, x11, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x12, xzr, x15, eq +; GISEL-NEXT: cmp x9, #6 +; GISEL-NEXT: ldr x15, [sp, #312] ; 8-byte Folded Reload +; GISEL-NEXT: orr x12, x14, x12 +; GISEL-NEXT: str x0, [sp, #280] ; 8-byte Folded Spill +; GISEL-NEXT: csel x11, x12, x11, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x12, xzr, x23, eq +; GISEL-NEXT: cmp x9, #7 +; GISEL-NEXT: mov x23, x25 +; GISEL-NEXT: orr x12, x20, x12 +; GISEL-NEXT: str x23, [sp, #288] ; 8-byte Folded Spill +; GISEL-NEXT: csel x11, x12, x11, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x12, xzr, x26, eq +; GISEL-NEXT: cmp x9, #8 +; GISEL-NEXT: orr x12, x2, x12 +; GISEL-NEXT: mov x2, x3 +; GISEL-NEXT: csel x11, x12, x11, eq +; GISEL-NEXT: cmp x9, #9 +; GISEL-NEXT: csel x11, x25, x11, eq +; GISEL-NEXT: cmp x9, #10 +; GISEL-NEXT: mov x25, x26 +; GISEL-NEXT: csel x11, xzr, x11, eq +; GISEL-NEXT: cmp x9, #11 +; GISEL-NEXT: csel x11, xzr, x11, eq +; GISEL-NEXT: cmp x9, #12 +; GISEL-NEXT: csel x11, xzr, x11, eq +; GISEL-NEXT: cmp x9, #13 +; GISEL-NEXT: csel x11, xzr, x11, eq +; GISEL-NEXT: cmp x9, #14 +; GISEL-NEXT: csel x11, xzr, x11, eq +; GISEL-NEXT: cmp x9, #15 +; GISEL-NEXT: csel x12, xzr, x11, eq +; GISEL-NEXT: cmp x8, #0 +; GISEL-NEXT: csel x12, x5, x12, eq +; GISEL-NEXT: ldp x11, x5, [x15, #80] +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: str x12, [sp, #120] ; 8-byte Folded Spill +; GISEL-NEXT: mov x15, x7 +; GISEL-NEXT: csel x12, xzr, x0, eq +; GISEL-NEXT: cmp x9, #0 +; GISEL-NEXT: str x15, [sp, #24] ; 8-byte Folded Spill +; GISEL-NEXT: lsl x20, x11, x24 +; GISEL-NEXT: orr x12, x20, x12 +; GISEL-NEXT: str x20, [sp, #232] ; 8-byte Folded Spill +; GISEL-NEXT: csel x12, x12, xzr, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x13, xzr, x10, eq +; GISEL-NEXT: cmp x9, #1 +; GISEL-NEXT: orr x13, x3, x13 +; GISEL-NEXT: lsl x3, x5, x24 +; GISEL-NEXT: csel x12, x13, x12, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x13, xzr, x19, eq +; GISEL-NEXT: cmp x9, #2 +; GISEL-NEXT: stp x19, x3, [sp, #264] ; 16-byte Folded Spill +; GISEL-NEXT: orr x13, x21, x13 +; GISEL-NEXT: csel x12, x13, x12, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x13, xzr, x6, eq +; GISEL-NEXT: cmp x9, #3 +; GISEL-NEXT: orr x13, x30, x13 +; GISEL-NEXT: csel x12, x13, x12, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x13, xzr, x7, eq +; GISEL-NEXT: ldp x7, x30, [sp, #240] ; 16-byte Folded Reload +; GISEL-NEXT: cmp x9, #4 +; GISEL-NEXT: orr x13, x28, x13 +; GISEL-NEXT: csel x12, x13, x12, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x13, xzr, x7, eq +; GISEL-NEXT: cmp x9, #5 +; GISEL-NEXT: orr x13, x1, x13 +; GISEL-NEXT: mov x1, x14 +; GISEL-NEXT: csel x12, x13, x12, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x13, xzr, x16, eq +; GISEL-NEXT: cmp x9, #6 +; GISEL-NEXT: orr x13, x30, x13 +; GISEL-NEXT: csel x12, x13, x12, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x13, xzr, x17, eq +; GISEL-NEXT: cmp x9, #7 +; GISEL-NEXT: orr x13, x14, x13 +; GISEL-NEXT: ldr x14, [sp, #208] ; 8-byte Folded Reload +; GISEL-NEXT: csel x12, x13, x12, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x13, xzr, x14, eq +; GISEL-NEXT: cmp x9, #8 +; GISEL-NEXT: orr x13, x4, x13 +; GISEL-NEXT: mov x4, x10 +; GISEL-NEXT: csel x12, x13, x12, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x13, xzr, x26, eq +; GISEL-NEXT: cmp x9, #9 +; GISEL-NEXT: mov x26, x27 +; GISEL-NEXT: orr x13, x27, x13 +; GISEL-NEXT: lsr x27, x11, x22 +; GISEL-NEXT: csel x12, x13, x12, eq +; GISEL-NEXT: cmp x9, #10 +; GISEL-NEXT: mov x13, x23 +; GISEL-NEXT: csel x12, x23, x12, eq +; GISEL-NEXT: cmp x9, #11 +; GISEL-NEXT: str x27, [sp, #64] ; 8-byte Folded Spill +; GISEL-NEXT: csel x12, xzr, x12, eq +; GISEL-NEXT: cmp x9, #12 +; GISEL-NEXT: mov x23, x20 +; GISEL-NEXT: csel x12, xzr, x12, eq +; GISEL-NEXT: cmp x9, #13 +; GISEL-NEXT: csel x12, xzr, x12, eq +; GISEL-NEXT: cmp x9, #14 +; GISEL-NEXT: csel x12, xzr, x12, eq +; GISEL-NEXT: cmp x9, #15 +; GISEL-NEXT: csel x12, xzr, x12, eq +; GISEL-NEXT: cmp x8, #0 +; GISEL-NEXT: csel x11, x11, x12, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: str x11, [sp, #104] ; 8-byte Folded Spill +; GISEL-NEXT: csel x11, xzr, x27, eq +; GISEL-NEXT: cmp x9, #0 +; GISEL-NEXT: orr x11, x3, x11 +; GISEL-NEXT: csel x11, x11, xzr, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x12, xzr, x0, eq +; GISEL-NEXT: cmp x9, #1 +; GISEL-NEXT: mov x0, x7 +; GISEL-NEXT: orr x12, x20, x12 +; GISEL-NEXT: mov x20, x16 +; GISEL-NEXT: csel x11, x12, x11, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x12, xzr, x10, eq +; GISEL-NEXT: cmp x9, #2 +; GISEL-NEXT: ldr x10, [sp, #312] ; 8-byte Folded Reload +; GISEL-NEXT: orr x12, x2, x12 +; GISEL-NEXT: ldr x2, [sp, #304] ; 8-byte Folded Reload +; GISEL-NEXT: csel x11, x12, x11, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x12, xzr, x19, eq +; GISEL-NEXT: cmp x9, #3 +; GISEL-NEXT: orr x12, x21, x12 +; GISEL-NEXT: ldr x21, [sp, #200] ; 8-byte Folded Reload +; GISEL-NEXT: csel x11, x12, x11, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x12, xzr, x6, eq +; GISEL-NEXT: cmp x9, #4 +; GISEL-NEXT: orr x12, x21, x12 +; GISEL-NEXT: csel x11, x12, x11, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x12, xzr, x15, eq +; GISEL-NEXT: cmp x9, #5 +; GISEL-NEXT: orr x12, x28, x12 +; GISEL-NEXT: csel x11, x12, x11, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x12, xzr, x7, eq +; GISEL-NEXT: cmp x9, #6 +; GISEL-NEXT: mov x7, x17 +; GISEL-NEXT: orr x12, x2, x12 +; GISEL-NEXT: csel x11, x12, x11, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x12, xzr, x16, eq +; GISEL-NEXT: cmp x9, #7 +; GISEL-NEXT: orr x12, x30, x12 +; GISEL-NEXT: csel x11, x12, x11, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x12, xzr, x17, eq +; GISEL-NEXT: cmp x9, #8 +; GISEL-NEXT: mov x17, x24 +; GISEL-NEXT: orr x12, x1, x12 +; GISEL-NEXT: csel x11, x12, x11, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x12, xzr, x14, eq +; GISEL-NEXT: ldr x14, [sp, #8] ; 8-byte Folded Reload +; GISEL-NEXT: cmp x9, #9 +; GISEL-NEXT: orr x12, x14, x12 +; GISEL-NEXT: csel x11, x12, x11, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x12, xzr, x25, eq +; GISEL-NEXT: cmp x9, #10 +; GISEL-NEXT: orr x12, x26, x12 +; GISEL-NEXT: csel x11, x12, x11, eq +; GISEL-NEXT: cmp x9, #11 +; GISEL-NEXT: csel x11, x13, x11, eq +; GISEL-NEXT: cmp x9, #12 +; GISEL-NEXT: csel x11, xzr, x11, eq +; GISEL-NEXT: cmp x9, #13 +; GISEL-NEXT: csel x11, xzr, x11, eq +; GISEL-NEXT: cmp x9, #14 +; GISEL-NEXT: csel x11, xzr, x11, eq +; GISEL-NEXT: cmp x9, #15 +; GISEL-NEXT: csel x12, xzr, x11, eq +; GISEL-NEXT: cmp x8, #0 +; GISEL-NEXT: ldp x11, x10, [x10, #96] +; GISEL-NEXT: csel x12, x5, x12, eq +; GISEL-NEXT: str x12, [sp, #96] ; 8-byte Folded Spill +; GISEL-NEXT: mov x12, x22 +; GISEL-NEXT: lsr x22, x5, x22 +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: mov x5, x27 +; GISEL-NEXT: lsl x24, x11, x24 +; GISEL-NEXT: str x10, [sp, #296] ; 8-byte Folded Spill +; GISEL-NEXT: csel x10, xzr, x22, eq +; GISEL-NEXT: cmp x9, #0 +; GISEL-NEXT: str x22, [sp, #16] ; 8-byte Folded Spill +; GISEL-NEXT: orr x10, x24, x10 +; GISEL-NEXT: csel x10, x10, xzr, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x13, xzr, x27, eq +; GISEL-NEXT: cmp x9, #1 +; GISEL-NEXT: ldr x27, [sp, #280] ; 8-byte Folded Reload +; GISEL-NEXT: orr x13, x3, x13 +; GISEL-NEXT: mov x3, x26 +; GISEL-NEXT: csel x10, x13, x10, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x13, xzr, x27, eq +; GISEL-NEXT: cmp x9, #2 +; GISEL-NEXT: orr x13, x23, x13 +; GISEL-NEXT: mov x23, x4 +; GISEL-NEXT: csel x10, x13, x10, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x13, xzr, x4, eq +; GISEL-NEXT: ldp x4, x16, [sp, #216] ; 16-byte Folded Reload +; GISEL-NEXT: cmp x9, #3 +; GISEL-NEXT: orr x13, x16, x13 +; GISEL-NEXT: csel x10, x13, x10, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x13, xzr, x19, eq +; GISEL-NEXT: cmp x9, #4 +; GISEL-NEXT: mov x19, x1 +; GISEL-NEXT: orr x13, x4, x13 +; GISEL-NEXT: csel x10, x13, x10, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x13, xzr, x6, eq +; GISEL-NEXT: cmp x9, #5 +; GISEL-NEXT: mov x6, x14 +; GISEL-NEXT: orr x13, x21, x13 +; GISEL-NEXT: csel x10, x13, x10, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x13, xzr, x15, eq +; GISEL-NEXT: cmp x9, #6 +; GISEL-NEXT: orr x13, x28, x13 +; GISEL-NEXT: csel x10, x13, x10, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x13, xzr, x0, eq +; GISEL-NEXT: cmp x9, #7 +; GISEL-NEXT: mov x0, x23 +; GISEL-NEXT: orr x13, x2, x13 +; GISEL-NEXT: csel x10, x13, x10, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x13, xzr, x20, eq +; GISEL-NEXT: cmp x9, #8 +; GISEL-NEXT: orr x13, x30, x13 +; GISEL-NEXT: ldr x30, [sp, #208] ; 8-byte Folded Reload +; GISEL-NEXT: csel x10, x13, x10, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x13, xzr, x7, eq +; GISEL-NEXT: cmp x9, #9 +; GISEL-NEXT: orr x13, x1, x13 +; GISEL-NEXT: csel x10, x13, x10, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x13, xzr, x30, eq +; GISEL-NEXT: cmp x9, #10 +; GISEL-NEXT: orr x13, x14, x13 +; GISEL-NEXT: ldp x14, x2, [sp, #264] ; 16-byte Folded Reload +; GISEL-NEXT: csel x10, x13, x10, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x13, xzr, x25, eq +; GISEL-NEXT: cmp x9, #11 +; GISEL-NEXT: orr x13, x26, x13 +; GISEL-NEXT: ldr x26, [sp, #288] ; 8-byte Folded Reload +; GISEL-NEXT: csel x10, x13, x10, eq +; GISEL-NEXT: cmp x9, #12 +; GISEL-NEXT: lsr x13, x11, x12 +; GISEL-NEXT: csel x10, x26, x10, eq +; GISEL-NEXT: cmp x9, #13 +; GISEL-NEXT: csel x10, xzr, x10, eq +; GISEL-NEXT: cmp x9, #14 +; GISEL-NEXT: str x13, [sp, #72] ; 8-byte Folded Spill +; GISEL-NEXT: csel x10, xzr, x10, eq +; GISEL-NEXT: cmp x9, #15 +; GISEL-NEXT: csel x10, xzr, x10, eq +; GISEL-NEXT: cmp x8, #0 +; GISEL-NEXT: csel x10, x11, x10, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: str x10, [sp, #88] ; 8-byte Folded Spill +; GISEL-NEXT: ldr x10, [sp, #296] ; 8-byte Folded Reload +; GISEL-NEXT: lsl x11, x10, x17 +; GISEL-NEXT: csel x10, xzr, x13, eq +; GISEL-NEXT: cmp x9, #0 +; GISEL-NEXT: ldr x17, [sp, #232] ; 8-byte Folded Reload +; GISEL-NEXT: ldr x13, [sp, #256] ; 8-byte Folded Reload +; GISEL-NEXT: orr x10, x11, x10 +; GISEL-NEXT: str x11, [sp, #56] ; 8-byte Folded Spill +; GISEL-NEXT: csel x10, x10, xzr, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x11, xzr, x22, eq +; GISEL-NEXT: cmp x9, #1 +; GISEL-NEXT: orr x11, x24, x11 +; GISEL-NEXT: csel x10, x11, x10, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x11, xzr, x5, eq +; GISEL-NEXT: cmp x9, #2 +; GISEL-NEXT: orr x11, x2, x11 +; GISEL-NEXT: ldp x12, x5, [sp, #240] ; 16-byte Folded Reload +; GISEL-NEXT: csel x10, x11, x10, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x11, xzr, x27, eq +; GISEL-NEXT: cmp x9, #3 +; GISEL-NEXT: mov x27, x30 +; GISEL-NEXT: orr x11, x17, x11 +; GISEL-NEXT: csel x10, x11, x10, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x11, xzr, x23, eq +; GISEL-NEXT: cmp x9, #4 +; GISEL-NEXT: mov x23, x20 +; GISEL-NEXT: orr x11, x16, x11 +; GISEL-NEXT: ldr x16, [sp, #304] ; 8-byte Folded Reload +; GISEL-NEXT: csel x10, x11, x10, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x11, xzr, x14, eq +; GISEL-NEXT: cmp x9, #5 +; GISEL-NEXT: orr x11, x4, x11 +; GISEL-NEXT: csel x10, x11, x10, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x11, xzr, x13, eq +; GISEL-NEXT: cmp x9, #6 +; GISEL-NEXT: orr x11, x21, x11 +; GISEL-NEXT: ldr x21, [sp, #296] ; 8-byte Folded Reload +; GISEL-NEXT: csel x10, x11, x10, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x11, xzr, x15, eq +; GISEL-NEXT: cmp x9, #7 +; GISEL-NEXT: orr x11, x28, x11 +; GISEL-NEXT: csel x10, x11, x10, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x11, xzr, x12, eq +; GISEL-NEXT: cmp x9, #8 +; GISEL-NEXT: orr x11, x16, x11 +; GISEL-NEXT: csel x10, x11, x10, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x11, xzr, x20, eq +; GISEL-NEXT: cmp x9, #9 +; GISEL-NEXT: orr x11, x5, x11 +; GISEL-NEXT: csel x10, x11, x10, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x11, xzr, x7, eq +; GISEL-NEXT: cmp x9, #10 +; GISEL-NEXT: orr x11, x1, x11 +; GISEL-NEXT: ldr x1, [sp, #312] ; 8-byte Folded Reload +; GISEL-NEXT: csel x10, x11, x10, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x11, xzr, x30, eq +; GISEL-NEXT: cmp x9, #11 +; GISEL-NEXT: orr x11, x6, x11 +; GISEL-NEXT: csel x10, x11, x10, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x11, xzr, x25, eq +; GISEL-NEXT: cmp x9, #12 +; GISEL-NEXT: orr x11, x3, x11 +; GISEL-NEXT: csel x10, x11, x10, eq +; GISEL-NEXT: cmp x9, #13 +; GISEL-NEXT: csel x10, x26, x10, eq +; GISEL-NEXT: cmp x9, #14 +; GISEL-NEXT: csel x10, xzr, x10, eq +; GISEL-NEXT: cmp x9, #15 +; GISEL-NEXT: csel x11, xzr, x10, eq +; GISEL-NEXT: cmp x8, #0 +; GISEL-NEXT: csel x11, x21, x11, eq +; GISEL-NEXT: ldp x10, x20, [x1, #112] +; GISEL-NEXT: str x11, [sp, #80] ; 8-byte Folded Spill +; GISEL-NEXT: ldp x11, x4, [sp, #40] ; 16-byte Folded Reload +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: lsr x21, x21, x4 +; GISEL-NEXT: lsl x28, x10, x11 +; GISEL-NEXT: csel x1, xzr, x21, eq +; GISEL-NEXT: str x21, [sp, #296] ; 8-byte Folded Spill +; GISEL-NEXT: cmp x9, #0 +; GISEL-NEXT: orr x1, x28, x1 +; GISEL-NEXT: ldr x21, [sp, #72] ; 8-byte Folded Reload +; GISEL-NEXT: str x28, [sp, #312] ; 8-byte Folded Spill +; GISEL-NEXT: csel x1, x1, xzr, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: ldr x28, [sp, #56] ; 8-byte Folded Reload +; GISEL-NEXT: csel x30, xzr, x21, eq +; GISEL-NEXT: cmp x9, #1 +; GISEL-NEXT: orr x30, x28, x30 +; GISEL-NEXT: csel x1, x30, x1, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x30, xzr, x22, eq +; GISEL-NEXT: cmp x9, #2 +; GISEL-NEXT: ldr x22, [sp, #64] ; 8-byte Folded Reload +; GISEL-NEXT: orr x30, x24, x30 +; GISEL-NEXT: csel x1, x30, x1, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x30, xzr, x22, eq +; GISEL-NEXT: cmp x9, #3 +; GISEL-NEXT: orr x30, x2, x30 +; GISEL-NEXT: ldr x2, [sp, #280] ; 8-byte Folded Reload +; GISEL-NEXT: csel x1, x30, x1, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x30, xzr, x2, eq +; GISEL-NEXT: cmp x9, #4 +; GISEL-NEXT: orr x30, x17, x30 +; GISEL-NEXT: ldr x17, [sp, #224] ; 8-byte Folded Reload +; GISEL-NEXT: csel x1, x30, x1, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x30, xzr, x0, eq +; GISEL-NEXT: cmp x9, #5 +; GISEL-NEXT: orr x30, x17, x30 +; GISEL-NEXT: csel x1, x30, x1, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x30, xzr, x14, eq +; GISEL-NEXT: ldr x14, [sp, #216] ; 8-byte Folded Reload +; GISEL-NEXT: cmp x9, #6 +; GISEL-NEXT: orr x30, x14, x30 +; GISEL-NEXT: csel x1, x30, x1, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x30, xzr, x13, eq +; GISEL-NEXT: ldr x13, [sp, #200] ; 8-byte Folded Reload +; GISEL-NEXT: cmp x9, #7 +; GISEL-NEXT: orr x30, x13, x30 +; GISEL-NEXT: csel x1, x30, x1, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x30, xzr, x15, eq +; GISEL-NEXT: ldr x15, [sp, #32] ; 8-byte Folded Reload +; GISEL-NEXT: cmp x9, #8 +; GISEL-NEXT: orr x30, x15, x30 +; GISEL-NEXT: csel x1, x30, x1, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x30, xzr, x12, eq +; GISEL-NEXT: cmp x9, #9 +; GISEL-NEXT: orr x30, x16, x30 +; GISEL-NEXT: csel x1, x30, x1, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x30, xzr, x23, eq +; GISEL-NEXT: cmp x9, #10 +; GISEL-NEXT: orr x30, x5, x30 +; GISEL-NEXT: csel x1, x30, x1, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x30, xzr, x7, eq +; GISEL-NEXT: cmp x9, #11 +; GISEL-NEXT: orr x30, x19, x30 +; GISEL-NEXT: csel x1, x30, x1, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x30, xzr, x27, eq +; GISEL-NEXT: cmp x9, #12 +; GISEL-NEXT: orr x30, x6, x30 +; GISEL-NEXT: csel x1, x30, x1, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x30, xzr, x25, eq +; GISEL-NEXT: cmp x9, #13 +; GISEL-NEXT: orr x30, x3, x30 +; GISEL-NEXT: csel x1, x30, x1, eq +; GISEL-NEXT: cmp x9, #14 +; GISEL-NEXT: lsr x30, x10, x4 +; GISEL-NEXT: csel x1, x26, x1, eq +; GISEL-NEXT: cmp x9, #15 +; GISEL-NEXT: csel x1, xzr, x1, eq +; GISEL-NEXT: cmp x8, #0 +; GISEL-NEXT: csel x26, x10, x1, eq +; GISEL-NEXT: lsl x10, x20, x11 +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x16, xzr, x30, eq +; GISEL-NEXT: cmp x9, #0 +; GISEL-NEXT: ldr x11, [sp, #296] ; 8-byte Folded Reload +; GISEL-NEXT: orr x10, x10, x16 +; GISEL-NEXT: ldr x16, [sp, #312] ; 8-byte Folded Reload +; GISEL-NEXT: csel x10, x10, xzr, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x11, xzr, x11, eq +; GISEL-NEXT: cmp x9, #1 +; GISEL-NEXT: orr x11, x16, x11 +; GISEL-NEXT: ldr x16, [sp, #272] ; 8-byte Folded Reload +; GISEL-NEXT: csel x10, x11, x10, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x11, xzr, x21, eq +; GISEL-NEXT: cmp x9, #2 +; GISEL-NEXT: orr x11, x28, x11 +; GISEL-NEXT: ldp x29, x30, [sp, #400] ; 16-byte Folded Reload +; GISEL-NEXT: csel x10, x11, x10, eq +; GISEL-NEXT: ldr x11, [sp, #16] ; 8-byte Folded Reload +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x11, xzr, x11, eq +; GISEL-NEXT: cmp x9, #3 +; GISEL-NEXT: orr x11, x24, x11 +; GISEL-NEXT: csel x10, x11, x10, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x11, xzr, x22, eq +; GISEL-NEXT: cmp x9, #4 +; GISEL-NEXT: orr x11, x16, x11 +; GISEL-NEXT: ldr x16, [sp, #232] ; 8-byte Folded Reload +; GISEL-NEXT: csel x10, x11, x10, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x11, xzr, x2, eq +; GISEL-NEXT: cmp x9, #5 +; GISEL-NEXT: orr x11, x16, x11 +; GISEL-NEXT: ldp x22, x21, [sp, #368] ; 16-byte Folded Reload +; GISEL-NEXT: csel x10, x11, x10, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x11, xzr, x0, eq +; GISEL-NEXT: cmp x9, #6 +; GISEL-NEXT: orr x11, x17, x11 +; GISEL-NEXT: csel x10, x11, x10, eq +; GISEL-NEXT: ldr x11, [sp, #264] ; 8-byte Folded Reload +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x11, xzr, x11, eq +; GISEL-NEXT: cmp x9, #7 +; GISEL-NEXT: orr x11, x14, x11 +; GISEL-NEXT: csel x10, x11, x10, eq +; GISEL-NEXT: ldr x11, [sp, #256] ; 8-byte Folded Reload +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x11, xzr, x11, eq +; GISEL-NEXT: cmp x9, #8 +; GISEL-NEXT: orr x11, x13, x11 +; GISEL-NEXT: ldr x13, [sp, #112] ; 8-byte Folded Reload +; GISEL-NEXT: csel x10, x11, x10, eq +; GISEL-NEXT: ldr x11, [sp, #24] ; 8-byte Folded Reload +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x11, xzr, x11, eq +; GISEL-NEXT: cmp x9, #9 +; GISEL-NEXT: orr x11, x15, x11 +; GISEL-NEXT: csel x10, x11, x10, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x11, xzr, x12, eq +; GISEL-NEXT: ldr x12, [sp, #304] ; 8-byte Folded Reload +; GISEL-NEXT: cmp x9, #10 +; GISEL-NEXT: orr x11, x12, x11 +; GISEL-NEXT: csel x10, x11, x10, eq +; GISEL-NEXT: ldr x11, [sp, #192] ; 8-byte Folded Reload +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: str x11, [x13] +; GISEL-NEXT: ldp x12, x11, [sp, #176] ; 16-byte Folded Reload +; GISEL-NEXT: stp x11, x12, [x13, #8] +; GISEL-NEXT: csel x11, xzr, x23, eq +; GISEL-NEXT: cmp x9, #11 +; GISEL-NEXT: orr x11, x5, x11 +; GISEL-NEXT: ldp x24, x23, [sp, #352] ; 16-byte Folded Reload +; GISEL-NEXT: csel x10, x11, x10, eq +; GISEL-NEXT: ldr x11, [sp, #168] ; 8-byte Folded Reload +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: str x11, [x13, #24] +; GISEL-NEXT: ldp x12, x11, [sp, #152] ; 16-byte Folded Reload +; GISEL-NEXT: stp x11, x12, [x13, #32] +; GISEL-NEXT: csel x11, xzr, x7, eq +; GISEL-NEXT: cmp x9, #12 +; GISEL-NEXT: orr x11, x19, x11 +; GISEL-NEXT: csel x10, x11, x10, eq +; GISEL-NEXT: ldr x11, [sp, #144] ; 8-byte Folded Reload +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: str x11, [x13, #48] +; GISEL-NEXT: ldp x12, x11, [sp, #128] ; 16-byte Folded Reload +; GISEL-NEXT: stp x11, x12, [x13, #56] +; GISEL-NEXT: csel x11, xzr, x27, eq +; GISEL-NEXT: cmp x9, #13 +; GISEL-NEXT: orr x11, x6, x11 +; GISEL-NEXT: ldp x28, x27, [sp, #320] ; 16-byte Folded Reload +; GISEL-NEXT: csel x10, x11, x10, eq +; GISEL-NEXT: ldr x11, [sp, #120] ; 8-byte Folded Reload +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: str x11, [x13, #72] +; GISEL-NEXT: ldp x12, x11, [sp, #96] ; 16-byte Folded Reload +; GISEL-NEXT: stp x11, x12, [x13, #80] +; GISEL-NEXT: csel x11, xzr, x25, eq +; GISEL-NEXT: cmp x9, #14 +; GISEL-NEXT: orr x11, x3, x11 +; GISEL-NEXT: csel x10, x11, x10, eq +; GISEL-NEXT: cmp x9, #15 +; GISEL-NEXT: ldr x9, [sp, #288] ; 8-byte Folded Reload +; GISEL-NEXT: ldr x11, [sp, #88] ; 8-byte Folded Reload +; GISEL-NEXT: csel x9, x9, x10, eq +; GISEL-NEXT: cmp x8, #0 +; GISEL-NEXT: ldr x8, [sp, #80] ; 8-byte Folded Reload +; GISEL-NEXT: stp x11, x8, [x13, #96] +; GISEL-NEXT: csel x8, x20, x9, eq +; GISEL-NEXT: stp x26, x8, [x13, #112] +; GISEL-NEXT: ldp x20, x19, [sp, #384] ; 16-byte Folded Reload +; GISEL-NEXT: ldp x26, x25, [sp, #336] ; 16-byte Folded Reload +; GISEL-NEXT: add sp, sp, #416 +; GISEL-NEXT: ret +entry: + %input_val = load i1024, ptr %input, align 128 + %shift_ext = zext i32 %shift to i1024 + %shifted = shl i1024 %input_val, %shift_ext + store i1024 %shifted, ptr %result, align 128 + ret void +} + +define void @test_lshr_i1024(ptr %result, ptr %input, i32 %shift) { +; SDAG-LABEL: test_lshr_i1024: +; SDAG: ; %bb.0: ; %entry +; SDAG-NEXT: sub sp, sp, #336 +; SDAG-NEXT: stp x28, x27, [sp, #256] ; 16-byte Folded Spill +; SDAG-NEXT: stp x26, x25, [sp, #272] ; 16-byte Folded Spill +; SDAG-NEXT: stp x24, x23, [sp, #288] ; 16-byte Folded Spill +; SDAG-NEXT: stp x22, x21, [sp, #304] ; 16-byte Folded Spill +; SDAG-NEXT: stp x20, x19, [sp, #320] ; 16-byte Folded Spill +; SDAG-NEXT: .cfi_def_cfa_offset 336 +; SDAG-NEXT: .cfi_offset w19, -8 +; SDAG-NEXT: .cfi_offset w20, -16 +; SDAG-NEXT: .cfi_offset w21, -24 +; SDAG-NEXT: .cfi_offset w22, -32 +; SDAG-NEXT: .cfi_offset w23, -40 +; SDAG-NEXT: .cfi_offset w24, -48 +; SDAG-NEXT: .cfi_offset w25, -56 +; SDAG-NEXT: .cfi_offset w26, -64 +; SDAG-NEXT: .cfi_offset w27, -72 +; SDAG-NEXT: .cfi_offset w28, -80 +; SDAG-NEXT: ldp x8, x9, [x1, #112] +; SDAG-NEXT: movi.2d v0, #0000000000000000 +; SDAG-NEXT: ldp q1, q2, [x1] +; SDAG-NEXT: mov x10, sp +; SDAG-NEXT: ldp q3, q4, [x1, #32] +; SDAG-NEXT: ldr q7, [x1, #96] +; SDAG-NEXT: ldp q5, q6, [x1, #64] +; SDAG-NEXT: mvn w1, w2 +; SDAG-NEXT: stp x8, x9, [sp, #112] +; SDAG-NEXT: mov w8, w2 +; SDAG-NEXT: lsr x9, x8, #3 +; SDAG-NEXT: stp q2, q3, [sp, #16] +; SDAG-NEXT: and x14, x8, #0x3f +; SDAG-NEXT: stp q4, q5, [sp, #48] +; SDAG-NEXT: eor x15, x14, #0x3f +; SDAG-NEXT: and x9, x9, #0x78 +; SDAG-NEXT: stp q6, q7, [sp, #80] +; SDAG-NEXT: stp q0, q0, [sp, #128] +; SDAG-NEXT: add x10, x10, x9 +; SDAG-NEXT: stp q0, q0, [sp, #160] +; SDAG-NEXT: stp q0, q0, [sp, #192] +; SDAG-NEXT: stp q0, q0, [sp, #224] +; SDAG-NEXT: str q1, [sp] +; SDAG-NEXT: ldp x11, x9, [x10, #16] +; SDAG-NEXT: ldr x16, [x10, #32] +; SDAG-NEXT: ldp x12, x13, [x10, #40] +; SDAG-NEXT: ldr x3, [x10, #56] +; SDAG-NEXT: ldp x4, x6, [x10, #64] +; SDAG-NEXT: lsl x2, x16, #1 +; SDAG-NEXT: lsl x17, x11, #1 +; SDAG-NEXT: ldp x24, x26, [x10, #112] +; SDAG-NEXT: lsl x5, x13, #1 +; SDAG-NEXT: lsr x13, x13, x8 +; SDAG-NEXT: lsr x11, x11, x8 +; SDAG-NEXT: lsl x14, x17, x1 +; SDAG-NEXT: lsl x7, x6, #1 +; SDAG-NEXT: lsl x17, x2, x1 +; SDAG-NEXT: lsl x2, x5, x1 +; SDAG-NEXT: ldp x5, x22, [x10, #80] +; SDAG-NEXT: lsr x19, x4, x8 +; SDAG-NEXT: lsl x7, x7, x15 +; SDAG-NEXT: lsl x21, x4, #1 +; SDAG-NEXT: lsr x6, x6, x8 +; SDAG-NEXT: lsl x27, x24, #1 +; SDAG-NEXT: lsr x24, x24, x8 +; SDAG-NEXT: lsl x23, x5, #1 +; SDAG-NEXT: orr x4, x7, x19 +; SDAG-NEXT: lsr x25, x22, x8 +; SDAG-NEXT: ldp x20, x7, [x10, #96] +; SDAG-NEXT: lsl x21, x21, x1 +; SDAG-NEXT: lsl x23, x23, x1 +; SDAG-NEXT: lsl x22, x22, #1 +; SDAG-NEXT: lsr x5, x5, x8 +; SDAG-NEXT: lsr x16, x16, x8 +; SDAG-NEXT: lsl x19, x20, #1 +; SDAG-NEXT: orr x6, x6, x23 +; SDAG-NEXT: lsl x23, x7, #1 +; SDAG-NEXT: lsr x20, x20, x8 +; SDAG-NEXT: lsr x7, x7, x8 +; SDAG-NEXT: lsl x22, x22, x15 +; SDAG-NEXT: lsl x19, x19, x1 +; SDAG-NEXT: lsl x1, x27, x1 +; SDAG-NEXT: lsl x23, x23, x15 +; SDAG-NEXT: orr x5, x22, x5 +; SDAG-NEXT: ldp x28, x27, [sp, #256] ; 16-byte Folded Reload +; SDAG-NEXT: orr x19, x25, x19 +; SDAG-NEXT: lsl x25, x26, #1 +; SDAG-NEXT: orr x20, x23, x20 +; SDAG-NEXT: orr x1, x7, x1 +; SDAG-NEXT: ldp x23, x10, [x10] +; SDAG-NEXT: stp x20, x1, [x0, #96] +; SDAG-NEXT: lsl x20, x3, #1 +; SDAG-NEXT: lsl x25, x25, x15 +; SDAG-NEXT: lsr x26, x26, x8 +; SDAG-NEXT: stp x5, x19, [x0, #80] +; SDAG-NEXT: lsr x3, x3, x8 +; SDAG-NEXT: lsl x19, x20, x15 +; SDAG-NEXT: orr x7, x25, x24 +; SDAG-NEXT: lsl x1, x9, #1 +; SDAG-NEXT: stp x7, x26, [x0, #112] +; SDAG-NEXT: lsl x7, x10, #1 +; SDAG-NEXT: orr x3, x3, x21 +; SDAG-NEXT: orr x13, x19, x13 +; SDAG-NEXT: lsl x5, x12, #1 +; SDAG-NEXT: lsr x9, x9, x8 +; SDAG-NEXT: stp x13, x3, [x0, #48] +; SDAG-NEXT: lsl x13, x1, x15 +; SDAG-NEXT: lsr x23, x23, x8 +; SDAG-NEXT: lsr x12, x12, x8 +; SDAG-NEXT: lsr x8, x10, x8 +; SDAG-NEXT: lsl x10, x7, x15 +; SDAG-NEXT: stp x4, x6, [x0, #64] +; SDAG-NEXT: lsl x4, x5, x15 +; SDAG-NEXT: orr x9, x9, x17 +; SDAG-NEXT: orr x11, x13, x11 +; SDAG-NEXT: ldp x20, x19, [sp, #320] ; 16-byte Folded Reload +; SDAG-NEXT: stp x11, x9, [x0, #16] +; SDAG-NEXT: orr x9, x10, x23 +; SDAG-NEXT: orr x12, x12, x2 +; SDAG-NEXT: ldp x22, x21, [sp, #304] ; 16-byte Folded Reload +; SDAG-NEXT: orr x16, x4, x16 +; SDAG-NEXT: ldp x24, x23, [sp, #288] ; 16-byte Folded Reload +; SDAG-NEXT: orr x8, x8, x14 +; SDAG-NEXT: ldp x26, x25, [sp, #272] ; 16-byte Folded Reload +; SDAG-NEXT: stp x16, x12, [x0, #32] +; SDAG-NEXT: stp x9, x8, [x0] +; SDAG-NEXT: add sp, sp, #336 +; SDAG-NEXT: ret +; +; GISEL-LABEL: test_lshr_i1024: +; GISEL: ; %bb.0: ; %entry +; GISEL-NEXT: sub sp, sp, #416 +; GISEL-NEXT: stp x28, x27, [sp, #320] ; 16-byte Folded Spill +; GISEL-NEXT: stp x26, x25, [sp, #336] ; 16-byte Folded Spill +; GISEL-NEXT: stp x24, x23, [sp, #352] ; 16-byte Folded Spill +; GISEL-NEXT: stp x22, x21, [sp, #368] ; 16-byte Folded Spill +; GISEL-NEXT: stp x20, x19, [sp, #384] ; 16-byte Folded Spill +; GISEL-NEXT: stp x29, x30, [sp, #400] ; 16-byte Folded Spill +; GISEL-NEXT: .cfi_def_cfa_offset 416 +; GISEL-NEXT: .cfi_offset w30, -8 +; GISEL-NEXT: .cfi_offset w29, -16 +; GISEL-NEXT: .cfi_offset w19, -24 +; GISEL-NEXT: .cfi_offset w20, -32 +; GISEL-NEXT: .cfi_offset w21, -40 +; GISEL-NEXT: .cfi_offset w22, -48 +; GISEL-NEXT: .cfi_offset w23, -56 +; GISEL-NEXT: .cfi_offset w24, -64 +; GISEL-NEXT: .cfi_offset w25, -72 +; GISEL-NEXT: .cfi_offset w26, -80 +; GISEL-NEXT: .cfi_offset w27, -88 +; GISEL-NEXT: .cfi_offset w28, -96 +; GISEL-NEXT: mov w8, w2 +; GISEL-NEXT: ldp x20, x16, [x1] +; GISEL-NEXT: mov w9, #64 ; =0x40 +; GISEL-NEXT: and x14, x8, #0x3f +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: sub x15, x9, x14 +; GISEL-NEXT: ldp x12, x13, [x1, #16] +; GISEL-NEXT: lsl x10, x16, x15 +; GISEL-NEXT: lsr x9, x8, #6 +; GISEL-NEXT: lsr x11, x20, x14 +; GISEL-NEXT: lsr x19, x16, x14 +; GISEL-NEXT: str x16, [sp, #264] ; 8-byte Folded Spill +; GISEL-NEXT: csel x10, xzr, x10, eq +; GISEL-NEXT: lsl x22, x12, x15 +; GISEL-NEXT: cmp x9, #0 +; GISEL-NEXT: orr x10, x11, x10 +; GISEL-NEXT: str x12, [sp, #240] ; 8-byte Folded Spill +; GISEL-NEXT: lsr x26, x12, x14 +; GISEL-NEXT: csel x10, x10, xzr, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: lsl x24, x13, x15 +; GISEL-NEXT: csel x11, xzr, x22, eq +; GISEL-NEXT: cmp x9, #1 +; GISEL-NEXT: lsr x5, x13, x14 +; GISEL-NEXT: orr x11, x19, x11 +; GISEL-NEXT: ldp x12, x16, [x1, #32] +; GISEL-NEXT: csel x10, x11, x10, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: str x0, [sp, #296] ; 8-byte Folded Spill +; GISEL-NEXT: csel x11, xzr, x24, eq +; GISEL-NEXT: cmp x9, #2 +; GISEL-NEXT: str x13, [sp, #216] ; 8-byte Folded Spill +; GISEL-NEXT: lsl x23, x12, x15 +; GISEL-NEXT: orr x11, x26, x11 +; GISEL-NEXT: stp x12, x16, [sp, #176] ; 16-byte Folded Spill +; GISEL-NEXT: csel x10, x11, x10, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: lsr x17, x12, x14 +; GISEL-NEXT: csel x11, xzr, x23, eq +; GISEL-NEXT: lsl x0, x16, x15 +; GISEL-NEXT: cmp x9, #3 +; GISEL-NEXT: orr x11, x5, x11 +; GISEL-NEXT: ldp x13, x12, [x1, #48] +; GISEL-NEXT: csel x10, x11, x10, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: lsr x7, x16, x14 +; GISEL-NEXT: csel x11, xzr, x0, eq +; GISEL-NEXT: cmp x9, #4 +; GISEL-NEXT: stp x17, x0, [sp, #152] ; 16-byte Folded Spill +; GISEL-NEXT: lsl x2, x13, x15 +; GISEL-NEXT: orr x11, x17, x11 +; GISEL-NEXT: stp x13, x12, [sp, #192] ; 16-byte Folded Spill +; GISEL-NEXT: csel x10, x11, x10, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: lsr x13, x13, x14 +; GISEL-NEXT: csel x11, xzr, x2, eq +; GISEL-NEXT: lsl x0, x12, x15 +; GISEL-NEXT: cmp x9, #5 +; GISEL-NEXT: stp x13, x2, [sp, #136] ; 16-byte Folded Spill +; GISEL-NEXT: orr x11, x7, x11 +; GISEL-NEXT: lsr x12, x12, x14 +; GISEL-NEXT: ldp x4, x2, [x1, #64] +; GISEL-NEXT: csel x10, x11, x10, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: ldr x17, [sp, #144] ; 8-byte Folded Reload +; GISEL-NEXT: stp x5, x23, [sp, #24] ; 16-byte Folded Spill +; GISEL-NEXT: csel x11, xzr, x0, eq +; GISEL-NEXT: cmp x9, #6 +; GISEL-NEXT: lsl x3, x4, x15 +; GISEL-NEXT: orr x11, x13, x11 +; GISEL-NEXT: str x4, [sp, #208] ; 8-byte Folded Spill +; GISEL-NEXT: csel x10, x11, x10, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: lsl x6, x2, x15 +; GISEL-NEXT: csel x11, xzr, x3, eq +; GISEL-NEXT: cmp x9, #7 +; GISEL-NEXT: lsr x13, x4, x14 +; GISEL-NEXT: orr x11, x12, x11 +; GISEL-NEXT: str x2, [sp, #224] ; 8-byte Folded Spill +; GISEL-NEXT: csel x16, x11, x10, eq +; GISEL-NEXT: ldp x10, x4, [x1, #80] +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: mov x11, x2 +; GISEL-NEXT: stp x13, x12, [sp, #120] ; 16-byte Folded Spill +; GISEL-NEXT: csel x2, xzr, x6, eq +; GISEL-NEXT: cmp x9, #8 +; GISEL-NEXT: lsr x11, x11, x14 +; GISEL-NEXT: orr x2, x13, x2 +; GISEL-NEXT: lsl x12, x10, x15 +; GISEL-NEXT: str x10, [sp, #232] ; 8-byte Folded Spill +; GISEL-NEXT: csel x16, x2, x16, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: lsr x10, x10, x14 +; GISEL-NEXT: csel x2, xzr, x12, eq +; GISEL-NEXT: str x12, [sp, #312] ; 8-byte Folded Spill +; GISEL-NEXT: cmp x9, #9 +; GISEL-NEXT: orr x2, x11, x2 +; GISEL-NEXT: lsl x12, x4, x15 +; GISEL-NEXT: str x10, [sp, #304] ; 8-byte Folded Spill +; GISEL-NEXT: csel x16, x2, x16, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: lsr x13, x4, x14 +; GISEL-NEXT: stp x12, x11, [sp, #104] ; 16-byte Folded Spill +; GISEL-NEXT: ldr x11, [x1, #96] +; GISEL-NEXT: csel x2, xzr, x12, eq +; GISEL-NEXT: orr x2, x10, x2 +; GISEL-NEXT: ldp x10, x30, [x1, #104] +; GISEL-NEXT: cmp x9, #10 +; GISEL-NEXT: lsl x28, x11, x15 +; GISEL-NEXT: stp x4, x11, [sp, #248] ; 16-byte Folded Spill +; GISEL-NEXT: csel x16, x2, x16, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: str x3, [sp, #16] ; 8-byte Folded Spill +; GISEL-NEXT: csel x2, xzr, x28, eq +; GISEL-NEXT: lsl x12, x10, x15 +; GISEL-NEXT: cmp x9, #11 +; GISEL-NEXT: orr x2, x13, x2 +; GISEL-NEXT: lsl x21, x30, x15 +; GISEL-NEXT: stp x10, x30, [sp, #272] ; 16-byte Folded Spill +; GISEL-NEXT: csel x16, x2, x16, eq +; GISEL-NEXT: stp x12, x13, [sp, #88] ; 16-byte Folded Spill +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: lsr x13, x11, x14 +; GISEL-NEXT: csel x2, xzr, x12, eq +; GISEL-NEXT: cmp x9, #12 +; GISEL-NEXT: ldr x11, [x1, #120] +; GISEL-NEXT: lsr x10, x10, x14 +; GISEL-NEXT: lsr x27, x30, x14 +; GISEL-NEXT: orr x4, x13, x2 +; GISEL-NEXT: mov x12, x23 +; GISEL-NEXT: str x28, [sp, #48] ; 8-byte Folded Spill +; GISEL-NEXT: csel x16, x4, x16, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: lsl x25, x11, x15 +; GISEL-NEXT: csel x1, xzr, x21, eq +; GISEL-NEXT: cmp x9, #13 +; GISEL-NEXT: stp x10, x13, [sp, #72] ; 16-byte Folded Spill +; GISEL-NEXT: orr x1, x10, x1 +; GISEL-NEXT: lsr x10, x11, x14 +; GISEL-NEXT: str x11, [sp, #288] ; 8-byte Folded Spill +; GISEL-NEXT: csel x1, x1, x16, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: str x21, [sp, #40] ; 8-byte Folded Spill +; GISEL-NEXT: csel x30, xzr, x25, eq +; GISEL-NEXT: cmp x9, #14 +; GISEL-NEXT: stp x27, x10, [sp, #56] ; 16-byte Folded Spill +; GISEL-NEXT: orr x30, x27, x30 +; GISEL-NEXT: ldp x11, x13, [sp, #152] ; 16-byte Folded Reload +; GISEL-NEXT: csel x1, x30, x1, eq +; GISEL-NEXT: cmp x9, #15 +; GISEL-NEXT: mov x30, x7 +; GISEL-NEXT: csel x1, x10, x1, eq +; GISEL-NEXT: cmp x8, #0 +; GISEL-NEXT: csel x10, x20, x1, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x1, xzr, x22, eq +; GISEL-NEXT: cmp x9, #0 +; GISEL-NEXT: str x10, [sp, #168] ; 8-byte Folded Spill +; GISEL-NEXT: orr x1, x19, x1 +; GISEL-NEXT: ldp x20, x14, [sp, #112] ; 16-byte Folded Reload +; GISEL-NEXT: csel x1, x1, xzr, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x19, xzr, x24, eq +; GISEL-NEXT: cmp x9, #1 +; GISEL-NEXT: orr x19, x26, x19 +; GISEL-NEXT: ldp x10, x15, [sp, #304] ; 16-byte Folded Reload +; GISEL-NEXT: csel x1, x19, x1, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x19, xzr, x23, eq +; GISEL-NEXT: cmp x9, #2 +; GISEL-NEXT: orr x19, x5, x19 +; GISEL-NEXT: ldp x16, x22, [sp, #96] ; 16-byte Folded Reload +; GISEL-NEXT: csel x1, x19, x1, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x19, xzr, x13, eq +; GISEL-NEXT: cmp x9, #3 +; GISEL-NEXT: orr x19, x11, x19 +; GISEL-NEXT: ldp x4, x2, [sp, #80] ; 16-byte Folded Reload +; GISEL-NEXT: csel x1, x19, x1, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x19, xzr, x17, eq +; GISEL-NEXT: cmp x9, #4 +; GISEL-NEXT: orr x19, x7, x19 +; GISEL-NEXT: mov x7, x0 +; GISEL-NEXT: csel x1, x19, x1, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x19, xzr, x0, eq +; GISEL-NEXT: ldp x23, x0, [sp, #128] ; 16-byte Folded Reload +; GISEL-NEXT: cmp x9, #5 +; GISEL-NEXT: orr x19, x0, x19 +; GISEL-NEXT: csel x1, x19, x1, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x19, xzr, x3, eq +; GISEL-NEXT: cmp x9, #6 +; GISEL-NEXT: orr x19, x23, x19 +; GISEL-NEXT: csel x1, x19, x1, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x19, xzr, x6, eq +; GISEL-NEXT: cmp x9, #7 +; GISEL-NEXT: orr x19, x14, x19 +; GISEL-NEXT: csel x1, x19, x1, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x19, xzr, x15, eq +; GISEL-NEXT: cmp x9, #8 +; GISEL-NEXT: orr x19, x20, x19 +; GISEL-NEXT: csel x1, x19, x1, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x19, xzr, x22, eq +; GISEL-NEXT: cmp x9, #9 +; GISEL-NEXT: orr x19, x10, x19 +; GISEL-NEXT: ldr x10, [sp, #264] ; 8-byte Folded Reload +; GISEL-NEXT: csel x1, x19, x1, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x19, xzr, x28, eq +; GISEL-NEXT: cmp x9, #10 +; GISEL-NEXT: orr x19, x16, x19 +; GISEL-NEXT: csel x1, x19, x1, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x19, xzr, x2, eq +; GISEL-NEXT: cmp x9, #11 +; GISEL-NEXT: orr x19, x4, x19 +; GISEL-NEXT: csel x1, x19, x1, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x19, xzr, x21, eq +; GISEL-NEXT: ldp x28, x21, [sp, #64] ; 16-byte Folded Reload +; GISEL-NEXT: cmp x9, #12 +; GISEL-NEXT: orr x19, x21, x19 +; GISEL-NEXT: csel x1, x19, x1, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x19, xzr, x25, eq +; GISEL-NEXT: cmp x9, #13 +; GISEL-NEXT: orr x19, x27, x19 +; GISEL-NEXT: csel x1, x19, x1, eq +; GISEL-NEXT: cmp x9, #14 +; GISEL-NEXT: csel x1, x28, x1, eq +; GISEL-NEXT: cmp x9, #15 +; GISEL-NEXT: csel x1, xzr, x1, eq +; GISEL-NEXT: cmp x8, #0 +; GISEL-NEXT: csel x10, x10, x1, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x1, xzr, x24, eq +; GISEL-NEXT: cmp x9, #0 +; GISEL-NEXT: mov x24, x11 +; GISEL-NEXT: orr x1, x26, x1 +; GISEL-NEXT: str x10, [sp, #264] ; 8-byte Folded Spill +; GISEL-NEXT: ldr x10, [sp, #240] ; 8-byte Folded Reload +; GISEL-NEXT: csel x1, x1, xzr, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: mov x26, x13 +; GISEL-NEXT: csel x19, xzr, x12, eq +; GISEL-NEXT: cmp x9, #1 +; GISEL-NEXT: mov x12, x30 +; GISEL-NEXT: orr x19, x5, x19 +; GISEL-NEXT: mov x5, x15 +; GISEL-NEXT: csel x1, x19, x1, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x19, xzr, x13, eq +; GISEL-NEXT: cmp x9, #2 +; GISEL-NEXT: mov x13, x23 +; GISEL-NEXT: orr x19, x11, x19 +; GISEL-NEXT: mov x11, x17 +; GISEL-NEXT: csel x1, x19, x1, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x19, xzr, x17, eq +; GISEL-NEXT: cmp x9, #3 +; GISEL-NEXT: mov x17, x20 +; GISEL-NEXT: orr x19, x30, x19 +; GISEL-NEXT: mov x30, x7 +; GISEL-NEXT: csel x1, x19, x1, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: mov x27, x30 +; GISEL-NEXT: csel x19, xzr, x7, eq +; GISEL-NEXT: cmp x9, #4 +; GISEL-NEXT: mov x7, x14 +; GISEL-NEXT: orr x19, x0, x19 +; GISEL-NEXT: csel x1, x19, x1, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x19, xzr, x3, eq +; GISEL-NEXT: cmp x9, #5 +; GISEL-NEXT: mov x3, x22 +; GISEL-NEXT: orr x19, x23, x19 +; GISEL-NEXT: ldr x23, [sp, #16] ; 8-byte Folded Reload +; GISEL-NEXT: csel x1, x19, x1, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x19, xzr, x6, eq +; GISEL-NEXT: cmp x9, #6 +; GISEL-NEXT: orr x19, x14, x19 +; GISEL-NEXT: csel x1, x19, x1, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x19, xzr, x15, eq +; GISEL-NEXT: cmp x9, #7 +; GISEL-NEXT: ldr x15, [sp, #304] ; 8-byte Folded Reload +; GISEL-NEXT: orr x19, x20, x19 +; GISEL-NEXT: ldp x14, x20, [sp, #40] ; 16-byte Folded Reload +; GISEL-NEXT: csel x1, x19, x1, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x19, xzr, x22, eq +; GISEL-NEXT: cmp x9, #8 +; GISEL-NEXT: ldr x22, [sp, #56] ; 8-byte Folded Reload +; GISEL-NEXT: orr x19, x15, x19 +; GISEL-NEXT: csel x1, x19, x1, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x19, xzr, x20, eq +; GISEL-NEXT: cmp x9, #9 +; GISEL-NEXT: orr x19, x16, x19 +; GISEL-NEXT: csel x1, x19, x1, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x19, xzr, x2, eq +; GISEL-NEXT: cmp x9, #10 +; GISEL-NEXT: orr x19, x4, x19 +; GISEL-NEXT: csel x1, x19, x1, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x19, xzr, x14, eq +; GISEL-NEXT: cmp x9, #11 +; GISEL-NEXT: orr x19, x21, x19 +; GISEL-NEXT: csel x1, x19, x1, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x19, xzr, x25, eq +; GISEL-NEXT: cmp x9, #12 +; GISEL-NEXT: orr x19, x22, x19 +; GISEL-NEXT: csel x1, x19, x1, eq +; GISEL-NEXT: cmp x9, #13 +; GISEL-NEXT: csel x1, x28, x1, eq +; GISEL-NEXT: cmp x9, #14 +; GISEL-NEXT: csel x1, xzr, x1, eq +; GISEL-NEXT: cmp x9, #15 +; GISEL-NEXT: csel x1, xzr, x1, eq +; GISEL-NEXT: cmp x8, #0 +; GISEL-NEXT: csel x10, x10, x1, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: str x10, [sp, #240] ; 8-byte Folded Spill +; GISEL-NEXT: ldr x10, [sp, #32] ; 8-byte Folded Reload +; GISEL-NEXT: csel x1, xzr, x10, eq +; GISEL-NEXT: ldr x10, [sp, #24] ; 8-byte Folded Reload +; GISEL-NEXT: cmp x9, #0 +; GISEL-NEXT: orr x1, x10, x1 +; GISEL-NEXT: ldr x10, [sp, #216] ; 8-byte Folded Reload +; GISEL-NEXT: csel x1, x1, xzr, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x19, xzr, x26, eq +; GISEL-NEXT: cmp x9, #1 +; GISEL-NEXT: orr x19, x24, x19 +; GISEL-NEXT: csel x1, x19, x1, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x19, xzr, x11, eq +; GISEL-NEXT: cmp x9, #2 +; GISEL-NEXT: orr x19, x12, x19 +; GISEL-NEXT: csel x1, x19, x1, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x19, xzr, x30, eq +; GISEL-NEXT: cmp x9, #3 +; GISEL-NEXT: mov x30, x0 +; GISEL-NEXT: orr x19, x0, x19 +; GISEL-NEXT: mov x0, x13 +; GISEL-NEXT: csel x1, x19, x1, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x19, xzr, x23, eq +; GISEL-NEXT: cmp x9, #4 +; GISEL-NEXT: orr x19, x13, x19 +; GISEL-NEXT: mov x13, x3 +; GISEL-NEXT: csel x1, x19, x1, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x19, xzr, x6, eq +; GISEL-NEXT: cmp x9, #5 +; GISEL-NEXT: orr x19, x7, x19 +; GISEL-NEXT: csel x1, x19, x1, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x19, xzr, x5, eq +; GISEL-NEXT: cmp x9, #6 +; GISEL-NEXT: mov x5, x17 +; GISEL-NEXT: orr x19, x17, x19 +; GISEL-NEXT: mov x17, x22 +; GISEL-NEXT: csel x1, x19, x1, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x19, xzr, x3, eq +; GISEL-NEXT: cmp x9, #7 +; GISEL-NEXT: mov x3, x20 +; GISEL-NEXT: orr x19, x15, x19 +; GISEL-NEXT: csel x1, x19, x1, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x19, xzr, x20, eq +; GISEL-NEXT: cmp x9, #8 +; GISEL-NEXT: mov x20, x14 +; GISEL-NEXT: orr x19, x16, x19 +; GISEL-NEXT: csel x1, x19, x1, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x19, xzr, x2, eq +; GISEL-NEXT: cmp x9, #9 +; GISEL-NEXT: orr x19, x4, x19 +; GISEL-NEXT: csel x1, x19, x1, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x19, xzr, x14, eq +; GISEL-NEXT: cmp x9, #10 +; GISEL-NEXT: mov x14, x13 +; GISEL-NEXT: orr x19, x21, x19 +; GISEL-NEXT: csel x1, x19, x1, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x19, xzr, x25, eq +; GISEL-NEXT: cmp x9, #11 +; GISEL-NEXT: orr x19, x22, x19 +; GISEL-NEXT: mov x22, x30 +; GISEL-NEXT: csel x1, x19, x1, eq +; GISEL-NEXT: cmp x9, #12 +; GISEL-NEXT: csel x1, x28, x1, eq +; GISEL-NEXT: cmp x9, #13 +; GISEL-NEXT: csel x1, xzr, x1, eq +; GISEL-NEXT: cmp x9, #14 +; GISEL-NEXT: csel x1, xzr, x1, eq +; GISEL-NEXT: cmp x9, #15 +; GISEL-NEXT: csel x1, xzr, x1, eq +; GISEL-NEXT: cmp x8, #0 +; GISEL-NEXT: csel x10, x10, x1, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x1, xzr, x26, eq +; GISEL-NEXT: cmp x9, #0 +; GISEL-NEXT: str x10, [sp, #216] ; 8-byte Folded Spill +; GISEL-NEXT: orr x1, x24, x1 +; GISEL-NEXT: ldr x10, [sp, #176] ; 8-byte Folded Reload +; GISEL-NEXT: mov x24, x3 +; GISEL-NEXT: csel x1, x1, xzr, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: mov x26, x5 +; GISEL-NEXT: csel x19, xzr, x11, eq +; GISEL-NEXT: cmp x9, #1 +; GISEL-NEXT: orr x19, x12, x19 +; GISEL-NEXT: csel x1, x19, x1, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x19, xzr, x27, eq +; GISEL-NEXT: cmp x9, #2 +; GISEL-NEXT: orr x19, x30, x19 +; GISEL-NEXT: ldr x30, [sp, #312] ; 8-byte Folded Reload +; GISEL-NEXT: csel x1, x19, x1, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x19, xzr, x23, eq +; GISEL-NEXT: cmp x9, #3 +; GISEL-NEXT: orr x19, x0, x19 +; GISEL-NEXT: csel x1, x19, x1, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x19, xzr, x6, eq +; GISEL-NEXT: cmp x9, #4 +; GISEL-NEXT: orr x19, x7, x19 +; GISEL-NEXT: csel x1, x19, x1, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x19, xzr, x30, eq +; GISEL-NEXT: cmp x9, #5 +; GISEL-NEXT: orr x19, x5, x19 +; GISEL-NEXT: csel x1, x19, x1, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x19, xzr, x13, eq +; GISEL-NEXT: cmp x9, #6 +; GISEL-NEXT: orr x19, x15, x19 +; GISEL-NEXT: csel x1, x19, x1, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x19, xzr, x3, eq +; GISEL-NEXT: cmp x9, #7 +; GISEL-NEXT: orr x19, x16, x19 +; GISEL-NEXT: csel x1, x19, x1, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x19, xzr, x2, eq +; GISEL-NEXT: cmp x9, #8 +; GISEL-NEXT: orr x19, x4, x19 +; GISEL-NEXT: csel x1, x19, x1, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x19, xzr, x20, eq +; GISEL-NEXT: cmp x9, #9 +; GISEL-NEXT: orr x19, x21, x19 +; GISEL-NEXT: csel x1, x19, x1, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x19, xzr, x25, eq +; GISEL-NEXT: cmp x9, #10 +; GISEL-NEXT: orr x19, x17, x19 +; GISEL-NEXT: csel x1, x19, x1, eq +; GISEL-NEXT: cmp x9, #11 +; GISEL-NEXT: csel x1, x28, x1, eq +; GISEL-NEXT: cmp x9, #12 +; GISEL-NEXT: csel x1, xzr, x1, eq +; GISEL-NEXT: cmp x9, #13 +; GISEL-NEXT: csel x1, xzr, x1, eq +; GISEL-NEXT: cmp x9, #14 +; GISEL-NEXT: csel x1, xzr, x1, eq +; GISEL-NEXT: cmp x9, #15 +; GISEL-NEXT: csel x1, xzr, x1, eq +; GISEL-NEXT: cmp x8, #0 +; GISEL-NEXT: csel x19, x10, x1, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: ldr x10, [sp, #184] ; 8-byte Folded Reload +; GISEL-NEXT: csel x1, xzr, x11, eq +; GISEL-NEXT: cmp x9, #0 +; GISEL-NEXT: mov x11, x23 +; GISEL-NEXT: orr x1, x12, x1 +; GISEL-NEXT: mov x12, x0 +; GISEL-NEXT: csel x1, x1, xzr, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x3, xzr, x27, eq +; GISEL-NEXT: cmp x9, #1 +; GISEL-NEXT: orr x3, x22, x3 +; GISEL-NEXT: csel x1, x3, x1, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x3, xzr, x23, eq +; GISEL-NEXT: cmp x9, #2 +; GISEL-NEXT: mov x23, x17 +; GISEL-NEXT: orr x3, x0, x3 +; GISEL-NEXT: csel x1, x3, x1, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x3, xzr, x6, eq +; GISEL-NEXT: cmp x9, #3 +; GISEL-NEXT: orr x3, x7, x3 +; GISEL-NEXT: csel x1, x3, x1, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x3, xzr, x30, eq +; GISEL-NEXT: cmp x9, #4 +; GISEL-NEXT: orr x3, x26, x3 +; GISEL-NEXT: csel x1, x3, x1, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x3, xzr, x13, eq +; GISEL-NEXT: cmp x9, #5 +; GISEL-NEXT: orr x3, x15, x3 +; GISEL-NEXT: csel x1, x3, x1, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x3, xzr, x24, eq +; GISEL-NEXT: cmp x9, #6 +; GISEL-NEXT: orr x3, x16, x3 +; GISEL-NEXT: csel x1, x3, x1, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x3, xzr, x2, eq +; GISEL-NEXT: cmp x9, #7 +; GISEL-NEXT: orr x3, x4, x3 +; GISEL-NEXT: csel x1, x3, x1, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x3, xzr, x20, eq +; GISEL-NEXT: cmp x9, #8 +; GISEL-NEXT: orr x3, x21, x3 +; GISEL-NEXT: csel x1, x3, x1, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x3, xzr, x25, eq +; GISEL-NEXT: cmp x9, #9 +; GISEL-NEXT: orr x3, x17, x3 +; GISEL-NEXT: csel x1, x3, x1, eq +; GISEL-NEXT: cmp x9, #10 +; GISEL-NEXT: mov x3, x4 +; GISEL-NEXT: csel x1, x28, x1, eq +; GISEL-NEXT: cmp x9, #11 +; GISEL-NEXT: csel x1, xzr, x1, eq +; GISEL-NEXT: cmp x9, #12 +; GISEL-NEXT: csel x1, xzr, x1, eq +; GISEL-NEXT: cmp x9, #13 +; GISEL-NEXT: csel x1, xzr, x1, eq +; GISEL-NEXT: cmp x9, #14 +; GISEL-NEXT: csel x1, xzr, x1, eq +; GISEL-NEXT: cmp x9, #15 +; GISEL-NEXT: csel x1, xzr, x1, eq +; GISEL-NEXT: cmp x8, #0 +; GISEL-NEXT: csel x10, x10, x1, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x17, xzr, x27, eq +; GISEL-NEXT: cmp x9, #0 +; GISEL-NEXT: str x10, [sp, #184] ; 8-byte Folded Spill +; GISEL-NEXT: orr x17, x22, x17 +; GISEL-NEXT: ldr x10, [sp, #192] ; 8-byte Folded Reload +; GISEL-NEXT: csel x17, x17, xzr, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x0, xzr, x11, eq +; GISEL-NEXT: cmp x9, #1 +; GISEL-NEXT: orr x0, x12, x0 +; GISEL-NEXT: csel x17, x0, x17, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x0, xzr, x6, eq +; GISEL-NEXT: cmp x9, #2 +; GISEL-NEXT: orr x0, x7, x0 +; GISEL-NEXT: csel x17, x0, x17, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x0, xzr, x30, eq +; GISEL-NEXT: cmp x9, #3 +; GISEL-NEXT: orr x0, x26, x0 +; GISEL-NEXT: csel x17, x0, x17, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x0, xzr, x13, eq +; GISEL-NEXT: cmp x9, #4 +; GISEL-NEXT: orr x0, x15, x0 +; GISEL-NEXT: csel x17, x0, x17, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x0, xzr, x24, eq +; GISEL-NEXT: cmp x9, #5 +; GISEL-NEXT: orr x0, x16, x0 +; GISEL-NEXT: csel x17, x0, x17, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x0, xzr, x2, eq +; GISEL-NEXT: cmp x9, #6 +; GISEL-NEXT: orr x0, x4, x0 +; GISEL-NEXT: csel x17, x0, x17, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x0, xzr, x20, eq +; GISEL-NEXT: cmp x9, #7 +; GISEL-NEXT: orr x0, x21, x0 +; GISEL-NEXT: csel x17, x0, x17, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x0, xzr, x25, eq +; GISEL-NEXT: cmp x9, #8 +; GISEL-NEXT: orr x0, x23, x0 +; GISEL-NEXT: csel x17, x0, x17, eq +; GISEL-NEXT: cmp x9, #9 +; GISEL-NEXT: csel x17, x28, x17, eq +; GISEL-NEXT: cmp x9, #10 +; GISEL-NEXT: csel x17, xzr, x17, eq +; GISEL-NEXT: cmp x9, #11 +; GISEL-NEXT: csel x17, xzr, x17, eq +; GISEL-NEXT: cmp x9, #12 +; GISEL-NEXT: csel x17, xzr, x17, eq +; GISEL-NEXT: cmp x9, #13 +; GISEL-NEXT: csel x17, xzr, x17, eq +; GISEL-NEXT: cmp x9, #14 +; GISEL-NEXT: csel x17, xzr, x17, eq +; GISEL-NEXT: cmp x9, #15 +; GISEL-NEXT: csel x17, xzr, x17, eq +; GISEL-NEXT: cmp x8, #0 +; GISEL-NEXT: csel x17, x10, x17, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: ldr x10, [sp, #200] ; 8-byte Folded Reload +; GISEL-NEXT: csel x13, xzr, x11, eq +; GISEL-NEXT: cmp x9, #0 +; GISEL-NEXT: orr x12, x12, x13 +; GISEL-NEXT: csel x12, x12, xzr, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x13, xzr, x6, eq +; GISEL-NEXT: cmp x9, #1 +; GISEL-NEXT: orr x13, x7, x13 +; GISEL-NEXT: csel x12, x13, x12, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x13, xzr, x30, eq +; GISEL-NEXT: cmp x9, #2 +; GISEL-NEXT: orr x13, x26, x13 +; GISEL-NEXT: csel x12, x13, x12, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x13, xzr, x14, eq +; GISEL-NEXT: cmp x9, #3 +; GISEL-NEXT: orr x13, x15, x13 +; GISEL-NEXT: csel x12, x13, x12, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x13, xzr, x24, eq +; GISEL-NEXT: cmp x9, #4 +; GISEL-NEXT: orr x13, x16, x13 +; GISEL-NEXT: csel x12, x13, x12, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x13, xzr, x2, eq +; GISEL-NEXT: cmp x9, #5 +; GISEL-NEXT: orr x13, x4, x13 +; GISEL-NEXT: csel x12, x13, x12, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x13, xzr, x20, eq +; GISEL-NEXT: cmp x9, #6 +; GISEL-NEXT: orr x13, x21, x13 +; GISEL-NEXT: csel x12, x13, x12, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x13, xzr, x25, eq +; GISEL-NEXT: cmp x9, #7 +; GISEL-NEXT: orr x13, x23, x13 +; GISEL-NEXT: csel x12, x13, x12, eq +; GISEL-NEXT: cmp x9, #8 +; GISEL-NEXT: csel x12, x28, x12, eq +; GISEL-NEXT: cmp x9, #9 +; GISEL-NEXT: csel x12, xzr, x12, eq +; GISEL-NEXT: cmp x9, #10 +; GISEL-NEXT: csel x12, xzr, x12, eq +; GISEL-NEXT: cmp x9, #11 +; GISEL-NEXT: csel x12, xzr, x12, eq +; GISEL-NEXT: cmp x9, #12 +; GISEL-NEXT: csel x12, xzr, x12, eq +; GISEL-NEXT: cmp x9, #13 +; GISEL-NEXT: csel x12, xzr, x12, eq +; GISEL-NEXT: cmp x9, #14 +; GISEL-NEXT: csel x12, xzr, x12, eq +; GISEL-NEXT: cmp x9, #15 +; GISEL-NEXT: csel x12, xzr, x12, eq +; GISEL-NEXT: cmp x8, #0 +; GISEL-NEXT: csel x12, x10, x12, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x11, xzr, x6, eq +; GISEL-NEXT: cmp x9, #0 +; GISEL-NEXT: orr x10, x7, x11 +; GISEL-NEXT: csel x10, x10, xzr, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x11, xzr, x30, eq +; GISEL-NEXT: cmp x9, #1 +; GISEL-NEXT: orr x11, x26, x11 +; GISEL-NEXT: csel x10, x11, x10, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x11, xzr, x14, eq +; GISEL-NEXT: cmp x9, #2 +; GISEL-NEXT: orr x11, x15, x11 +; GISEL-NEXT: csel x10, x11, x10, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x11, xzr, x24, eq +; GISEL-NEXT: cmp x9, #3 +; GISEL-NEXT: orr x11, x16, x11 +; GISEL-NEXT: csel x10, x11, x10, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x11, xzr, x2, eq +; GISEL-NEXT: cmp x9, #4 +; GISEL-NEXT: orr x11, x4, x11 +; GISEL-NEXT: ldr x4, [sp, #168] ; 8-byte Folded Reload +; GISEL-NEXT: csel x10, x11, x10, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x11, xzr, x20, eq +; GISEL-NEXT: cmp x9, #5 +; GISEL-NEXT: orr x11, x21, x11 +; GISEL-NEXT: csel x10, x11, x10, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x11, xzr, x25, eq +; GISEL-NEXT: cmp x9, #6 +; GISEL-NEXT: orr x11, x23, x11 +; GISEL-NEXT: csel x10, x11, x10, eq +; GISEL-NEXT: cmp x9, #7 +; GISEL-NEXT: ldr x11, [sp, #208] ; 8-byte Folded Reload +; GISEL-NEXT: csel x10, x28, x10, eq +; GISEL-NEXT: cmp x9, #8 +; GISEL-NEXT: csel x10, xzr, x10, eq +; GISEL-NEXT: cmp x9, #9 +; GISEL-NEXT: csel x10, xzr, x10, eq +; GISEL-NEXT: cmp x9, #10 +; GISEL-NEXT: csel x10, xzr, x10, eq +; GISEL-NEXT: cmp x9, #11 +; GISEL-NEXT: csel x10, xzr, x10, eq +; GISEL-NEXT: cmp x9, #12 +; GISEL-NEXT: csel x10, xzr, x10, eq +; GISEL-NEXT: cmp x9, #13 +; GISEL-NEXT: csel x10, xzr, x10, eq +; GISEL-NEXT: cmp x9, #14 +; GISEL-NEXT: csel x10, xzr, x10, eq +; GISEL-NEXT: cmp x9, #15 +; GISEL-NEXT: csel x10, xzr, x10, eq +; GISEL-NEXT: cmp x8, #0 +; GISEL-NEXT: csel x10, x11, x10, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x11, xzr, x30, eq +; GISEL-NEXT: cmp x9, #0 +; GISEL-NEXT: orr x11, x26, x11 +; GISEL-NEXT: ldp x29, x30, [sp, #400] ; 16-byte Folded Reload +; GISEL-NEXT: csel x11, x11, xzr, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x13, xzr, x14, eq +; GISEL-NEXT: cmp x9, #1 +; GISEL-NEXT: orr x13, x15, x13 +; GISEL-NEXT: csel x11, x13, x11, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x13, xzr, x24, eq +; GISEL-NEXT: cmp x9, #2 +; GISEL-NEXT: orr x13, x16, x13 +; GISEL-NEXT: csel x11, x13, x11, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x13, xzr, x2, eq +; GISEL-NEXT: cmp x9, #3 +; GISEL-NEXT: orr x13, x3, x13 +; GISEL-NEXT: csel x11, x13, x11, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x13, xzr, x20, eq +; GISEL-NEXT: cmp x9, #4 +; GISEL-NEXT: orr x13, x21, x13 +; GISEL-NEXT: csel x11, x13, x11, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x13, xzr, x25, eq +; GISEL-NEXT: cmp x9, #5 +; GISEL-NEXT: orr x13, x23, x13 +; GISEL-NEXT: csel x11, x13, x11, eq +; GISEL-NEXT: cmp x9, #6 +; GISEL-NEXT: ldr x13, [sp, #224] ; 8-byte Folded Reload +; GISEL-NEXT: csel x11, x28, x11, eq +; GISEL-NEXT: cmp x9, #7 +; GISEL-NEXT: csel x11, xzr, x11, eq +; GISEL-NEXT: cmp x9, #8 +; GISEL-NEXT: csel x11, xzr, x11, eq +; GISEL-NEXT: cmp x9, #9 +; GISEL-NEXT: csel x11, xzr, x11, eq +; GISEL-NEXT: cmp x9, #10 +; GISEL-NEXT: csel x11, xzr, x11, eq +; GISEL-NEXT: cmp x9, #11 +; GISEL-NEXT: csel x11, xzr, x11, eq +; GISEL-NEXT: cmp x9, #12 +; GISEL-NEXT: csel x11, xzr, x11, eq +; GISEL-NEXT: cmp x9, #13 +; GISEL-NEXT: csel x11, xzr, x11, eq +; GISEL-NEXT: cmp x9, #14 +; GISEL-NEXT: csel x11, xzr, x11, eq +; GISEL-NEXT: cmp x9, #15 +; GISEL-NEXT: csel x11, xzr, x11, eq +; GISEL-NEXT: cmp x8, #0 +; GISEL-NEXT: csel x11, x13, x11, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x13, xzr, x14, eq +; GISEL-NEXT: cmp x9, #0 +; GISEL-NEXT: orr x13, x15, x13 +; GISEL-NEXT: csel x13, x13, xzr, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x0, xzr, x24, eq +; GISEL-NEXT: cmp x9, #1 +; GISEL-NEXT: orr x0, x16, x0 +; GISEL-NEXT: csel x13, x0, x13, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x0, xzr, x2, eq +; GISEL-NEXT: cmp x9, #2 +; GISEL-NEXT: orr x0, x3, x0 +; GISEL-NEXT: csel x13, x0, x13, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x0, xzr, x20, eq +; GISEL-NEXT: cmp x9, #3 +; GISEL-NEXT: orr x0, x21, x0 +; GISEL-NEXT: csel x13, x0, x13, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x0, xzr, x25, eq +; GISEL-NEXT: cmp x9, #4 +; GISEL-NEXT: orr x0, x23, x0 +; GISEL-NEXT: csel x13, x0, x13, eq +; GISEL-NEXT: cmp x9, #5 +; GISEL-NEXT: ldr x0, [sp, #232] ; 8-byte Folded Reload +; GISEL-NEXT: csel x13, x28, x13, eq +; GISEL-NEXT: cmp x9, #6 +; GISEL-NEXT: csel x13, xzr, x13, eq +; GISEL-NEXT: cmp x9, #7 +; GISEL-NEXT: csel x13, xzr, x13, eq +; GISEL-NEXT: cmp x9, #8 +; GISEL-NEXT: csel x13, xzr, x13, eq +; GISEL-NEXT: cmp x9, #9 +; GISEL-NEXT: csel x13, xzr, x13, eq +; GISEL-NEXT: cmp x9, #10 +; GISEL-NEXT: csel x13, xzr, x13, eq +; GISEL-NEXT: cmp x9, #11 +; GISEL-NEXT: csel x13, xzr, x13, eq +; GISEL-NEXT: cmp x9, #12 +; GISEL-NEXT: csel x13, xzr, x13, eq +; GISEL-NEXT: cmp x9, #13 +; GISEL-NEXT: csel x13, xzr, x13, eq +; GISEL-NEXT: cmp x9, #14 +; GISEL-NEXT: csel x13, xzr, x13, eq +; GISEL-NEXT: cmp x9, #15 +; GISEL-NEXT: csel x13, xzr, x13, eq +; GISEL-NEXT: cmp x8, #0 +; GISEL-NEXT: csel x13, x0, x13, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x0, xzr, x24, eq +; GISEL-NEXT: cmp x9, #0 +; GISEL-NEXT: orr x0, x16, x0 +; GISEL-NEXT: ldr x16, [sp, #280] ; 8-byte Folded Reload +; GISEL-NEXT: csel x0, x0, xzr, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x1, xzr, x2, eq +; GISEL-NEXT: cmp x9, #1 +; GISEL-NEXT: orr x1, x3, x1 +; GISEL-NEXT: csel x0, x1, x0, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x1, xzr, x20, eq +; GISEL-NEXT: cmp x9, #2 +; GISEL-NEXT: orr x1, x21, x1 +; GISEL-NEXT: csel x0, x1, x0, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x1, xzr, x25, eq +; GISEL-NEXT: cmp x9, #3 +; GISEL-NEXT: orr x1, x23, x1 +; GISEL-NEXT: csel x0, x1, x0, eq +; GISEL-NEXT: cmp x9, #4 +; GISEL-NEXT: ldr x1, [sp, #248] ; 8-byte Folded Reload +; GISEL-NEXT: csel x0, x28, x0, eq +; GISEL-NEXT: cmp x9, #5 +; GISEL-NEXT: csel x0, xzr, x0, eq +; GISEL-NEXT: cmp x9, #6 +; GISEL-NEXT: csel x0, xzr, x0, eq +; GISEL-NEXT: cmp x9, #7 +; GISEL-NEXT: csel x0, xzr, x0, eq +; GISEL-NEXT: cmp x9, #8 +; GISEL-NEXT: csel x0, xzr, x0, eq +; GISEL-NEXT: cmp x9, #9 +; GISEL-NEXT: csel x0, xzr, x0, eq +; GISEL-NEXT: cmp x9, #10 +; GISEL-NEXT: csel x0, xzr, x0, eq +; GISEL-NEXT: cmp x9, #11 +; GISEL-NEXT: csel x0, xzr, x0, eq +; GISEL-NEXT: cmp x9, #12 +; GISEL-NEXT: csel x0, xzr, x0, eq +; GISEL-NEXT: cmp x9, #13 +; GISEL-NEXT: csel x0, xzr, x0, eq +; GISEL-NEXT: cmp x9, #14 +; GISEL-NEXT: csel x0, xzr, x0, eq +; GISEL-NEXT: cmp x9, #15 +; GISEL-NEXT: csel x0, xzr, x0, eq +; GISEL-NEXT: cmp x8, #0 +; GISEL-NEXT: csel x0, x1, x0, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x1, xzr, x2, eq +; GISEL-NEXT: cmp x9, #0 +; GISEL-NEXT: orr x1, x3, x1 +; GISEL-NEXT: csel x1, x1, xzr, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x3, xzr, x20, eq +; GISEL-NEXT: cmp x9, #1 +; GISEL-NEXT: orr x3, x21, x3 +; GISEL-NEXT: csel x1, x3, x1, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x3, xzr, x25, eq +; GISEL-NEXT: cmp x9, #2 +; GISEL-NEXT: orr x3, x23, x3 +; GISEL-NEXT: csel x1, x3, x1, eq +; GISEL-NEXT: cmp x9, #3 +; GISEL-NEXT: ldr x3, [sp, #256] ; 8-byte Folded Reload +; GISEL-NEXT: csel x1, x28, x1, eq +; GISEL-NEXT: cmp x9, #4 +; GISEL-NEXT: csel x1, xzr, x1, eq +; GISEL-NEXT: cmp x9, #5 +; GISEL-NEXT: csel x1, xzr, x1, eq +; GISEL-NEXT: cmp x9, #6 +; GISEL-NEXT: csel x1, xzr, x1, eq +; GISEL-NEXT: cmp x9, #7 +; GISEL-NEXT: csel x1, xzr, x1, eq +; GISEL-NEXT: cmp x9, #8 +; GISEL-NEXT: csel x1, xzr, x1, eq +; GISEL-NEXT: cmp x9, #9 +; GISEL-NEXT: csel x1, xzr, x1, eq +; GISEL-NEXT: cmp x9, #10 +; GISEL-NEXT: csel x1, xzr, x1, eq +; GISEL-NEXT: cmp x9, #11 +; GISEL-NEXT: csel x1, xzr, x1, eq +; GISEL-NEXT: cmp x9, #12 +; GISEL-NEXT: csel x1, xzr, x1, eq +; GISEL-NEXT: cmp x9, #13 +; GISEL-NEXT: csel x1, xzr, x1, eq +; GISEL-NEXT: cmp x9, #14 +; GISEL-NEXT: csel x1, xzr, x1, eq +; GISEL-NEXT: cmp x9, #15 +; GISEL-NEXT: csel x1, xzr, x1, eq +; GISEL-NEXT: cmp x8, #0 +; GISEL-NEXT: csel x3, x3, x1, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x1, xzr, x20, eq +; GISEL-NEXT: cmp x9, #0 +; GISEL-NEXT: orr x1, x21, x1 +; GISEL-NEXT: ldp x22, x21, [sp, #368] ; 16-byte Folded Reload +; GISEL-NEXT: csel x1, x1, xzr, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x2, xzr, x25, eq +; GISEL-NEXT: cmp x9, #1 +; GISEL-NEXT: orr x2, x23, x2 +; GISEL-NEXT: csel x1, x2, x1, eq +; GISEL-NEXT: cmp x9, #2 +; GISEL-NEXT: ldr x2, [sp, #272] ; 8-byte Folded Reload +; GISEL-NEXT: csel x1, x28, x1, eq +; GISEL-NEXT: cmp x9, #3 +; GISEL-NEXT: csel x1, xzr, x1, eq +; GISEL-NEXT: cmp x9, #4 +; GISEL-NEXT: csel x1, xzr, x1, eq +; GISEL-NEXT: cmp x9, #5 +; GISEL-NEXT: csel x1, xzr, x1, eq +; GISEL-NEXT: cmp x9, #6 +; GISEL-NEXT: csel x1, xzr, x1, eq +; GISEL-NEXT: cmp x9, #7 +; GISEL-NEXT: csel x1, xzr, x1, eq +; GISEL-NEXT: cmp x9, #8 +; GISEL-NEXT: csel x1, xzr, x1, eq +; GISEL-NEXT: cmp x9, #9 +; GISEL-NEXT: csel x1, xzr, x1, eq +; GISEL-NEXT: cmp x9, #10 +; GISEL-NEXT: csel x1, xzr, x1, eq +; GISEL-NEXT: cmp x9, #11 +; GISEL-NEXT: csel x1, xzr, x1, eq +; GISEL-NEXT: cmp x9, #12 +; GISEL-NEXT: csel x1, xzr, x1, eq +; GISEL-NEXT: cmp x9, #13 +; GISEL-NEXT: csel x1, xzr, x1, eq +; GISEL-NEXT: cmp x9, #14 +; GISEL-NEXT: csel x1, xzr, x1, eq +; GISEL-NEXT: cmp x9, #15 +; GISEL-NEXT: csel x1, xzr, x1, eq +; GISEL-NEXT: cmp x8, #0 +; GISEL-NEXT: csel x2, x2, x1, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: ldr x1, [sp, #264] ; 8-byte Folded Reload +; GISEL-NEXT: csel x15, xzr, x25, eq +; GISEL-NEXT: cmp x9, #0 +; GISEL-NEXT: orr x15, x23, x15 +; GISEL-NEXT: ldp x24, x23, [sp, #352] ; 16-byte Folded Reload +; GISEL-NEXT: csel x15, x15, xzr, eq +; GISEL-NEXT: cmp x9, #1 +; GISEL-NEXT: csel x15, x28, x15, eq +; GISEL-NEXT: cmp x9, #2 +; GISEL-NEXT: csel x15, xzr, x15, eq +; GISEL-NEXT: cmp x9, #3 +; GISEL-NEXT: csel x15, xzr, x15, eq +; GISEL-NEXT: cmp x9, #4 +; GISEL-NEXT: csel x15, xzr, x15, eq +; GISEL-NEXT: cmp x9, #5 +; GISEL-NEXT: csel x15, xzr, x15, eq +; GISEL-NEXT: cmp x9, #6 +; GISEL-NEXT: csel x15, xzr, x15, eq +; GISEL-NEXT: cmp x9, #7 +; GISEL-NEXT: csel x15, xzr, x15, eq +; GISEL-NEXT: cmp x9, #8 +; GISEL-NEXT: csel x15, xzr, x15, eq +; GISEL-NEXT: cmp x9, #9 +; GISEL-NEXT: csel x15, xzr, x15, eq +; GISEL-NEXT: cmp x9, #10 +; GISEL-NEXT: csel x15, xzr, x15, eq +; GISEL-NEXT: cmp x9, #11 +; GISEL-NEXT: csel x15, xzr, x15, eq +; GISEL-NEXT: cmp x9, #12 +; GISEL-NEXT: csel x15, xzr, x15, eq +; GISEL-NEXT: cmp x9, #13 +; GISEL-NEXT: csel x15, xzr, x15, eq +; GISEL-NEXT: cmp x9, #14 +; GISEL-NEXT: csel x15, xzr, x15, eq +; GISEL-NEXT: cmp x9, #15 +; GISEL-NEXT: csel x15, xzr, x15, eq +; GISEL-NEXT: cmp x8, #0 +; GISEL-NEXT: csel x15, x16, x15, eq +; GISEL-NEXT: cmp x9, #0 +; GISEL-NEXT: ldr x16, [sp, #296] ; 8-byte Folded Reload +; GISEL-NEXT: csel x14, x28, xzr, eq +; GISEL-NEXT: cmp x9, #1 +; GISEL-NEXT: csel x14, xzr, x14, eq +; GISEL-NEXT: cmp x9, #2 +; GISEL-NEXT: stp x17, x12, [x16, #48] +; GISEL-NEXT: csel x14, xzr, x14, eq +; GISEL-NEXT: cmp x9, #3 +; GISEL-NEXT: stp x10, x11, [x16, #64] +; GISEL-NEXT: csel x14, xzr, x14, eq +; GISEL-NEXT: cmp x9, #4 +; GISEL-NEXT: stp x4, x1, [x16] +; GISEL-NEXT: csel x14, xzr, x14, eq +; GISEL-NEXT: cmp x9, #5 +; GISEL-NEXT: ldr x4, [sp, #240] ; 8-byte Folded Reload +; GISEL-NEXT: csel x14, xzr, x14, eq +; GISEL-NEXT: cmp x9, #6 +; GISEL-NEXT: ldr x1, [sp, #216] ; 8-byte Folded Reload +; GISEL-NEXT: csel x14, xzr, x14, eq +; GISEL-NEXT: cmp x9, #7 +; GISEL-NEXT: stp x13, x0, [x16, #80] +; GISEL-NEXT: csel x14, xzr, x14, eq +; GISEL-NEXT: cmp x9, #8 +; GISEL-NEXT: stp x4, x1, [x16, #16] +; GISEL-NEXT: csel x14, xzr, x14, eq +; GISEL-NEXT: cmp x9, #9 +; GISEL-NEXT: ldr x1, [sp, #184] ; 8-byte Folded Reload +; GISEL-NEXT: csel x12, xzr, x14, eq +; GISEL-NEXT: cmp x9, #10 +; GISEL-NEXT: stp x3, x2, [x16, #96] +; GISEL-NEXT: csel x10, xzr, x12, eq +; GISEL-NEXT: cmp x9, #11 +; GISEL-NEXT: stp x19, x1, [x16, #32] +; GISEL-NEXT: csel x10, xzr, x10, eq +; GISEL-NEXT: cmp x9, #12 +; GISEL-NEXT: csel x10, xzr, x10, eq +; GISEL-NEXT: cmp x9, #13 +; GISEL-NEXT: csel x10, xzr, x10, eq +; GISEL-NEXT: cmp x9, #14 +; GISEL-NEXT: csel x10, xzr, x10, eq +; GISEL-NEXT: cmp x9, #15 +; GISEL-NEXT: csel x9, xzr, x10, eq +; GISEL-NEXT: cmp x8, #0 +; GISEL-NEXT: ldr x8, [sp, #288] ; 8-byte Folded Reload +; GISEL-NEXT: ldp x20, x19, [sp, #384] ; 16-byte Folded Reload +; GISEL-NEXT: ldp x26, x25, [sp, #336] ; 16-byte Folded Reload +; GISEL-NEXT: csel x8, x8, x9, eq +; GISEL-NEXT: ldp x28, x27, [sp, #320] ; 16-byte Folded Reload +; GISEL-NEXT: stp x15, x8, [x16, #112] +; GISEL-NEXT: add sp, sp, #416 +; GISEL-NEXT: ret +entry: + %input_val = load i1024, ptr %input, align 128 + %shift_ext = zext i32 %shift to i1024 + %shifted = lshr i1024 %input_val, %shift_ext + store i1024 %shifted, ptr %result, align 128 + ret void +} + +define void @test_ashr_i1024(ptr %result, ptr %input, i32 %shift) { +; SDAG-LABEL: test_ashr_i1024: +; SDAG: ; %bb.0: ; %entry +; SDAG-NEXT: sub sp, sp, #336 +; SDAG-NEXT: stp x28, x27, [sp, #256] ; 16-byte Folded Spill +; SDAG-NEXT: stp x26, x25, [sp, #272] ; 16-byte Folded Spill +; SDAG-NEXT: stp x24, x23, [sp, #288] ; 16-byte Folded Spill +; SDAG-NEXT: stp x22, x21, [sp, #304] ; 16-byte Folded Spill +; SDAG-NEXT: stp x20, x19, [sp, #320] ; 16-byte Folded Spill +; SDAG-NEXT: .cfi_def_cfa_offset 336 +; SDAG-NEXT: .cfi_offset w19, -8 +; SDAG-NEXT: .cfi_offset w20, -16 +; SDAG-NEXT: .cfi_offset w21, -24 +; SDAG-NEXT: .cfi_offset w22, -32 +; SDAG-NEXT: .cfi_offset w23, -40 +; SDAG-NEXT: .cfi_offset w24, -48 +; SDAG-NEXT: .cfi_offset w25, -56 +; SDAG-NEXT: .cfi_offset w26, -64 +; SDAG-NEXT: .cfi_offset w27, -72 +; SDAG-NEXT: .cfi_offset w28, -80 +; SDAG-NEXT: ldp x8, x9, [x1, #112] +; SDAG-NEXT: mov x11, sp +; SDAG-NEXT: ldp q0, q1, [x1] +; SDAG-NEXT: ldr q6, [x1, #96] +; SDAG-NEXT: ldp q2, q3, [x1, #32] +; SDAG-NEXT: ldp q4, q5, [x1, #64] +; SDAG-NEXT: mvn w1, w2 +; SDAG-NEXT: stp x8, x9, [sp, #112] +; SDAG-NEXT: mov w8, w2 +; SDAG-NEXT: asr x9, x9, #63 +; SDAG-NEXT: lsr x10, x8, #3 +; SDAG-NEXT: stp q1, q2, [sp, #16] +; SDAG-NEXT: and x14, x8, #0x3f +; SDAG-NEXT: stp q3, q4, [sp, #48] +; SDAG-NEXT: eor x15, x14, #0x3f +; SDAG-NEXT: and x10, x10, #0x78 +; SDAG-NEXT: stp q5, q6, [sp, #80] +; SDAG-NEXT: str q0, [sp] +; SDAG-NEXT: add x10, x11, x10 +; SDAG-NEXT: stp x9, x9, [sp, #240] +; SDAG-NEXT: stp x9, x9, [sp, #224] +; SDAG-NEXT: stp x9, x9, [sp, #208] +; SDAG-NEXT: stp x9, x9, [sp, #192] +; SDAG-NEXT: stp x9, x9, [sp, #176] +; SDAG-NEXT: stp x9, x9, [sp, #160] +; SDAG-NEXT: stp x9, x9, [sp, #144] +; SDAG-NEXT: stp x9, x9, [sp, #128] +; SDAG-NEXT: ldp x11, x9, [x10, #16] +; SDAG-NEXT: ldr x16, [x10, #32] +; SDAG-NEXT: ldp x12, x13, [x10, #40] +; SDAG-NEXT: ldr x3, [x10, #56] +; SDAG-NEXT: ldp x4, x6, [x10, #64] +; SDAG-NEXT: lsl x2, x16, #1 +; SDAG-NEXT: lsl x17, x11, #1 +; SDAG-NEXT: ldp x24, x26, [x10, #112] +; SDAG-NEXT: lsl x5, x13, #1 +; SDAG-NEXT: lsr x13, x13, x8 +; SDAG-NEXT: lsr x11, x11, x8 +; SDAG-NEXT: lsl x14, x17, x1 +; SDAG-NEXT: lsl x7, x6, #1 +; SDAG-NEXT: lsl x17, x2, x1 +; SDAG-NEXT: lsl x2, x5, x1 +; SDAG-NEXT: ldp x5, x22, [x10, #80] +; SDAG-NEXT: lsr x19, x4, x8 +; SDAG-NEXT: lsl x7, x7, x15 +; SDAG-NEXT: lsl x21, x4, #1 +; SDAG-NEXT: lsr x6, x6, x8 +; SDAG-NEXT: lsl x27, x24, #1 +; SDAG-NEXT: lsr x24, x24, x8 +; SDAG-NEXT: lsl x23, x5, #1 +; SDAG-NEXT: orr x4, x7, x19 +; SDAG-NEXT: lsr x25, x22, x8 +; SDAG-NEXT: ldp x20, x7, [x10, #96] +; SDAG-NEXT: lsl x21, x21, x1 +; SDAG-NEXT: lsl x23, x23, x1 +; SDAG-NEXT: lsl x22, x22, #1 +; SDAG-NEXT: lsr x5, x5, x8 +; SDAG-NEXT: lsr x16, x16, x8 +; SDAG-NEXT: lsl x19, x20, #1 +; SDAG-NEXT: orr x6, x6, x23 +; SDAG-NEXT: lsl x23, x7, #1 +; SDAG-NEXT: lsr x20, x20, x8 +; SDAG-NEXT: lsr x7, x7, x8 +; SDAG-NEXT: lsl x22, x22, x15 +; SDAG-NEXT: lsl x19, x19, x1 +; SDAG-NEXT: lsl x1, x27, x1 +; SDAG-NEXT: lsl x23, x23, x15 +; SDAG-NEXT: orr x5, x22, x5 +; SDAG-NEXT: ldp x28, x27, [sp, #256] ; 16-byte Folded Reload +; SDAG-NEXT: orr x19, x25, x19 +; SDAG-NEXT: lsl x25, x26, #1 +; SDAG-NEXT: orr x20, x23, x20 +; SDAG-NEXT: orr x1, x7, x1 +; SDAG-NEXT: ldp x23, x10, [x10] +; SDAG-NEXT: stp x20, x1, [x0, #96] +; SDAG-NEXT: lsl x20, x3, #1 +; SDAG-NEXT: lsl x25, x25, x15 +; SDAG-NEXT: asr x26, x26, x8 +; SDAG-NEXT: stp x5, x19, [x0, #80] +; SDAG-NEXT: lsr x3, x3, x8 +; SDAG-NEXT: lsl x19, x20, x15 +; SDAG-NEXT: orr x7, x25, x24 +; SDAG-NEXT: lsl x1, x9, #1 +; SDAG-NEXT: stp x7, x26, [x0, #112] +; SDAG-NEXT: lsl x7, x10, #1 +; SDAG-NEXT: orr x3, x3, x21 +; SDAG-NEXT: orr x13, x19, x13 +; SDAG-NEXT: lsl x5, x12, #1 +; SDAG-NEXT: lsr x9, x9, x8 +; SDAG-NEXT: stp x13, x3, [x0, #48] +; SDAG-NEXT: lsl x13, x1, x15 +; SDAG-NEXT: lsr x23, x23, x8 +; SDAG-NEXT: lsr x12, x12, x8 +; SDAG-NEXT: lsr x8, x10, x8 +; SDAG-NEXT: lsl x10, x7, x15 +; SDAG-NEXT: stp x4, x6, [x0, #64] +; SDAG-NEXT: lsl x4, x5, x15 +; SDAG-NEXT: orr x9, x9, x17 +; SDAG-NEXT: orr x11, x13, x11 +; SDAG-NEXT: ldp x20, x19, [sp, #320] ; 16-byte Folded Reload +; SDAG-NEXT: stp x11, x9, [x0, #16] +; SDAG-NEXT: orr x9, x10, x23 +; SDAG-NEXT: orr x12, x12, x2 +; SDAG-NEXT: ldp x22, x21, [sp, #304] ; 16-byte Folded Reload +; SDAG-NEXT: orr x16, x4, x16 +; SDAG-NEXT: ldp x24, x23, [sp, #288] ; 16-byte Folded Reload +; SDAG-NEXT: orr x8, x8, x14 +; SDAG-NEXT: ldp x26, x25, [sp, #272] ; 16-byte Folded Reload +; SDAG-NEXT: stp x16, x12, [x0, #32] +; SDAG-NEXT: stp x9, x8, [x0] +; SDAG-NEXT: add sp, sp, #336 +; SDAG-NEXT: ret +; +; GISEL-LABEL: test_ashr_i1024: +; GISEL: ; %bb.0: ; %entry +; GISEL-NEXT: sub sp, sp, #432 +; GISEL-NEXT: stp x28, x27, [sp, #336] ; 16-byte Folded Spill +; GISEL-NEXT: stp x26, x25, [sp, #352] ; 16-byte Folded Spill +; GISEL-NEXT: stp x24, x23, [sp, #368] ; 16-byte Folded Spill +; GISEL-NEXT: stp x22, x21, [sp, #384] ; 16-byte Folded Spill +; GISEL-NEXT: stp x20, x19, [sp, #400] ; 16-byte Folded Spill +; GISEL-NEXT: stp x29, x30, [sp, #416] ; 16-byte Folded Spill +; GISEL-NEXT: .cfi_def_cfa_offset 432 +; GISEL-NEXT: .cfi_offset w30, -8 +; GISEL-NEXT: .cfi_offset w29, -16 +; GISEL-NEXT: .cfi_offset w19, -24 +; GISEL-NEXT: .cfi_offset w20, -32 +; GISEL-NEXT: .cfi_offset w21, -40 +; GISEL-NEXT: .cfi_offset w22, -48 +; GISEL-NEXT: .cfi_offset w23, -56 +; GISEL-NEXT: .cfi_offset w24, -64 +; GISEL-NEXT: .cfi_offset w25, -72 +; GISEL-NEXT: .cfi_offset w26, -80 +; GISEL-NEXT: .cfi_offset w27, -88 +; GISEL-NEXT: .cfi_offset w28, -96 +; GISEL-NEXT: str x0, [sp, #264] ; 8-byte Folded Spill +; GISEL-NEXT: mov w8, w2 +; GISEL-NEXT: mov w9, #64 ; =0x40 +; GISEL-NEXT: ldp x7, x0, [x1] +; GISEL-NEXT: and x15, x8, #0x3f +; GISEL-NEXT: sub x14, x9, x15 +; GISEL-NEXT: ldr x28, [x1, #120] +; GISEL-NEXT: lsr x10, x8, #6 +; GISEL-NEXT: ldp x17, x16, [x1, #16] +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: lsl x9, x0, x14 +; GISEL-NEXT: lsr x12, x7, x15 +; GISEL-NEXT: asr x11, x28, #63 +; GISEL-NEXT: lsr x20, x0, x15 +; GISEL-NEXT: str x0, [sp, #232] ; 8-byte Folded Spill +; GISEL-NEXT: lsl x27, x28, x14 +; GISEL-NEXT: csel x9, xzr, x9, eq +; GISEL-NEXT: lsl x19, x17, x14 +; GISEL-NEXT: cmp x10, #0 +; GISEL-NEXT: orr x9, x12, x9 +; GISEL-NEXT: str x17, [sp, #208] ; 8-byte Folded Spill +; GISEL-NEXT: lsr x2, x17, x15 +; GISEL-NEXT: csel x9, x9, x11, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: lsl x30, x16, x14 +; GISEL-NEXT: csel x12, xzr, x19, eq +; GISEL-NEXT: cmp x10, #1 +; GISEL-NEXT: lsr x25, x16, x15 +; GISEL-NEXT: orr x12, x20, x12 +; GISEL-NEXT: ldp x13, x17, [x1, #32] +; GISEL-NEXT: csel x9, x12, x9, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: str x16, [sp, #184] ; 8-byte Folded Spill +; GISEL-NEXT: csel x12, xzr, x30, eq +; GISEL-NEXT: cmp x10, #2 +; GISEL-NEXT: str x2, [sp, #88] ; 8-byte Folded Spill +; GISEL-NEXT: lsl x24, x13, x14 +; GISEL-NEXT: orr x12, x2, x12 +; GISEL-NEXT: str x13, [sp, #168] ; 8-byte Folded Spill +; GISEL-NEXT: csel x9, x12, x9, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: lsr x21, x13, x15 +; GISEL-NEXT: csel x12, xzr, x24, eq +; GISEL-NEXT: lsl x0, x17, x14 +; GISEL-NEXT: cmp x10, #3 +; GISEL-NEXT: orr x12, x25, x12 +; GISEL-NEXT: ldp x16, x13, [x1, #48] +; GISEL-NEXT: csel x9, x12, x9, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: lsr x3, x17, x15 +; GISEL-NEXT: csel x12, xzr, x0, eq +; GISEL-NEXT: cmp x10, #4 +; GISEL-NEXT: str x0, [sp, #128] ; 8-byte Folded Spill +; GISEL-NEXT: lsl x2, x16, x14 +; GISEL-NEXT: orr x12, x21, x12 +; GISEL-NEXT: mov x0, x16 +; GISEL-NEXT: csel x9, x12, x9, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: str x17, [sp, #144] ; 8-byte Folded Spill +; GISEL-NEXT: csel x12, xzr, x2, eq +; GISEL-NEXT: str x2, [sp, #304] ; 8-byte Folded Spill +; GISEL-NEXT: lsl x2, x13, x14 +; GISEL-NEXT: cmp x10, #5 +; GISEL-NEXT: orr x12, x3, x12 +; GISEL-NEXT: ldr x17, [x1, #64] +; GISEL-NEXT: lsr x6, x0, x15 +; GISEL-NEXT: csel x9, x12, x9, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x12, xzr, x2, eq +; GISEL-NEXT: stp x16, x13, [sp, #152] ; 16-byte Folded Spill +; GISEL-NEXT: mov x16, x13 +; GISEL-NEXT: cmp x10, #6 +; GISEL-NEXT: orr x12, x6, x12 +; GISEL-NEXT: lsl x0, x17, x14 +; GISEL-NEXT: csel x9, x12, x9, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: lsr x16, x16, x15 +; GISEL-NEXT: ldr x13, [x1, #72] +; GISEL-NEXT: csel x12, xzr, x0, eq +; GISEL-NEXT: cmp x10, #7 +; GISEL-NEXT: orr x12, x16, x12 +; GISEL-NEXT: stp x16, x0, [sp, #288] ; 16-byte Folded Spill +; GISEL-NEXT: lsr x0, x17, x15 +; GISEL-NEXT: csel x9, x12, x9, eq +; GISEL-NEXT: lsl x12, x13, x14 +; GISEL-NEXT: mov x16, x13 +; GISEL-NEXT: str x13, [sp, #192] ; 8-byte Folded Spill +; GISEL-NEXT: ldp x13, x5, [x1, #80] +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: str x17, [sp, #176] ; 8-byte Folded Spill +; GISEL-NEXT: csel x17, xzr, x12, eq +; GISEL-NEXT: str x0, [sp, #112] ; 8-byte Folded Spill +; GISEL-NEXT: cmp x10, #8 +; GISEL-NEXT: orr x17, x0, x17 +; GISEL-NEXT: lsl x0, x13, x14 +; GISEL-NEXT: str x12, [sp, #280] ; 8-byte Folded Spill +; GISEL-NEXT: csel x17, x17, x9, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: lsr x9, x16, x15 +; GISEL-NEXT: csel x4, xzr, x0, eq +; GISEL-NEXT: str x13, [sp, #200] ; 8-byte Folded Spill +; GISEL-NEXT: cmp x10, #9 +; GISEL-NEXT: stp x9, x0, [sp, #96] ; 16-byte Folded Spill +; GISEL-NEXT: orr x4, x9, x4 +; GISEL-NEXT: lsl x23, x5, x14 +; GISEL-NEXT: lsr x12, x13, x15 +; GISEL-NEXT: ldp x9, x13, [x1, #96] +; GISEL-NEXT: csel x17, x4, x17, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: str x28, [sp, #256] ; 8-byte Folded Spill +; GISEL-NEXT: csel x4, xzr, x23, eq +; GISEL-NEXT: cmp x10, #10 +; GISEL-NEXT: str x3, [sp, #120] ; 8-byte Folded Spill +; GISEL-NEXT: orr x4, x12, x4 +; GISEL-NEXT: lsl x16, x9, x14 +; GISEL-NEXT: stp x5, x9, [sp, #216] ; 16-byte Folded Spill +; GISEL-NEXT: csel x17, x4, x17, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: lsl x3, x11, x14 +; GISEL-NEXT: stp x16, x12, [sp, #72] ; 16-byte Folded Spill +; GISEL-NEXT: mov x12, x9 +; GISEL-NEXT: lsr x9, x5, x15 +; GISEL-NEXT: csel x4, xzr, x16, eq +; GISEL-NEXT: lsl x16, x13, x14 +; GISEL-NEXT: cmp x10, #11 +; GISEL-NEXT: orr x4, x9, x4 +; GISEL-NEXT: lsr x12, x12, x15 +; GISEL-NEXT: str x30, [sp, #48] ; 8-byte Folded Spill +; GISEL-NEXT: stp x16, x9, [sp, #56] ; 16-byte Folded Spill +; GISEL-NEXT: ldr x9, [x1, #112] +; GISEL-NEXT: csel x17, x4, x17, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: stp x25, x24, [sp, #16] ; 16-byte Folded Spill +; GISEL-NEXT: ldr x5, [sp, #96] ; 8-byte Folded Reload +; GISEL-NEXT: csel x1, xzr, x16, eq +; GISEL-NEXT: lsl x16, x9, x14 +; GISEL-NEXT: cmp x10, #12 +; GISEL-NEXT: orr x1, x12, x1 +; GISEL-NEXT: stp x13, x9, [sp, #240] ; 16-byte Folded Spill +; GISEL-NEXT: stp x16, x12, [sp, #320] ; 16-byte Folded Spill +; GISEL-NEXT: mov x12, x9 +; GISEL-NEXT: csel x1, x1, x17, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: lsr x9, x13, x15 +; GISEL-NEXT: lsr x26, x12, x15 +; GISEL-NEXT: csel x17, xzr, x16, eq +; GISEL-NEXT: cmp x10, #13 +; GISEL-NEXT: str x23, [sp, #272] ; 8-byte Folded Spill +; GISEL-NEXT: orr x13, x9, x17 +; GISEL-NEXT: str x9, [sp, #312] ; 8-byte Folded Spill +; GISEL-NEXT: mov x9, x28 +; GISEL-NEXT: csel x13, x13, x1, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: lsr x22, x9, x15 +; GISEL-NEXT: csel x28, xzr, x27, eq +; GISEL-NEXT: cmp x10, #14 +; GISEL-NEXT: str x2, [sp, #8] ; 8-byte Folded Spill +; GISEL-NEXT: orr x28, x26, x28 +; GISEL-NEXT: ldp x0, x16, [sp, #120] ; 16-byte Folded Reload +; GISEL-NEXT: csel x12, x28, x13, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: ldr x13, [sp, #304] ; 8-byte Folded Reload +; GISEL-NEXT: csel x28, xzr, x3, eq +; GISEL-NEXT: cmp x10, #15 +; GISEL-NEXT: stp x22, x3, [sp, #32] ; 16-byte Folded Spill +; GISEL-NEXT: orr x28, x22, x28 +; GISEL-NEXT: ldp x15, x14, [sp, #72] ; 16-byte Folded Reload +; GISEL-NEXT: csel x9, x28, x12, eq +; GISEL-NEXT: cmp x8, #0 +; GISEL-NEXT: mov x28, x24 +; GISEL-NEXT: csel x9, x7, x9, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: ldr x7, [sp, #88] ; 8-byte Folded Reload +; GISEL-NEXT: str x9, [sp, #136] ; 8-byte Folded Spill +; GISEL-NEXT: csel x9, xzr, x19, eq +; GISEL-NEXT: cmp x10, #0 +; GISEL-NEXT: orr x9, x20, x9 +; GISEL-NEXT: ldr x12, [sp, #280] ; 8-byte Folded Reload +; GISEL-NEXT: csel x9, x9, x11, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x20, xzr, x30, eq +; GISEL-NEXT: cmp x10, #1 +; GISEL-NEXT: mov x30, x25 +; GISEL-NEXT: orr x20, x7, x20 +; GISEL-NEXT: ldp x4, x19, [sp, #104] ; 16-byte Folded Reload +; GISEL-NEXT: csel x9, x20, x9, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x20, xzr, x24, eq +; GISEL-NEXT: cmp x10, #2 +; GISEL-NEXT: mov x24, x6 +; GISEL-NEXT: orr x20, x25, x20 +; GISEL-NEXT: mov x25, x21 +; GISEL-NEXT: csel x9, x20, x9, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x20, xzr, x16, eq +; GISEL-NEXT: cmp x10, #3 +; GISEL-NEXT: orr x20, x21, x20 +; GISEL-NEXT: ldp x1, x17, [sp, #56] ; 16-byte Folded Reload +; GISEL-NEXT: csel x9, x20, x9, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x20, xzr, x13, eq +; GISEL-NEXT: cmp x10, #4 +; GISEL-NEXT: orr x20, x0, x20 +; GISEL-NEXT: csel x9, x20, x9, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x20, xzr, x2, eq +; GISEL-NEXT: cmp x10, #5 +; GISEL-NEXT: orr x20, x6, x20 +; GISEL-NEXT: ldp x21, x6, [sp, #288] ; 16-byte Folded Reload +; GISEL-NEXT: csel x9, x20, x9, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x20, xzr, x6, eq +; GISEL-NEXT: cmp x10, #6 +; GISEL-NEXT: orr x20, x21, x20 +; GISEL-NEXT: csel x9, x20, x9, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x20, xzr, x12, eq +; GISEL-NEXT: cmp x10, #7 +; GISEL-NEXT: orr x20, x19, x20 +; GISEL-NEXT: csel x9, x20, x9, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x20, xzr, x4, eq +; GISEL-NEXT: cmp x10, #8 +; GISEL-NEXT: orr x20, x5, x20 +; GISEL-NEXT: csel x9, x20, x9, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x20, xzr, x23, eq +; GISEL-NEXT: cmp x10, #9 +; GISEL-NEXT: ldr x23, [sp, #328] ; 8-byte Folded Reload +; GISEL-NEXT: orr x20, x14, x20 +; GISEL-NEXT: csel x9, x20, x9, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x20, xzr, x15, eq +; GISEL-NEXT: cmp x10, #10 +; GISEL-NEXT: orr x20, x17, x20 +; GISEL-NEXT: csel x9, x20, x9, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x20, xzr, x1, eq +; GISEL-NEXT: cmp x10, #11 +; GISEL-NEXT: orr x20, x23, x20 +; GISEL-NEXT: csel x9, x20, x9, eq +; GISEL-NEXT: ldp x23, x20, [sp, #312] ; 16-byte Folded Reload +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x20, xzr, x20, eq +; GISEL-NEXT: cmp x10, #12 +; GISEL-NEXT: orr x20, x23, x20 +; GISEL-NEXT: mov x23, x26 +; GISEL-NEXT: csel x9, x20, x9, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x20, xzr, x27, eq +; GISEL-NEXT: cmp x10, #13 +; GISEL-NEXT: orr x20, x26, x20 +; GISEL-NEXT: ldr x26, [sp, #272] ; 8-byte Folded Reload +; GISEL-NEXT: csel x9, x20, x9, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x20, xzr, x3, eq +; GISEL-NEXT: cmp x10, #14 +; GISEL-NEXT: ldr x3, [sp, #232] ; 8-byte Folded Reload +; GISEL-NEXT: orr x20, x22, x20 +; GISEL-NEXT: mov x22, x23 +; GISEL-NEXT: csel x9, x20, x9, eq +; GISEL-NEXT: cmp x10, #15 +; GISEL-NEXT: csel x9, x11, x9, eq +; GISEL-NEXT: cmp x8, #0 +; GISEL-NEXT: csel x9, x3, x9, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: mov x3, x16 +; GISEL-NEXT: str x9, [sp, #232] ; 8-byte Folded Spill +; GISEL-NEXT: ldr x9, [sp, #48] ; 8-byte Folded Reload +; GISEL-NEXT: csel x9, xzr, x9, eq +; GISEL-NEXT: cmp x10, #0 +; GISEL-NEXT: orr x9, x7, x9 +; GISEL-NEXT: ldr x7, [sp, #312] ; 8-byte Folded Reload +; GISEL-NEXT: csel x9, x9, x11, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x20, xzr, x28, eq +; GISEL-NEXT: cmp x10, #1 +; GISEL-NEXT: mov x28, x21 +; GISEL-NEXT: orr x20, x30, x20 +; GISEL-NEXT: csel x9, x20, x9, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x20, xzr, x16, eq +; GISEL-NEXT: cmp x10, #2 +; GISEL-NEXT: mov x16, x0 +; GISEL-NEXT: orr x20, x25, x20 +; GISEL-NEXT: csel x9, x20, x9, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x20, xzr, x13, eq +; GISEL-NEXT: cmp x10, #3 +; GISEL-NEXT: orr x20, x0, x20 +; GISEL-NEXT: mov x0, x19 +; GISEL-NEXT: csel x9, x20, x9, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x20, xzr, x2, eq +; GISEL-NEXT: cmp x10, #4 +; GISEL-NEXT: mov x2, x4 +; GISEL-NEXT: orr x20, x24, x20 +; GISEL-NEXT: csel x9, x20, x9, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x20, xzr, x6, eq +; GISEL-NEXT: cmp x10, #5 +; GISEL-NEXT: mov x6, x5 +; GISEL-NEXT: orr x20, x21, x20 +; GISEL-NEXT: mov x21, x25 +; GISEL-NEXT: csel x9, x20, x9, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x20, xzr, x12, eq +; GISEL-NEXT: cmp x10, #6 +; GISEL-NEXT: ldr x12, [sp, #208] ; 8-byte Folded Reload +; GISEL-NEXT: orr x20, x19, x20 +; GISEL-NEXT: mov x19, x27 +; GISEL-NEXT: csel x9, x20, x9, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x20, xzr, x4, eq +; GISEL-NEXT: cmp x10, #7 +; GISEL-NEXT: orr x20, x5, x20 +; GISEL-NEXT: ldp x30, x4, [sp, #320] ; 16-byte Folded Reload +; GISEL-NEXT: csel x9, x20, x9, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: mov x5, x3 +; GISEL-NEXT: csel x20, xzr, x26, eq +; GISEL-NEXT: cmp x10, #8 +; GISEL-NEXT: orr x20, x14, x20 +; GISEL-NEXT: csel x9, x20, x9, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x20, xzr, x15, eq +; GISEL-NEXT: cmp x10, #9 +; GISEL-NEXT: orr x20, x17, x20 +; GISEL-NEXT: csel x9, x20, x9, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x20, xzr, x1, eq +; GISEL-NEXT: cmp x10, #10 +; GISEL-NEXT: orr x20, x4, x20 +; GISEL-NEXT: csel x9, x20, x9, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x20, xzr, x30, eq +; GISEL-NEXT: cmp x10, #11 +; GISEL-NEXT: orr x20, x7, x20 +; GISEL-NEXT: csel x9, x20, x9, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x20, xzr, x27, eq +; GISEL-NEXT: cmp x10, #12 +; GISEL-NEXT: orr x20, x23, x20 +; GISEL-NEXT: ldp x27, x23, [sp, #32] ; 16-byte Folded Reload +; GISEL-NEXT: csel x9, x20, x9, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x20, xzr, x23, eq +; GISEL-NEXT: cmp x10, #13 +; GISEL-NEXT: orr x20, x27, x20 +; GISEL-NEXT: csel x9, x20, x9, eq +; GISEL-NEXT: cmp x10, #14 +; GISEL-NEXT: csel x9, x11, x9, eq +; GISEL-NEXT: cmp x10, #15 +; GISEL-NEXT: csel x9, x11, x9, eq +; GISEL-NEXT: cmp x8, #0 +; GISEL-NEXT: csel x9, x12, x9, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: str x9, [sp, #208] ; 8-byte Folded Spill +; GISEL-NEXT: ldp x12, x9, [sp, #16] ; 16-byte Folded Reload +; GISEL-NEXT: csel x9, xzr, x9, eq +; GISEL-NEXT: cmp x10, #0 +; GISEL-NEXT: orr x9, x12, x9 +; GISEL-NEXT: ldr x12, [sp, #184] ; 8-byte Folded Reload +; GISEL-NEXT: csel x9, x9, x11, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x20, xzr, x3, eq +; GISEL-NEXT: cmp x10, #1 +; GISEL-NEXT: ldr x3, [sp, #296] ; 8-byte Folded Reload +; GISEL-NEXT: orr x20, x25, x20 +; GISEL-NEXT: ldr x25, [sp, #280] ; 8-byte Folded Reload +; GISEL-NEXT: csel x9, x20, x9, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x20, xzr, x13, eq +; GISEL-NEXT: cmp x10, #2 +; GISEL-NEXT: ldr x13, [sp, #8] ; 8-byte Folded Reload +; GISEL-NEXT: orr x20, x16, x20 +; GISEL-NEXT: csel x9, x20, x9, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x20, xzr, x13, eq +; GISEL-NEXT: cmp x10, #3 +; GISEL-NEXT: orr x20, x24, x20 +; GISEL-NEXT: csel x9, x20, x9, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x20, xzr, x3, eq +; GISEL-NEXT: cmp x10, #4 +; GISEL-NEXT: orr x20, x28, x20 +; GISEL-NEXT: mov x28, x16 +; GISEL-NEXT: csel x9, x20, x9, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x20, xzr, x25, eq +; GISEL-NEXT: cmp x10, #5 +; GISEL-NEXT: orr x20, x0, x20 +; GISEL-NEXT: csel x9, x20, x9, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x20, xzr, x2, eq +; GISEL-NEXT: cmp x10, #6 +; GISEL-NEXT: orr x20, x6, x20 +; GISEL-NEXT: csel x9, x20, x9, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x20, xzr, x26, eq +; GISEL-NEXT: cmp x10, #7 +; GISEL-NEXT: mov x26, x14 +; GISEL-NEXT: orr x20, x14, x20 +; GISEL-NEXT: mov x14, x15 +; GISEL-NEXT: csel x9, x20, x9, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x20, xzr, x15, eq +; GISEL-NEXT: cmp x10, #8 +; GISEL-NEXT: mov x15, x17 +; GISEL-NEXT: orr x20, x17, x20 +; GISEL-NEXT: mov x17, x1 +; GISEL-NEXT: csel x9, x20, x9, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x20, xzr, x1, eq +; GISEL-NEXT: cmp x10, #9 +; GISEL-NEXT: mov x1, x4 +; GISEL-NEXT: orr x20, x4, x20 +; GISEL-NEXT: mov x4, x30 +; GISEL-NEXT: csel x9, x20, x9, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x20, xzr, x30, eq +; GISEL-NEXT: cmp x10, #10 +; GISEL-NEXT: ldr x30, [sp, #272] ; 8-byte Folded Reload +; GISEL-NEXT: orr x20, x7, x20 +; GISEL-NEXT: csel x9, x20, x9, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x20, xzr, x19, eq +; GISEL-NEXT: cmp x10, #11 +; GISEL-NEXT: orr x20, x22, x20 +; GISEL-NEXT: csel x9, x20, x9, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x20, xzr, x23, eq +; GISEL-NEXT: cmp x10, #12 +; GISEL-NEXT: orr x20, x27, x20 +; GISEL-NEXT: csel x9, x20, x9, eq +; GISEL-NEXT: cmp x10, #13 +; GISEL-NEXT: csel x9, x11, x9, eq +; GISEL-NEXT: cmp x10, #14 +; GISEL-NEXT: csel x9, x11, x9, eq +; GISEL-NEXT: cmp x10, #15 +; GISEL-NEXT: csel x9, x11, x9, eq +; GISEL-NEXT: cmp x8, #0 +; GISEL-NEXT: csel x9, x12, x9, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: ldr x12, [sp, #168] ; 8-byte Folded Reload +; GISEL-NEXT: str x9, [sp, #184] ; 8-byte Folded Spill +; GISEL-NEXT: csel x9, xzr, x5, eq +; GISEL-NEXT: cmp x10, #0 +; GISEL-NEXT: orr x9, x21, x9 +; GISEL-NEXT: ldr x5, [sp, #304] ; 8-byte Folded Reload +; GISEL-NEXT: mov x21, x0 +; GISEL-NEXT: csel x9, x9, x11, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x20, xzr, x5, eq +; GISEL-NEXT: cmp x10, #1 +; GISEL-NEXT: orr x20, x16, x20 +; GISEL-NEXT: mov x16, x24 +; GISEL-NEXT: csel x9, x20, x9, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x20, xzr, x13, eq +; GISEL-NEXT: cmp x10, #2 +; GISEL-NEXT: orr x20, x24, x20 +; GISEL-NEXT: ldr x24, [sp, #288] ; 8-byte Folded Reload +; GISEL-NEXT: csel x9, x20, x9, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x20, xzr, x3, eq +; GISEL-NEXT: cmp x10, #3 +; GISEL-NEXT: orr x20, x24, x20 +; GISEL-NEXT: csel x9, x20, x9, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x20, xzr, x25, eq +; GISEL-NEXT: cmp x10, #4 +; GISEL-NEXT: orr x20, x0, x20 +; GISEL-NEXT: csel x9, x20, x9, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x20, xzr, x2, eq +; GISEL-NEXT: cmp x10, #5 +; GISEL-NEXT: orr x20, x6, x20 +; GISEL-NEXT: csel x9, x20, x9, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x20, xzr, x30, eq +; GISEL-NEXT: cmp x10, #6 +; GISEL-NEXT: orr x20, x26, x20 +; GISEL-NEXT: csel x9, x20, x9, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x20, xzr, x14, eq +; GISEL-NEXT: cmp x10, #7 +; GISEL-NEXT: orr x20, x15, x20 +; GISEL-NEXT: csel x9, x20, x9, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x20, xzr, x17, eq +; GISEL-NEXT: cmp x10, #8 +; GISEL-NEXT: orr x20, x1, x20 +; GISEL-NEXT: csel x9, x20, x9, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x20, xzr, x4, eq +; GISEL-NEXT: cmp x10, #9 +; GISEL-NEXT: orr x20, x7, x20 +; GISEL-NEXT: csel x9, x20, x9, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x20, xzr, x19, eq +; GISEL-NEXT: cmp x10, #10 +; GISEL-NEXT: orr x20, x22, x20 +; GISEL-NEXT: csel x9, x20, x9, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x20, xzr, x23, eq +; GISEL-NEXT: cmp x10, #11 +; GISEL-NEXT: orr x20, x27, x20 +; GISEL-NEXT: csel x9, x20, x9, eq +; GISEL-NEXT: cmp x10, #12 +; GISEL-NEXT: csel x9, x11, x9, eq +; GISEL-NEXT: cmp x10, #13 +; GISEL-NEXT: csel x9, x11, x9, eq +; GISEL-NEXT: cmp x10, #14 +; GISEL-NEXT: csel x9, x11, x9, eq +; GISEL-NEXT: cmp x10, #15 +; GISEL-NEXT: csel x9, x11, x9, eq +; GISEL-NEXT: cmp x8, #0 +; GISEL-NEXT: csel x9, x12, x9, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: ldr x12, [sp, #144] ; 8-byte Folded Reload +; GISEL-NEXT: str x9, [sp, #168] ; 8-byte Folded Spill +; GISEL-NEXT: csel x9, xzr, x5, eq +; GISEL-NEXT: cmp x10, #0 +; GISEL-NEXT: orr x9, x28, x9 +; GISEL-NEXT: mov x28, x3 +; GISEL-NEXT: mov x5, x7 +; GISEL-NEXT: csel x9, x9, x11, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x20, xzr, x13, eq +; GISEL-NEXT: cmp x10, #1 +; GISEL-NEXT: orr x20, x16, x20 +; GISEL-NEXT: csel x9, x20, x9, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x20, xzr, x3, eq +; GISEL-NEXT: cmp x10, #2 +; GISEL-NEXT: orr x20, x24, x20 +; GISEL-NEXT: csel x9, x20, x9, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x20, xzr, x25, eq +; GISEL-NEXT: cmp x10, #3 +; GISEL-NEXT: orr x20, x0, x20 +; GISEL-NEXT: csel x9, x20, x9, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x20, xzr, x2, eq +; GISEL-NEXT: cmp x10, #4 +; GISEL-NEXT: orr x20, x6, x20 +; GISEL-NEXT: csel x9, x20, x9, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x20, xzr, x30, eq +; GISEL-NEXT: cmp x10, #5 +; GISEL-NEXT: orr x20, x26, x20 +; GISEL-NEXT: csel x9, x20, x9, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x20, xzr, x14, eq +; GISEL-NEXT: cmp x10, #6 +; GISEL-NEXT: orr x20, x15, x20 +; GISEL-NEXT: csel x9, x20, x9, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x20, xzr, x17, eq +; GISEL-NEXT: cmp x10, #7 +; GISEL-NEXT: orr x20, x1, x20 +; GISEL-NEXT: csel x9, x20, x9, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x20, xzr, x4, eq +; GISEL-NEXT: cmp x10, #8 +; GISEL-NEXT: orr x20, x7, x20 +; GISEL-NEXT: mov x7, x19 +; GISEL-NEXT: csel x9, x20, x9, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x20, xzr, x19, eq +; GISEL-NEXT: cmp x10, #9 +; GISEL-NEXT: mov x19, x22 +; GISEL-NEXT: orr x20, x22, x20 +; GISEL-NEXT: mov x22, x23 +; GISEL-NEXT: csel x9, x20, x9, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x20, xzr, x23, eq +; GISEL-NEXT: cmp x10, #10 +; GISEL-NEXT: orr x20, x27, x20 +; GISEL-NEXT: csel x9, x20, x9, eq +; GISEL-NEXT: cmp x10, #11 +; GISEL-NEXT: csel x9, x11, x9, eq +; GISEL-NEXT: cmp x10, #12 +; GISEL-NEXT: csel x9, x11, x9, eq +; GISEL-NEXT: cmp x10, #13 +; GISEL-NEXT: csel x9, x11, x9, eq +; GISEL-NEXT: cmp x10, #14 +; GISEL-NEXT: csel x9, x11, x9, eq +; GISEL-NEXT: cmp x10, #15 +; GISEL-NEXT: csel x9, x11, x9, eq +; GISEL-NEXT: cmp x8, #0 +; GISEL-NEXT: csel x9, x12, x9, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: ldr x12, [sp, #152] ; 8-byte Folded Reload +; GISEL-NEXT: str x9, [sp, #304] ; 8-byte Folded Spill +; GISEL-NEXT: csel x9, xzr, x13, eq +; GISEL-NEXT: cmp x10, #0 +; GISEL-NEXT: orr x9, x16, x9 +; GISEL-NEXT: mov x16, x0 +; GISEL-NEXT: csel x9, x9, x11, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x3, xzr, x3, eq +; GISEL-NEXT: cmp x10, #1 +; GISEL-NEXT: orr x3, x24, x3 +; GISEL-NEXT: csel x9, x3, x9, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x3, xzr, x25, eq +; GISEL-NEXT: cmp x10, #2 +; GISEL-NEXT: orr x3, x21, x3 +; GISEL-NEXT: csel x9, x3, x9, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x3, xzr, x2, eq +; GISEL-NEXT: cmp x10, #3 +; GISEL-NEXT: orr x3, x6, x3 +; GISEL-NEXT: csel x9, x3, x9, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x3, xzr, x30, eq +; GISEL-NEXT: cmp x10, #4 +; GISEL-NEXT: orr x3, x26, x3 +; GISEL-NEXT: csel x9, x3, x9, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x3, xzr, x14, eq +; GISEL-NEXT: cmp x10, #5 +; GISEL-NEXT: orr x3, x15, x3 +; GISEL-NEXT: csel x9, x3, x9, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x3, xzr, x17, eq +; GISEL-NEXT: cmp x10, #6 +; GISEL-NEXT: orr x3, x1, x3 +; GISEL-NEXT: csel x9, x3, x9, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x3, xzr, x4, eq +; GISEL-NEXT: cmp x10, #7 +; GISEL-NEXT: orr x3, x5, x3 +; GISEL-NEXT: csel x9, x3, x9, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x3, xzr, x7, eq +; GISEL-NEXT: cmp x10, #8 +; GISEL-NEXT: orr x3, x19, x3 +; GISEL-NEXT: csel x9, x3, x9, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x3, xzr, x23, eq +; GISEL-NEXT: cmp x10, #9 +; GISEL-NEXT: orr x3, x27, x3 +; GISEL-NEXT: csel x9, x3, x9, eq +; GISEL-NEXT: cmp x10, #10 +; GISEL-NEXT: mov x3, x2 +; GISEL-NEXT: csel x9, x11, x9, eq +; GISEL-NEXT: cmp x10, #11 +; GISEL-NEXT: csel x9, x11, x9, eq +; GISEL-NEXT: cmp x10, #12 +; GISEL-NEXT: csel x9, x11, x9, eq +; GISEL-NEXT: cmp x10, #13 +; GISEL-NEXT: csel x9, x11, x9, eq +; GISEL-NEXT: cmp x10, #14 +; GISEL-NEXT: csel x9, x11, x9, eq +; GISEL-NEXT: cmp x10, #15 +; GISEL-NEXT: csel x9, x11, x9, eq +; GISEL-NEXT: cmp x8, #0 +; GISEL-NEXT: csel x20, x12, x9, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: ldr x12, [sp, #160] ; 8-byte Folded Reload +; GISEL-NEXT: csel x9, xzr, x28, eq +; GISEL-NEXT: cmp x10, #0 +; GISEL-NEXT: orr x9, x24, x9 +; GISEL-NEXT: csel x9, x9, x11, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x0, xzr, x25, eq +; GISEL-NEXT: cmp x10, #1 +; GISEL-NEXT: orr x0, x21, x0 +; GISEL-NEXT: mov x21, x6 +; GISEL-NEXT: csel x9, x0, x9, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x0, xzr, x2, eq +; GISEL-NEXT: cmp x10, #2 +; GISEL-NEXT: orr x0, x6, x0 +; GISEL-NEXT: csel x9, x0, x9, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x0, xzr, x30, eq +; GISEL-NEXT: cmp x10, #3 +; GISEL-NEXT: orr x0, x26, x0 +; GISEL-NEXT: csel x9, x0, x9, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x0, xzr, x14, eq +; GISEL-NEXT: cmp x10, #4 +; GISEL-NEXT: orr x0, x15, x0 +; GISEL-NEXT: csel x9, x0, x9, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x0, xzr, x17, eq +; GISEL-NEXT: cmp x10, #5 +; GISEL-NEXT: orr x0, x1, x0 +; GISEL-NEXT: csel x9, x0, x9, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x0, xzr, x4, eq +; GISEL-NEXT: cmp x10, #6 +; GISEL-NEXT: orr x0, x5, x0 +; GISEL-NEXT: csel x9, x0, x9, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x0, xzr, x7, eq +; GISEL-NEXT: cmp x10, #7 +; GISEL-NEXT: orr x0, x19, x0 +; GISEL-NEXT: csel x9, x0, x9, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x0, xzr, x23, eq +; GISEL-NEXT: cmp x10, #8 +; GISEL-NEXT: orr x0, x27, x0 +; GISEL-NEXT: csel x9, x0, x9, eq +; GISEL-NEXT: cmp x10, #9 +; GISEL-NEXT: csel x9, x11, x9, eq +; GISEL-NEXT: cmp x10, #10 +; GISEL-NEXT: csel x9, x11, x9, eq +; GISEL-NEXT: cmp x10, #11 +; GISEL-NEXT: csel x9, x11, x9, eq +; GISEL-NEXT: cmp x10, #12 +; GISEL-NEXT: csel x9, x11, x9, eq +; GISEL-NEXT: cmp x10, #13 +; GISEL-NEXT: csel x9, x11, x9, eq +; GISEL-NEXT: cmp x10, #14 +; GISEL-NEXT: csel x9, x11, x9, eq +; GISEL-NEXT: cmp x10, #15 +; GISEL-NEXT: csel x9, x11, x9, eq +; GISEL-NEXT: cmp x8, #0 +; GISEL-NEXT: csel x2, x12, x9, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: ldr x12, [sp, #176] ; 8-byte Folded Reload +; GISEL-NEXT: csel x9, xzr, x25, eq +; GISEL-NEXT: cmp x10, #0 +; GISEL-NEXT: orr x9, x16, x9 +; GISEL-NEXT: ldr x16, [sp, #216] ; 8-byte Folded Reload +; GISEL-NEXT: csel x9, x9, x11, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x13, xzr, x3, eq +; GISEL-NEXT: cmp x10, #1 +; GISEL-NEXT: orr x13, x6, x13 +; GISEL-NEXT: csel x9, x13, x9, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x13, xzr, x30, eq +; GISEL-NEXT: cmp x10, #2 +; GISEL-NEXT: orr x13, x26, x13 +; GISEL-NEXT: csel x9, x13, x9, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x13, xzr, x14, eq +; GISEL-NEXT: cmp x10, #3 +; GISEL-NEXT: orr x13, x15, x13 +; GISEL-NEXT: csel x9, x13, x9, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x13, xzr, x17, eq +; GISEL-NEXT: cmp x10, #4 +; GISEL-NEXT: orr x13, x1, x13 +; GISEL-NEXT: csel x9, x13, x9, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x13, xzr, x4, eq +; GISEL-NEXT: cmp x10, #5 +; GISEL-NEXT: orr x13, x5, x13 +; GISEL-NEXT: csel x9, x13, x9, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x13, xzr, x7, eq +; GISEL-NEXT: cmp x10, #6 +; GISEL-NEXT: orr x13, x19, x13 +; GISEL-NEXT: csel x9, x13, x9, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x13, xzr, x23, eq +; GISEL-NEXT: cmp x10, #7 +; GISEL-NEXT: orr x13, x27, x13 +; GISEL-NEXT: csel x9, x13, x9, eq +; GISEL-NEXT: cmp x10, #8 +; GISEL-NEXT: csel x9, x11, x9, eq +; GISEL-NEXT: cmp x10, #9 +; GISEL-NEXT: csel x9, x11, x9, eq +; GISEL-NEXT: cmp x10, #10 +; GISEL-NEXT: csel x9, x11, x9, eq +; GISEL-NEXT: cmp x10, #11 +; GISEL-NEXT: csel x9, x11, x9, eq +; GISEL-NEXT: cmp x10, #12 +; GISEL-NEXT: csel x9, x11, x9, eq +; GISEL-NEXT: cmp x10, #13 +; GISEL-NEXT: csel x9, x11, x9, eq +; GISEL-NEXT: cmp x10, #14 +; GISEL-NEXT: csel x9, x11, x9, eq +; GISEL-NEXT: cmp x10, #15 +; GISEL-NEXT: csel x9, x11, x9, eq +; GISEL-NEXT: cmp x8, #0 +; GISEL-NEXT: csel x6, x12, x9, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x9, xzr, x3, eq +; GISEL-NEXT: cmp x10, #0 +; GISEL-NEXT: orr x9, x21, x9 +; GISEL-NEXT: csel x9, x9, x11, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x12, xzr, x30, eq +; GISEL-NEXT: cmp x10, #1 +; GISEL-NEXT: orr x12, x26, x12 +; GISEL-NEXT: csel x9, x12, x9, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x12, xzr, x14, eq +; GISEL-NEXT: cmp x10, #2 +; GISEL-NEXT: orr x12, x15, x12 +; GISEL-NEXT: csel x9, x12, x9, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x12, xzr, x17, eq +; GISEL-NEXT: cmp x10, #3 +; GISEL-NEXT: orr x12, x1, x12 +; GISEL-NEXT: csel x9, x12, x9, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x12, xzr, x4, eq +; GISEL-NEXT: cmp x10, #4 +; GISEL-NEXT: orr x12, x5, x12 +; GISEL-NEXT: csel x9, x12, x9, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x12, xzr, x7, eq +; GISEL-NEXT: cmp x10, #5 +; GISEL-NEXT: orr x12, x19, x12 +; GISEL-NEXT: csel x9, x12, x9, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x12, xzr, x23, eq +; GISEL-NEXT: cmp x10, #6 +; GISEL-NEXT: orr x12, x27, x12 +; GISEL-NEXT: csel x9, x12, x9, eq +; GISEL-NEXT: cmp x10, #7 +; GISEL-NEXT: ldr x12, [sp, #192] ; 8-byte Folded Reload +; GISEL-NEXT: csel x9, x11, x9, eq +; GISEL-NEXT: cmp x10, #8 +; GISEL-NEXT: csel x9, x11, x9, eq +; GISEL-NEXT: cmp x10, #9 +; GISEL-NEXT: csel x9, x11, x9, eq +; GISEL-NEXT: cmp x10, #10 +; GISEL-NEXT: csel x9, x11, x9, eq +; GISEL-NEXT: cmp x10, #11 +; GISEL-NEXT: csel x9, x11, x9, eq +; GISEL-NEXT: cmp x10, #12 +; GISEL-NEXT: csel x9, x11, x9, eq +; GISEL-NEXT: cmp x10, #13 +; GISEL-NEXT: csel x9, x11, x9, eq +; GISEL-NEXT: cmp x10, #14 +; GISEL-NEXT: csel x9, x11, x9, eq +; GISEL-NEXT: cmp x10, #15 +; GISEL-NEXT: csel x9, x11, x9, eq +; GISEL-NEXT: cmp x8, #0 +; GISEL-NEXT: csel x9, x12, x9, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x12, xzr, x30, eq +; GISEL-NEXT: cmp x10, #0 +; GISEL-NEXT: orr x12, x26, x12 +; GISEL-NEXT: ldp x29, x30, [sp, #416] ; 16-byte Folded Reload +; GISEL-NEXT: csel x12, x12, x11, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x13, xzr, x14, eq +; GISEL-NEXT: cmp x10, #1 +; GISEL-NEXT: orr x13, x15, x13 +; GISEL-NEXT: ldp x26, x25, [sp, #352] ; 16-byte Folded Reload +; GISEL-NEXT: csel x12, x13, x12, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x13, xzr, x17, eq +; GISEL-NEXT: cmp x10, #2 +; GISEL-NEXT: orr x13, x1, x13 +; GISEL-NEXT: csel x12, x13, x12, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x13, xzr, x4, eq +; GISEL-NEXT: cmp x10, #3 +; GISEL-NEXT: orr x13, x5, x13 +; GISEL-NEXT: csel x12, x13, x12, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x13, xzr, x7, eq +; GISEL-NEXT: cmp x10, #4 +; GISEL-NEXT: orr x13, x19, x13 +; GISEL-NEXT: csel x12, x13, x12, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x13, xzr, x23, eq +; GISEL-NEXT: cmp x10, #5 +; GISEL-NEXT: orr x13, x27, x13 +; GISEL-NEXT: csel x12, x13, x12, eq +; GISEL-NEXT: cmp x10, #6 +; GISEL-NEXT: ldr x13, [sp, #200] ; 8-byte Folded Reload +; GISEL-NEXT: csel x12, x11, x12, eq +; GISEL-NEXT: cmp x10, #7 +; GISEL-NEXT: csel x12, x11, x12, eq +; GISEL-NEXT: cmp x10, #8 +; GISEL-NEXT: csel x12, x11, x12, eq +; GISEL-NEXT: cmp x10, #9 +; GISEL-NEXT: csel x12, x11, x12, eq +; GISEL-NEXT: cmp x10, #10 +; GISEL-NEXT: csel x12, x11, x12, eq +; GISEL-NEXT: cmp x10, #11 +; GISEL-NEXT: csel x12, x11, x12, eq +; GISEL-NEXT: cmp x10, #12 +; GISEL-NEXT: csel x12, x11, x12, eq +; GISEL-NEXT: cmp x10, #13 +; GISEL-NEXT: csel x12, x11, x12, eq +; GISEL-NEXT: cmp x10, #14 +; GISEL-NEXT: csel x12, x11, x12, eq +; GISEL-NEXT: cmp x10, #15 +; GISEL-NEXT: csel x12, x11, x12, eq +; GISEL-NEXT: cmp x8, #0 +; GISEL-NEXT: csel x13, x13, x12, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x12, xzr, x14, eq +; GISEL-NEXT: cmp x10, #0 +; GISEL-NEXT: ldr x14, [sp, #264] ; 8-byte Folded Reload +; GISEL-NEXT: orr x12, x15, x12 +; GISEL-NEXT: ldr x15, [sp, #136] ; 8-byte Folded Reload +; GISEL-NEXT: csel x12, x12, x11, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: stp x9, x13, [x14, #72] +; GISEL-NEXT: csel x0, xzr, x17, eq +; GISEL-NEXT: cmp x10, #1 +; GISEL-NEXT: str x15, [x14] +; GISEL-NEXT: orr x0, x1, x0 +; GISEL-NEXT: ldr x15, [sp, #232] ; 8-byte Folded Reload +; GISEL-NEXT: stp x2, x6, [x14, #56] +; GISEL-NEXT: csel x12, x0, x12, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x0, xzr, x4, eq +; GISEL-NEXT: cmp x10, #2 +; GISEL-NEXT: str x15, [x14, #8] +; GISEL-NEXT: orr x0, x5, x0 +; GISEL-NEXT: ldr x15, [sp, #208] ; 8-byte Folded Reload +; GISEL-NEXT: csel x12, x0, x12, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x0, xzr, x7, eq +; GISEL-NEXT: cmp x10, #3 +; GISEL-NEXT: str x15, [x14, #16] +; GISEL-NEXT: orr x0, x19, x0 +; GISEL-NEXT: ldr x15, [sp, #184] ; 8-byte Folded Reload +; GISEL-NEXT: csel x12, x0, x12, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x0, xzr, x23, eq +; GISEL-NEXT: cmp x10, #4 +; GISEL-NEXT: str x15, [x14, #24] +; GISEL-NEXT: orr x0, x27, x0 +; GISEL-NEXT: ldr x15, [sp, #168] ; 8-byte Folded Reload +; GISEL-NEXT: csel x12, x0, x12, eq +; GISEL-NEXT: cmp x10, #5 +; GISEL-NEXT: csel x12, x11, x12, eq +; GISEL-NEXT: cmp x10, #6 +; GISEL-NEXT: str x15, [x14, #32] +; GISEL-NEXT: csel x12, x11, x12, eq +; GISEL-NEXT: cmp x10, #7 +; GISEL-NEXT: ldr x15, [sp, #304] ; 8-byte Folded Reload +; GISEL-NEXT: csel x12, x11, x12, eq +; GISEL-NEXT: cmp x10, #8 +; GISEL-NEXT: csel x12, x11, x12, eq +; GISEL-NEXT: cmp x10, #9 +; GISEL-NEXT: stp x15, x20, [x14, #40] +; GISEL-NEXT: csel x12, x11, x12, eq +; GISEL-NEXT: cmp x10, #10 +; GISEL-NEXT: csel x12, x11, x12, eq +; GISEL-NEXT: cmp x10, #11 +; GISEL-NEXT: csel x12, x11, x12, eq +; GISEL-NEXT: cmp x10, #12 +; GISEL-NEXT: csel x12, x11, x12, eq +; GISEL-NEXT: cmp x10, #13 +; GISEL-NEXT: csel x12, x11, x12, eq +; GISEL-NEXT: cmp x10, #14 +; GISEL-NEXT: csel x12, x11, x12, eq +; GISEL-NEXT: cmp x10, #15 +; GISEL-NEXT: csel x12, x11, x12, eq +; GISEL-NEXT: cmp x8, #0 +; GISEL-NEXT: csel x0, x16, x12, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: ldr x16, [sp, #224] ; 8-byte Folded Reload +; GISEL-NEXT: csel x12, xzr, x17, eq +; GISEL-NEXT: cmp x10, #0 +; GISEL-NEXT: orr x12, x1, x12 +; GISEL-NEXT: csel x12, x12, x11, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x3, xzr, x4, eq +; GISEL-NEXT: cmp x10, #1 +; GISEL-NEXT: orr x3, x5, x3 +; GISEL-NEXT: csel x12, x3, x12, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x3, xzr, x7, eq +; GISEL-NEXT: cmp x10, #2 +; GISEL-NEXT: orr x3, x19, x3 +; GISEL-NEXT: csel x12, x3, x12, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x3, xzr, x23, eq +; GISEL-NEXT: cmp x10, #3 +; GISEL-NEXT: orr x3, x27, x3 +; GISEL-NEXT: ldp x24, x23, [sp, #368] ; 16-byte Folded Reload +; GISEL-NEXT: csel x12, x3, x12, eq +; GISEL-NEXT: cmp x10, #4 +; GISEL-NEXT: csel x12, x11, x12, eq +; GISEL-NEXT: cmp x10, #5 +; GISEL-NEXT: csel x12, x11, x12, eq +; GISEL-NEXT: cmp x10, #6 +; GISEL-NEXT: csel x12, x11, x12, eq +; GISEL-NEXT: cmp x10, #7 +; GISEL-NEXT: csel x12, x11, x12, eq +; GISEL-NEXT: cmp x10, #8 +; GISEL-NEXT: csel x12, x11, x12, eq +; GISEL-NEXT: cmp x10, #9 +; GISEL-NEXT: csel x12, x11, x12, eq +; GISEL-NEXT: cmp x10, #10 +; GISEL-NEXT: csel x12, x11, x12, eq +; GISEL-NEXT: cmp x10, #11 +; GISEL-NEXT: csel x12, x11, x12, eq +; GISEL-NEXT: cmp x10, #12 +; GISEL-NEXT: csel x12, x11, x12, eq +; GISEL-NEXT: cmp x10, #13 +; GISEL-NEXT: csel x12, x11, x12, eq +; GISEL-NEXT: cmp x10, #14 +; GISEL-NEXT: csel x12, x11, x12, eq +; GISEL-NEXT: cmp x10, #15 +; GISEL-NEXT: csel x12, x11, x12, eq +; GISEL-NEXT: cmp x8, #0 +; GISEL-NEXT: csel x3, x16, x12, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: ldr x16, [sp, #240] ; 8-byte Folded Reload +; GISEL-NEXT: csel x12, xzr, x4, eq +; GISEL-NEXT: cmp x10, #0 +; GISEL-NEXT: stp x0, x3, [x14, #88] +; GISEL-NEXT: orr x12, x5, x12 +; GISEL-NEXT: csel x12, x12, x11, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x4, xzr, x7, eq +; GISEL-NEXT: cmp x10, #1 +; GISEL-NEXT: orr x4, x19, x4 +; GISEL-NEXT: csel x12, x4, x12, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x4, xzr, x22, eq +; GISEL-NEXT: cmp x10, #2 +; GISEL-NEXT: orr x4, x27, x4 +; GISEL-NEXT: csel x12, x4, x12, eq +; GISEL-NEXT: cmp x10, #3 +; GISEL-NEXT: csel x12, x11, x12, eq +; GISEL-NEXT: cmp x10, #4 +; GISEL-NEXT: csel x12, x11, x12, eq +; GISEL-NEXT: cmp x10, #5 +; GISEL-NEXT: csel x12, x11, x12, eq +; GISEL-NEXT: cmp x10, #6 +; GISEL-NEXT: csel x12, x11, x12, eq +; GISEL-NEXT: cmp x10, #7 +; GISEL-NEXT: csel x12, x11, x12, eq +; GISEL-NEXT: cmp x10, #8 +; GISEL-NEXT: csel x12, x11, x12, eq +; GISEL-NEXT: cmp x10, #9 +; GISEL-NEXT: csel x12, x11, x12, eq +; GISEL-NEXT: cmp x10, #10 +; GISEL-NEXT: csel x12, x11, x12, eq +; GISEL-NEXT: cmp x10, #11 +; GISEL-NEXT: csel x12, x11, x12, eq +; GISEL-NEXT: cmp x10, #12 +; GISEL-NEXT: csel x12, x11, x12, eq +; GISEL-NEXT: cmp x10, #13 +; GISEL-NEXT: csel x12, x11, x12, eq +; GISEL-NEXT: cmp x10, #14 +; GISEL-NEXT: csel x12, x11, x12, eq +; GISEL-NEXT: cmp x10, #15 +; GISEL-NEXT: csel x12, x11, x12, eq +; GISEL-NEXT: cmp x8, #0 +; GISEL-NEXT: csel x4, x16, x12, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: ldr x16, [sp, #248] ; 8-byte Folded Reload +; GISEL-NEXT: csel x12, xzr, x7, eq +; GISEL-NEXT: cmp x10, #0 +; GISEL-NEXT: orr x12, x19, x12 +; GISEL-NEXT: ldp x20, x19, [sp, #400] ; 16-byte Folded Reload +; GISEL-NEXT: csel x12, x12, x11, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x17, xzr, x22, eq +; GISEL-NEXT: cmp x10, #1 +; GISEL-NEXT: orr x17, x27, x17 +; GISEL-NEXT: csel x12, x17, x12, eq +; GISEL-NEXT: cmp x10, #2 +; GISEL-NEXT: csel x12, x11, x12, eq +; GISEL-NEXT: cmp x10, #3 +; GISEL-NEXT: csel x12, x11, x12, eq +; GISEL-NEXT: cmp x10, #4 +; GISEL-NEXT: csel x12, x11, x12, eq +; GISEL-NEXT: cmp x10, #5 +; GISEL-NEXT: csel x12, x11, x12, eq +; GISEL-NEXT: cmp x10, #6 +; GISEL-NEXT: csel x12, x11, x12, eq +; GISEL-NEXT: cmp x10, #7 +; GISEL-NEXT: csel x12, x11, x12, eq +; GISEL-NEXT: cmp x10, #8 +; GISEL-NEXT: csel x12, x11, x12, eq +; GISEL-NEXT: cmp x10, #9 +; GISEL-NEXT: csel x12, x11, x12, eq +; GISEL-NEXT: cmp x10, #10 +; GISEL-NEXT: csel x12, x11, x12, eq +; GISEL-NEXT: cmp x10, #11 +; GISEL-NEXT: csel x12, x11, x12, eq +; GISEL-NEXT: cmp x10, #12 +; GISEL-NEXT: csel x12, x11, x12, eq +; GISEL-NEXT: cmp x10, #13 +; GISEL-NEXT: csel x12, x11, x12, eq +; GISEL-NEXT: cmp x10, #14 +; GISEL-NEXT: csel x12, x11, x12, eq +; GISEL-NEXT: cmp x10, #15 +; GISEL-NEXT: csel x12, x11, x12, eq +; GISEL-NEXT: cmp x8, #0 +; GISEL-NEXT: csel x17, x16, x12, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: csel x12, xzr, x22, eq +; GISEL-NEXT: cmp x10, #0 +; GISEL-NEXT: stp x4, x17, [x14, #104] +; GISEL-NEXT: orr x12, x27, x12 +; GISEL-NEXT: ldp x22, x21, [sp, #384] ; 16-byte Folded Reload +; GISEL-NEXT: csel x12, x12, x11, eq +; GISEL-NEXT: cmp x10, #1 +; GISEL-NEXT: csel x12, x11, x12, eq +; GISEL-NEXT: cmp x10, #2 +; GISEL-NEXT: csel x12, x11, x12, eq +; GISEL-NEXT: cmp x10, #3 +; GISEL-NEXT: csel x12, x11, x12, eq +; GISEL-NEXT: cmp x10, #4 +; GISEL-NEXT: csel x12, x11, x12, eq +; GISEL-NEXT: cmp x10, #5 +; GISEL-NEXT: csel x12, x11, x12, eq +; GISEL-NEXT: cmp x10, #6 +; GISEL-NEXT: csel x12, x11, x12, eq +; GISEL-NEXT: cmp x10, #7 +; GISEL-NEXT: csel x12, x11, x12, eq +; GISEL-NEXT: cmp x10, #8 +; GISEL-NEXT: csel x12, x11, x12, eq +; GISEL-NEXT: cmp x10, #9 +; GISEL-NEXT: csel x12, x11, x12, eq +; GISEL-NEXT: cmp x10, #10 +; GISEL-NEXT: csel x12, x11, x12, eq +; GISEL-NEXT: cmp x10, #11 +; GISEL-NEXT: csel x9, x11, x12, eq +; GISEL-NEXT: cmp x10, #12 +; GISEL-NEXT: csel x9, x11, x9, eq +; GISEL-NEXT: cmp x10, #13 +; GISEL-NEXT: csel x9, x11, x9, eq +; GISEL-NEXT: cmp x10, #14 +; GISEL-NEXT: csel x9, x11, x9, eq +; GISEL-NEXT: cmp x10, #15 +; GISEL-NEXT: csel x9, x11, x9, eq +; GISEL-NEXT: cmp x8, #0 +; GISEL-NEXT: ldr x8, [sp, #256] ; 8-byte Folded Reload +; GISEL-NEXT: ldp x28, x27, [sp, #336] ; 16-byte Folded Reload +; GISEL-NEXT: csel x8, x8, x9, eq +; GISEL-NEXT: str x8, [x14, #120] +; GISEL-NEXT: add sp, sp, #432 +; GISEL-NEXT: ret +entry: + %input_val = load i1024, ptr %input, align 128 + %shift_ext = zext i32 %shift to i1024 + %shifted = ashr i1024 %input_val, %shift_ext + store i1024 %shifted, ptr %result, align 128 + ret void +} + + +; Constant shift tests. + +; Zero shift tests +define void @test_shl_i512_const_zero(ptr %result, ptr %input) { +; SDAG-LABEL: test_shl_i512_const_zero: +; SDAG: ; %bb.0: ; %entry +; SDAG-NEXT: ldp x9, x8, [x1, #48] +; SDAG-NEXT: ldr q0, [x1] +; SDAG-NEXT: ldp x11, x10, [x1, #16] +; SDAG-NEXT: ldp x13, x12, [x1, #32] +; SDAG-NEXT: str q0, [x0] +; SDAG-NEXT: stp x9, x8, [x0, #48] +; SDAG-NEXT: stp x11, x10, [x0, #16] +; SDAG-NEXT: stp x13, x12, [x0, #32] +; SDAG-NEXT: ret +; +; GISEL-LABEL: test_shl_i512_const_zero: +; GISEL: ; %bb.0: ; %entry +; GISEL-NEXT: ldp x8, x9, [x1] +; GISEL-NEXT: ldp x10, x11, [x1, #16] +; GISEL-NEXT: ldp x12, x13, [x1, #32] +; GISEL-NEXT: ldp x14, x15, [x1, #48] +; GISEL-NEXT: stp x8, x9, [x0] +; GISEL-NEXT: stp x10, x11, [x0, #16] +; GISEL-NEXT: stp x12, x13, [x0, #32] +; GISEL-NEXT: stp x14, x15, [x0, #48] +; GISEL-NEXT: ret +entry: + %input_val = load i512, ptr %input, align 64 + %shifted = shl i512 %input_val, 0 + store i512 %shifted, ptr %result, align 64 + ret void +} + +define void @test_lshr_i512_const_zero(ptr %result, ptr %input) { +; SDAG-LABEL: test_lshr_i512_const_zero: +; SDAG: ; %bb.0: ; %entry +; SDAG-NEXT: ldp x9, x8, [x1, #48] +; SDAG-NEXT: ldr q0, [x1] +; SDAG-NEXT: ldp x11, x10, [x1, #16] +; SDAG-NEXT: ldp x13, x12, [x1, #32] +; SDAG-NEXT: str q0, [x0] +; SDAG-NEXT: stp x9, x8, [x0, #48] +; SDAG-NEXT: stp x11, x10, [x0, #16] +; SDAG-NEXT: stp x13, x12, [x0, #32] +; SDAG-NEXT: ret +; +; GISEL-LABEL: test_lshr_i512_const_zero: +; GISEL: ; %bb.0: ; %entry +; GISEL-NEXT: ldp x8, x9, [x1] +; GISEL-NEXT: ldp x10, x11, [x1, #16] +; GISEL-NEXT: ldp x12, x13, [x1, #32] +; GISEL-NEXT: ldp x14, x15, [x1, #48] +; GISEL-NEXT: stp x8, x9, [x0] +; GISEL-NEXT: stp x10, x11, [x0, #16] +; GISEL-NEXT: stp x12, x13, [x0, #32] +; GISEL-NEXT: stp x14, x15, [x0, #48] +; GISEL-NEXT: ret +entry: + %input_val = load i512, ptr %input, align 64 + %shifted = lshr i512 %input_val, 0 + store i512 %shifted, ptr %result, align 64 + ret void +} + +define void @test_ashr_i512_const_zero(ptr %result, ptr %input) { +; SDAG-LABEL: test_ashr_i512_const_zero: +; SDAG: ; %bb.0: ; %entry +; SDAG-NEXT: ldp x9, x8, [x1, #48] +; SDAG-NEXT: ldr q0, [x1] +; SDAG-NEXT: ldp x11, x10, [x1, #16] +; SDAG-NEXT: ldp x13, x12, [x1, #32] +; SDAG-NEXT: str q0, [x0] +; SDAG-NEXT: stp x9, x8, [x0, #48] +; SDAG-NEXT: stp x11, x10, [x0, #16] +; SDAG-NEXT: stp x13, x12, [x0, #32] +; SDAG-NEXT: ret +; +; GISEL-LABEL: test_ashr_i512_const_zero: +; GISEL: ; %bb.0: ; %entry +; GISEL-NEXT: ldp x8, x9, [x1] +; GISEL-NEXT: ldp x10, x11, [x1, #16] +; GISEL-NEXT: ldp x12, x13, [x1, #32] +; GISEL-NEXT: ldp x14, x15, [x1, #48] +; GISEL-NEXT: stp x8, x9, [x0] +; GISEL-NEXT: stp x10, x11, [x0, #16] +; GISEL-NEXT: stp x12, x13, [x0, #32] +; GISEL-NEXT: stp x14, x15, [x0, #48] +; GISEL-NEXT: ret +entry: + %input_val = load i512, ptr %input, align 64 + %shifted = ashr i512 %input_val, 0 + store i512 %shifted, ptr %result, align 64 + ret void +} + +; Word-aligned constant shifts (32-bit multiples for i512 -> i32 narrowing) +define void @test_shl_i512_const_32(ptr %result, ptr %input) { +; SDAG-LABEL: test_shl_i512_const_32: +; SDAG: ; %bb.0: ; %entry +; SDAG-NEXT: ldp x12, x13, [x1, #16] +; SDAG-NEXT: ldur x8, [x1, #36] +; SDAG-NEXT: ldp x14, x15, [x1] +; SDAG-NEXT: ldur x9, [x1, #28] +; SDAG-NEXT: ldur x10, [x1, #44] +; SDAG-NEXT: ldur x11, [x1, #52] +; SDAG-NEXT: stp x9, x8, [x0, #32] +; SDAG-NEXT: extr x9, x13, x12, #32 +; SDAG-NEXT: stp x10, x11, [x0, #48] +; SDAG-NEXT: extr x10, x12, x15, #32 +; SDAG-NEXT: lsl x8, x14, #32 +; SDAG-NEXT: stp x10, x9, [x0, #16] +; SDAG-NEXT: extr x10, x15, x14, #32 +; SDAG-NEXT: stp x8, x10, [x0] +; SDAG-NEXT: ret +; +; GISEL-LABEL: test_shl_i512_const_32: +; GISEL: ; %bb.0: ; %entry +; GISEL-NEXT: ldp x8, x9, [x1] +; GISEL-NEXT: ldp x11, x12, [x1, #16] +; GISEL-NEXT: ldp x14, x15, [x1, #32] +; GISEL-NEXT: lsr x10, x8, #32 +; GISEL-NEXT: lsr x13, x9, #32 +; GISEL-NEXT: lsl x8, x8, #32 +; GISEL-NEXT: orr x9, x10, x9, lsl #32 +; GISEL-NEXT: lsr x10, x11, #32 +; GISEL-NEXT: orr x11, x13, x11, lsl #32 +; GISEL-NEXT: ldp x13, x16, [x1, #48] +; GISEL-NEXT: stp x8, x9, [x0] +; GISEL-NEXT: lsr x8, x12, #32 +; GISEL-NEXT: orr x10, x10, x12, lsl #32 +; GISEL-NEXT: lsr x12, x14, #32 +; GISEL-NEXT: lsr x9, x15, #32 +; GISEL-NEXT: orr x8, x8, x14, lsl #32 +; GISEL-NEXT: stp x11, x10, [x0, #16] +; GISEL-NEXT: orr x11, x12, x15, lsl #32 +; GISEL-NEXT: lsr x12, x13, #32 +; GISEL-NEXT: orr x9, x9, x13, lsl #32 +; GISEL-NEXT: stp x8, x11, [x0, #32] +; GISEL-NEXT: orr x8, x12, x16, lsl #32 +; GISEL-NEXT: stp x9, x8, [x0, #48] +; GISEL-NEXT: ret +entry: + %input_val = load i512, ptr %input, align 64 + %shifted = shl i512 %input_val, 32 + store i512 %shifted, ptr %result, align 64 + ret void +} + +define void @test_lshr_i512_const_32(ptr %result, ptr %input) { +; SDAG-LABEL: test_lshr_i512_const_32: +; SDAG: ; %bb.0: ; %entry +; SDAG-NEXT: ldp x11, x10, [x1, #48] +; SDAG-NEXT: ldur x8, [x1, #12] +; SDAG-NEXT: ldp x15, x14, [x1, #32] +; SDAG-NEXT: ldur x9, [x1, #4] +; SDAG-NEXT: ldp x12, x13, [x1, #16] +; SDAG-NEXT: extr x16, x10, x11, #32 +; SDAG-NEXT: stp x9, x8, [x0] +; SDAG-NEXT: lsr x9, x10, #32 +; SDAG-NEXT: extr x8, x14, x15, #32 +; SDAG-NEXT: extr x10, x11, x14, #32 +; SDAG-NEXT: stp x16, x9, [x0, #48] +; SDAG-NEXT: extr x9, x13, x12, #32 +; SDAG-NEXT: stp x8, x10, [x0, #32] +; SDAG-NEXT: extr x8, x15, x13, #32 +; SDAG-NEXT: stp x9, x8, [x0, #16] +; SDAG-NEXT: ret +; +; GISEL-LABEL: test_lshr_i512_const_32: +; GISEL: ; %bb.0: ; %entry +; GISEL-NEXT: ldp x8, x9, [x1, #8] +; GISEL-NEXT: ldr x11, [x1] +; GISEL-NEXT: ldp x10, x14, [x1, #24] +; GISEL-NEXT: ldr x16, [x1, #56] +; GISEL-NEXT: lsl x12, x8, #32 +; GISEL-NEXT: lsl x13, x9, #32 +; GISEL-NEXT: lsl x15, x10, #32 +; GISEL-NEXT: orr x11, x12, x11, lsr #32 +; GISEL-NEXT: orr x8, x13, x8, lsr #32 +; GISEL-NEXT: lsl x13, x14, #32 +; GISEL-NEXT: orr x9, x15, x9, lsr #32 +; GISEL-NEXT: ldp x12, x15, [x1, #40] +; GISEL-NEXT: stp x11, x8, [x0] +; GISEL-NEXT: orr x10, x13, x10, lsr #32 +; GISEL-NEXT: lsl x8, x16, #32 +; GISEL-NEXT: lsl x11, x12, #32 +; GISEL-NEXT: lsl x13, x15, #32 +; GISEL-NEXT: stp x9, x10, [x0, #16] +; GISEL-NEXT: orr x8, x8, x15, lsr #32 +; GISEL-NEXT: lsr x10, x16, #32 +; GISEL-NEXT: orr x11, x11, x14, lsr #32 +; GISEL-NEXT: orr x9, x13, x12, lsr #32 +; GISEL-NEXT: stp x8, x10, [x0, #48] +; GISEL-NEXT: stp x11, x9, [x0, #32] +; GISEL-NEXT: ret +entry: + %input_val = load i512, ptr %input, align 64 + %shifted = lshr i512 %input_val, 32 + store i512 %shifted, ptr %result, align 64 + ret void +} + +define void @test_ashr_i512_const_32(ptr %result, ptr %input) { +; SDAG-LABEL: test_ashr_i512_const_32: +; SDAG: ; %bb.0: ; %entry +; SDAG-NEXT: ldp x11, x10, [x1, #48] +; SDAG-NEXT: ldur x8, [x1, #12] +; SDAG-NEXT: ldp x15, x14, [x1, #32] +; SDAG-NEXT: ldur x9, [x1, #4] +; SDAG-NEXT: ldp x12, x13, [x1, #16] +; SDAG-NEXT: extr x16, x10, x11, #32 +; SDAG-NEXT: stp x9, x8, [x0] +; SDAG-NEXT: asr x9, x10, #32 +; SDAG-NEXT: extr x8, x14, x15, #32 +; SDAG-NEXT: extr x10, x11, x14, #32 +; SDAG-NEXT: stp x16, x9, [x0, #48] +; SDAG-NEXT: extr x9, x13, x12, #32 +; SDAG-NEXT: stp x8, x10, [x0, #32] +; SDAG-NEXT: extr x8, x15, x13, #32 +; SDAG-NEXT: stp x9, x8, [x0, #16] +; SDAG-NEXT: ret +; +; GISEL-LABEL: test_ashr_i512_const_32: +; GISEL: ; %bb.0: ; %entry +; GISEL-NEXT: ldp x8, x9, [x1, #8] +; GISEL-NEXT: ldr x11, [x1] +; GISEL-NEXT: ldp x10, x13, [x1, #24] +; GISEL-NEXT: ldr x17, [x1, #56] +; GISEL-NEXT: lsl x12, x8, #32 +; GISEL-NEXT: lsl x15, x9, #32 +; GISEL-NEXT: lsl x16, x10, #32 +; GISEL-NEXT: orr x11, x12, x11, lsr #32 +; GISEL-NEXT: ldp x14, x12, [x1, #40] +; GISEL-NEXT: orr x8, x15, x8, lsr #32 +; GISEL-NEXT: lsl x15, x13, #32 +; GISEL-NEXT: orr x9, x16, x9, lsr #32 +; GISEL-NEXT: asr x16, x17, #63 +; GISEL-NEXT: stp x11, x8, [x0] +; GISEL-NEXT: lsl x11, x14, #32 +; GISEL-NEXT: orr x10, x15, x10, lsr #32 +; GISEL-NEXT: lsl x15, x12, #32 +; GISEL-NEXT: orr x8, x11, x13, lsr #32 +; GISEL-NEXT: lsl x11, x17, #32 +; GISEL-NEXT: stp x9, x10, [x0, #16] +; GISEL-NEXT: orr x9, x15, x14, lsr #32 +; GISEL-NEXT: lsl x13, x16, #32 +; GISEL-NEXT: orr x10, x11, x12, lsr #32 +; GISEL-NEXT: stp x8, x9, [x0, #32] +; GISEL-NEXT: orr x8, x13, x17, asr #32 +; GISEL-NEXT: stp x10, x8, [x0, #48] +; GISEL-NEXT: ret +entry: + %input_val = load i512, ptr %input, align 64 + %shifted = ashr i512 %input_val, 32 + store i512 %shifted, ptr %result, align 64 + ret void +} + +define void @test_shl_i512_const_64(ptr %result, ptr %input) { +; SDAG-LABEL: test_shl_i512_const_64: +; SDAG: ; %bb.0: ; %entry +; SDAG-NEXT: ldp x10, x8, [x1, #40] +; SDAG-NEXT: ldr q0, [x1] +; SDAG-NEXT: ldp x12, x9, [x1, #24] +; SDAG-NEXT: ldr x11, [x1, #16] +; SDAG-NEXT: str xzr, [x0] +; SDAG-NEXT: stp x10, x8, [x0, #48] +; SDAG-NEXT: stp x12, x9, [x0, #32] +; SDAG-NEXT: str x11, [x0, #24] +; SDAG-NEXT: stur q0, [x0, #8] +; SDAG-NEXT: ret +; +; GISEL-LABEL: test_shl_i512_const_64: +; GISEL: ; %bb.0: ; %entry +; GISEL-NEXT: ldp x8, x9, [x1] +; GISEL-NEXT: ldr x14, [x1, #48] +; GISEL-NEXT: ldp x10, x11, [x1, #16] +; GISEL-NEXT: ldp x12, x13, [x1, #32] +; GISEL-NEXT: stp xzr, x8, [x0] +; GISEL-NEXT: stp x9, x10, [x0, #16] +; GISEL-NEXT: stp x11, x12, [x0, #32] +; GISEL-NEXT: stp x13, x14, [x0, #48] +; GISEL-NEXT: ret +entry: + %input_val = load i512, ptr %input, align 64 + %shifted = shl i512 %input_val, 64 + store i512 %shifted, ptr %result, align 64 + ret void +} + +define void @test_lshr_i512_const_64(ptr %result, ptr %input) { +; SDAG-LABEL: test_lshr_i512_const_64: +; SDAG: ; %bb.0: ; %entry +; SDAG-NEXT: ldp x11, x8, [x1, #48] +; SDAG-NEXT: ldur q0, [x1, #8] +; SDAG-NEXT: ldp x10, x9, [x1, #24] +; SDAG-NEXT: ldr x12, [x1, #40] +; SDAG-NEXT: str q0, [x0] +; SDAG-NEXT: stp x8, xzr, [x0, #48] +; SDAG-NEXT: stp x12, x11, [x0, #32] +; SDAG-NEXT: stp x10, x9, [x0, #16] +; SDAG-NEXT: ret +; +; GISEL-LABEL: test_lshr_i512_const_64: +; GISEL: ; %bb.0: ; %entry +; GISEL-NEXT: ldp x8, x9, [x1, #8] +; GISEL-NEXT: ldr x14, [x1, #56] +; GISEL-NEXT: ldp x10, x11, [x1, #24] +; GISEL-NEXT: ldp x12, x13, [x1, #40] +; GISEL-NEXT: stp x14, xzr, [x0, #48] +; GISEL-NEXT: stp x8, x9, [x0] +; GISEL-NEXT: stp x10, x11, [x0, #16] +; GISEL-NEXT: stp x12, x13, [x0, #32] +; GISEL-NEXT: ret +entry: + %input_val = load i512, ptr %input, align 64 + %shifted = lshr i512 %input_val, 64 + store i512 %shifted, ptr %result, align 64 + ret void +} + +define void @test_ashr_i512_const_64(ptr %result, ptr %input) { +; SDAG-LABEL: test_ashr_i512_const_64: +; SDAG: ; %bb.0: ; %entry +; SDAG-NEXT: ldp x8, x9, [x1, #40] +; SDAG-NEXT: ldr x12, [x1, #56] +; SDAG-NEXT: ldp x11, x10, [x1, #24] +; SDAG-NEXT: ldur q0, [x1, #8] +; SDAG-NEXT: stp x8, x9, [x0, #32] +; SDAG-NEXT: asr x8, x12, #63 +; SDAG-NEXT: stp x11, x10, [x0, #16] +; SDAG-NEXT: str q0, [x0] +; SDAG-NEXT: stp x12, x8, [x0, #48] +; SDAG-NEXT: ret +; +; GISEL-LABEL: test_ashr_i512_const_64: +; GISEL: ; %bb.0: ; %entry +; GISEL-NEXT: ldp x8, x9, [x1, #8] +; GISEL-NEXT: ldr x14, [x1, #56] +; GISEL-NEXT: ldp x10, x11, [x1, #24] +; GISEL-NEXT: ldp x12, x13, [x1, #40] +; GISEL-NEXT: stp x8, x9, [x0] +; GISEL-NEXT: asr x8, x14, #63 +; GISEL-NEXT: stp x10, x11, [x0, #16] +; GISEL-NEXT: stp x12, x13, [x0, #32] +; GISEL-NEXT: stp x14, x8, [x0, #48] +; GISEL-NEXT: ret +entry: + %input_val = load i512, ptr %input, align 64 + %shifted = ashr i512 %input_val, 64 + store i512 %shifted, ptr %result, align 64 + ret void +} + +define void @test_shl_i512_const_96(ptr %result, ptr %input) { +; SDAG-LABEL: test_shl_i512_const_96: +; SDAG: ; %bb.0: ; %entry +; SDAG-NEXT: ldp x12, x11, [x1, #16] +; SDAG-NEXT: ldur x9, [x1, #36] +; SDAG-NEXT: ldur x10, [x1, #44] +; SDAG-NEXT: ldur x8, [x1, #28] +; SDAG-NEXT: ldp x13, x14, [x1] +; SDAG-NEXT: stp x9, x10, [x0, #48] +; SDAG-NEXT: extr x9, x11, x12, #32 +; SDAG-NEXT: extr x10, x14, x13, #32 +; SDAG-NEXT: stp x9, x8, [x0, #32] +; SDAG-NEXT: extr x8, x12, x14, #32 +; SDAG-NEXT: lsl x9, x13, #32 +; SDAG-NEXT: stp x10, x8, [x0, #16] +; SDAG-NEXT: stp xzr, x9, [x0] +; SDAG-NEXT: ret +; +; GISEL-LABEL: test_shl_i512_const_96: +; GISEL: ; %bb.0: ; %entry +; GISEL-NEXT: ldp x8, x9, [x1] +; GISEL-NEXT: ldr x15, [x1, #48] +; GISEL-NEXT: ldp x10, x11, [x1, #16] +; GISEL-NEXT: ldp x12, x13, [x1, #32] +; GISEL-NEXT: lsr x14, x8, #32 +; GISEL-NEXT: lsr x16, x9, #32 +; GISEL-NEXT: lsl x8, x8, #32 +; GISEL-NEXT: orr x9, x14, x9, lsl #32 +; GISEL-NEXT: lsr x14, x10, #32 +; GISEL-NEXT: orr x10, x16, x10, lsl #32 +; GISEL-NEXT: stp xzr, x8, [x0] +; GISEL-NEXT: lsr x8, x11, #32 +; GISEL-NEXT: orr x11, x14, x11, lsl #32 +; GISEL-NEXT: lsr x14, x12, #32 +; GISEL-NEXT: stp x9, x10, [x0, #16] +; GISEL-NEXT: lsr x9, x13, #32 +; GISEL-NEXT: orr x8, x8, x12, lsl #32 +; GISEL-NEXT: orr x10, x14, x13, lsl #32 +; GISEL-NEXT: orr x9, x9, x15, lsl #32 +; GISEL-NEXT: stp x11, x8, [x0, #32] +; GISEL-NEXT: stp x10, x9, [x0, #48] +; GISEL-NEXT: ret +entry: + %input_val = load i512, ptr %input, align 64 + %shifted = shl i512 %input_val, 96 + store i512 %shifted, ptr %result, align 64 + ret void +} + +define void @test_lshr_i512_const_96(ptr %result, ptr %input) { +; SDAG-LABEL: test_lshr_i512_const_96: +; SDAG: ; %bb.0: ; %entry +; SDAG-NEXT: ldp x9, x10, [x1, #48] +; SDAG-NEXT: ldur x8, [x1, #20] +; SDAG-NEXT: ldp x13, x14, [x1, #32] +; SDAG-NEXT: ldur x11, [x1, #12] +; SDAG-NEXT: ldur x12, [x1, #28] +; SDAG-NEXT: lsr x15, x10, #32 +; SDAG-NEXT: stp x11, x8, [x0] +; SDAG-NEXT: extr x8, x10, x9, #32 +; SDAG-NEXT: extr x11, x9, x14, #32 +; SDAG-NEXT: extr x9, x14, x13, #32 +; SDAG-NEXT: stp x15, xzr, [x0, #48] +; SDAG-NEXT: stp x11, x8, [x0, #32] +; SDAG-NEXT: stp x12, x9, [x0, #16] +; SDAG-NEXT: ret +; +; GISEL-LABEL: test_lshr_i512_const_96: +; GISEL: ; %bb.0: ; %entry +; GISEL-NEXT: ldp x8, x9, [x1, #16] +; GISEL-NEXT: ldr x10, [x1, #8] +; GISEL-NEXT: ldp x11, x14, [x1, #32] +; GISEL-NEXT: ldp x15, x16, [x1, #48] +; GISEL-NEXT: lsl x12, x8, #32 +; GISEL-NEXT: lsl x13, x9, #32 +; GISEL-NEXT: orr x10, x12, x10, lsr #32 +; GISEL-NEXT: lsl x12, x11, #32 +; GISEL-NEXT: orr x8, x13, x8, lsr #32 +; GISEL-NEXT: lsl x13, x14, #32 +; GISEL-NEXT: orr x9, x12, x9, lsr #32 +; GISEL-NEXT: stp x10, x8, [x0] +; GISEL-NEXT: lsl x10, x15, #32 +; GISEL-NEXT: orr x11, x13, x11, lsr #32 +; GISEL-NEXT: lsl x12, x16, #32 +; GISEL-NEXT: orr x8, x10, x14, lsr #32 +; GISEL-NEXT: lsr x10, x16, #32 +; GISEL-NEXT: stp x9, x11, [x0, #16] +; GISEL-NEXT: orr x9, x12, x15, lsr #32 +; GISEL-NEXT: stp x10, xzr, [x0, #48] +; GISEL-NEXT: stp x8, x9, [x0, #32] +; GISEL-NEXT: ret +entry: + %input_val = load i512, ptr %input, align 64 + %shifted = lshr i512 %input_val, 96 + store i512 %shifted, ptr %result, align 64 + ret void +} + +define void @test_ashr_i512_const_96(ptr %result, ptr %input) { +; SDAG-LABEL: test_ashr_i512_const_96: +; SDAG: ; %bb.0: ; %entry +; SDAG-NEXT: ldp x9, x10, [x1, #48] +; SDAG-NEXT: ldur x8, [x1, #12] +; SDAG-NEXT: ldur x11, [x1, #20] +; SDAG-NEXT: ldur x12, [x1, #28] +; SDAG-NEXT: ldp x13, x14, [x1, #32] +; SDAG-NEXT: asr x15, x10, #32 +; SDAG-NEXT: stp x8, x11, [x0] +; SDAG-NEXT: asr x8, x10, #63 +; SDAG-NEXT: extr x11, x9, x14, #32 +; SDAG-NEXT: stp x15, x8, [x0, #48] +; SDAG-NEXT: extr x9, x10, x9, #32 +; SDAG-NEXT: extr x8, x14, x13, #32 +; SDAG-NEXT: stp x11, x9, [x0, #32] +; SDAG-NEXT: stp x12, x8, [x0, #16] +; SDAG-NEXT: ret +; +; GISEL-LABEL: test_ashr_i512_const_96: +; GISEL: ; %bb.0: ; %entry +; GISEL-NEXT: ldp x8, x9, [x1, #16] +; GISEL-NEXT: ldr x11, [x1, #8] +; GISEL-NEXT: ldp x10, x13, [x1, #32] +; GISEL-NEXT: lsl x12, x8, #32 +; GISEL-NEXT: lsl x14, x9, #32 +; GISEL-NEXT: lsl x15, x10, #32 +; GISEL-NEXT: orr x11, x12, x11, lsr #32 +; GISEL-NEXT: ldp x12, x16, [x1, #48] +; GISEL-NEXT: orr x8, x14, x8, lsr #32 +; GISEL-NEXT: lsl x14, x13, #32 +; GISEL-NEXT: orr x9, x15, x9, lsr #32 +; GISEL-NEXT: asr x15, x16, #63 +; GISEL-NEXT: stp x11, x8, [x0] +; GISEL-NEXT: lsl x11, x12, #32 +; GISEL-NEXT: orr x10, x14, x10, lsr #32 +; GISEL-NEXT: lsl x14, x16, #32 +; GISEL-NEXT: orr x8, x11, x13, lsr #32 +; GISEL-NEXT: lsl x11, x15, #32 +; GISEL-NEXT: stp x9, x10, [x0, #16] +; GISEL-NEXT: orr x9, x14, x12, lsr #32 +; GISEL-NEXT: orr x10, x11, x16, asr #32 +; GISEL-NEXT: stp x8, x9, [x0, #32] +; GISEL-NEXT: stp x10, x15, [x0, #48] +; GISEL-NEXT: ret +entry: + %input_val = load i512, ptr %input, align 64 + %shifted = ashr i512 %input_val, 96 + store i512 %shifted, ptr %result, align 64 + ret void +} + +; Bit-only shifts (< 64 bits) +define void @test_shl_i512_const_1(ptr %result, ptr %input) { +; SDAG-LABEL: test_shl_i512_const_1: +; SDAG: ; %bb.0: ; %entry +; SDAG-NEXT: ldp x8, x9, [x1, #40] +; SDAG-NEXT: ldr x10, [x1, #56] +; SDAG-NEXT: ldp x13, x11, [x1, #24] +; SDAG-NEXT: ldp x15, x14, [x1, #8] +; SDAG-NEXT: extr x12, x9, x8, #63 +; SDAG-NEXT: extr x9, x10, x9, #63 +; SDAG-NEXT: ldr x10, [x1] +; SDAG-NEXT: extr x16, x11, x13, #63 +; SDAG-NEXT: extr x8, x8, x11, #63 +; SDAG-NEXT: stp x12, x9, [x0, #48] +; SDAG-NEXT: extr x9, x14, x15, #63 +; SDAG-NEXT: extr x11, x13, x14, #63 +; SDAG-NEXT: stp x16, x8, [x0, #32] +; SDAG-NEXT: lsl x8, x10, #1 +; SDAG-NEXT: stp x9, x11, [x0, #16] +; SDAG-NEXT: extr x9, x15, x10, #63 +; SDAG-NEXT: stp x8, x9, [x0] +; SDAG-NEXT: ret +; +; GISEL-LABEL: test_shl_i512_const_1: +; GISEL: ; %bb.0: ; %entry +; GISEL-NEXT: ldp x8, x9, [x1] +; GISEL-NEXT: ldp x11, x12, [x1, #16] +; GISEL-NEXT: ldp x14, x15, [x1, #32] +; GISEL-NEXT: lsr x10, x8, #63 +; GISEL-NEXT: lsr x13, x9, #63 +; GISEL-NEXT: lsl x8, x8, #1 +; GISEL-NEXT: orr x9, x10, x9, lsl #1 +; GISEL-NEXT: lsr x10, x11, #63 +; GISEL-NEXT: orr x11, x13, x11, lsl #1 +; GISEL-NEXT: ldp x13, x16, [x1, #48] +; GISEL-NEXT: stp x8, x9, [x0] +; GISEL-NEXT: lsr x8, x12, #63 +; GISEL-NEXT: orr x10, x10, x12, lsl #1 +; GISEL-NEXT: lsr x12, x14, #63 +; GISEL-NEXT: lsr x9, x15, #63 +; GISEL-NEXT: orr x8, x8, x14, lsl #1 +; GISEL-NEXT: stp x11, x10, [x0, #16] +; GISEL-NEXT: orr x11, x12, x15, lsl #1 +; GISEL-NEXT: lsr x12, x13, #63 +; GISEL-NEXT: orr x9, x9, x13, lsl #1 +; GISEL-NEXT: stp x8, x11, [x0, #32] +; GISEL-NEXT: orr x8, x12, x16, lsl #1 +; GISEL-NEXT: stp x9, x8, [x0, #48] +; GISEL-NEXT: ret +entry: + %input_val = load i512, ptr %input, align 64 + %shifted = shl i512 %input_val, 1 + store i512 %shifted, ptr %result, align 64 + ret void +} + +define void @test_lshr_i512_const_1(ptr %result, ptr %input) { +; SDAG-LABEL: test_lshr_i512_const_1: +; SDAG: ; %bb.0: ; %entry +; SDAG-NEXT: ldp x9, x8, [x1, #48] +; SDAG-NEXT: ldp x11, x10, [x1, #32] +; SDAG-NEXT: ldp x16, x15, [x1, #16] +; SDAG-NEXT: extr x12, x8, x9, #1 +; SDAG-NEXT: lsr x8, x8, #1 +; SDAG-NEXT: ldp x13, x14, [x1] +; SDAG-NEXT: extr x9, x9, x10, #1 +; SDAG-NEXT: stp x12, x8, [x0, #48] +; SDAG-NEXT: extr x12, x10, x11, #1 +; SDAG-NEXT: extr x8, x15, x16, #1 +; SDAG-NEXT: extr x10, x11, x15, #1 +; SDAG-NEXT: stp x12, x9, [x0, #32] +; SDAG-NEXT: extr x9, x14, x13, #1 +; SDAG-NEXT: stp x8, x10, [x0, #16] +; SDAG-NEXT: extr x8, x16, x14, #1 +; SDAG-NEXT: stp x9, x8, [x0] +; SDAG-NEXT: ret +; +; GISEL-LABEL: test_lshr_i512_const_1: +; GISEL: ; %bb.0: ; %entry +; GISEL-NEXT: ldp x8, x9, [x1, #8] +; GISEL-NEXT: ldr x11, [x1] +; GISEL-NEXT: ldp x10, x14, [x1, #24] +; GISEL-NEXT: ldr x16, [x1, #56] +; GISEL-NEXT: lsl x12, x8, #63 +; GISEL-NEXT: lsl x13, x9, #63 +; GISEL-NEXT: lsl x15, x10, #63 +; GISEL-NEXT: orr x11, x12, x11, lsr #1 +; GISEL-NEXT: orr x8, x13, x8, lsr #1 +; GISEL-NEXT: lsl x13, x14, #63 +; GISEL-NEXT: orr x9, x15, x9, lsr #1 +; GISEL-NEXT: ldp x12, x15, [x1, #40] +; GISEL-NEXT: stp x11, x8, [x0] +; GISEL-NEXT: orr x10, x13, x10, lsr #1 +; GISEL-NEXT: lsl x8, x16, #63 +; GISEL-NEXT: lsl x11, x12, #63 +; GISEL-NEXT: lsl x13, x15, #63 +; GISEL-NEXT: stp x9, x10, [x0, #16] +; GISEL-NEXT: orr x8, x8, x15, lsr #1 +; GISEL-NEXT: lsr x10, x16, #1 +; GISEL-NEXT: orr x11, x11, x14, lsr #1 +; GISEL-NEXT: orr x9, x13, x12, lsr #1 +; GISEL-NEXT: stp x8, x10, [x0, #48] +; GISEL-NEXT: stp x11, x9, [x0, #32] +; GISEL-NEXT: ret +entry: + %input_val = load i512, ptr %input, align 64 + %shifted = lshr i512 %input_val, 1 + store i512 %shifted, ptr %result, align 64 + ret void +} + +define void @test_ashr_i512_const_1(ptr %result, ptr %input) { +; SDAG-LABEL: test_ashr_i512_const_1: +; SDAG: ; %bb.0: ; %entry +; SDAG-NEXT: ldp x9, x8, [x1, #48] +; SDAG-NEXT: ldp x11, x10, [x1, #32] +; SDAG-NEXT: ldp x16, x15, [x1, #16] +; SDAG-NEXT: extr x12, x8, x9, #1 +; SDAG-NEXT: asr x8, x8, #1 +; SDAG-NEXT: ldp x13, x14, [x1] +; SDAG-NEXT: extr x9, x9, x10, #1 +; SDAG-NEXT: stp x12, x8, [x0, #48] +; SDAG-NEXT: extr x12, x10, x11, #1 +; SDAG-NEXT: extr x8, x15, x16, #1 +; SDAG-NEXT: extr x10, x11, x15, #1 +; SDAG-NEXT: stp x12, x9, [x0, #32] +; SDAG-NEXT: extr x9, x14, x13, #1 +; SDAG-NEXT: stp x8, x10, [x0, #16] +; SDAG-NEXT: extr x8, x16, x14, #1 +; SDAG-NEXT: stp x9, x8, [x0] +; SDAG-NEXT: ret +; +; GISEL-LABEL: test_ashr_i512_const_1: +; GISEL: ; %bb.0: ; %entry +; GISEL-NEXT: ldp x8, x9, [x1, #8] +; GISEL-NEXT: ldr x11, [x1] +; GISEL-NEXT: ldp x10, x13, [x1, #24] +; GISEL-NEXT: ldr x17, [x1, #56] +; GISEL-NEXT: lsl x12, x8, #63 +; GISEL-NEXT: lsl x15, x9, #63 +; GISEL-NEXT: lsl x16, x10, #63 +; GISEL-NEXT: orr x11, x12, x11, lsr #1 +; GISEL-NEXT: ldp x14, x12, [x1, #40] +; GISEL-NEXT: orr x8, x15, x8, lsr #1 +; GISEL-NEXT: lsl x15, x13, #63 +; GISEL-NEXT: orr x9, x16, x9, lsr #1 +; GISEL-NEXT: asr x16, x17, #63 +; GISEL-NEXT: stp x11, x8, [x0] +; GISEL-NEXT: lsl x11, x14, #63 +; GISEL-NEXT: orr x10, x15, x10, lsr #1 +; GISEL-NEXT: lsl x15, x12, #63 +; GISEL-NEXT: orr x8, x11, x13, lsr #1 +; GISEL-NEXT: lsl x11, x17, #63 +; GISEL-NEXT: stp x9, x10, [x0, #16] +; GISEL-NEXT: orr x9, x15, x14, lsr #1 +; GISEL-NEXT: lsl x13, x16, #63 +; GISEL-NEXT: orr x10, x11, x12, lsr #1 +; GISEL-NEXT: stp x8, x9, [x0, #32] +; GISEL-NEXT: orr x8, x13, x17, asr #1 +; GISEL-NEXT: stp x10, x8, [x0, #48] +; GISEL-NEXT: ret +entry: + %input_val = load i512, ptr %input, align 64 + %shifted = ashr i512 %input_val, 1 + store i512 %shifted, ptr %result, align 64 + ret void +} + +define void @test_shl_i512_const_15(ptr %result, ptr %input) { +; SDAG-LABEL: test_shl_i512_const_15: +; SDAG: ; %bb.0: ; %entry +; SDAG-NEXT: ldp x8, x9, [x1, #40] +; SDAG-NEXT: ldr x10, [x1, #56] +; SDAG-NEXT: ldp x13, x11, [x1, #24] +; SDAG-NEXT: ldp x15, x14, [x1, #8] +; SDAG-NEXT: extr x12, x9, x8, #49 +; SDAG-NEXT: extr x9, x10, x9, #49 +; SDAG-NEXT: ldr x10, [x1] +; SDAG-NEXT: extr x16, x11, x13, #49 +; SDAG-NEXT: extr x8, x8, x11, #49 +; SDAG-NEXT: stp x12, x9, [x0, #48] +; SDAG-NEXT: extr x9, x14, x15, #49 +; SDAG-NEXT: extr x11, x13, x14, #49 +; SDAG-NEXT: stp x16, x8, [x0, #32] +; SDAG-NEXT: lsl x8, x10, #15 +; SDAG-NEXT: stp x9, x11, [x0, #16] +; SDAG-NEXT: extr x9, x15, x10, #49 +; SDAG-NEXT: stp x8, x9, [x0] +; SDAG-NEXT: ret +; +; GISEL-LABEL: test_shl_i512_const_15: +; GISEL: ; %bb.0: ; %entry +; GISEL-NEXT: ldp x8, x9, [x1] +; GISEL-NEXT: ldp x11, x12, [x1, #16] +; GISEL-NEXT: ldp x14, x15, [x1, #32] +; GISEL-NEXT: lsr x10, x8, #49 +; GISEL-NEXT: lsr x13, x9, #49 +; GISEL-NEXT: lsl x8, x8, #15 +; GISEL-NEXT: orr x9, x10, x9, lsl #15 +; GISEL-NEXT: lsr x10, x11, #49 +; GISEL-NEXT: orr x11, x13, x11, lsl #15 +; GISEL-NEXT: ldp x13, x16, [x1, #48] +; GISEL-NEXT: stp x8, x9, [x0] +; GISEL-NEXT: lsr x8, x12, #49 +; GISEL-NEXT: orr x10, x10, x12, lsl #15 +; GISEL-NEXT: lsr x12, x14, #49 +; GISEL-NEXT: lsr x9, x15, #49 +; GISEL-NEXT: orr x8, x8, x14, lsl #15 +; GISEL-NEXT: stp x11, x10, [x0, #16] +; GISEL-NEXT: orr x11, x12, x15, lsl #15 +; GISEL-NEXT: lsr x12, x13, #49 +; GISEL-NEXT: orr x9, x9, x13, lsl #15 +; GISEL-NEXT: stp x8, x11, [x0, #32] +; GISEL-NEXT: orr x8, x12, x16, lsl #15 +; GISEL-NEXT: stp x9, x8, [x0, #48] +; GISEL-NEXT: ret +entry: + %input_val = load i512, ptr %input, align 64 + %shifted = shl i512 %input_val, 15 + store i512 %shifted, ptr %result, align 64 + ret void +} + +define void @test_lshr_i512_const_15(ptr %result, ptr %input) { +; SDAG-LABEL: test_lshr_i512_const_15: +; SDAG: ; %bb.0: ; %entry +; SDAG-NEXT: ldp x9, x8, [x1, #48] +; SDAG-NEXT: ldp x11, x10, [x1, #32] +; SDAG-NEXT: ldp x16, x15, [x1, #16] +; SDAG-NEXT: extr x12, x8, x9, #15 +; SDAG-NEXT: lsr x8, x8, #15 +; SDAG-NEXT: ldp x13, x14, [x1] +; SDAG-NEXT: extr x9, x9, x10, #15 +; SDAG-NEXT: stp x12, x8, [x0, #48] +; SDAG-NEXT: extr x12, x10, x11, #15 +; SDAG-NEXT: extr x8, x15, x16, #15 +; SDAG-NEXT: extr x10, x11, x15, #15 +; SDAG-NEXT: stp x12, x9, [x0, #32] +; SDAG-NEXT: extr x9, x14, x13, #15 +; SDAG-NEXT: stp x8, x10, [x0, #16] +; SDAG-NEXT: extr x8, x16, x14, #15 +; SDAG-NEXT: stp x9, x8, [x0] +; SDAG-NEXT: ret +; +; GISEL-LABEL: test_lshr_i512_const_15: +; GISEL: ; %bb.0: ; %entry +; GISEL-NEXT: ldp x8, x9, [x1, #8] +; GISEL-NEXT: ldr x11, [x1] +; GISEL-NEXT: ldp x10, x14, [x1, #24] +; GISEL-NEXT: ldr x16, [x1, #56] +; GISEL-NEXT: lsl x12, x8, #49 +; GISEL-NEXT: lsl x13, x9, #49 +; GISEL-NEXT: lsl x15, x10, #49 +; GISEL-NEXT: orr x11, x12, x11, lsr #15 +; GISEL-NEXT: orr x8, x13, x8, lsr #15 +; GISEL-NEXT: lsl x13, x14, #49 +; GISEL-NEXT: orr x9, x15, x9, lsr #15 +; GISEL-NEXT: ldp x12, x15, [x1, #40] +; GISEL-NEXT: stp x11, x8, [x0] +; GISEL-NEXT: orr x10, x13, x10, lsr #15 +; GISEL-NEXT: lsl x8, x16, #49 +; GISEL-NEXT: lsl x11, x12, #49 +; GISEL-NEXT: lsl x13, x15, #49 +; GISEL-NEXT: stp x9, x10, [x0, #16] +; GISEL-NEXT: orr x8, x8, x15, lsr #15 +; GISEL-NEXT: lsr x10, x16, #15 +; GISEL-NEXT: orr x11, x11, x14, lsr #15 +; GISEL-NEXT: orr x9, x13, x12, lsr #15 +; GISEL-NEXT: stp x8, x10, [x0, #48] +; GISEL-NEXT: stp x11, x9, [x0, #32] +; GISEL-NEXT: ret +entry: + %input_val = load i512, ptr %input, align 64 + %shifted = lshr i512 %input_val, 15 + store i512 %shifted, ptr %result, align 64 + ret void +} + +define void @test_ashr_i512_const_15(ptr %result, ptr %input) { +; SDAG-LABEL: test_ashr_i512_const_15: +; SDAG: ; %bb.0: ; %entry +; SDAG-NEXT: ldp x9, x8, [x1, #48] +; SDAG-NEXT: ldp x11, x10, [x1, #32] +; SDAG-NEXT: ldp x16, x15, [x1, #16] +; SDAG-NEXT: extr x12, x8, x9, #15 +; SDAG-NEXT: asr x8, x8, #15 +; SDAG-NEXT: ldp x13, x14, [x1] +; SDAG-NEXT: extr x9, x9, x10, #15 +; SDAG-NEXT: stp x12, x8, [x0, #48] +; SDAG-NEXT: extr x12, x10, x11, #15 +; SDAG-NEXT: extr x8, x15, x16, #15 +; SDAG-NEXT: extr x10, x11, x15, #15 +; SDAG-NEXT: stp x12, x9, [x0, #32] +; SDAG-NEXT: extr x9, x14, x13, #15 +; SDAG-NEXT: stp x8, x10, [x0, #16] +; SDAG-NEXT: extr x8, x16, x14, #15 +; SDAG-NEXT: stp x9, x8, [x0] +; SDAG-NEXT: ret +; +; GISEL-LABEL: test_ashr_i512_const_15: +; GISEL: ; %bb.0: ; %entry +; GISEL-NEXT: ldp x8, x9, [x1, #8] +; GISEL-NEXT: ldr x11, [x1] +; GISEL-NEXT: ldp x10, x13, [x1, #24] +; GISEL-NEXT: ldr x17, [x1, #56] +; GISEL-NEXT: lsl x12, x8, #49 +; GISEL-NEXT: lsl x15, x9, #49 +; GISEL-NEXT: lsl x16, x10, #49 +; GISEL-NEXT: orr x11, x12, x11, lsr #15 +; GISEL-NEXT: ldp x14, x12, [x1, #40] +; GISEL-NEXT: orr x8, x15, x8, lsr #15 +; GISEL-NEXT: lsl x15, x13, #49 +; GISEL-NEXT: orr x9, x16, x9, lsr #15 +; GISEL-NEXT: asr x16, x17, #63 +; GISEL-NEXT: stp x11, x8, [x0] +; GISEL-NEXT: lsl x11, x14, #49 +; GISEL-NEXT: orr x10, x15, x10, lsr #15 +; GISEL-NEXT: lsl x15, x12, #49 +; GISEL-NEXT: orr x8, x11, x13, lsr #15 +; GISEL-NEXT: lsl x11, x17, #49 +; GISEL-NEXT: stp x9, x10, [x0, #16] +; GISEL-NEXT: orr x9, x15, x14, lsr #15 +; GISEL-NEXT: lsl x13, x16, #49 +; GISEL-NEXT: orr x10, x11, x12, lsr #15 +; GISEL-NEXT: stp x8, x9, [x0, #32] +; GISEL-NEXT: orr x8, x13, x17, asr #15 +; GISEL-NEXT: stp x10, x8, [x0, #48] +; GISEL-NEXT: ret +entry: + %input_val = load i512, ptr %input, align 64 + %shifted = ashr i512 %input_val, 15 + store i512 %shifted, ptr %result, align 64 + ret void +} + +define void @test_shl_i512_const_63(ptr %result, ptr %input) { +; SDAG-LABEL: test_shl_i512_const_63: +; SDAG: ; %bb.0: ; %entry +; SDAG-NEXT: ldp x8, x9, [x1, #40] +; SDAG-NEXT: ldr x10, [x1, #56] +; SDAG-NEXT: ldp x13, x11, [x1, #24] +; SDAG-NEXT: ldp x15, x14, [x1, #8] +; SDAG-NEXT: extr x12, x9, x8, #1 +; SDAG-NEXT: extr x9, x10, x9, #1 +; SDAG-NEXT: ldr x10, [x1] +; SDAG-NEXT: extr x16, x11, x13, #1 +; SDAG-NEXT: extr x8, x8, x11, #1 +; SDAG-NEXT: stp x12, x9, [x0, #48] +; SDAG-NEXT: extr x9, x14, x15, #1 +; SDAG-NEXT: extr x11, x13, x14, #1 +; SDAG-NEXT: stp x16, x8, [x0, #32] +; SDAG-NEXT: lsl x8, x10, #63 +; SDAG-NEXT: stp x9, x11, [x0, #16] +; SDAG-NEXT: extr x9, x15, x10, #1 +; SDAG-NEXT: stp x8, x9, [x0] +; SDAG-NEXT: ret +; +; GISEL-LABEL: test_shl_i512_const_63: +; GISEL: ; %bb.0: ; %entry +; GISEL-NEXT: ldp x8, x9, [x1] +; GISEL-NEXT: ldp x11, x12, [x1, #16] +; GISEL-NEXT: ldp x14, x15, [x1, #32] +; GISEL-NEXT: lsr x10, x8, #1 +; GISEL-NEXT: lsr x13, x9, #1 +; GISEL-NEXT: lsl x8, x8, #63 +; GISEL-NEXT: orr x9, x10, x9, lsl #63 +; GISEL-NEXT: lsr x10, x11, #1 +; GISEL-NEXT: orr x11, x13, x11, lsl #63 +; GISEL-NEXT: ldp x13, x16, [x1, #48] +; GISEL-NEXT: stp x8, x9, [x0] +; GISEL-NEXT: lsr x8, x12, #1 +; GISEL-NEXT: orr x10, x10, x12, lsl #63 +; GISEL-NEXT: lsr x12, x14, #1 +; GISEL-NEXT: lsr x9, x15, #1 +; GISEL-NEXT: orr x8, x8, x14, lsl #63 +; GISEL-NEXT: stp x11, x10, [x0, #16] +; GISEL-NEXT: orr x11, x12, x15, lsl #63 +; GISEL-NEXT: lsr x12, x13, #1 +; GISEL-NEXT: orr x9, x9, x13, lsl #63 +; GISEL-NEXT: stp x8, x11, [x0, #32] +; GISEL-NEXT: orr x8, x12, x16, lsl #63 +; GISEL-NEXT: stp x9, x8, [x0, #48] +; GISEL-NEXT: ret +entry: + %input_val = load i512, ptr %input, align 64 + %shifted = shl i512 %input_val, 63 + store i512 %shifted, ptr %result, align 64 + ret void +} + +define void @test_lshr_i512_const_63(ptr %result, ptr %input) { +; SDAG-LABEL: test_lshr_i512_const_63: +; SDAG: ; %bb.0: ; %entry +; SDAG-NEXT: ldp x9, x8, [x1, #48] +; SDAG-NEXT: ldp x11, x10, [x1, #32] +; SDAG-NEXT: ldp x16, x15, [x1, #16] +; SDAG-NEXT: extr x12, x8, x9, #63 +; SDAG-NEXT: lsr x8, x8, #63 +; SDAG-NEXT: ldp x13, x14, [x1] +; SDAG-NEXT: extr x9, x9, x10, #63 +; SDAG-NEXT: stp x12, x8, [x0, #48] +; SDAG-NEXT: extr x12, x10, x11, #63 +; SDAG-NEXT: extr x8, x15, x16, #63 +; SDAG-NEXT: extr x10, x11, x15, #63 +; SDAG-NEXT: stp x12, x9, [x0, #32] +; SDAG-NEXT: extr x9, x14, x13, #63 +; SDAG-NEXT: stp x8, x10, [x0, #16] +; SDAG-NEXT: extr x8, x16, x14, #63 +; SDAG-NEXT: stp x9, x8, [x0] +; SDAG-NEXT: ret +; +; GISEL-LABEL: test_lshr_i512_const_63: +; GISEL: ; %bb.0: ; %entry +; GISEL-NEXT: ldp x8, x9, [x1, #8] +; GISEL-NEXT: ldr x11, [x1] +; GISEL-NEXT: ldp x10, x14, [x1, #24] +; GISEL-NEXT: ldr x16, [x1, #56] +; GISEL-NEXT: lsl x12, x8, #1 +; GISEL-NEXT: lsl x13, x9, #1 +; GISEL-NEXT: lsl x15, x10, #1 +; GISEL-NEXT: orr x11, x12, x11, lsr #63 +; GISEL-NEXT: orr x8, x13, x8, lsr #63 +; GISEL-NEXT: lsl x13, x14, #1 +; GISEL-NEXT: orr x9, x15, x9, lsr #63 +; GISEL-NEXT: ldp x12, x15, [x1, #40] +; GISEL-NEXT: stp x11, x8, [x0] +; GISEL-NEXT: orr x10, x13, x10, lsr #63 +; GISEL-NEXT: lsl x8, x16, #1 +; GISEL-NEXT: lsl x11, x12, #1 +; GISEL-NEXT: lsl x13, x15, #1 +; GISEL-NEXT: stp x9, x10, [x0, #16] +; GISEL-NEXT: orr x8, x8, x15, lsr #63 +; GISEL-NEXT: lsr x10, x16, #63 +; GISEL-NEXT: orr x11, x11, x14, lsr #63 +; GISEL-NEXT: orr x9, x13, x12, lsr #63 +; GISEL-NEXT: stp x8, x10, [x0, #48] +; GISEL-NEXT: stp x11, x9, [x0, #32] +; GISEL-NEXT: ret +entry: + %input_val = load i512, ptr %input, align 64 + %shifted = lshr i512 %input_val, 63 + store i512 %shifted, ptr %result, align 64 + ret void +} + +define void @test_ashr_i512_const_63(ptr %result, ptr %input) { +; SDAG-LABEL: test_ashr_i512_const_63: +; SDAG: ; %bb.0: ; %entry +; SDAG-NEXT: ldp x9, x8, [x1, #48] +; SDAG-NEXT: ldp x11, x10, [x1, #32] +; SDAG-NEXT: ldp x16, x15, [x1, #16] +; SDAG-NEXT: extr x12, x8, x9, #63 +; SDAG-NEXT: asr x8, x8, #63 +; SDAG-NEXT: ldp x13, x14, [x1] +; SDAG-NEXT: extr x9, x9, x10, #63 +; SDAG-NEXT: stp x12, x8, [x0, #48] +; SDAG-NEXT: extr x12, x10, x11, #63 +; SDAG-NEXT: extr x8, x15, x16, #63 +; SDAG-NEXT: extr x10, x11, x15, #63 +; SDAG-NEXT: stp x12, x9, [x0, #32] +; SDAG-NEXT: extr x9, x14, x13, #63 +; SDAG-NEXT: stp x8, x10, [x0, #16] +; SDAG-NEXT: extr x8, x16, x14, #63 +; SDAG-NEXT: stp x9, x8, [x0] +; SDAG-NEXT: ret +; +; GISEL-LABEL: test_ashr_i512_const_63: +; GISEL: ; %bb.0: ; %entry +; GISEL-NEXT: ldp x8, x9, [x1, #8] +; GISEL-NEXT: ldr x10, [x1] +; GISEL-NEXT: ldp x11, x13, [x1, #24] +; GISEL-NEXT: ldr x17, [x1, #56] +; GISEL-NEXT: lsl x15, x9, #1 +; GISEL-NEXT: lsl x12, x8, #1 +; GISEL-NEXT: lsl x16, x11, #1 +; GISEL-NEXT: orr x8, x15, x8, lsr #63 +; GISEL-NEXT: lsl x15, x13, #1 +; GISEL-NEXT: orr x10, x12, x10, lsr #63 +; GISEL-NEXT: ldp x14, x12, [x1, #40] +; GISEL-NEXT: orr x9, x16, x9, lsr #63 +; GISEL-NEXT: orr x11, x15, x11, lsr #63 +; GISEL-NEXT: stp x10, x8, [x0] +; GISEL-NEXT: lsl x8, x17, #1 +; GISEL-NEXT: lsl x16, x14, #1 +; GISEL-NEXT: lsl x10, x12, #1 +; GISEL-NEXT: stp x9, x11, [x0, #16] +; GISEL-NEXT: asr x9, x17, #63 +; GISEL-NEXT: orr x8, x8, x12, lsr #63 +; GISEL-NEXT: orr x13, x16, x13, lsr #63 +; GISEL-NEXT: orr x10, x10, x14, lsr #63 +; GISEL-NEXT: orr x9, x9, x9, lsl #1 +; GISEL-NEXT: stp x13, x10, [x0, #32] +; GISEL-NEXT: stp x8, x9, [x0, #48] +; GISEL-NEXT: ret +entry: + %input_val = load i512, ptr %input, align 64 + %shifted = ashr i512 %input_val, 63 + store i512 %shifted, ptr %result, align 64 + ret void +} + +; Mixed word+bit shifts +define void @test_shl_i512_const_65(ptr %result, ptr %input) { +; SDAG-LABEL: test_shl_i512_const_65: +; SDAG: ; %bb.0: ; %entry +; SDAG-NEXT: ldp x9, x8, [x1, #32] +; SDAG-NEXT: ldr x10, [x1, #48] +; SDAG-NEXT: ldp x12, x11, [x1, #16] +; SDAG-NEXT: extr x13, x8, x9, #63 +; SDAG-NEXT: extr x8, x10, x8, #63 +; SDAG-NEXT: ldp x10, x14, [x1] +; SDAG-NEXT: extr x15, x11, x12, #63 +; SDAG-NEXT: stp x13, x8, [x0, #48] +; SDAG-NEXT: extr x9, x9, x11, #63 +; SDAG-NEXT: extr x8, x14, x10, #63 +; SDAG-NEXT: extr x11, x12, x14, #63 +; SDAG-NEXT: stp x15, x9, [x0, #32] +; SDAG-NEXT: stp x8, x11, [x0, #16] +; SDAG-NEXT: lsl x8, x10, #1 +; SDAG-NEXT: stp xzr, x8, [x0] +; SDAG-NEXT: ret +; +; GISEL-LABEL: test_shl_i512_const_65: +; GISEL: ; %bb.0: ; %entry +; GISEL-NEXT: ldp x8, x9, [x1] +; GISEL-NEXT: ldr x15, [x1, #48] +; GISEL-NEXT: ldp x10, x11, [x1, #16] +; GISEL-NEXT: ldp x12, x13, [x1, #32] +; GISEL-NEXT: lsr x14, x8, #63 +; GISEL-NEXT: lsr x16, x9, #63 +; GISEL-NEXT: lsl x8, x8, #1 +; GISEL-NEXT: orr x9, x14, x9, lsl #1 +; GISEL-NEXT: lsr x14, x10, #63 +; GISEL-NEXT: orr x10, x16, x10, lsl #1 +; GISEL-NEXT: stp xzr, x8, [x0] +; GISEL-NEXT: lsr x8, x11, #63 +; GISEL-NEXT: orr x11, x14, x11, lsl #1 +; GISEL-NEXT: lsr x14, x12, #63 +; GISEL-NEXT: stp x9, x10, [x0, #16] +; GISEL-NEXT: lsr x9, x13, #63 +; GISEL-NEXT: orr x8, x8, x12, lsl #1 +; GISEL-NEXT: orr x10, x14, x13, lsl #1 +; GISEL-NEXT: orr x9, x9, x15, lsl #1 +; GISEL-NEXT: stp x11, x8, [x0, #32] +; GISEL-NEXT: stp x10, x9, [x0, #48] +; GISEL-NEXT: ret +entry: + %input_val = load i512, ptr %input, align 64 + %shifted = shl i512 %input_val, 65 + store i512 %shifted, ptr %result, align 64 + ret void +} + +define void @test_lshr_i512_const_65(ptr %result, ptr %input) { +; SDAG-LABEL: test_lshr_i512_const_65: +; SDAG: ; %bb.0: ; %entry +; SDAG-NEXT: ldp x9, x8, [x1, #48] +; SDAG-NEXT: ldr x15, [x1, #24] +; SDAG-NEXT: ldp x14, x10, [x1, #32] +; SDAG-NEXT: ldp x11, x12, [x1, #8] +; SDAG-NEXT: lsr x13, x8, #1 +; SDAG-NEXT: extr x8, x8, x9, #1 +; SDAG-NEXT: extr x16, x9, x10, #1 +; SDAG-NEXT: extr x9, x14, x15, #1 +; SDAG-NEXT: extr x10, x10, x14, #1 +; SDAG-NEXT: stp x13, xzr, [x0, #48] +; SDAG-NEXT: stp x16, x8, [x0, #32] +; SDAG-NEXT: extr x8, x12, x11, #1 +; SDAG-NEXT: stp x9, x10, [x0, #16] +; SDAG-NEXT: extr x9, x15, x12, #1 +; SDAG-NEXT: stp x8, x9, [x0] +; SDAG-NEXT: ret +; +; GISEL-LABEL: test_lshr_i512_const_65: +; GISEL: ; %bb.0: ; %entry +; GISEL-NEXT: ldp x8, x9, [x1, #16] +; GISEL-NEXT: ldr x10, [x1, #8] +; GISEL-NEXT: ldp x11, x14, [x1, #32] +; GISEL-NEXT: ldp x15, x16, [x1, #48] +; GISEL-NEXT: lsl x12, x8, #63 +; GISEL-NEXT: lsl x13, x9, #63 +; GISEL-NEXT: orr x10, x12, x10, lsr #1 +; GISEL-NEXT: lsl x12, x11, #63 +; GISEL-NEXT: orr x8, x13, x8, lsr #1 +; GISEL-NEXT: lsl x13, x14, #63 +; GISEL-NEXT: orr x9, x12, x9, lsr #1 +; GISEL-NEXT: stp x10, x8, [x0] +; GISEL-NEXT: lsl x10, x15, #63 +; GISEL-NEXT: orr x11, x13, x11, lsr #1 +; GISEL-NEXT: lsl x12, x16, #63 +; GISEL-NEXT: orr x8, x10, x14, lsr #1 +; GISEL-NEXT: lsr x10, x16, #1 +; GISEL-NEXT: stp x9, x11, [x0, #16] +; GISEL-NEXT: orr x9, x12, x15, lsr #1 +; GISEL-NEXT: stp x10, xzr, [x0, #48] +; GISEL-NEXT: stp x8, x9, [x0, #32] +; GISEL-NEXT: ret +entry: + %input_val = load i512, ptr %input, align 64 + %shifted = lshr i512 %input_val, 65 + store i512 %shifted, ptr %result, align 64 + ret void +} + +define void @test_ashr_i512_const_65(ptr %result, ptr %input) { +; SDAG-LABEL: test_ashr_i512_const_65: +; SDAG: ; %bb.0: ; %entry +; SDAG-NEXT: ldp x9, x8, [x1, #48] +; SDAG-NEXT: ldr x16, [x1, #24] +; SDAG-NEXT: ldp x15, x10, [x1, #32] +; SDAG-NEXT: ldp x12, x13, [x1, #8] +; SDAG-NEXT: asr x11, x8, #1 +; SDAG-NEXT: asr x14, x8, #63 +; SDAG-NEXT: extr x8, x8, x9, #1 +; SDAG-NEXT: stp x11, x14, [x0, #48] +; SDAG-NEXT: extr x11, x9, x10, #1 +; SDAG-NEXT: extr x9, x15, x16, #1 +; SDAG-NEXT: extr x10, x10, x15, #1 +; SDAG-NEXT: stp x11, x8, [x0, #32] +; SDAG-NEXT: extr x8, x13, x12, #1 +; SDAG-NEXT: stp x9, x10, [x0, #16] +; SDAG-NEXT: extr x9, x16, x13, #1 +; SDAG-NEXT: stp x8, x9, [x0] +; SDAG-NEXT: ret +; +; GISEL-LABEL: test_ashr_i512_const_65: +; GISEL: ; %bb.0: ; %entry +; GISEL-NEXT: ldp x8, x9, [x1, #16] +; GISEL-NEXT: ldr x11, [x1, #8] +; GISEL-NEXT: ldp x10, x13, [x1, #32] +; GISEL-NEXT: lsl x12, x8, #63 +; GISEL-NEXT: lsl x14, x9, #63 +; GISEL-NEXT: lsl x15, x10, #63 +; GISEL-NEXT: orr x11, x12, x11, lsr #1 +; GISEL-NEXT: ldp x12, x16, [x1, #48] +; GISEL-NEXT: orr x8, x14, x8, lsr #1 +; GISEL-NEXT: lsl x14, x13, #63 +; GISEL-NEXT: orr x9, x15, x9, lsr #1 +; GISEL-NEXT: asr x15, x16, #63 +; GISEL-NEXT: stp x11, x8, [x0] +; GISEL-NEXT: lsl x11, x12, #63 +; GISEL-NEXT: orr x10, x14, x10, lsr #1 +; GISEL-NEXT: lsl x14, x16, #63 +; GISEL-NEXT: orr x8, x11, x13, lsr #1 +; GISEL-NEXT: lsl x11, x15, #63 +; GISEL-NEXT: stp x9, x10, [x0, #16] +; GISEL-NEXT: orr x9, x14, x12, lsr #1 +; GISEL-NEXT: orr x10, x11, x16, asr #1 +; GISEL-NEXT: stp x8, x9, [x0, #32] +; GISEL-NEXT: stp x10, x15, [x0, #48] +; GISEL-NEXT: ret +entry: + %input_val = load i512, ptr %input, align 64 + %shifted = ashr i512 %input_val, 65 + store i512 %shifted, ptr %result, align 64 + ret void +} + +define void @test_shl_i512_const_100(ptr %result, ptr %input) { +; SDAG-LABEL: test_shl_i512_const_100: +; SDAG: ; %bb.0: ; %entry +; SDAG-NEXT: ldp x9, x8, [x1, #32] +; SDAG-NEXT: ldr x10, [x1, #48] +; SDAG-NEXT: ldp x12, x11, [x1, #16] +; SDAG-NEXT: extr x13, x8, x9, #28 +; SDAG-NEXT: extr x8, x10, x8, #28 +; SDAG-NEXT: ldp x10, x14, [x1] +; SDAG-NEXT: extr x15, x11, x12, #28 +; SDAG-NEXT: stp x13, x8, [x0, #48] +; SDAG-NEXT: extr x9, x9, x11, #28 +; SDAG-NEXT: extr x8, x14, x10, #28 +; SDAG-NEXT: extr x11, x12, x14, #28 +; SDAG-NEXT: stp x15, x9, [x0, #32] +; SDAG-NEXT: stp x8, x11, [x0, #16] +; SDAG-NEXT: lsl x8, x10, #36 +; SDAG-NEXT: stp xzr, x8, [x0] +; SDAG-NEXT: ret +; +; GISEL-LABEL: test_shl_i512_const_100: +; GISEL: ; %bb.0: ; %entry +; GISEL-NEXT: ldp x8, x9, [x1] +; GISEL-NEXT: ldr x15, [x1, #48] +; GISEL-NEXT: ldp x10, x11, [x1, #16] +; GISEL-NEXT: ldp x12, x13, [x1, #32] +; GISEL-NEXT: lsr x14, x8, #28 +; GISEL-NEXT: lsr x16, x9, #28 +; GISEL-NEXT: lsl x8, x8, #36 +; GISEL-NEXT: orr x9, x14, x9, lsl #36 +; GISEL-NEXT: lsr x14, x10, #28 +; GISEL-NEXT: orr x10, x16, x10, lsl #36 +; GISEL-NEXT: stp xzr, x8, [x0] +; GISEL-NEXT: lsr x8, x11, #28 +; GISEL-NEXT: orr x11, x14, x11, lsl #36 +; GISEL-NEXT: lsr x14, x12, #28 +; GISEL-NEXT: stp x9, x10, [x0, #16] +; GISEL-NEXT: lsr x9, x13, #28 +; GISEL-NEXT: orr x8, x8, x12, lsl #36 +; GISEL-NEXT: orr x10, x14, x13, lsl #36 +; GISEL-NEXT: orr x9, x9, x15, lsl #36 +; GISEL-NEXT: stp x11, x8, [x0, #32] +; GISEL-NEXT: stp x10, x9, [x0, #48] +; GISEL-NEXT: ret +entry: + %input_val = load i512, ptr %input, align 64 + %shifted = shl i512 %input_val, 100 + store i512 %shifted, ptr %result, align 64 + ret void +} + +define void @test_lshr_i512_const_100(ptr %result, ptr %input) { +; SDAG-LABEL: test_lshr_i512_const_100: +; SDAG: ; %bb.0: ; %entry +; SDAG-NEXT: ldp x9, x8, [x1, #48] +; SDAG-NEXT: ldr x15, [x1, #24] +; SDAG-NEXT: ldp x14, x10, [x1, #32] +; SDAG-NEXT: ldp x11, x12, [x1, #8] +; SDAG-NEXT: lsr x13, x8, #36 +; SDAG-NEXT: extr x8, x8, x9, #36 +; SDAG-NEXT: extr x16, x9, x10, #36 +; SDAG-NEXT: extr x9, x14, x15, #36 +; SDAG-NEXT: extr x10, x10, x14, #36 +; SDAG-NEXT: stp x13, xzr, [x0, #48] +; SDAG-NEXT: stp x16, x8, [x0, #32] +; SDAG-NEXT: extr x8, x12, x11, #36 +; SDAG-NEXT: stp x9, x10, [x0, #16] +; SDAG-NEXT: extr x9, x15, x12, #36 +; SDAG-NEXT: stp x8, x9, [x0] +; SDAG-NEXT: ret +; +; GISEL-LABEL: test_lshr_i512_const_100: +; GISEL: ; %bb.0: ; %entry +; GISEL-NEXT: ldp x8, x9, [x1, #16] +; GISEL-NEXT: ldr x10, [x1, #8] +; GISEL-NEXT: ldp x11, x14, [x1, #32] +; GISEL-NEXT: ldp x15, x16, [x1, #48] +; GISEL-NEXT: lsl x12, x8, #28 +; GISEL-NEXT: lsl x13, x9, #28 +; GISEL-NEXT: orr x10, x12, x10, lsr #36 +; GISEL-NEXT: lsl x12, x11, #28 +; GISEL-NEXT: orr x8, x13, x8, lsr #36 +; GISEL-NEXT: lsl x13, x14, #28 +; GISEL-NEXT: orr x9, x12, x9, lsr #36 +; GISEL-NEXT: stp x10, x8, [x0] +; GISEL-NEXT: lsl x10, x15, #28 +; GISEL-NEXT: orr x11, x13, x11, lsr #36 +; GISEL-NEXT: lsl x12, x16, #28 +; GISEL-NEXT: orr x8, x10, x14, lsr #36 +; GISEL-NEXT: lsr x10, x16, #36 +; GISEL-NEXT: stp x9, x11, [x0, #16] +; GISEL-NEXT: orr x9, x12, x15, lsr #36 +; GISEL-NEXT: stp x10, xzr, [x0, #48] +; GISEL-NEXT: stp x8, x9, [x0, #32] +; GISEL-NEXT: ret +entry: + %input_val = load i512, ptr %input, align 64 + %shifted = lshr i512 %input_val, 100 + store i512 %shifted, ptr %result, align 64 + ret void +} + +define void @test_ashr_i512_const_100(ptr %result, ptr %input) { +; SDAG-LABEL: test_ashr_i512_const_100: +; SDAG: ; %bb.0: ; %entry +; SDAG-NEXT: ldp x9, x8, [x1, #48] +; SDAG-NEXT: ldr x16, [x1, #24] +; SDAG-NEXT: ldp x15, x10, [x1, #32] +; SDAG-NEXT: ldp x12, x13, [x1, #8] +; SDAG-NEXT: asr x11, x8, #36 +; SDAG-NEXT: asr x14, x8, #63 +; SDAG-NEXT: extr x8, x8, x9, #36 +; SDAG-NEXT: stp x11, x14, [x0, #48] +; SDAG-NEXT: extr x11, x9, x10, #36 +; SDAG-NEXT: extr x9, x15, x16, #36 +; SDAG-NEXT: extr x10, x10, x15, #36 +; SDAG-NEXT: stp x11, x8, [x0, #32] +; SDAG-NEXT: extr x8, x13, x12, #36 +; SDAG-NEXT: stp x9, x10, [x0, #16] +; SDAG-NEXT: extr x9, x16, x13, #36 +; SDAG-NEXT: stp x8, x9, [x0] +; SDAG-NEXT: ret +; +; GISEL-LABEL: test_ashr_i512_const_100: +; GISEL: ; %bb.0: ; %entry +; GISEL-NEXT: ldp x8, x9, [x1, #16] +; GISEL-NEXT: ldr x11, [x1, #8] +; GISEL-NEXT: ldp x10, x13, [x1, #32] +; GISEL-NEXT: lsl x12, x8, #28 +; GISEL-NEXT: lsl x14, x9, #28 +; GISEL-NEXT: lsl x15, x10, #28 +; GISEL-NEXT: orr x11, x12, x11, lsr #36 +; GISEL-NEXT: ldp x12, x16, [x1, #48] +; GISEL-NEXT: orr x8, x14, x8, lsr #36 +; GISEL-NEXT: lsl x14, x13, #28 +; GISEL-NEXT: orr x9, x15, x9, lsr #36 +; GISEL-NEXT: asr x15, x16, #63 +; GISEL-NEXT: stp x11, x8, [x0] +; GISEL-NEXT: lsl x11, x12, #28 +; GISEL-NEXT: orr x10, x14, x10, lsr #36 +; GISEL-NEXT: lsl x14, x16, #28 +; GISEL-NEXT: orr x8, x11, x13, lsr #36 +; GISEL-NEXT: lsl x11, x15, #28 +; GISEL-NEXT: stp x9, x10, [x0, #16] +; GISEL-NEXT: orr x9, x14, x12, lsr #36 +; GISEL-NEXT: orr x10, x11, x16, asr #36 +; GISEL-NEXT: stp x8, x9, [x0, #32] +; GISEL-NEXT: stp x10, x15, [x0, #48] +; GISEL-NEXT: ret +entry: + %input_val = load i512, ptr %input, align 64 + %shifted = ashr i512 %input_val, 100 + store i512 %shifted, ptr %result, align 64 + ret void +} + +; Boundary conditions - test exactly at the edge +define void @test_shl_i512_const_127(ptr %result, ptr %input) { +; SDAG-LABEL: test_shl_i512_const_127: +; SDAG: ; %bb.0: ; %entry +; SDAG-NEXT: ldp x9, x8, [x1, #32] +; SDAG-NEXT: ldr x10, [x1, #48] +; SDAG-NEXT: ldp x12, x11, [x1, #16] +; SDAG-NEXT: extr x13, x8, x9, #1 +; SDAG-NEXT: extr x8, x10, x8, #1 +; SDAG-NEXT: ldp x10, x14, [x1] +; SDAG-NEXT: extr x15, x11, x12, #1 +; SDAG-NEXT: stp x13, x8, [x0, #48] +; SDAG-NEXT: extr x9, x9, x11, #1 +; SDAG-NEXT: extr x8, x14, x10, #1 +; SDAG-NEXT: extr x11, x12, x14, #1 +; SDAG-NEXT: stp x15, x9, [x0, #32] +; SDAG-NEXT: stp x8, x11, [x0, #16] +; SDAG-NEXT: lsl x8, x10, #63 +; SDAG-NEXT: stp xzr, x8, [x0] +; SDAG-NEXT: ret +; +; GISEL-LABEL: test_shl_i512_const_127: +; GISEL: ; %bb.0: ; %entry +; GISEL-NEXT: ldp x8, x9, [x1] +; GISEL-NEXT: ldr x15, [x1, #48] +; GISEL-NEXT: ldp x10, x11, [x1, #16] +; GISEL-NEXT: ldp x12, x13, [x1, #32] +; GISEL-NEXT: lsr x14, x8, #1 +; GISEL-NEXT: lsr x16, x9, #1 +; GISEL-NEXT: lsl x8, x8, #63 +; GISEL-NEXT: orr x9, x14, x9, lsl #63 +; GISEL-NEXT: lsr x14, x10, #1 +; GISEL-NEXT: orr x10, x16, x10, lsl #63 +; GISEL-NEXT: stp xzr, x8, [x0] +; GISEL-NEXT: lsr x8, x11, #1 +; GISEL-NEXT: orr x11, x14, x11, lsl #63 +; GISEL-NEXT: lsr x14, x12, #1 +; GISEL-NEXT: stp x9, x10, [x0, #16] +; GISEL-NEXT: lsr x9, x13, #1 +; GISEL-NEXT: orr x8, x8, x12, lsl #63 +; GISEL-NEXT: orr x10, x14, x13, lsl #63 +; GISEL-NEXT: orr x9, x9, x15, lsl #63 +; GISEL-NEXT: stp x11, x8, [x0, #32] +; GISEL-NEXT: stp x10, x9, [x0, #48] +; GISEL-NEXT: ret +entry: + %input_val = load i512, ptr %input, align 64 + %shifted = shl i512 %input_val, 127 + store i512 %shifted, ptr %result, align 64 + ret void +} + +define void @test_lshr_i512_const_127(ptr %result, ptr %input) { +; SDAG-LABEL: test_lshr_i512_const_127: +; SDAG: ; %bb.0: ; %entry +; SDAG-NEXT: ldp x9, x8, [x1, #48] +; SDAG-NEXT: ldr x15, [x1, #24] +; SDAG-NEXT: ldp x14, x10, [x1, #32] +; SDAG-NEXT: ldp x11, x12, [x1, #8] +; SDAG-NEXT: lsr x13, x8, #63 +; SDAG-NEXT: extr x8, x8, x9, #63 +; SDAG-NEXT: extr x16, x9, x10, #63 +; SDAG-NEXT: extr x9, x14, x15, #63 +; SDAG-NEXT: extr x10, x10, x14, #63 +; SDAG-NEXT: stp x13, xzr, [x0, #48] +; SDAG-NEXT: stp x16, x8, [x0, #32] +; SDAG-NEXT: extr x8, x12, x11, #63 +; SDAG-NEXT: stp x9, x10, [x0, #16] +; SDAG-NEXT: extr x9, x15, x12, #63 +; SDAG-NEXT: stp x8, x9, [x0] +; SDAG-NEXT: ret +; +; GISEL-LABEL: test_lshr_i512_const_127: +; GISEL: ; %bb.0: ; %entry +; GISEL-NEXT: ldp x8, x9, [x1, #16] +; GISEL-NEXT: ldr x10, [x1, #8] +; GISEL-NEXT: ldp x11, x14, [x1, #32] +; GISEL-NEXT: ldp x15, x16, [x1, #48] +; GISEL-NEXT: lsl x12, x8, #1 +; GISEL-NEXT: lsl x13, x9, #1 +; GISEL-NEXT: orr x10, x12, x10, lsr #63 +; GISEL-NEXT: lsl x12, x11, #1 +; GISEL-NEXT: orr x8, x13, x8, lsr #63 +; GISEL-NEXT: lsl x13, x14, #1 +; GISEL-NEXT: orr x9, x12, x9, lsr #63 +; GISEL-NEXT: stp x10, x8, [x0] +; GISEL-NEXT: lsl x10, x15, #1 +; GISEL-NEXT: orr x11, x13, x11, lsr #63 +; GISEL-NEXT: lsl x12, x16, #1 +; GISEL-NEXT: orr x8, x10, x14, lsr #63 +; GISEL-NEXT: lsr x10, x16, #63 +; GISEL-NEXT: stp x9, x11, [x0, #16] +; GISEL-NEXT: orr x9, x12, x15, lsr #63 +; GISEL-NEXT: stp x10, xzr, [x0, #48] +; GISEL-NEXT: stp x8, x9, [x0, #32] +; GISEL-NEXT: ret +entry: + %input_val = load i512, ptr %input, align 64 + %shifted = lshr i512 %input_val, 127 + store i512 %shifted, ptr %result, align 64 + ret void +} + +define void @test_ashr_i512_const_127(ptr %result, ptr %input) { +; SDAG-LABEL: test_ashr_i512_const_127: +; SDAG: ; %bb.0: ; %entry +; SDAG-NEXT: ldp x9, x8, [x1, #48] +; SDAG-NEXT: ldr x15, [x1, #24] +; SDAG-NEXT: ldp x14, x10, [x1, #32] +; SDAG-NEXT: ldp x12, x13, [x1, #8] +; SDAG-NEXT: asr x11, x8, #63 +; SDAG-NEXT: extr x8, x8, x9, #63 +; SDAG-NEXT: extr x16, x9, x10, #63 +; SDAG-NEXT: extr x9, x14, x15, #63 +; SDAG-NEXT: extr x10, x10, x14, #63 +; SDAG-NEXT: stp x11, x11, [x0, #48] +; SDAG-NEXT: stp x16, x8, [x0, #32] +; SDAG-NEXT: extr x8, x13, x12, #63 +; SDAG-NEXT: stp x9, x10, [x0, #16] +; SDAG-NEXT: extr x9, x15, x13, #63 +; SDAG-NEXT: stp x8, x9, [x0] +; SDAG-NEXT: ret +; +; GISEL-LABEL: test_ashr_i512_const_127: +; GISEL: ; %bb.0: ; %entry +; GISEL-NEXT: ldp x8, x9, [x1, #16] +; GISEL-NEXT: ldr x10, [x1, #8] +; GISEL-NEXT: ldp x11, x14, [x1, #32] +; GISEL-NEXT: ldp x15, x16, [x1, #48] +; GISEL-NEXT: lsl x12, x8, #1 +; GISEL-NEXT: lsl x13, x9, #1 +; GISEL-NEXT: orr x10, x12, x10, lsr #63 +; GISEL-NEXT: lsl x12, x11, #1 +; GISEL-NEXT: orr x8, x13, x8, lsr #63 +; GISEL-NEXT: lsl x13, x14, #1 +; GISEL-NEXT: orr x9, x12, x9, lsr #63 +; GISEL-NEXT: lsl x12, x15, #1 +; GISEL-NEXT: stp x10, x8, [x0] +; GISEL-NEXT: lsl x10, x16, #1 +; GISEL-NEXT: orr x11, x13, x11, lsr #63 +; GISEL-NEXT: asr x8, x16, #63 +; GISEL-NEXT: orr x12, x12, x14, lsr #63 +; GISEL-NEXT: stp x9, x11, [x0, #16] +; GISEL-NEXT: orr x9, x10, x15, lsr #63 +; GISEL-NEXT: orr x10, x8, x8, lsl #1 +; GISEL-NEXT: stp x12, x9, [x0, #32] +; GISEL-NEXT: stp x10, x8, [x0, #48] +; GISEL-NEXT: ret +entry: + %input_val = load i512, ptr %input, align 64 + %shifted = ashr i512 %input_val, 127 + store i512 %shifted, ptr %result, align 64 + ret void +} + +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CHECK: {{.*}} diff --git a/llvm/test/CodeGen/AArch64/README b/llvm/test/CodeGen/AArch64/README deleted file mode 100644 index b0a93e8668ee..000000000000 --- a/llvm/test/CodeGen/AArch64/README +++ /dev/null @@ -1,11 +0,0 @@ -++ SVE CodeGen Warnings ++ - -When the WARN check lines fail in the SVE codegen tests it most likely means you -have introduced a warning due to: -1. Adding an invalid call to VectorType::getNumElements() or EVT::getVectorNumElements() - when the type is a scalable vector. -2. Relying upon an implicit cast conversion from TypeSize to uint64_t. - -For generic code, please modify your code to work with ElementCount and TypeSize directly. -For target-specific code that only deals with fixed-width vectors, use the fixed-size interfaces. -Please refer to the code where those functions live for more details. diff --git a/llvm/test/CodeGen/AArch64/alias_mask.ll b/llvm/test/CodeGen/AArch64/alias_mask.ll new file mode 100644 index 000000000000..9b9c020016ba --- /dev/null +++ b/llvm/test/CodeGen/AArch64/alias_mask.ll @@ -0,0 +1,900 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=aarch64 -mattr=+sve2 %s -o - | FileCheck %s + +define <16 x i1> @whilewr_8(ptr %a, ptr %b) { +; CHECK-LABEL: whilewr_8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: whilewr p0.b, x0, x1 +; CHECK-NEXT: mov z0.b, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret +entry: + %0 = call <16 x i1> @llvm.loop.dependence.war.mask.v16i1(ptr %a, ptr %b, i64 1) + ret <16 x i1> %0 +} + +define <8 x i1> @whilewr_16(ptr %a, ptr %b) { +; CHECK-LABEL: whilewr_16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: whilewr p0.h, x0, x1 +; CHECK-NEXT: mov z0.h, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: xtn v0.8b, v0.8h +; CHECK-NEXT: ret +entry: + %0 = call <8 x i1> @llvm.loop.dependence.war.mask.v8i1(ptr %a, ptr %b, i64 2) + ret <8 x i1> %0 +} + +define <4 x i1> @whilewr_32(ptr %a, ptr %b) { +; CHECK-LABEL: whilewr_32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: whilewr p0.s, x0, x1 +; CHECK-NEXT: mov z0.s, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: xtn v0.4h, v0.4s +; CHECK-NEXT: ret +entry: + %0 = call <4 x i1> @llvm.loop.dependence.war.mask.v4i1(ptr %a, ptr %b, i64 4) + ret <4 x i1> %0 +} + +define <2 x i1> @whilewr_64(ptr %a, ptr %b) { +; CHECK-LABEL: whilewr_64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: whilewr p0.d, x0, x1 +; CHECK-NEXT: mov z0.d, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: xtn v0.2s, v0.2d +; CHECK-NEXT: ret +entry: + %0 = call <2 x i1> @llvm.loop.dependence.war.mask.v2i1(ptr %a, ptr %b, i64 8) + ret <2 x i1> %0 +} + +define <16 x i1> @whilerw_8(ptr %a, ptr %b) { +; CHECK-LABEL: whilerw_8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: whilerw p0.b, x0, x1 +; CHECK-NEXT: mov z0.b, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret +entry: + %0 = call <16 x i1> @llvm.loop.dependence.raw.mask.v16i1(ptr %a, ptr %b, i64 1) + ret <16 x i1> %0 +} + +define <8 x i1> @whilerw_16(ptr %a, ptr %b) { +; CHECK-LABEL: whilerw_16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: whilerw p0.h, x0, x1 +; CHECK-NEXT: mov z0.h, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: xtn v0.8b, v0.8h +; CHECK-NEXT: ret +entry: + %0 = call <8 x i1> @llvm.loop.dependence.raw.mask.v8i1(ptr %a, ptr %b, i64 2) + ret <8 x i1> %0 +} + +define <4 x i1> @whilerw_32(ptr %a, ptr %b) { +; CHECK-LABEL: whilerw_32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: whilerw p0.s, x0, x1 +; CHECK-NEXT: mov z0.s, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: xtn v0.4h, v0.4s +; CHECK-NEXT: ret +entry: + %0 = call <4 x i1> @llvm.loop.dependence.raw.mask.v4i1(ptr %a, ptr %b, i64 4) + ret <4 x i1> %0 +} + +define <2 x i1> @whilerw_64(ptr %a, ptr %b) { +; CHECK-LABEL: whilerw_64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: whilerw p0.d, x0, x1 +; CHECK-NEXT: mov z0.d, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: xtn v0.2s, v0.2d +; CHECK-NEXT: ret +entry: + %0 = call <2 x i1> @llvm.loop.dependence.raw.mask.v2i1(ptr %a, ptr %b, i64 8) + ret <2 x i1> %0 +} + +define <32 x i1> @whilewr_8_split(ptr %a, ptr %b) { +; CHECK-LABEL: whilewr_8_split: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: add x9, x0, #16 +; CHECK-NEXT: whilewr p0.b, x0, x1 +; CHECK-NEXT: whilewr p1.b, x9, x1 +; CHECK-NEXT: adrp x9, .LCPI8_0 +; CHECK-NEXT: mov z0.b, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI8_0] +; CHECK-NEXT: mov z1.b, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: shl v0.16b, v0.16b, #7 +; CHECK-NEXT: shl v1.16b, v1.16b, #7 +; CHECK-NEXT: cmlt v0.16b, v0.16b, #0 +; CHECK-NEXT: cmlt v1.16b, v1.16b, #0 +; CHECK-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-NEXT: and v1.16b, v1.16b, v2.16b +; CHECK-NEXT: ext v2.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: ext v3.16b, v1.16b, v1.16b, #8 +; CHECK-NEXT: zip1 v0.16b, v0.16b, v2.16b +; CHECK-NEXT: zip1 v1.16b, v1.16b, v3.16b +; CHECK-NEXT: addv h0, v0.8h +; CHECK-NEXT: addv h1, v1.8h +; CHECK-NEXT: str h0, [x8] +; CHECK-NEXT: str h1, [x8, #2] +; CHECK-NEXT: ret +entry: + %0 = call <32 x i1> @llvm.loop.dependence.war.mask.v32i1(ptr %a, ptr %b, i64 1) + ret <32 x i1> %0 +} + +define <64 x i1> @whilewr_8_split2(ptr %a, ptr %b) { +; CHECK-LABEL: whilewr_8_split2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: add x9, x0, #48 +; CHECK-NEXT: whilewr p0.b, x0, x1 +; CHECK-NEXT: add x10, x0, #16 +; CHECK-NEXT: whilewr p1.b, x9, x1 +; CHECK-NEXT: add x9, x0, #32 +; CHECK-NEXT: mov z0.b, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: whilewr p0.b, x9, x1 +; CHECK-NEXT: adrp x9, .LCPI9_0 +; CHECK-NEXT: mov z1.b, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: whilewr p1.b, x10, x1 +; CHECK-NEXT: ldr q4, [x9, :lo12:.LCPI9_0] +; CHECK-NEXT: mov z2.b, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z3.b, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: shl v0.16b, v0.16b, #7 +; CHECK-NEXT: shl v1.16b, v1.16b, #7 +; CHECK-NEXT: shl v2.16b, v2.16b, #7 +; CHECK-NEXT: shl v3.16b, v3.16b, #7 +; CHECK-NEXT: cmlt v0.16b, v0.16b, #0 +; CHECK-NEXT: cmlt v1.16b, v1.16b, #0 +; CHECK-NEXT: cmlt v2.16b, v2.16b, #0 +; CHECK-NEXT: cmlt v3.16b, v3.16b, #0 +; CHECK-NEXT: and v0.16b, v0.16b, v4.16b +; CHECK-NEXT: and v1.16b, v1.16b, v4.16b +; CHECK-NEXT: and v2.16b, v2.16b, v4.16b +; CHECK-NEXT: and v3.16b, v3.16b, v4.16b +; CHECK-NEXT: ext v4.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: ext v5.16b, v1.16b, v1.16b, #8 +; CHECK-NEXT: ext v6.16b, v2.16b, v2.16b, #8 +; CHECK-NEXT: ext v7.16b, v3.16b, v3.16b, #8 +; CHECK-NEXT: zip1 v0.16b, v0.16b, v4.16b +; CHECK-NEXT: zip1 v1.16b, v1.16b, v5.16b +; CHECK-NEXT: zip1 v2.16b, v2.16b, v6.16b +; CHECK-NEXT: zip1 v3.16b, v3.16b, v7.16b +; CHECK-NEXT: addv h0, v0.8h +; CHECK-NEXT: addv h1, v1.8h +; CHECK-NEXT: addv h2, v2.8h +; CHECK-NEXT: addv h3, v3.8h +; CHECK-NEXT: str h0, [x8] +; CHECK-NEXT: str h1, [x8, #6] +; CHECK-NEXT: str h2, [x8, #4] +; CHECK-NEXT: str h3, [x8, #2] +; CHECK-NEXT: ret +entry: + %0 = call <64 x i1> @llvm.loop.dependence.war.mask.v64i1(ptr %a, ptr %b, i64 1) + ret <64 x i1> %0 +} + +define <16 x i1> @whilewr_16_expand(ptr %a, ptr %b) { +; CHECK-LABEL: whilewr_16_expand: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: index z0.d, #0, #1 +; CHECK-NEXT: sub x8, x1, x0 +; CHECK-NEXT: add x8, x8, x8, lsr #63 +; CHECK-NEXT: asr x8, x8, #1 +; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: mov z2.d, z0.d +; CHECK-NEXT: mov z4.d, z0.d +; CHECK-NEXT: mov z5.d, z0.d +; CHECK-NEXT: mov z6.d, z0.d +; CHECK-NEXT: mov z7.d, z0.d +; CHECK-NEXT: mov z16.d, z0.d +; CHECK-NEXT: dup v3.2d, x8 +; CHECK-NEXT: cmp x8, #1 +; CHECK-NEXT: add z1.d, z1.d, #12 // =0xc +; CHECK-NEXT: add z2.d, z2.d, #10 // =0xa +; CHECK-NEXT: add z4.d, z4.d, #8 // =0x8 +; CHECK-NEXT: add z5.d, z5.d, #6 // =0x6 +; CHECK-NEXT: add z6.d, z6.d, #4 // =0x4 +; CHECK-NEXT: add z7.d, z7.d, #2 // =0x2 +; CHECK-NEXT: add z16.d, z16.d, #14 // =0xe +; CHECK-NEXT: cmhi v0.2d, v3.2d, v0.2d +; CHECK-NEXT: cset w8, lt +; CHECK-NEXT: cmhi v1.2d, v3.2d, v1.2d +; CHECK-NEXT: cmhi v2.2d, v3.2d, v2.2d +; CHECK-NEXT: cmhi v4.2d, v3.2d, v4.2d +; CHECK-NEXT: cmhi v5.2d, v3.2d, v5.2d +; CHECK-NEXT: cmhi v6.2d, v3.2d, v6.2d +; CHECK-NEXT: cmhi v16.2d, v3.2d, v16.2d +; CHECK-NEXT: cmhi v3.2d, v3.2d, v7.2d +; CHECK-NEXT: uzp1 v2.4s, v4.4s, v2.4s +; CHECK-NEXT: uzp1 v4.4s, v6.4s, v5.4s +; CHECK-NEXT: uzp1 v1.4s, v1.4s, v16.4s +; CHECK-NEXT: uzp1 v0.4s, v0.4s, v3.4s +; CHECK-NEXT: uzp1 v1.8h, v2.8h, v1.8h +; CHECK-NEXT: uzp1 v0.8h, v0.8h, v4.8h +; CHECK-NEXT: uzp1 v0.16b, v0.16b, v1.16b +; CHECK-NEXT: dup v1.16b, w8 +; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ret +entry: + %0 = call <16 x i1> @llvm.loop.dependence.war.mask.v16i1(ptr %a, ptr %b, i64 2) + ret <16 x i1> %0 +} + +define <32 x i1> @whilewr_16_expand2(ptr %a, ptr %b) { +; CHECK-LABEL: whilewr_16_expand2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sub x9, x1, x0 +; CHECK-NEXT: index z0.d, #0, #1 +; CHECK-NEXT: sub x10, x9, #32 +; CHECK-NEXT: add x9, x9, x9, lsr #63 +; CHECK-NEXT: add x10, x10, x10, lsr #63 +; CHECK-NEXT: asr x9, x9, #1 +; CHECK-NEXT: asr x10, x10, #1 +; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: mov z2.d, z0.d +; CHECK-NEXT: mov z3.d, z0.d +; CHECK-NEXT: mov z4.d, z0.d +; CHECK-NEXT: mov z5.d, z0.d +; CHECK-NEXT: mov z6.d, z0.d +; CHECK-NEXT: dup v7.2d, x9 +; CHECK-NEXT: dup v16.2d, x10 +; CHECK-NEXT: add z1.d, z1.d, #12 // =0xc +; CHECK-NEXT: add z2.d, z2.d, #10 // =0xa +; CHECK-NEXT: cmp x10, #1 +; CHECK-NEXT: add z3.d, z3.d, #8 // =0x8 +; CHECK-NEXT: add z4.d, z4.d, #6 // =0x6 +; CHECK-NEXT: add z5.d, z5.d, #4 // =0x4 +; CHECK-NEXT: add z6.d, z6.d, #2 // =0x2 +; CHECK-NEXT: cmhi v17.2d, v7.2d, v0.2d +; CHECK-NEXT: cmhi v18.2d, v16.2d, v0.2d +; CHECK-NEXT: add z0.d, z0.d, #14 // =0xe +; CHECK-NEXT: cmhi v19.2d, v7.2d, v1.2d +; CHECK-NEXT: cmhi v20.2d, v7.2d, v2.2d +; CHECK-NEXT: cmhi v21.2d, v7.2d, v3.2d +; CHECK-NEXT: cmhi v22.2d, v7.2d, v4.2d +; CHECK-NEXT: cmhi v23.2d, v7.2d, v5.2d +; CHECK-NEXT: cmhi v24.2d, v7.2d, v6.2d +; CHECK-NEXT: cmhi v1.2d, v16.2d, v1.2d +; CHECK-NEXT: cmhi v2.2d, v16.2d, v2.2d +; CHECK-NEXT: cmhi v3.2d, v16.2d, v3.2d +; CHECK-NEXT: cmhi v4.2d, v16.2d, v4.2d +; CHECK-NEXT: cmhi v7.2d, v7.2d, v0.2d +; CHECK-NEXT: cmhi v5.2d, v16.2d, v5.2d +; CHECK-NEXT: cmhi v6.2d, v16.2d, v6.2d +; CHECK-NEXT: cset w10, lt +; CHECK-NEXT: cmhi v0.2d, v16.2d, v0.2d +; CHECK-NEXT: uzp1 v16.4s, v21.4s, v20.4s +; CHECK-NEXT: cmp x9, #1 +; CHECK-NEXT: uzp1 v20.4s, v23.4s, v22.4s +; CHECK-NEXT: uzp1 v17.4s, v17.4s, v24.4s +; CHECK-NEXT: cset w9, lt +; CHECK-NEXT: uzp1 v2.4s, v3.4s, v2.4s +; CHECK-NEXT: uzp1 v3.4s, v19.4s, v7.4s +; CHECK-NEXT: uzp1 v4.4s, v5.4s, v4.4s +; CHECK-NEXT: uzp1 v5.4s, v18.4s, v6.4s +; CHECK-NEXT: uzp1 v0.4s, v1.4s, v0.4s +; CHECK-NEXT: uzp1 v1.8h, v17.8h, v20.8h +; CHECK-NEXT: uzp1 v3.8h, v16.8h, v3.8h +; CHECK-NEXT: uzp1 v4.8h, v5.8h, v4.8h +; CHECK-NEXT: uzp1 v0.8h, v2.8h, v0.8h +; CHECK-NEXT: dup v2.16b, w9 +; CHECK-NEXT: adrp x9, .LCPI11_0 +; CHECK-NEXT: uzp1 v1.16b, v1.16b, v3.16b +; CHECK-NEXT: dup v3.16b, w10 +; CHECK-NEXT: uzp1 v0.16b, v4.16b, v0.16b +; CHECK-NEXT: orr v1.16b, v1.16b, v2.16b +; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI11_0] +; CHECK-NEXT: orr v0.16b, v0.16b, v3.16b +; CHECK-NEXT: shl v1.16b, v1.16b, #7 +; CHECK-NEXT: shl v0.16b, v0.16b, #7 +; CHECK-NEXT: cmlt v1.16b, v1.16b, #0 +; CHECK-NEXT: cmlt v0.16b, v0.16b, #0 +; CHECK-NEXT: and v1.16b, v1.16b, v2.16b +; CHECK-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-NEXT: ext v2.16b, v1.16b, v1.16b, #8 +; CHECK-NEXT: ext v3.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: zip1 v1.16b, v1.16b, v2.16b +; CHECK-NEXT: zip1 v0.16b, v0.16b, v3.16b +; CHECK-NEXT: addv h1, v1.8h +; CHECK-NEXT: addv h0, v0.8h +; CHECK-NEXT: str h1, [x8] +; CHECK-NEXT: str h0, [x8, #2] +; CHECK-NEXT: ret +entry: + %0 = call <32 x i1> @llvm.loop.dependence.war.mask.v32i1(ptr %a, ptr %b, i64 2) + ret <32 x i1> %0 +} + +define <8 x i1> @whilewr_32_expand(ptr %a, ptr %b) { +; CHECK-LABEL: whilewr_32_expand: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: index z0.d, #0, #1 +; CHECK-NEXT: subs x8, x1, x0 +; CHECK-NEXT: add x9, x8, #3 +; CHECK-NEXT: csel x8, x9, x8, mi +; CHECK-NEXT: asr x8, x8, #2 +; CHECK-NEXT: mov z2.d, z0.d +; CHECK-NEXT: mov z3.d, z0.d +; CHECK-NEXT: mov z4.d, z0.d +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: cmp x8, #1 +; CHECK-NEXT: cset w8, lt +; CHECK-NEXT: add z4.d, z4.d, #6 // =0x6 +; CHECK-NEXT: add z2.d, z2.d, #4 // =0x4 +; CHECK-NEXT: add z3.d, z3.d, #2 // =0x2 +; CHECK-NEXT: cmhi v0.2d, v1.2d, v0.2d +; CHECK-NEXT: cmhi v4.2d, v1.2d, v4.2d +; CHECK-NEXT: cmhi v2.2d, v1.2d, v2.2d +; CHECK-NEXT: cmhi v1.2d, v1.2d, v3.2d +; CHECK-NEXT: uzp1 v2.4s, v2.4s, v4.4s +; CHECK-NEXT: uzp1 v0.4s, v0.4s, v1.4s +; CHECK-NEXT: dup v1.8b, w8 +; CHECK-NEXT: uzp1 v0.8h, v0.8h, v2.8h +; CHECK-NEXT: xtn v0.8b, v0.8h +; CHECK-NEXT: orr v0.8b, v0.8b, v1.8b +; CHECK-NEXT: ret +entry: + %0 = call <8 x i1> @llvm.loop.dependence.war.mask.v8i1(ptr %a, ptr %b, i64 4) + ret <8 x i1> %0 +} + +define <16 x i1> @whilewr_32_expand2(ptr %a, ptr %b) { +; CHECK-LABEL: whilewr_32_expand2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: index z0.d, #0, #1 +; CHECK-NEXT: subs x8, x1, x0 +; CHECK-NEXT: add x9, x8, #3 +; CHECK-NEXT: csel x8, x9, x8, mi +; CHECK-NEXT: asr x8, x8, #2 +; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: mov z2.d, z0.d +; CHECK-NEXT: mov z4.d, z0.d +; CHECK-NEXT: mov z5.d, z0.d +; CHECK-NEXT: mov z6.d, z0.d +; CHECK-NEXT: mov z7.d, z0.d +; CHECK-NEXT: mov z16.d, z0.d +; CHECK-NEXT: dup v3.2d, x8 +; CHECK-NEXT: cmp x8, #1 +; CHECK-NEXT: add z1.d, z1.d, #12 // =0xc +; CHECK-NEXT: add z2.d, z2.d, #10 // =0xa +; CHECK-NEXT: add z4.d, z4.d, #8 // =0x8 +; CHECK-NEXT: add z5.d, z5.d, #6 // =0x6 +; CHECK-NEXT: add z6.d, z6.d, #4 // =0x4 +; CHECK-NEXT: add z7.d, z7.d, #2 // =0x2 +; CHECK-NEXT: add z16.d, z16.d, #14 // =0xe +; CHECK-NEXT: cmhi v0.2d, v3.2d, v0.2d +; CHECK-NEXT: cset w8, lt +; CHECK-NEXT: cmhi v1.2d, v3.2d, v1.2d +; CHECK-NEXT: cmhi v2.2d, v3.2d, v2.2d +; CHECK-NEXT: cmhi v4.2d, v3.2d, v4.2d +; CHECK-NEXT: cmhi v5.2d, v3.2d, v5.2d +; CHECK-NEXT: cmhi v6.2d, v3.2d, v6.2d +; CHECK-NEXT: cmhi v16.2d, v3.2d, v16.2d +; CHECK-NEXT: cmhi v3.2d, v3.2d, v7.2d +; CHECK-NEXT: uzp1 v2.4s, v4.4s, v2.4s +; CHECK-NEXT: uzp1 v4.4s, v6.4s, v5.4s +; CHECK-NEXT: uzp1 v1.4s, v1.4s, v16.4s +; CHECK-NEXT: uzp1 v0.4s, v0.4s, v3.4s +; CHECK-NEXT: uzp1 v1.8h, v2.8h, v1.8h +; CHECK-NEXT: uzp1 v0.8h, v0.8h, v4.8h +; CHECK-NEXT: uzp1 v0.16b, v0.16b, v1.16b +; CHECK-NEXT: dup v1.16b, w8 +; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ret +entry: + %0 = call <16 x i1> @llvm.loop.dependence.war.mask.v16i1(ptr %a, ptr %b, i64 4) + ret <16 x i1> %0 +} + +define <32 x i1> @whilewr_32_expand3(ptr %a, ptr %b) { +; CHECK-LABEL: whilewr_32_expand3: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: subs x9, x1, x0 +; CHECK-NEXT: index z0.d, #0, #1 +; CHECK-NEXT: add x10, x9, #3 +; CHECK-NEXT: sub x11, x9, #61 +; CHECK-NEXT: csel x10, x10, x9, mi +; CHECK-NEXT: subs x9, x9, #64 +; CHECK-NEXT: csel x9, x11, x9, mi +; CHECK-NEXT: asr x10, x10, #2 +; CHECK-NEXT: asr x9, x9, #2 +; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: mov z2.d, z0.d +; CHECK-NEXT: mov z3.d, z0.d +; CHECK-NEXT: mov z4.d, z0.d +; CHECK-NEXT: mov z5.d, z0.d +; CHECK-NEXT: mov z6.d, z0.d +; CHECK-NEXT: dup v7.2d, x10 +; CHECK-NEXT: dup v16.2d, x9 +; CHECK-NEXT: add z1.d, z1.d, #12 // =0xc +; CHECK-NEXT: add z2.d, z2.d, #10 // =0xa +; CHECK-NEXT: cmp x9, #1 +; CHECK-NEXT: add z3.d, z3.d, #8 // =0x8 +; CHECK-NEXT: add z4.d, z4.d, #6 // =0x6 +; CHECK-NEXT: add z5.d, z5.d, #4 // =0x4 +; CHECK-NEXT: add z6.d, z6.d, #2 // =0x2 +; CHECK-NEXT: cmhi v17.2d, v7.2d, v0.2d +; CHECK-NEXT: cmhi v18.2d, v16.2d, v0.2d +; CHECK-NEXT: add z0.d, z0.d, #14 // =0xe +; CHECK-NEXT: cmhi v19.2d, v7.2d, v1.2d +; CHECK-NEXT: cmhi v20.2d, v7.2d, v2.2d +; CHECK-NEXT: cmhi v21.2d, v7.2d, v3.2d +; CHECK-NEXT: cmhi v22.2d, v7.2d, v4.2d +; CHECK-NEXT: cmhi v23.2d, v7.2d, v5.2d +; CHECK-NEXT: cmhi v24.2d, v7.2d, v6.2d +; CHECK-NEXT: cmhi v1.2d, v16.2d, v1.2d +; CHECK-NEXT: cmhi v2.2d, v16.2d, v2.2d +; CHECK-NEXT: cmhi v3.2d, v16.2d, v3.2d +; CHECK-NEXT: cmhi v4.2d, v16.2d, v4.2d +; CHECK-NEXT: cmhi v7.2d, v7.2d, v0.2d +; CHECK-NEXT: cmhi v5.2d, v16.2d, v5.2d +; CHECK-NEXT: cmhi v6.2d, v16.2d, v6.2d +; CHECK-NEXT: cset w9, lt +; CHECK-NEXT: cmhi v0.2d, v16.2d, v0.2d +; CHECK-NEXT: uzp1 v16.4s, v21.4s, v20.4s +; CHECK-NEXT: cmp x10, #1 +; CHECK-NEXT: uzp1 v20.4s, v23.4s, v22.4s +; CHECK-NEXT: uzp1 v17.4s, v17.4s, v24.4s +; CHECK-NEXT: cset w10, lt +; CHECK-NEXT: uzp1 v2.4s, v3.4s, v2.4s +; CHECK-NEXT: uzp1 v3.4s, v19.4s, v7.4s +; CHECK-NEXT: uzp1 v4.4s, v5.4s, v4.4s +; CHECK-NEXT: uzp1 v5.4s, v18.4s, v6.4s +; CHECK-NEXT: uzp1 v0.4s, v1.4s, v0.4s +; CHECK-NEXT: uzp1 v1.8h, v17.8h, v20.8h +; CHECK-NEXT: uzp1 v3.8h, v16.8h, v3.8h +; CHECK-NEXT: uzp1 v4.8h, v5.8h, v4.8h +; CHECK-NEXT: uzp1 v0.8h, v2.8h, v0.8h +; CHECK-NEXT: dup v2.16b, w10 +; CHECK-NEXT: uzp1 v1.16b, v1.16b, v3.16b +; CHECK-NEXT: dup v3.16b, w9 +; CHECK-NEXT: adrp x9, .LCPI14_0 +; CHECK-NEXT: uzp1 v0.16b, v4.16b, v0.16b +; CHECK-NEXT: orr v1.16b, v1.16b, v2.16b +; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI14_0] +; CHECK-NEXT: orr v0.16b, v0.16b, v3.16b +; CHECK-NEXT: shl v1.16b, v1.16b, #7 +; CHECK-NEXT: shl v0.16b, v0.16b, #7 +; CHECK-NEXT: cmlt v1.16b, v1.16b, #0 +; CHECK-NEXT: cmlt v0.16b, v0.16b, #0 +; CHECK-NEXT: and v1.16b, v1.16b, v2.16b +; CHECK-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-NEXT: ext v2.16b, v1.16b, v1.16b, #8 +; CHECK-NEXT: ext v3.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: zip1 v1.16b, v1.16b, v2.16b +; CHECK-NEXT: zip1 v0.16b, v0.16b, v3.16b +; CHECK-NEXT: addv h1, v1.8h +; CHECK-NEXT: addv h0, v0.8h +; CHECK-NEXT: str h1, [x8] +; CHECK-NEXT: str h0, [x8, #2] +; CHECK-NEXT: ret +entry: + %0 = call <32 x i1> @llvm.loop.dependence.war.mask.v32i1(ptr %a, ptr %b, i64 4) + ret <32 x i1> %0 +} + +define <4 x i1> @whilewr_64_expand(ptr %a, ptr %b) { +; CHECK-LABEL: whilewr_64_expand: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: index z0.d, #0, #1 +; CHECK-NEXT: subs x8, x1, x0 +; CHECK-NEXT: add x9, x8, #7 +; CHECK-NEXT: csel x8, x9, x8, mi +; CHECK-NEXT: asr x8, x8, #3 +; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: dup v2.2d, x8 +; CHECK-NEXT: cmp x8, #1 +; CHECK-NEXT: cset w8, lt +; CHECK-NEXT: add z1.d, z1.d, #2 // =0x2 +; CHECK-NEXT: cmhi v0.2d, v2.2d, v0.2d +; CHECK-NEXT: cmhi v1.2d, v2.2d, v1.2d +; CHECK-NEXT: uzp1 v0.4s, v0.4s, v1.4s +; CHECK-NEXT: dup v1.4h, w8 +; CHECK-NEXT: xtn v0.4h, v0.4s +; CHECK-NEXT: orr v0.8b, v0.8b, v1.8b +; CHECK-NEXT: ret +entry: + %0 = call <4 x i1> @llvm.loop.dependence.war.mask.v4i1(ptr %a, ptr %b, i64 8) + ret <4 x i1> %0 +} + +define <8 x i1> @whilewr_64_expand2(ptr %a, ptr %b) { +; CHECK-LABEL: whilewr_64_expand2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: index z0.d, #0, #1 +; CHECK-NEXT: subs x8, x1, x0 +; CHECK-NEXT: add x9, x8, #7 +; CHECK-NEXT: csel x8, x9, x8, mi +; CHECK-NEXT: asr x8, x8, #3 +; CHECK-NEXT: mov z2.d, z0.d +; CHECK-NEXT: mov z3.d, z0.d +; CHECK-NEXT: mov z4.d, z0.d +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: cmp x8, #1 +; CHECK-NEXT: cset w8, lt +; CHECK-NEXT: add z4.d, z4.d, #6 // =0x6 +; CHECK-NEXT: add z2.d, z2.d, #4 // =0x4 +; CHECK-NEXT: add z3.d, z3.d, #2 // =0x2 +; CHECK-NEXT: cmhi v0.2d, v1.2d, v0.2d +; CHECK-NEXT: cmhi v4.2d, v1.2d, v4.2d +; CHECK-NEXT: cmhi v2.2d, v1.2d, v2.2d +; CHECK-NEXT: cmhi v1.2d, v1.2d, v3.2d +; CHECK-NEXT: uzp1 v2.4s, v2.4s, v4.4s +; CHECK-NEXT: uzp1 v0.4s, v0.4s, v1.4s +; CHECK-NEXT: dup v1.8b, w8 +; CHECK-NEXT: uzp1 v0.8h, v0.8h, v2.8h +; CHECK-NEXT: xtn v0.8b, v0.8h +; CHECK-NEXT: orr v0.8b, v0.8b, v1.8b +; CHECK-NEXT: ret +entry: + %0 = call <8 x i1> @llvm.loop.dependence.war.mask.v8i1(ptr %a, ptr %b, i64 8) + ret <8 x i1> %0 +} + +define <16 x i1> @whilewr_64_expand3(ptr %a, ptr %b) { +; CHECK-LABEL: whilewr_64_expand3: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: index z0.d, #0, #1 +; CHECK-NEXT: subs x8, x1, x0 +; CHECK-NEXT: add x9, x8, #7 +; CHECK-NEXT: csel x8, x9, x8, mi +; CHECK-NEXT: asr x8, x8, #3 +; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: mov z2.d, z0.d +; CHECK-NEXT: mov z4.d, z0.d +; CHECK-NEXT: mov z5.d, z0.d +; CHECK-NEXT: mov z6.d, z0.d +; CHECK-NEXT: mov z7.d, z0.d +; CHECK-NEXT: mov z16.d, z0.d +; CHECK-NEXT: dup v3.2d, x8 +; CHECK-NEXT: cmp x8, #1 +; CHECK-NEXT: add z1.d, z1.d, #12 // =0xc +; CHECK-NEXT: add z2.d, z2.d, #10 // =0xa +; CHECK-NEXT: add z4.d, z4.d, #8 // =0x8 +; CHECK-NEXT: add z5.d, z5.d, #6 // =0x6 +; CHECK-NEXT: add z6.d, z6.d, #4 // =0x4 +; CHECK-NEXT: add z7.d, z7.d, #2 // =0x2 +; CHECK-NEXT: add z16.d, z16.d, #14 // =0xe +; CHECK-NEXT: cmhi v0.2d, v3.2d, v0.2d +; CHECK-NEXT: cset w8, lt +; CHECK-NEXT: cmhi v1.2d, v3.2d, v1.2d +; CHECK-NEXT: cmhi v2.2d, v3.2d, v2.2d +; CHECK-NEXT: cmhi v4.2d, v3.2d, v4.2d +; CHECK-NEXT: cmhi v5.2d, v3.2d, v5.2d +; CHECK-NEXT: cmhi v6.2d, v3.2d, v6.2d +; CHECK-NEXT: cmhi v16.2d, v3.2d, v16.2d +; CHECK-NEXT: cmhi v3.2d, v3.2d, v7.2d +; CHECK-NEXT: uzp1 v2.4s, v4.4s, v2.4s +; CHECK-NEXT: uzp1 v4.4s, v6.4s, v5.4s +; CHECK-NEXT: uzp1 v1.4s, v1.4s, v16.4s +; CHECK-NEXT: uzp1 v0.4s, v0.4s, v3.4s +; CHECK-NEXT: uzp1 v1.8h, v2.8h, v1.8h +; CHECK-NEXT: uzp1 v0.8h, v0.8h, v4.8h +; CHECK-NEXT: uzp1 v0.16b, v0.16b, v1.16b +; CHECK-NEXT: dup v1.16b, w8 +; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ret +entry: + %0 = call <16 x i1> @llvm.loop.dependence.war.mask.v16i1(ptr %a, ptr %b, i64 8) + ret <16 x i1> %0 +} + +define <32 x i1> @whilewr_64_expand4(ptr %a, ptr %b) { +; CHECK-LABEL: whilewr_64_expand4: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: subs x9, x1, x0 +; CHECK-NEXT: index z0.d, #0, #1 +; CHECK-NEXT: add x10, x9, #7 +; CHECK-NEXT: sub x11, x9, #121 +; CHECK-NEXT: csel x10, x10, x9, mi +; CHECK-NEXT: subs x9, x9, #128 +; CHECK-NEXT: csel x9, x11, x9, mi +; CHECK-NEXT: asr x10, x10, #3 +; CHECK-NEXT: asr x9, x9, #3 +; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: mov z2.d, z0.d +; CHECK-NEXT: mov z3.d, z0.d +; CHECK-NEXT: mov z4.d, z0.d +; CHECK-NEXT: mov z5.d, z0.d +; CHECK-NEXT: mov z6.d, z0.d +; CHECK-NEXT: dup v7.2d, x10 +; CHECK-NEXT: dup v16.2d, x9 +; CHECK-NEXT: add z1.d, z1.d, #12 // =0xc +; CHECK-NEXT: add z2.d, z2.d, #10 // =0xa +; CHECK-NEXT: cmp x9, #1 +; CHECK-NEXT: add z3.d, z3.d, #8 // =0x8 +; CHECK-NEXT: add z4.d, z4.d, #6 // =0x6 +; CHECK-NEXT: add z5.d, z5.d, #4 // =0x4 +; CHECK-NEXT: add z6.d, z6.d, #2 // =0x2 +; CHECK-NEXT: cmhi v17.2d, v7.2d, v0.2d +; CHECK-NEXT: cmhi v18.2d, v16.2d, v0.2d +; CHECK-NEXT: add z0.d, z0.d, #14 // =0xe +; CHECK-NEXT: cmhi v19.2d, v7.2d, v1.2d +; CHECK-NEXT: cmhi v20.2d, v7.2d, v2.2d +; CHECK-NEXT: cmhi v21.2d, v7.2d, v3.2d +; CHECK-NEXT: cmhi v22.2d, v7.2d, v4.2d +; CHECK-NEXT: cmhi v23.2d, v7.2d, v5.2d +; CHECK-NEXT: cmhi v24.2d, v7.2d, v6.2d +; CHECK-NEXT: cmhi v1.2d, v16.2d, v1.2d +; CHECK-NEXT: cmhi v2.2d, v16.2d, v2.2d +; CHECK-NEXT: cmhi v3.2d, v16.2d, v3.2d +; CHECK-NEXT: cmhi v4.2d, v16.2d, v4.2d +; CHECK-NEXT: cmhi v7.2d, v7.2d, v0.2d +; CHECK-NEXT: cmhi v5.2d, v16.2d, v5.2d +; CHECK-NEXT: cmhi v6.2d, v16.2d, v6.2d +; CHECK-NEXT: cset w9, lt +; CHECK-NEXT: cmhi v0.2d, v16.2d, v0.2d +; CHECK-NEXT: uzp1 v16.4s, v21.4s, v20.4s +; CHECK-NEXT: cmp x10, #1 +; CHECK-NEXT: uzp1 v20.4s, v23.4s, v22.4s +; CHECK-NEXT: uzp1 v17.4s, v17.4s, v24.4s +; CHECK-NEXT: cset w10, lt +; CHECK-NEXT: uzp1 v2.4s, v3.4s, v2.4s +; CHECK-NEXT: uzp1 v3.4s, v19.4s, v7.4s +; CHECK-NEXT: uzp1 v4.4s, v5.4s, v4.4s +; CHECK-NEXT: uzp1 v5.4s, v18.4s, v6.4s +; CHECK-NEXT: uzp1 v0.4s, v1.4s, v0.4s +; CHECK-NEXT: uzp1 v1.8h, v17.8h, v20.8h +; CHECK-NEXT: uzp1 v3.8h, v16.8h, v3.8h +; CHECK-NEXT: uzp1 v4.8h, v5.8h, v4.8h +; CHECK-NEXT: uzp1 v0.8h, v2.8h, v0.8h +; CHECK-NEXT: dup v2.16b, w10 +; CHECK-NEXT: uzp1 v1.16b, v1.16b, v3.16b +; CHECK-NEXT: dup v3.16b, w9 +; CHECK-NEXT: adrp x9, .LCPI18_0 +; CHECK-NEXT: uzp1 v0.16b, v4.16b, v0.16b +; CHECK-NEXT: orr v1.16b, v1.16b, v2.16b +; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI18_0] +; CHECK-NEXT: orr v0.16b, v0.16b, v3.16b +; CHECK-NEXT: shl v1.16b, v1.16b, #7 +; CHECK-NEXT: shl v0.16b, v0.16b, #7 +; CHECK-NEXT: cmlt v1.16b, v1.16b, #0 +; CHECK-NEXT: cmlt v0.16b, v0.16b, #0 +; CHECK-NEXT: and v1.16b, v1.16b, v2.16b +; CHECK-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-NEXT: ext v2.16b, v1.16b, v1.16b, #8 +; CHECK-NEXT: ext v3.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: zip1 v1.16b, v1.16b, v2.16b +; CHECK-NEXT: zip1 v0.16b, v0.16b, v3.16b +; CHECK-NEXT: addv h1, v1.8h +; CHECK-NEXT: addv h0, v0.8h +; CHECK-NEXT: str h1, [x8] +; CHECK-NEXT: str h0, [x8, #2] +; CHECK-NEXT: ret +entry: + %0 = call <32 x i1> @llvm.loop.dependence.war.mask.v32i1(ptr %a, ptr %b, i64 8) + ret <32 x i1> %0 +} + +define <9 x i1> @whilewr_8_widen(ptr %a, ptr %b) { +; CHECK-LABEL: whilewr_8_widen: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: whilewr p0.b, x0, x1 +; CHECK-NEXT: mov z0.b, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: umov w9, v0.b[0] +; CHECK-NEXT: umov w10, v0.b[1] +; CHECK-NEXT: umov w11, v0.b[2] +; CHECK-NEXT: umov w12, v0.b[7] +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: bfi w9, w10, #1, #1 +; CHECK-NEXT: umov w10, v0.b[3] +; CHECK-NEXT: bfi w9, w11, #2, #1 +; CHECK-NEXT: umov w11, v0.b[4] +; CHECK-NEXT: bfi w9, w10, #3, #1 +; CHECK-NEXT: umov w10, v0.b[5] +; CHECK-NEXT: bfi w9, w11, #4, #1 +; CHECK-NEXT: umov w11, v0.b[6] +; CHECK-NEXT: bfi w9, w10, #5, #1 +; CHECK-NEXT: umov w10, v0.b[8] +; CHECK-NEXT: bfi w9, w11, #6, #1 +; CHECK-NEXT: ubfiz w11, w12, #7, #1 +; CHECK-NEXT: orr w9, w9, w11 +; CHECK-NEXT: orr w9, w9, w10, lsl #8 +; CHECK-NEXT: and w9, w9, #0x1ff +; CHECK-NEXT: strh w9, [x8] +; CHECK-NEXT: ret +entry: + %0 = call <9 x i1> @llvm.loop.dependence.war.mask.v9i1(ptr %a, ptr %b, i64 1) + ret <9 x i1> %0 +} + +define <7 x i1> @whilewr_16_widen(ptr %a, ptr %b) { +; CHECK-LABEL: whilewr_16_widen: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: whilewr p0.h, x0, x1 +; CHECK-NEXT: mov z0.h, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: xtn v0.8b, v0.8h +; CHECK-NEXT: umov w0, v0.b[0] +; CHECK-NEXT: umov w1, v0.b[1] +; CHECK-NEXT: umov w2, v0.b[2] +; CHECK-NEXT: umov w3, v0.b[3] +; CHECK-NEXT: umov w4, v0.b[4] +; CHECK-NEXT: umov w5, v0.b[5] +; CHECK-NEXT: umov w6, v0.b[6] +; CHECK-NEXT: ret +entry: + %0 = call <7 x i1> @llvm.loop.dependence.war.mask.v7i1(ptr %a, ptr %b, i64 2) + ret <7 x i1> %0 +} + +define <3 x i1> @whilewr_32_widen(ptr %a, ptr %b) { +; CHECK-LABEL: whilewr_32_widen: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: whilewr p0.s, x0, x1 +; CHECK-NEXT: mov z0.s, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: xtn v0.4h, v0.4s +; CHECK-NEXT: umov w0, v0.h[0] +; CHECK-NEXT: umov w1, v0.h[1] +; CHECK-NEXT: umov w2, v0.h[2] +; CHECK-NEXT: ret +entry: + %0 = call <3 x i1> @llvm.loop.dependence.war.mask.v3i1(ptr %a, ptr %b, i64 4) + ret <3 x i1> %0 +} + +define <16 x i1> @whilewr_badimm(ptr %a, ptr %b) { +; CHECK-LABEL: whilewr_badimm: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov x8, #6148914691236517205 // =0x5555555555555555 +; CHECK-NEXT: sub x9, x1, x0 +; CHECK-NEXT: index z0.d, #0, #1 +; CHECK-NEXT: movk x8, #21846 +; CHECK-NEXT: smulh x8, x9, x8 +; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: mov z2.d, z0.d +; CHECK-NEXT: mov z4.d, z0.d +; CHECK-NEXT: mov z5.d, z0.d +; CHECK-NEXT: mov z6.d, z0.d +; CHECK-NEXT: mov z7.d, z0.d +; CHECK-NEXT: mov z16.d, z0.d +; CHECK-NEXT: add x8, x8, x8, lsr #63 +; CHECK-NEXT: add z1.d, z1.d, #12 // =0xc +; CHECK-NEXT: add z2.d, z2.d, #10 // =0xa +; CHECK-NEXT: add z4.d, z4.d, #8 // =0x8 +; CHECK-NEXT: add z5.d, z5.d, #6 // =0x6 +; CHECK-NEXT: add z6.d, z6.d, #4 // =0x4 +; CHECK-NEXT: dup v3.2d, x8 +; CHECK-NEXT: add z16.d, z16.d, #14 // =0xe +; CHECK-NEXT: add z7.d, z7.d, #2 // =0x2 +; CHECK-NEXT: cmp x8, #1 +; CHECK-NEXT: cset w8, lt +; CHECK-NEXT: cmhi v0.2d, v3.2d, v0.2d +; CHECK-NEXT: cmhi v1.2d, v3.2d, v1.2d +; CHECK-NEXT: cmhi v2.2d, v3.2d, v2.2d +; CHECK-NEXT: cmhi v4.2d, v3.2d, v4.2d +; CHECK-NEXT: cmhi v16.2d, v3.2d, v16.2d +; CHECK-NEXT: cmhi v5.2d, v3.2d, v5.2d +; CHECK-NEXT: cmhi v6.2d, v3.2d, v6.2d +; CHECK-NEXT: cmhi v3.2d, v3.2d, v7.2d +; CHECK-NEXT: uzp1 v1.4s, v1.4s, v16.4s +; CHECK-NEXT: uzp1 v2.4s, v4.4s, v2.4s +; CHECK-NEXT: uzp1 v4.4s, v6.4s, v5.4s +; CHECK-NEXT: uzp1 v0.4s, v0.4s, v3.4s +; CHECK-NEXT: uzp1 v1.8h, v2.8h, v1.8h +; CHECK-NEXT: uzp1 v0.8h, v0.8h, v4.8h +; CHECK-NEXT: uzp1 v0.16b, v0.16b, v1.16b +; CHECK-NEXT: dup v1.16b, w8 +; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ret +entry: + %0 = call <16 x i1> @llvm.loop.dependence.war.mask.v16i1(ptr %a, ptr %b, i64 3) + ret <16 x i1> %0 +} + +; Scalarizing <1 x i1> types + +define <1 x i1> @whilewr_8_scalarize(ptr %a, ptr %b) { +; CHECK-LABEL: whilewr_8_scalarize: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: subs x8, x1, x0 +; CHECK-NEXT: cmp x8, #0 +; CHECK-NEXT: cset w8, gt +; CHECK-NEXT: cmp x1, x0 +; CHECK-NEXT: csinc w0, w8, wzr, ne +; CHECK-NEXT: ret +entry: + %0 = call <1 x i1> @llvm.loop.dependence.war.mask.v1i1(ptr %a, ptr %b, i64 1) + ret <1 x i1> %0 +} + +define <1 x i1> @whilewr_16_scalarize(ptr %a, ptr %b) { +; CHECK-LABEL: whilewr_16_scalarize: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: subs x8, x1, x0 +; CHECK-NEXT: cmp x8, #1 +; CHECK-NEXT: cset w8, gt +; CHECK-NEXT: cmp x1, x0 +; CHECK-NEXT: csinc w0, w8, wzr, ne +; CHECK-NEXT: ret +entry: + %0 = call <1 x i1> @llvm.loop.dependence.war.mask.v1i1(ptr %a, ptr %b, i64 2) + ret <1 x i1> %0 +} + +define <1 x i1> @whilewr_32_scalarize(ptr %a, ptr %b) { +; CHECK-LABEL: whilewr_32_scalarize: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: subs x8, x1, x0 +; CHECK-NEXT: cmp x8, #3 +; CHECK-NEXT: cset w8, gt +; CHECK-NEXT: cmp x1, x0 +; CHECK-NEXT: csinc w0, w8, wzr, ne +; CHECK-NEXT: ret +entry: + %0 = call <1 x i1> @llvm.loop.dependence.war.mask.v1i1(ptr %a, ptr %b, i64 4) + ret <1 x i1> %0 +} + +define <1 x i1> @whilewr_64_scalarize(ptr %a, ptr %b) { +; CHECK-LABEL: whilewr_64_scalarize: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: subs x8, x1, x0 +; CHECK-NEXT: cmp x8, #7 +; CHECK-NEXT: cset w8, gt +; CHECK-NEXT: cmp x1, x0 +; CHECK-NEXT: csinc w0, w8, wzr, ne +; CHECK-NEXT: ret +entry: + %0 = call <1 x i1> @llvm.loop.dependence.war.mask.v1i1(ptr %a, ptr %b, i64 8) + ret <1 x i1> %0 +} + +define <1 x i1> @whilerw_8_scalarize(ptr %a, ptr %b) { +; CHECK-LABEL: whilerw_8_scalarize: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: subs x8, x1, x0 +; CHECK-NEXT: cmp x8, #0 +; CHECK-NEXT: cset w8, gt +; CHECK-NEXT: cmp x1, x0 +; CHECK-NEXT: csinc w0, w8, wzr, ne +; CHECK-NEXT: ret +entry: + %0 = call <1 x i1> @llvm.loop.dependence.raw.mask.v1i1(ptr %a, ptr %b, i64 1) + ret <1 x i1> %0 +} + +define <1 x i1> @whilerw_16_scalarize(ptr %a, ptr %b) { +; CHECK-LABEL: whilerw_16_scalarize: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: subs x8, x1, x0 +; CHECK-NEXT: cmp x8, #1 +; CHECK-NEXT: cset w8, gt +; CHECK-NEXT: cmp x1, x0 +; CHECK-NEXT: csinc w0, w8, wzr, ne +; CHECK-NEXT: ret +entry: + %0 = call <1 x i1> @llvm.loop.dependence.raw.mask.v1i1(ptr %a, ptr %b, i64 2) + ret <1 x i1> %0 +} + +define <1 x i1> @whilerw_32_scalarize(ptr %a, ptr %b) { +; CHECK-LABEL: whilerw_32_scalarize: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: subs x8, x1, x0 +; CHECK-NEXT: cmp x8, #3 +; CHECK-NEXT: cset w8, gt +; CHECK-NEXT: cmp x1, x0 +; CHECK-NEXT: csinc w0, w8, wzr, ne +; CHECK-NEXT: ret +entry: + %0 = call <1 x i1> @llvm.loop.dependence.raw.mask.v1i1(ptr %a, ptr %b, i64 4) + ret <1 x i1> %0 +} + +define <1 x i1> @whilerw_64_scalarize(ptr %a, ptr %b) { +; CHECK-LABEL: whilerw_64_scalarize: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: subs x8, x1, x0 +; CHECK-NEXT: cmp x8, #7 +; CHECK-NEXT: cset w8, gt +; CHECK-NEXT: cmp x1, x0 +; CHECK-NEXT: csinc w0, w8, wzr, ne +; CHECK-NEXT: ret +entry: + %0 = call <1 x i1> @llvm.loop.dependence.raw.mask.v1i1(ptr %a, ptr %b, i64 8) + ret <1 x i1> %0 +} diff --git a/llvm/test/CodeGen/AArch64/alias_mask_nosve.ll b/llvm/test/CodeGen/AArch64/alias_mask_nosve.ll new file mode 100644 index 000000000000..922b37c2f2a0 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/alias_mask_nosve.ll @@ -0,0 +1,48 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=aarch64 %s -o - | FileCheck %s + +define <16 x i1> @whilewr_8(ptr %a, ptr %b) { +; CHECK-LABEL: whilewr_8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: adrp x8, .LCPI0_0 +; CHECK-NEXT: adrp x10, .LCPI0_1 +; CHECK-NEXT: sub x9, x1, x0 +; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI0_0] +; CHECK-NEXT: adrp x8, .LCPI0_2 +; CHECK-NEXT: ldr q1, [x10, :lo12:.LCPI0_1] +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI0_2] +; CHECK-NEXT: adrp x8, .LCPI0_4 +; CHECK-NEXT: adrp x10, .LCPI0_3 +; CHECK-NEXT: ldr q5, [x8, :lo12:.LCPI0_4] +; CHECK-NEXT: adrp x8, .LCPI0_5 +; CHECK-NEXT: dup v2.2d, x9 +; CHECK-NEXT: ldr q4, [x10, :lo12:.LCPI0_3] +; CHECK-NEXT: adrp x10, .LCPI0_6 +; CHECK-NEXT: ldr q6, [x8, :lo12:.LCPI0_5] +; CHECK-NEXT: adrp x8, .LCPI0_7 +; CHECK-NEXT: ldr q7, [x10, :lo12:.LCPI0_6] +; CHECK-NEXT: cmp x9, #1 +; CHECK-NEXT: ldr q16, [x8, :lo12:.LCPI0_7] +; CHECK-NEXT: cmhi v0.2d, v2.2d, v0.2d +; CHECK-NEXT: cmhi v1.2d, v2.2d, v1.2d +; CHECK-NEXT: cmhi v3.2d, v2.2d, v3.2d +; CHECK-NEXT: cmhi v4.2d, v2.2d, v4.2d +; CHECK-NEXT: cmhi v5.2d, v2.2d, v5.2d +; CHECK-NEXT: cmhi v6.2d, v2.2d, v6.2d +; CHECK-NEXT: cmhi v7.2d, v2.2d, v7.2d +; CHECK-NEXT: cmhi v2.2d, v2.2d, v16.2d +; CHECK-NEXT: uzp1 v0.4s, v1.4s, v0.4s +; CHECK-NEXT: cset w8, lt +; CHECK-NEXT: uzp1 v1.4s, v4.4s, v3.4s +; CHECK-NEXT: uzp1 v3.4s, v6.4s, v5.4s +; CHECK-NEXT: uzp1 v2.4s, v2.4s, v7.4s +; CHECK-NEXT: uzp1 v0.8h, v1.8h, v0.8h +; CHECK-NEXT: uzp1 v1.8h, v2.8h, v3.8h +; CHECK-NEXT: uzp1 v0.16b, v1.16b, v0.16b +; CHECK-NEXT: dup v1.16b, w8 +; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ret +entry: + %0 = call <16 x i1> @llvm.loop.dependence.war.mask.v16i1(ptr %a, ptr %b, i64 1) + ret <16 x i1> %0 +} diff --git a/llvm/test/CodeGen/AArch64/alias_mask_scalable.ll b/llvm/test/CodeGen/AArch64/alias_mask_scalable.ll new file mode 100644 index 000000000000..179dcfa11c10 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/alias_mask_scalable.ll @@ -0,0 +1,767 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=aarch64 -mattr=+sve2 %s -o - | FileCheck %s + +define <vscale x 16 x i1> @whilewr_8(ptr %a, ptr %b) { +; CHECK-LABEL: whilewr_8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: whilewr p0.b, x0, x1 +; CHECK-NEXT: ret +entry: + %0 = call <vscale x 16 x i1> @llvm.loop.dependence.war.mask.nxv16i1(ptr %a, ptr %b, i64 1) + ret <vscale x 16 x i1> %0 +} + +define <vscale x 8 x i1> @whilewr_16(ptr %a, ptr %b) { +; CHECK-LABEL: whilewr_16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: whilewr p0.h, x0, x1 +; CHECK-NEXT: ret +entry: + %0 = call <vscale x 8 x i1> @llvm.loop.dependence.war.mask.nxv8i1(ptr %a, ptr %b, i64 2) + ret <vscale x 8 x i1> %0 +} + +define <vscale x 4 x i1> @whilewr_32(ptr %a, ptr %b) { +; CHECK-LABEL: whilewr_32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: whilewr p0.s, x0, x1 +; CHECK-NEXT: ret +entry: + %0 = call <vscale x 4 x i1> @llvm.loop.dependence.war.mask.nxv4i1(ptr %a, ptr %b, i64 4) + ret <vscale x 4 x i1> %0 +} + +define <vscale x 2 x i1> @whilewr_64(ptr %a, ptr %b) { +; CHECK-LABEL: whilewr_64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: whilewr p0.d, x0, x1 +; CHECK-NEXT: ret +entry: + %0 = call <vscale x 2 x i1> @llvm.loop.dependence.war.mask.nxv2i1(ptr %a, ptr %b, i64 8) + ret <vscale x 2 x i1> %0 +} + +define <vscale x 16 x i1> @whilerw_8(ptr %a, ptr %b) { +; CHECK-LABEL: whilerw_8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: whilerw p0.b, x0, x1 +; CHECK-NEXT: ret +entry: + %0 = call <vscale x 16 x i1> @llvm.loop.dependence.raw.mask.nxv16i1(ptr %a, ptr %b, i64 1) + ret <vscale x 16 x i1> %0 +} + +define <vscale x 8 x i1> @whilerw_16(ptr %a, ptr %b) { +; CHECK-LABEL: whilerw_16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: whilerw p0.h, x0, x1 +; CHECK-NEXT: ret +entry: + %0 = call <vscale x 8 x i1> @llvm.loop.dependence.raw.mask.nxv8i1(ptr %a, ptr %b, i64 2) + ret <vscale x 8 x i1> %0 +} + +define <vscale x 4 x i1> @whilerw_32(ptr %a, ptr %b) { +; CHECK-LABEL: whilerw_32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: whilerw p0.s, x0, x1 +; CHECK-NEXT: ret +entry: + %0 = call <vscale x 4 x i1> @llvm.loop.dependence.raw.mask.nxv4i1(ptr %a, ptr %b, i64 4) + ret <vscale x 4 x i1> %0 +} + +define <vscale x 2 x i1> @whilerw_64(ptr %a, ptr %b) { +; CHECK-LABEL: whilerw_64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: whilerw p0.d, x0, x1 +; CHECK-NEXT: ret +entry: + %0 = call <vscale x 2 x i1> @llvm.loop.dependence.raw.mask.nxv2i1(ptr %a, ptr %b, i64 8) + ret <vscale x 2 x i1> %0 +} + +define <vscale x 32 x i1> @whilewr_8_split(ptr %a, ptr %b) { +; CHECK-LABEL: whilewr_8_split: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: whilewr p0.b, x0, x1 +; CHECK-NEXT: incb x0 +; CHECK-NEXT: whilewr p1.b, x0, x1 +; CHECK-NEXT: ret +entry: + %0 = call <vscale x 32 x i1> @llvm.loop.dependence.war.mask.nxv32i1(ptr %a, ptr %b, i64 1) + ret <vscale x 32 x i1> %0 +} + +define <vscale x 64 x i1> @whilewr_8_split2(ptr %a, ptr %b) { +; CHECK-LABEL: whilewr_8_split2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov x8, x0 +; CHECK-NEXT: whilewr p0.b, x0, x1 +; CHECK-NEXT: addvl x9, x0, #3 +; CHECK-NEXT: incb x0, all, mul #2 +; CHECK-NEXT: incb x8 +; CHECK-NEXT: whilewr p3.b, x9, x1 +; CHECK-NEXT: whilewr p2.b, x0, x1 +; CHECK-NEXT: whilewr p1.b, x8, x1 +; CHECK-NEXT: ret +entry: + %0 = call <vscale x 64 x i1> @llvm.loop.dependence.war.mask.nxv64i1(ptr %a, ptr %b, i64 1) + ret <vscale x 64 x i1> %0 +} + +define <vscale x 16 x i1> @whilewr_16_expand(ptr %a, ptr %b) { +; CHECK-LABEL: whilewr_16_expand: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p7, [sp, #4, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p6, [sp, #5, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p5, [sp, #6, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 16 + 8 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: index z0.d, #0, #1 +; CHECK-NEXT: sub x8, x1, x0 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: add x8, x8, x8, lsr #63 +; CHECK-NEXT: asr x8, x8, #1 +; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: mov z4.d, z0.d +; CHECK-NEXT: mov z5.d, z0.d +; CHECK-NEXT: mov z2.d, x8 +; CHECK-NEXT: incd z1.d +; CHECK-NEXT: incd z4.d, all, mul #2 +; CHECK-NEXT: incd z5.d, all, mul #4 +; CHECK-NEXT: cmphi p2.d, p0/z, z2.d, z0.d +; CHECK-NEXT: mov z3.d, z1.d +; CHECK-NEXT: cmphi p1.d, p0/z, z2.d, z1.d +; CHECK-NEXT: incd z1.d, all, mul #4 +; CHECK-NEXT: cmphi p3.d, p0/z, z2.d, z4.d +; CHECK-NEXT: incd z4.d, all, mul #4 +; CHECK-NEXT: cmphi p4.d, p0/z, z2.d, z5.d +; CHECK-NEXT: incd z3.d, all, mul #2 +; CHECK-NEXT: cmphi p5.d, p0/z, z2.d, z1.d +; CHECK-NEXT: cmphi p7.d, p0/z, z2.d, z4.d +; CHECK-NEXT: uzp1 p1.s, p2.s, p1.s +; CHECK-NEXT: mov z0.d, z3.d +; CHECK-NEXT: cmphi p6.d, p0/z, z2.d, z3.d +; CHECK-NEXT: uzp1 p2.s, p4.s, p5.s +; CHECK-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: incd z0.d, all, mul #4 +; CHECK-NEXT: uzp1 p3.s, p3.s, p6.s +; CHECK-NEXT: ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: cmphi p0.d, p0/z, z2.d, z0.d +; CHECK-NEXT: uzp1 p1.h, p1.h, p3.h +; CHECK-NEXT: cmp x8, #1 +; CHECK-NEXT: cset w8, lt +; CHECK-NEXT: sbfx x8, x8, #0, #1 +; CHECK-NEXT: uzp1 p0.s, p7.s, p0.s +; CHECK-NEXT: ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: uzp1 p0.h, p2.h, p0.h +; CHECK-NEXT: uzp1 p0.b, p1.b, p0.b +; CHECK-NEXT: whilelo p1.b, xzr, x8 +; CHECK-NEXT: sel p0.b, p0, p0.b, p1.b +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %0 = call <vscale x 16 x i1> @llvm.loop.dependence.war.mask.nxv16i1(ptr %a, ptr %b, i64 2) + ret <vscale x 16 x i1> %0 +} + +define <vscale x 32 x i1> @whilewr_16_expand2(ptr %a, ptr %b) { +; CHECK-LABEL: whilewr_16_expand2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p9, [sp, #2, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p8, [sp, #3, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p7, [sp, #4, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p6, [sp, #5, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p5, [sp, #6, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 16 + 8 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: index z0.d, #0, #1 +; CHECK-NEXT: sub x8, x1, x0 +; CHECK-NEXT: incb x0, all, mul #2 +; CHECK-NEXT: add x8, x8, x8, lsr #63 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: asr x8, x8, #1 +; CHECK-NEXT: sub x9, x1, x0 +; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: mov z2.d, z0.d +; CHECK-NEXT: mov z3.d, z0.d +; CHECK-NEXT: mov z5.d, x8 +; CHECK-NEXT: add x9, x9, x9, lsr #63 +; CHECK-NEXT: incd z1.d +; CHECK-NEXT: incd z2.d, all, mul #2 +; CHECK-NEXT: incd z3.d, all, mul #4 +; CHECK-NEXT: cmphi p2.d, p0/z, z5.d, z0.d +; CHECK-NEXT: asr x9, x9, #1 +; CHECK-NEXT: mov z4.d, z1.d +; CHECK-NEXT: mov z6.d, z1.d +; CHECK-NEXT: mov z7.d, z2.d +; CHECK-NEXT: cmphi p1.d, p0/z, z5.d, z1.d +; CHECK-NEXT: cmphi p3.d, p0/z, z5.d, z3.d +; CHECK-NEXT: cmphi p5.d, p0/z, z5.d, z2.d +; CHECK-NEXT: incd z4.d, all, mul #2 +; CHECK-NEXT: incd z6.d, all, mul #4 +; CHECK-NEXT: incd z7.d, all, mul #4 +; CHECK-NEXT: uzp1 p1.s, p2.s, p1.s +; CHECK-NEXT: mov z24.d, z4.d +; CHECK-NEXT: cmphi p4.d, p0/z, z5.d, z6.d +; CHECK-NEXT: cmphi p6.d, p0/z, z5.d, z4.d +; CHECK-NEXT: cmphi p7.d, p0/z, z5.d, z7.d +; CHECK-NEXT: incd z24.d, all, mul #4 +; CHECK-NEXT: uzp1 p2.s, p3.s, p4.s +; CHECK-NEXT: uzp1 p3.s, p5.s, p6.s +; CHECK-NEXT: cmphi p8.d, p0/z, z5.d, z24.d +; CHECK-NEXT: mov z5.d, x9 +; CHECK-NEXT: cmp x8, #1 +; CHECK-NEXT: uzp1 p1.h, p1.h, p3.h +; CHECK-NEXT: cset w8, lt +; CHECK-NEXT: cmphi p4.d, p0/z, z5.d, z24.d +; CHECK-NEXT: cmphi p5.d, p0/z, z5.d, z7.d +; CHECK-NEXT: cmphi p6.d, p0/z, z5.d, z6.d +; CHECK-NEXT: uzp1 p7.s, p7.s, p8.s +; CHECK-NEXT: cmphi p9.d, p0/z, z5.d, z3.d +; CHECK-NEXT: cmphi p3.d, p0/z, z5.d, z4.d +; CHECK-NEXT: cmphi p8.d, p0/z, z5.d, z2.d +; CHECK-NEXT: sbfx x8, x8, #0, #1 +; CHECK-NEXT: uzp1 p2.h, p2.h, p7.h +; CHECK-NEXT: cmphi p7.d, p0/z, z5.d, z1.d +; CHECK-NEXT: cmphi p0.d, p0/z, z5.d, z0.d +; CHECK-NEXT: uzp1 p4.s, p5.s, p4.s +; CHECK-NEXT: uzp1 p5.s, p9.s, p6.s +; CHECK-NEXT: ldr p9, [sp, #2, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: whilelo p6.b, xzr, x8 +; CHECK-NEXT: uzp1 p3.s, p8.s, p3.s +; CHECK-NEXT: cmp x9, #1 +; CHECK-NEXT: ldr p8, [sp, #3, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: uzp1 p0.s, p0.s, p7.s +; CHECK-NEXT: cset w8, lt +; CHECK-NEXT: ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: uzp1 p4.h, p5.h, p4.h +; CHECK-NEXT: sbfx x8, x8, #0, #1 +; CHECK-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: uzp1 p0.h, p0.h, p3.h +; CHECK-NEXT: uzp1 p1.b, p1.b, p2.b +; CHECK-NEXT: uzp1 p2.b, p0.b, p4.b +; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: whilelo p3.b, xzr, x8 +; CHECK-NEXT: sel p0.b, p1, p1.b, p6.b +; CHECK-NEXT: ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: sel p1.b, p2, p2.b, p3.b +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %0 = call <vscale x 32 x i1> @llvm.loop.dependence.war.mask.nxv32i1(ptr %a, ptr %b, i64 2) + ret <vscale x 32 x i1> %0 +} + +define <vscale x 8 x i1> @whilewr_32_expand(ptr %a, ptr %b) { +; CHECK-LABEL: whilewr_32_expand: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: index z0.d, #0, #1 +; CHECK-NEXT: subs x8, x1, x0 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: add x9, x8, #3 +; CHECK-NEXT: csel x8, x9, x8, mi +; CHECK-NEXT: asr x8, x8, #2 +; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: mov z2.d, z0.d +; CHECK-NEXT: mov z3.d, x8 +; CHECK-NEXT: incd z1.d +; CHECK-NEXT: incd z2.d, all, mul #2 +; CHECK-NEXT: cmphi p1.d, p0/z, z3.d, z0.d +; CHECK-NEXT: mov z4.d, z1.d +; CHECK-NEXT: cmphi p2.d, p0/z, z3.d, z1.d +; CHECK-NEXT: cmphi p3.d, p0/z, z3.d, z2.d +; CHECK-NEXT: incd z4.d, all, mul #2 +; CHECK-NEXT: uzp1 p1.s, p1.s, p2.s +; CHECK-NEXT: cmphi p0.d, p0/z, z3.d, z4.d +; CHECK-NEXT: cmp x8, #1 +; CHECK-NEXT: cset w8, lt +; CHECK-NEXT: sbfx x8, x8, #0, #1 +; CHECK-NEXT: uzp1 p0.s, p3.s, p0.s +; CHECK-NEXT: uzp1 p0.h, p1.h, p0.h +; CHECK-NEXT: whilelo p1.h, xzr, x8 +; CHECK-NEXT: sel p0.b, p0, p0.b, p1.b +; CHECK-NEXT: ret +entry: + %0 = call <vscale x 8 x i1> @llvm.loop.dependence.war.mask.nxv8i1(ptr %a, ptr %b, i64 4) + ret <vscale x 8 x i1> %0 +} + +define <vscale x 16 x i1> @whilewr_32_expand2(ptr %a, ptr %b) { +; CHECK-LABEL: whilewr_32_expand2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p7, [sp, #4, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p6, [sp, #5, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p5, [sp, #6, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 16 + 8 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: index z0.d, #0, #1 +; CHECK-NEXT: subs x8, x1, x0 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: add x9, x8, #3 +; CHECK-NEXT: csel x8, x9, x8, mi +; CHECK-NEXT: asr x8, x8, #2 +; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: mov z4.d, z0.d +; CHECK-NEXT: mov z5.d, z0.d +; CHECK-NEXT: mov z2.d, x8 +; CHECK-NEXT: incd z1.d +; CHECK-NEXT: incd z4.d, all, mul #2 +; CHECK-NEXT: incd z5.d, all, mul #4 +; CHECK-NEXT: cmphi p2.d, p0/z, z2.d, z0.d +; CHECK-NEXT: mov z3.d, z1.d +; CHECK-NEXT: cmphi p1.d, p0/z, z2.d, z1.d +; CHECK-NEXT: incd z1.d, all, mul #4 +; CHECK-NEXT: cmphi p3.d, p0/z, z2.d, z4.d +; CHECK-NEXT: incd z4.d, all, mul #4 +; CHECK-NEXT: cmphi p4.d, p0/z, z2.d, z5.d +; CHECK-NEXT: incd z3.d, all, mul #2 +; CHECK-NEXT: cmphi p5.d, p0/z, z2.d, z1.d +; CHECK-NEXT: cmphi p7.d, p0/z, z2.d, z4.d +; CHECK-NEXT: uzp1 p1.s, p2.s, p1.s +; CHECK-NEXT: mov z0.d, z3.d +; CHECK-NEXT: cmphi p6.d, p0/z, z2.d, z3.d +; CHECK-NEXT: uzp1 p2.s, p4.s, p5.s +; CHECK-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: incd z0.d, all, mul #4 +; CHECK-NEXT: uzp1 p3.s, p3.s, p6.s +; CHECK-NEXT: ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: cmphi p0.d, p0/z, z2.d, z0.d +; CHECK-NEXT: uzp1 p1.h, p1.h, p3.h +; CHECK-NEXT: cmp x8, #1 +; CHECK-NEXT: cset w8, lt +; CHECK-NEXT: sbfx x8, x8, #0, #1 +; CHECK-NEXT: uzp1 p0.s, p7.s, p0.s +; CHECK-NEXT: ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: uzp1 p0.h, p2.h, p0.h +; CHECK-NEXT: uzp1 p0.b, p1.b, p0.b +; CHECK-NEXT: whilelo p1.b, xzr, x8 +; CHECK-NEXT: sel p0.b, p0, p0.b, p1.b +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %0 = call <vscale x 16 x i1> @llvm.loop.dependence.war.mask.nxv16i1(ptr %a, ptr %b, i64 4) + ret <vscale x 16 x i1> %0 +} + +define <vscale x 32 x i1> @whilewr_32_expand3(ptr %a, ptr %b) { +; CHECK-LABEL: whilewr_32_expand3: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p10, [sp, #1, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p9, [sp, #2, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p8, [sp, #3, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p7, [sp, #4, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p6, [sp, #5, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p5, [sp, #6, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 16 + 8 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: index z0.d, #0, #1 +; CHECK-NEXT: subs x8, x1, x0 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: add x9, x8, #3 +; CHECK-NEXT: incb x0, all, mul #4 +; CHECK-NEXT: csel x8, x9, x8, mi +; CHECK-NEXT: asr x8, x8, #2 +; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: mov z2.d, z0.d +; CHECK-NEXT: mov z4.d, z0.d +; CHECK-NEXT: mov z5.d, x8 +; CHECK-NEXT: incd z1.d +; CHECK-NEXT: incd z2.d, all, mul #2 +; CHECK-NEXT: incd z4.d, all, mul #4 +; CHECK-NEXT: cmphi p5.d, p0/z, z5.d, z0.d +; CHECK-NEXT: mov z3.d, z1.d +; CHECK-NEXT: mov z6.d, z2.d +; CHECK-NEXT: mov z7.d, z1.d +; CHECK-NEXT: cmphi p2.d, p0/z, z5.d, z4.d +; CHECK-NEXT: cmphi p3.d, p0/z, z5.d, z2.d +; CHECK-NEXT: cmphi p4.d, p0/z, z5.d, z1.d +; CHECK-NEXT: incd z3.d, all, mul #2 +; CHECK-NEXT: incd z6.d, all, mul #4 +; CHECK-NEXT: incd z7.d, all, mul #4 +; CHECK-NEXT: uzp1 p4.s, p5.s, p4.s +; CHECK-NEXT: mov z24.d, z3.d +; CHECK-NEXT: cmphi p6.d, p0/z, z5.d, z6.d +; CHECK-NEXT: cmphi p7.d, p0/z, z5.d, z7.d +; CHECK-NEXT: cmphi p8.d, p0/z, z5.d, z3.d +; CHECK-NEXT: incd z24.d, all, mul #4 +; CHECK-NEXT: uzp1 p2.s, p2.s, p7.s +; CHECK-NEXT: uzp1 p3.s, p3.s, p8.s +; CHECK-NEXT: cmphi p9.d, p0/z, z5.d, z24.d +; CHECK-NEXT: cmp x8, #1 +; CHECK-NEXT: uzp1 p3.h, p4.h, p3.h +; CHECK-NEXT: cset w8, lt +; CHECK-NEXT: sbfx x8, x8, #0, #1 +; CHECK-NEXT: uzp1 p6.s, p6.s, p9.s +; CHECK-NEXT: whilelo p1.b, xzr, x8 +; CHECK-NEXT: subs x8, x1, x0 +; CHECK-NEXT: uzp1 p2.h, p2.h, p6.h +; CHECK-NEXT: add x9, x8, #3 +; CHECK-NEXT: csel x8, x9, x8, mi +; CHECK-NEXT: uzp1 p2.b, p3.b, p2.b +; CHECK-NEXT: asr x8, x8, #2 +; CHECK-NEXT: mov z5.d, x8 +; CHECK-NEXT: cmphi p5.d, p0/z, z5.d, z24.d +; CHECK-NEXT: cmphi p7.d, p0/z, z5.d, z6.d +; CHECK-NEXT: cmphi p8.d, p0/z, z5.d, z7.d +; CHECK-NEXT: cmphi p9.d, p0/z, z5.d, z4.d +; CHECK-NEXT: cmphi p4.d, p0/z, z5.d, z3.d +; CHECK-NEXT: cmphi p10.d, p0/z, z5.d, z2.d +; CHECK-NEXT: cmphi p6.d, p0/z, z5.d, z1.d +; CHECK-NEXT: cmphi p0.d, p0/z, z5.d, z0.d +; CHECK-NEXT: cmp x8, #1 +; CHECK-NEXT: uzp1 p5.s, p7.s, p5.s +; CHECK-NEXT: cset w8, lt +; CHECK-NEXT: uzp1 p7.s, p9.s, p8.s +; CHECK-NEXT: sbfx x8, x8, #0, #1 +; CHECK-NEXT: ldr p9, [sp, #2, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: uzp1 p4.s, p10.s, p4.s +; CHECK-NEXT: ldr p10, [sp, #1, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: uzp1 p0.s, p0.s, p6.s +; CHECK-NEXT: ldr p8, [sp, #3, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: uzp1 p5.h, p7.h, p5.h +; CHECK-NEXT: ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: uzp1 p0.h, p0.h, p4.h +; CHECK-NEXT: ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: whilelo p4.b, xzr, x8 +; CHECK-NEXT: uzp1 p3.b, p0.b, p5.b +; CHECK-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: sel p0.b, p2, p2.b, p1.b +; CHECK-NEXT: sel p1.b, p3, p3.b, p4.b +; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %0 = call <vscale x 32 x i1> @llvm.loop.dependence.war.mask.nxv32i1(ptr %a, ptr %b, i64 4) + ret <vscale x 32 x i1> %0 +} + +define <vscale x 4 x i1> @whilewr_64_expand(ptr %a, ptr %b) { +; CHECK-LABEL: whilewr_64_expand: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: index z0.d, #0, #1 +; CHECK-NEXT: subs x8, x1, x0 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: add x9, x8, #7 +; CHECK-NEXT: csel x8, x9, x8, mi +; CHECK-NEXT: asr x8, x8, #3 +; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: mov z2.d, x8 +; CHECK-NEXT: incd z1.d +; CHECK-NEXT: cmphi p1.d, p0/z, z2.d, z0.d +; CHECK-NEXT: cmphi p0.d, p0/z, z2.d, z1.d +; CHECK-NEXT: cmp x8, #1 +; CHECK-NEXT: cset w8, lt +; CHECK-NEXT: sbfx x8, x8, #0, #1 +; CHECK-NEXT: uzp1 p0.s, p1.s, p0.s +; CHECK-NEXT: whilelo p1.s, xzr, x8 +; CHECK-NEXT: sel p0.b, p0, p0.b, p1.b +; CHECK-NEXT: ret +entry: + %0 = call <vscale x 4 x i1> @llvm.loop.dependence.war.mask.nxv4i1(ptr %a, ptr %b, i64 8) + ret <vscale x 4 x i1> %0 +} + +define <vscale x 8 x i1> @whilewr_64_expand2(ptr %a, ptr %b) { +; CHECK-LABEL: whilewr_64_expand2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: index z0.d, #0, #1 +; CHECK-NEXT: subs x8, x1, x0 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: add x9, x8, #7 +; CHECK-NEXT: csel x8, x9, x8, mi +; CHECK-NEXT: asr x8, x8, #3 +; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: mov z2.d, z0.d +; CHECK-NEXT: mov z3.d, x8 +; CHECK-NEXT: incd z1.d +; CHECK-NEXT: incd z2.d, all, mul #2 +; CHECK-NEXT: cmphi p1.d, p0/z, z3.d, z0.d +; CHECK-NEXT: mov z4.d, z1.d +; CHECK-NEXT: cmphi p2.d, p0/z, z3.d, z1.d +; CHECK-NEXT: cmphi p3.d, p0/z, z3.d, z2.d +; CHECK-NEXT: incd z4.d, all, mul #2 +; CHECK-NEXT: uzp1 p1.s, p1.s, p2.s +; CHECK-NEXT: cmphi p0.d, p0/z, z3.d, z4.d +; CHECK-NEXT: cmp x8, #1 +; CHECK-NEXT: cset w8, lt +; CHECK-NEXT: sbfx x8, x8, #0, #1 +; CHECK-NEXT: uzp1 p0.s, p3.s, p0.s +; CHECK-NEXT: uzp1 p0.h, p1.h, p0.h +; CHECK-NEXT: whilelo p1.h, xzr, x8 +; CHECK-NEXT: sel p0.b, p0, p0.b, p1.b +; CHECK-NEXT: ret +entry: + %0 = call <vscale x 8 x i1> @llvm.loop.dependence.war.mask.nxv8i1(ptr %a, ptr %b, i64 8) + ret <vscale x 8 x i1> %0 +} + +define <vscale x 16 x i1> @whilewr_64_expand3(ptr %a, ptr %b) { +; CHECK-LABEL: whilewr_64_expand3: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p7, [sp, #4, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p6, [sp, #5, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p5, [sp, #6, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 16 + 8 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: index z0.d, #0, #1 +; CHECK-NEXT: subs x8, x1, x0 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: add x9, x8, #7 +; CHECK-NEXT: csel x8, x9, x8, mi +; CHECK-NEXT: asr x8, x8, #3 +; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: mov z4.d, z0.d +; CHECK-NEXT: mov z5.d, z0.d +; CHECK-NEXT: mov z2.d, x8 +; CHECK-NEXT: incd z1.d +; CHECK-NEXT: incd z4.d, all, mul #2 +; CHECK-NEXT: incd z5.d, all, mul #4 +; CHECK-NEXT: cmphi p2.d, p0/z, z2.d, z0.d +; CHECK-NEXT: mov z3.d, z1.d +; CHECK-NEXT: cmphi p1.d, p0/z, z2.d, z1.d +; CHECK-NEXT: incd z1.d, all, mul #4 +; CHECK-NEXT: cmphi p3.d, p0/z, z2.d, z4.d +; CHECK-NEXT: incd z4.d, all, mul #4 +; CHECK-NEXT: cmphi p4.d, p0/z, z2.d, z5.d +; CHECK-NEXT: incd z3.d, all, mul #2 +; CHECK-NEXT: cmphi p5.d, p0/z, z2.d, z1.d +; CHECK-NEXT: cmphi p7.d, p0/z, z2.d, z4.d +; CHECK-NEXT: uzp1 p1.s, p2.s, p1.s +; CHECK-NEXT: mov z0.d, z3.d +; CHECK-NEXT: cmphi p6.d, p0/z, z2.d, z3.d +; CHECK-NEXT: uzp1 p2.s, p4.s, p5.s +; CHECK-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: incd z0.d, all, mul #4 +; CHECK-NEXT: uzp1 p3.s, p3.s, p6.s +; CHECK-NEXT: ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: cmphi p0.d, p0/z, z2.d, z0.d +; CHECK-NEXT: uzp1 p1.h, p1.h, p3.h +; CHECK-NEXT: cmp x8, #1 +; CHECK-NEXT: cset w8, lt +; CHECK-NEXT: sbfx x8, x8, #0, #1 +; CHECK-NEXT: uzp1 p0.s, p7.s, p0.s +; CHECK-NEXT: ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: uzp1 p0.h, p2.h, p0.h +; CHECK-NEXT: uzp1 p0.b, p1.b, p0.b +; CHECK-NEXT: whilelo p1.b, xzr, x8 +; CHECK-NEXT: sel p0.b, p0, p0.b, p1.b +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %0 = call <vscale x 16 x i1> @llvm.loop.dependence.war.mask.nxv16i1(ptr %a, ptr %b, i64 8) + ret <vscale x 16 x i1> %0 +} + +define <vscale x 32 x i1> @whilewr_64_expand4(ptr %a, ptr %b) { +; CHECK-LABEL: whilewr_64_expand4: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p10, [sp, #1, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p9, [sp, #2, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p8, [sp, #3, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p7, [sp, #4, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p6, [sp, #5, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p5, [sp, #6, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 16 + 8 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: index z0.d, #0, #1 +; CHECK-NEXT: subs x8, x1, x0 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: add x9, x8, #7 +; CHECK-NEXT: csel x8, x9, x8, mi +; CHECK-NEXT: addvl x9, x0, #8 +; CHECK-NEXT: asr x8, x8, #3 +; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: mov z2.d, z0.d +; CHECK-NEXT: mov z4.d, z0.d +; CHECK-NEXT: mov z5.d, x8 +; CHECK-NEXT: incd z1.d +; CHECK-NEXT: incd z2.d, all, mul #2 +; CHECK-NEXT: incd z4.d, all, mul #4 +; CHECK-NEXT: cmphi p5.d, p0/z, z5.d, z0.d +; CHECK-NEXT: mov z3.d, z1.d +; CHECK-NEXT: mov z6.d, z2.d +; CHECK-NEXT: mov z7.d, z1.d +; CHECK-NEXT: cmphi p2.d, p0/z, z5.d, z4.d +; CHECK-NEXT: cmphi p3.d, p0/z, z5.d, z2.d +; CHECK-NEXT: cmphi p4.d, p0/z, z5.d, z1.d +; CHECK-NEXT: incd z3.d, all, mul #2 +; CHECK-NEXT: incd z6.d, all, mul #4 +; CHECK-NEXT: incd z7.d, all, mul #4 +; CHECK-NEXT: uzp1 p4.s, p5.s, p4.s +; CHECK-NEXT: mov z24.d, z3.d +; CHECK-NEXT: cmphi p6.d, p0/z, z5.d, z6.d +; CHECK-NEXT: cmphi p7.d, p0/z, z5.d, z7.d +; CHECK-NEXT: cmphi p8.d, p0/z, z5.d, z3.d +; CHECK-NEXT: incd z24.d, all, mul #4 +; CHECK-NEXT: uzp1 p2.s, p2.s, p7.s +; CHECK-NEXT: uzp1 p3.s, p3.s, p8.s +; CHECK-NEXT: cmphi p9.d, p0/z, z5.d, z24.d +; CHECK-NEXT: cmp x8, #1 +; CHECK-NEXT: uzp1 p3.h, p4.h, p3.h +; CHECK-NEXT: cset w8, lt +; CHECK-NEXT: sbfx x8, x8, #0, #1 +; CHECK-NEXT: uzp1 p6.s, p6.s, p9.s +; CHECK-NEXT: whilelo p1.b, xzr, x8 +; CHECK-NEXT: subs x8, x1, x9 +; CHECK-NEXT: uzp1 p2.h, p2.h, p6.h +; CHECK-NEXT: add x9, x8, #7 +; CHECK-NEXT: csel x8, x9, x8, mi +; CHECK-NEXT: uzp1 p2.b, p3.b, p2.b +; CHECK-NEXT: asr x8, x8, #3 +; CHECK-NEXT: mov z5.d, x8 +; CHECK-NEXT: cmphi p5.d, p0/z, z5.d, z24.d +; CHECK-NEXT: cmphi p7.d, p0/z, z5.d, z6.d +; CHECK-NEXT: cmphi p8.d, p0/z, z5.d, z7.d +; CHECK-NEXT: cmphi p9.d, p0/z, z5.d, z4.d +; CHECK-NEXT: cmphi p4.d, p0/z, z5.d, z3.d +; CHECK-NEXT: cmphi p10.d, p0/z, z5.d, z2.d +; CHECK-NEXT: cmphi p6.d, p0/z, z5.d, z1.d +; CHECK-NEXT: cmphi p0.d, p0/z, z5.d, z0.d +; CHECK-NEXT: cmp x8, #1 +; CHECK-NEXT: uzp1 p5.s, p7.s, p5.s +; CHECK-NEXT: cset w8, lt +; CHECK-NEXT: uzp1 p7.s, p9.s, p8.s +; CHECK-NEXT: sbfx x8, x8, #0, #1 +; CHECK-NEXT: ldr p9, [sp, #2, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: uzp1 p4.s, p10.s, p4.s +; CHECK-NEXT: ldr p10, [sp, #1, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: uzp1 p0.s, p0.s, p6.s +; CHECK-NEXT: ldr p8, [sp, #3, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: uzp1 p5.h, p7.h, p5.h +; CHECK-NEXT: ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: uzp1 p0.h, p0.h, p4.h +; CHECK-NEXT: ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: whilelo p4.b, xzr, x8 +; CHECK-NEXT: uzp1 p3.b, p0.b, p5.b +; CHECK-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: sel p0.b, p2, p2.b, p1.b +; CHECK-NEXT: sel p1.b, p3, p3.b, p4.b +; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %0 = call <vscale x 32 x i1> @llvm.loop.dependence.war.mask.nxv32i1(ptr %a, ptr %b, i64 8) + ret <vscale x 32 x i1> %0 +} + +define <vscale x 9 x i1> @whilewr_8_widen(ptr %a, ptr %b) { +; CHECK-LABEL: whilewr_8_widen: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: whilewr p0.b, x0, x1 +; CHECK-NEXT: ret +entry: + %0 = call <vscale x 9 x i1> @llvm.loop.dependence.war.mask.nxv9i1(ptr %a, ptr %b, i64 1) + ret <vscale x 9 x i1> %0 +} + +define <vscale x 7 x i1> @whilewr_16_widen(ptr %a, ptr %b) { +; CHECK-LABEL: whilewr_16_widen: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: whilewr p0.h, x0, x1 +; CHECK-NEXT: ret +entry: + %0 = call <vscale x 7 x i1> @llvm.loop.dependence.war.mask.nxv7i1(ptr %a, ptr %b, i64 2) + ret <vscale x 7 x i1> %0 +} + +define <vscale x 3 x i1> @whilewr_32_widen(ptr %a, ptr %b) { +; CHECK-LABEL: whilewr_32_widen: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: whilewr p0.s, x0, x1 +; CHECK-NEXT: ret +entry: + %0 = call <vscale x 3 x i1> @llvm.loop.dependence.war.mask.nxv3i1(ptr %a, ptr %b, i64 4) + ret <vscale x 3 x i1> %0 +} + +define <vscale x 16 x i1> @whilewr_badimm(ptr %a, ptr %b) { +; CHECK-LABEL: whilewr_badimm: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p7, [sp, #4, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p6, [sp, #5, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p5, [sp, #6, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 16 + 8 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: index z0.d, #0, #1 +; CHECK-NEXT: mov x8, #6148914691236517205 // =0x5555555555555555 +; CHECK-NEXT: sub x9, x1, x0 +; CHECK-NEXT: movk x8, #21846 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: smulh x8, x9, x8 +; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: mov z4.d, z0.d +; CHECK-NEXT: mov z5.d, z0.d +; CHECK-NEXT: incd z1.d +; CHECK-NEXT: add x8, x8, x8, lsr #63 +; CHECK-NEXT: incd z4.d, all, mul #2 +; CHECK-NEXT: incd z5.d, all, mul #4 +; CHECK-NEXT: mov z2.d, x8 +; CHECK-NEXT: mov z3.d, z1.d +; CHECK-NEXT: cmphi p2.d, p0/z, z2.d, z0.d +; CHECK-NEXT: cmphi p1.d, p0/z, z2.d, z1.d +; CHECK-NEXT: incd z1.d, all, mul #4 +; CHECK-NEXT: incd z3.d, all, mul #2 +; CHECK-NEXT: cmphi p3.d, p0/z, z2.d, z4.d +; CHECK-NEXT: incd z4.d, all, mul #4 +; CHECK-NEXT: cmphi p4.d, p0/z, z2.d, z5.d +; CHECK-NEXT: cmphi p5.d, p0/z, z2.d, z1.d +; CHECK-NEXT: mov z0.d, z3.d +; CHECK-NEXT: cmphi p6.d, p0/z, z2.d, z3.d +; CHECK-NEXT: cmphi p7.d, p0/z, z2.d, z4.d +; CHECK-NEXT: uzp1 p1.s, p2.s, p1.s +; CHECK-NEXT: incd z0.d, all, mul #4 +; CHECK-NEXT: uzp1 p2.s, p4.s, p5.s +; CHECK-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: uzp1 p3.s, p3.s, p6.s +; CHECK-NEXT: ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: cmphi p0.d, p0/z, z2.d, z0.d +; CHECK-NEXT: uzp1 p1.h, p1.h, p3.h +; CHECK-NEXT: cmp x8, #1 +; CHECK-NEXT: cset w8, lt +; CHECK-NEXT: sbfx x8, x8, #0, #1 +; CHECK-NEXT: uzp1 p0.s, p7.s, p0.s +; CHECK-NEXT: ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: uzp1 p0.h, p2.h, p0.h +; CHECK-NEXT: uzp1 p0.b, p1.b, p0.b +; CHECK-NEXT: whilelo p1.b, xzr, x8 +; CHECK-NEXT: sel p0.b, p0, p0.b, p1.b +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %0 = call <vscale x 16 x i1> @llvm.loop.dependence.war.mask.nxv16i1(ptr %a, ptr %b, i64 3) + ret <vscale x 16 x i1> %0 +} diff --git a/llvm/test/CodeGen/AArch64/alias_mask_scalable_nosve2.ll b/llvm/test/CodeGen/AArch64/alias_mask_scalable_nosve2.ll new file mode 100644 index 000000000000..8b5ea0bc3b3c --- /dev/null +++ b/llvm/test/CodeGen/AArch64/alias_mask_scalable_nosve2.ll @@ -0,0 +1,59 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=aarch64 -mattr=+sve %s -o - | FileCheck %s + +define <vscale x 16 x i1> @whilewr_8(ptr %a, ptr %b) { +; CHECK-LABEL: whilewr_8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p7, [sp, #4, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p6, [sp, #5, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p5, [sp, #6, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 16 + 8 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: index z0.d, #0, #1 +; CHECK-NEXT: sub x8, x1, x0 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mov z2.d, x8 +; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: mov z3.d, z0.d +; CHECK-NEXT: cmphi p1.d, p0/z, z2.d, z0.d +; CHECK-NEXT: incd z0.d, all, mul #4 +; CHECK-NEXT: incd z1.d +; CHECK-NEXT: incd z3.d, all, mul #2 +; CHECK-NEXT: cmphi p5.d, p0/z, z2.d, z0.d +; CHECK-NEXT: mov z4.d, z1.d +; CHECK-NEXT: cmphi p2.d, p0/z, z2.d, z1.d +; CHECK-NEXT: incd z1.d, all, mul #4 +; CHECK-NEXT: cmphi p3.d, p0/z, z2.d, z3.d +; CHECK-NEXT: incd z3.d, all, mul #4 +; CHECK-NEXT: incd z4.d, all, mul #2 +; CHECK-NEXT: cmphi p6.d, p0/z, z2.d, z1.d +; CHECK-NEXT: cmphi p7.d, p0/z, z2.d, z3.d +; CHECK-NEXT: uzp1 p1.s, p1.s, p2.s +; CHECK-NEXT: cmphi p4.d, p0/z, z2.d, z4.d +; CHECK-NEXT: incd z4.d, all, mul #4 +; CHECK-NEXT: uzp1 p2.s, p5.s, p6.s +; CHECK-NEXT: ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: cmphi p0.d, p0/z, z2.d, z4.d +; CHECK-NEXT: uzp1 p3.s, p3.s, p4.s +; CHECK-NEXT: cmp x8, #1 +; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: cset w8, lt +; CHECK-NEXT: uzp1 p1.h, p1.h, p3.h +; CHECK-NEXT: sbfx x8, x8, #0, #1 +; CHECK-NEXT: uzp1 p0.s, p7.s, p0.s +; CHECK-NEXT: ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: uzp1 p0.h, p2.h, p0.h +; CHECK-NEXT: uzp1 p0.b, p1.b, p0.b +; CHECK-NEXT: whilelo p1.b, xzr, x8 +; CHECK-NEXT: sel p0.b, p0, p0.b, p1.b +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %0 = call <vscale x 16 x i1> @llvm.loop.dependence.war.mask.nxv16i1(ptr %a, ptr %b, i64 1) + ret <vscale x 16 x i1> %0 +} diff --git a/llvm/test/CodeGen/AArch64/alloca-oversized.ll b/llvm/test/CodeGen/AArch64/alloca-oversized.ll index e57bbcdf9980..81d301019e7b 100644 --- a/llvm/test/CodeGen/AArch64/alloca-oversized.ll +++ b/llvm/test/CodeGen/AArch64/alloca-oversized.ll @@ -10,10 +10,7 @@ define void @test_oversized(ptr %dst, i32 %cond) { ; CHECK-NEXT: .cfi_offset w30, -8 ; CHECK-NEXT: .cfi_offset w29, -16 ; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: mov x9, #2305843009213693952 // =0x2000000000000000 -; CHECK-NEXT: sub x8, x8, x9 ; CHECK-NEXT: sub x9, x29, #32 -; CHECK-NEXT: mov sp, x8 ; CHECK-NEXT: cmp w1, #0 ; CHECK-NEXT: csel x8, x9, x8, eq ; CHECK-NEXT: str x8, [x0] diff --git a/llvm/test/CodeGen/AArch64/arm64-AdvSIMD-Scalar.ll b/llvm/test/CodeGen/AArch64/arm64-AdvSIMD-Scalar.ll index 7934e39b2b69..78e20f2a5e21 100644 --- a/llvm/test/CodeGen/AArch64/arm64-AdvSIMD-Scalar.ll +++ b/llvm/test/CodeGen/AArch64/arm64-AdvSIMD-Scalar.ll @@ -69,14 +69,14 @@ define double @add_sub_su64(<2 x i64> %a, <2 x i64> %b) nounwind readnone { ; CHECK-LABEL: add_sub_su64: ; CHECK: // %bb.0: ; CHECK-NEXT: add d0, d1, d0 -; CHECK-NEXT: fmov d1, xzr +; CHECK-NEXT: movi d1, #0000000000000000 ; CHECK-NEXT: sub d0, d1, d0 ; CHECK-NEXT: ret ; ; GENERIC-LABEL: add_sub_su64: ; GENERIC: // %bb.0: ; GENERIC-NEXT: add d0, d1, d0 -; GENERIC-NEXT: fmov d1, xzr +; GENERIC-NEXT: movi d1, #0000000000000000 ; GENERIC-NEXT: sub d0, d1, d0 ; GENERIC-NEXT: ret %vecext = extractelement <2 x i64> %a, i32 0 diff --git a/llvm/test/CodeGen/AArch64/arm64-ccmp.ll b/llvm/test/CodeGen/AArch64/arm64-ccmp.ll index 4fe01e838771..cad5df0d9655 100644 --- a/llvm/test/CodeGen/AArch64/arm64-ccmp.ll +++ b/llvm/test/CodeGen/AArch64/arm64-ccmp.ll @@ -632,11 +632,11 @@ define i64 @select_noccmp2(i64 %v1, i64 %v2, i64 %v3, i64 %r) { ; CHECK-SD-NEXT: cmp x0, #0 ; CHECK-SD-NEXT: ccmp x0, #13, #0, pl ; CHECK-SD-NEXT: cset w8, gt +; CHECK-SD-NEXT: csetm w9, gt ; CHECK-SD-NEXT: cmp w8, #0 ; CHECK-SD-NEXT: csel x0, xzr, x3, ne -; CHECK-SD-NEXT: sbfx w8, w8, #0, #1 -; CHECK-SD-NEXT: adrp x9, _g@PAGE -; CHECK-SD-NEXT: str w8, [x9, _g@PAGEOFF] +; CHECK-SD-NEXT: adrp x8, _g@PAGE +; CHECK-SD-NEXT: str w9, [x8, _g@PAGEOFF] ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: select_noccmp2: diff --git a/llvm/test/CodeGen/AArch64/arm64-copy-phys-zero-reg.mir b/llvm/test/CodeGen/AArch64/arm64-copy-phys-zero-reg.mir index 76b5b7613065..284d624a4e68 100644 --- a/llvm/test/CodeGen/AArch64/arm64-copy-phys-zero-reg.mir +++ b/llvm/test/CodeGen/AArch64/arm64-copy-phys-zero-reg.mir @@ -1,15 +1,15 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 -# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="-zcm-gpr32,-zcm-gpr64,-zcz" %s \ +# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="-zcm-gpr32,-zcm-gpr64,-zcz-gpr32,-zcz-gpr64" %s \ # RUN: | FileCheck --check-prefix=CHECK-NO-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ %s -# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="+zcm-gpr32,-zcm-gpr64,-zcz" %s \ +# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="+zcm-gpr32,-zcm-gpr64,-zcz-gpr32,-zcz-gpr64" %s \ # RUN: | FileCheck --check-prefix=CHECK-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ %s -# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="-zcm-gpr32,+zcm-gpr64,-zcz" %s \ +# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="-zcm-gpr32,+zcm-gpr64,-zcz-gpr32,-zcz-gpr64" %s \ # RUN: | FileCheck --check-prefix=CHECK-NO-ZCM-GPR32-ZCM-GPR64-NO-ZCZ %s -# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="+zcm-gpr32,+zcm-gpr64,-zcz" %s \ +# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="+zcm-gpr32,+zcm-gpr64,-zcz-gpr32,-zcz-gpr64" %s \ # RUN: | FileCheck --check-prefix=CHECK-ZCM-GPR32-ZCM-GPR64-NO-ZCZ %s -# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="-zcm-gpr32,-zcm-gpr64,+zcz" %s \ +# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="-zcm-gpr32,-zcm-gpr64,+zcz-gpr32,+zcz-gpr64" %s \ # RUN: | FileCheck --check-prefix=CHECK-NO-ZCM-ZCZ %s -# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="+zcm-gpr32,+zcm-gpr64,+zcz" %s \ +# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="+zcm-gpr32,+zcm-gpr64,+zcz-gpr32,+zcz-gpr64" %s \ # RUN: | FileCheck --check-prefix=CHECK-ZCM-ZCZ %s --- | diff --git a/llvm/test/CodeGen/AArch64/arm64-crypto.ll b/llvm/test/CodeGen/AArch64/arm64-crypto.ll index 1def7588e7bd..160fc82cbabb 100644 --- a/llvm/test/CodeGen/AArch64/arm64-crypto.ll +++ b/llvm/test/CodeGen/AArch64/arm64-crypto.ll @@ -1,5 +1,7 @@ -; RUN: llc -mtriple=arm64-eabi -mattr=crypto -aarch64-neon-syntax=apple -o - %s | FileCheck %s -; RUN: llc -mtriple=arm64-eabi -global-isel -global-isel-abort=2 -pass-remarks-missed=gisel* -mattr=crypto -aarch64-neon-syntax=apple -o - %s 2>&1 | FileCheck %s --check-prefixes=CHECK,FALLBACK +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=arm64-eabi -mattr=crypto -aarch64-neon-syntax=apple -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-SD +; RUN: llc -mtriple=arm64-eabi -mattr=crypto -aarch64-neon-syntax=apple -global-isel -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-GI + declare <16 x i8> @llvm.aarch64.crypto.aese(<16 x i8> %data, <16 x i8> %key) declare <16 x i8> @llvm.aarch64.crypto.aesd(<16 x i8> %data, <16 x i8> %key) @@ -8,28 +10,36 @@ declare <16 x i8> @llvm.aarch64.crypto.aesimc(<16 x i8> %data) define <16 x i8> @test_aese(<16 x i8> %data, <16 x i8> %key) { ; CHECK-LABEL: test_aese: -; CHECK: aese.16b v0, v1 +; CHECK: // %bb.0: +; CHECK-NEXT: aese.16b v0, v1 +; CHECK-NEXT: ret %res = call <16 x i8> @llvm.aarch64.crypto.aese(<16 x i8> %data, <16 x i8> %key) ret <16 x i8> %res } define <16 x i8> @test_aesd(<16 x i8> %data, <16 x i8> %key) { ; CHECK-LABEL: test_aesd: -; CHECK: aesd.16b v0, v1 +; CHECK: // %bb.0: +; CHECK-NEXT: aesd.16b v0, v1 +; CHECK-NEXT: ret %res = call <16 x i8> @llvm.aarch64.crypto.aesd(<16 x i8> %data, <16 x i8> %key) ret <16 x i8> %res } define <16 x i8> @test_aesmc(<16 x i8> %data) { ; CHECK-LABEL: test_aesmc: -; CHECK: aesmc.16b v0, v0 +; CHECK: // %bb.0: +; CHECK-NEXT: aesmc.16b v0, v0 +; CHECK-NEXT: ret %res = call <16 x i8> @llvm.aarch64.crypto.aesmc(<16 x i8> %data) ret <16 x i8> %res } define <16 x i8> @test_aesimc(<16 x i8> %data) { ; CHECK-LABEL: test_aesimc: -; CHECK: aesimc.16b v0, v0 +; CHECK: // %bb.0: +; CHECK-NEXT: aesimc.16b v0, v0 +; CHECK-NEXT: ret %res = call <16 x i8> @llvm.aarch64.crypto.aesimc(<16 x i8> %data) ret <16 x i8> %res } @@ -43,8 +53,10 @@ declare <4 x i32> @llvm.aarch64.crypto.sha1su1(<4 x i32> %wk0_3, <4 x i32> %wk12 define <4 x i32> @test_sha1c(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk) { ; CHECK-LABEL: test_sha1c: -; CHECK: fmov [[HASH_E:s[0-9]+]], w0 -; CHECK: sha1c.4s q0, [[HASH_E]], v1 +; CHECK: // %bb.0: +; CHECK-NEXT: fmov s2, w0 +; CHECK-NEXT: sha1c.4s q0, s2, v1 +; CHECK-NEXT: ret %res = call <4 x i32> @llvm.aarch64.crypto.sha1c(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk) ret <4 x i32> %res } @@ -52,10 +64,12 @@ define <4 x i32> @test_sha1c(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk) { ; <rdar://problem/14742333> Incomplete removal of unnecessary FMOV instructions in intrinsic SHA1 define <4 x i32> @test_sha1c_in_a_row(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk) { ; CHECK-LABEL: test_sha1c_in_a_row: -; CHECK: fmov [[HASH_E:s[0-9]+]], w0 -; CHECK: sha1c.4s q[[SHA1RES:[0-9]+]], [[HASH_E]], v1 -; CHECK-NOT: fmov -; CHECK: sha1c.4s q0, s[[SHA1RES]], v1 +; CHECK: // %bb.0: +; CHECK-NEXT: fmov s2, w0 +; CHECK-NEXT: mov.16b v3, v0 +; CHECK-NEXT: sha1c.4s q3, s2, v1 +; CHECK-NEXT: sha1c.4s q0, s3, v1 +; CHECK-NEXT: ret %res = call <4 x i32> @llvm.aarch64.crypto.sha1c(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk) %extract = extractelement <4 x i32> %res, i32 0 %res2 = call <4 x i32> @llvm.aarch64.crypto.sha1c(<4 x i32> %hash_abcd, i32 %extract, <4 x i32> %wk) @@ -64,40 +78,49 @@ define <4 x i32> @test_sha1c_in_a_row(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i3 define <4 x i32> @test_sha1p(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk) { ; CHECK-LABEL: test_sha1p: -; CHECK: fmov [[HASH_E:s[0-9]+]], w0 -; CHECK: sha1p.4s q0, [[HASH_E]], v1 +; CHECK: // %bb.0: +; CHECK-NEXT: fmov s2, w0 +; CHECK-NEXT: sha1p.4s q0, s2, v1 +; CHECK-NEXT: ret %res = call <4 x i32> @llvm.aarch64.crypto.sha1p(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk) ret <4 x i32> %res } define <4 x i32> @test_sha1m(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk) { ; CHECK-LABEL: test_sha1m: -; CHECK: fmov [[HASH_E:s[0-9]+]], w0 -; CHECK: sha1m.4s q0, [[HASH_E]], v1 +; CHECK: // %bb.0: +; CHECK-NEXT: fmov s2, w0 +; CHECK-NEXT: sha1m.4s q0, s2, v1 +; CHECK-NEXT: ret %res = call <4 x i32> @llvm.aarch64.crypto.sha1m(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk) ret <4 x i32> %res } -; FALLBACK-NOT: remark{{.*}}test_sha1h define i32 @test_sha1h(i32 %hash_e) { ; CHECK-LABEL: test_sha1h: -; CHECK: fmov [[HASH_E:s[0-9]+]], w0 -; CHECK: sha1h [[RES:s[0-9]+]], [[HASH_E]] -; CHECK: fmov w0, [[RES]] +; CHECK: // %bb.0: +; CHECK-NEXT: fmov s0, w0 +; CHECK-NEXT: sha1h s0, s0 +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret %res = call i32 @llvm.aarch64.crypto.sha1h(i32 %hash_e) ret i32 %res } define <4 x i32> @test_sha1su0(<4 x i32> %wk0_3, <4 x i32> %wk4_7, <4 x i32> %wk8_11) { ; CHECK-LABEL: test_sha1su0: -; CHECK: sha1su0.4s v0, v1, v2 +; CHECK: // %bb.0: +; CHECK-NEXT: sha1su0.4s v0, v1, v2 +; CHECK-NEXT: ret %res = call <4 x i32> @llvm.aarch64.crypto.sha1su0(<4 x i32> %wk0_3, <4 x i32> %wk4_7, <4 x i32> %wk8_11) ret <4 x i32> %res } define <4 x i32> @test_sha1su1(<4 x i32> %wk0_3, <4 x i32> %wk12_15) { ; CHECK-LABEL: test_sha1su1: -; CHECK: sha1su1.4s v0, v1 +; CHECK: // %bb.0: +; CHECK-NEXT: sha1su1.4s v0, v1 +; CHECK-NEXT: ret %res = call <4 x i32> @llvm.aarch64.crypto.sha1su1(<4 x i32> %wk0_3, <4 x i32> %wk12_15) ret <4 x i32> %res } @@ -109,14 +132,18 @@ declare <4 x i32> @llvm.aarch64.crypto.sha256su1(<4 x i32> %w0_3, <4 x i32> %w8_ define <4 x i32> @test_sha256h(<4 x i32> %hash_abcd, <4 x i32> %hash_efgh, <4 x i32> %wk) { ; CHECK-LABEL: test_sha256h: -; CHECK: sha256h.4s q0, q1, v2 +; CHECK: // %bb.0: +; CHECK-NEXT: sha256h.4s q0, q1, v2 +; CHECK-NEXT: ret %res = call <4 x i32> @llvm.aarch64.crypto.sha256h(<4 x i32> %hash_abcd, <4 x i32> %hash_efgh, <4 x i32> %wk) ret <4 x i32> %res } define <4 x i32> @test_sha256h2(<4 x i32> %hash_efgh, <4 x i32> %hash_abcd, <4 x i32> %wk) { ; CHECK-LABEL: test_sha256h2: -; CHECK: sha256h2.4s q0, q1, v2 +; CHECK: // %bb.0: +; CHECK-NEXT: sha256h2.4s q0, q1, v2 +; CHECK-NEXT: ret %res = call <4 x i32> @llvm.aarch64.crypto.sha256h2(<4 x i32> %hash_efgh, <4 x i32> %hash_abcd, <4 x i32> %wk) ret <4 x i32> %res @@ -124,14 +151,21 @@ define <4 x i32> @test_sha256h2(<4 x i32> %hash_efgh, <4 x i32> %hash_abcd, <4 x define <4 x i32> @test_sha256su0(<4 x i32> %w0_3, <4 x i32> %w4_7) { ; CHECK-LABEL: test_sha256su0: -; CHECK: sha256su0.4s v0, v1 +; CHECK: // %bb.0: +; CHECK-NEXT: sha256su0.4s v0, v1 +; CHECK-NEXT: ret %res = call <4 x i32> @llvm.aarch64.crypto.sha256su0(<4 x i32> %w0_3, <4 x i32> %w4_7) ret <4 x i32> %res } define <4 x i32> @test_sha256su1(<4 x i32> %w0_3, <4 x i32> %w8_11, <4 x i32> %w12_15) { ; CHECK-LABEL: test_sha256su1: -; CHECK: sha256su1.4s v0, v1, v2 +; CHECK: // %bb.0: +; CHECK-NEXT: sha256su1.4s v0, v1, v2 +; CHECK-NEXT: ret %res = call <4 x i32> @llvm.aarch64.crypto.sha256su1(<4 x i32> %w0_3, <4 x i32> %w8_11, <4 x i32> %w12_15) ret <4 x i32> %res } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CHECK-GI: {{.*}} +; CHECK-SD: {{.*}} diff --git a/llvm/test/CodeGen/AArch64/arm64-vcvt.ll b/llvm/test/CodeGen/AArch64/arm64-vcvt.ll index be21776e26f8..60fcb643fb9f 100644 --- a/llvm/test/CodeGen/AArch64/arm64-vcvt.ll +++ b/llvm/test/CodeGen/AArch64/arm64-vcvt.ll @@ -1,43 +1,48 @@ -; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple | FileCheck %s -; RUN: llc < %s -mtriple=arm64-eabi -pass-remarks-missed=gisel-* \ -; RUN: -aarch64-neon-syntax=apple -global-isel -global-isel-abort=2 2>&1 | \ -; RUN: FileCheck %s --check-prefixes=FALLBACK,CHECK +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple | FileCheck %s --check-prefixes=CHECK,CHECK-SD +; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple -global-isel -global-isel-abort=2 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI + +; CHECK-GI: warning: Instruction selection used fallback path for fcvtas_1d +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fcvtau_1d +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fcvtms_1d +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fcvtmu_1d +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fcvtps_1d +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fcvtpu_1d +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fcvtns_1d +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fcvtnu_1d -; FALLBACK-NOT: remark{{.*}}fcvtas_2s define <2 x i32> @fcvtas_2s(<2 x float> %A) nounwind { -;CHECK-LABEL: fcvtas_2s: -;CHECK-NOT: ld1 -;CHECK: fcvtas.2s v0, v0 -;CHECK-NEXT: ret +; CHECK-LABEL: fcvtas_2s: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtas.2s v0, v0 +; CHECK-NEXT: ret %tmp3 = call <2 x i32> @llvm.aarch64.neon.fcvtas.v2i32.v2f32(<2 x float> %A) ret <2 x i32> %tmp3 } -; FALLBACK-NOT: remark{{.*}}fcvtas_4s define <4 x i32> @fcvtas_4s(<4 x float> %A) nounwind { -;CHECK-LABEL: fcvtas_4s: -;CHECK-NOT: ld1 -;CHECK: fcvtas.4s v0, v0 -;CHECK-NEXT: ret +; CHECK-LABEL: fcvtas_4s: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtas.4s v0, v0 +; CHECK-NEXT: ret %tmp3 = call <4 x i32> @llvm.aarch64.neon.fcvtas.v4i32.v4f32(<4 x float> %A) ret <4 x i32> %tmp3 } -; FALLBACK-NOT: remark{{.*}}fcvtas_2d define <2 x i64> @fcvtas_2d(<2 x double> %A) nounwind { -;CHECK-LABEL: fcvtas_2d: -;CHECK-NOT: ld1 -;CHECK: fcvtas.2d v0, v0 -;CHECK-NEXT: ret +; CHECK-LABEL: fcvtas_2d: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtas.2d v0, v0 +; CHECK-NEXT: ret %tmp3 = call <2 x i64> @llvm.aarch64.neon.fcvtas.v2i64.v2f64(<2 x double> %A) ret <2 x i64> %tmp3 } define <1 x i64> @fcvtas_1d(<1 x double> %A) nounwind { -;CHECK-LABEL: fcvtas_1d: -;CHECK-NOT: ld1 -;CHECK: fcvtas d0, d0 -;CHECK-NEXT: ret +; CHECK-LABEL: fcvtas_1d: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtas d0, d0 +; CHECK-NEXT: ret %tmp3 = call <1 x i64> @llvm.aarch64.neon.fcvtas.v1i64.v1f64(<1 x double> %A) ret <1 x i64> %tmp3 } @@ -48,37 +53,37 @@ declare <2 x i64> @llvm.aarch64.neon.fcvtas.v2i64.v2f64(<2 x double>) nounwind r declare <1 x i64> @llvm.aarch64.neon.fcvtas.v1i64.v1f64(<1 x double>) nounwind readnone define <2 x i32> @fcvtau_2s(<2 x float> %A) nounwind { -;CHECK-LABEL: fcvtau_2s: -;CHECK-NOT: ld1 -;CHECK: fcvtau.2s v0, v0 -;CHECK-NEXT: ret +; CHECK-LABEL: fcvtau_2s: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtau.2s v0, v0 +; CHECK-NEXT: ret %tmp3 = call <2 x i32> @llvm.aarch64.neon.fcvtau.v2i32.v2f32(<2 x float> %A) ret <2 x i32> %tmp3 } define <4 x i32> @fcvtau_4s(<4 x float> %A) nounwind { -;CHECK-LABEL: fcvtau_4s: -;CHECK-NOT: ld1 -;CHECK: fcvtau.4s v0, v0 -;CHECK-NEXT: ret +; CHECK-LABEL: fcvtau_4s: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtau.4s v0, v0 +; CHECK-NEXT: ret %tmp3 = call <4 x i32> @llvm.aarch64.neon.fcvtau.v4i32.v4f32(<4 x float> %A) ret <4 x i32> %tmp3 } define <2 x i64> @fcvtau_2d(<2 x double> %A) nounwind { -;CHECK-LABEL: fcvtau_2d: -;CHECK-NOT: ld1 -;CHECK: fcvtau.2d v0, v0 -;CHECK-NEXT: ret +; CHECK-LABEL: fcvtau_2d: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtau.2d v0, v0 +; CHECK-NEXT: ret %tmp3 = call <2 x i64> @llvm.aarch64.neon.fcvtau.v2i64.v2f64(<2 x double> %A) ret <2 x i64> %tmp3 } define <1 x i64> @fcvtau_1d(<1 x double> %A) nounwind { -;CHECK-LABEL: fcvtau_1d: -;CHECK-NOT: ld1 -;CHECK: fcvtau d0, d0 -;CHECK-NEXT: ret +; CHECK-LABEL: fcvtau_1d: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtau d0, d0 +; CHECK-NEXT: ret %tmp3 = call <1 x i64> @llvm.aarch64.neon.fcvtau.v1i64.v1f64(<1 x double> %A) ret <1 x i64> %tmp3 } @@ -89,37 +94,37 @@ declare <2 x i64> @llvm.aarch64.neon.fcvtau.v2i64.v2f64(<2 x double>) nounwind r declare <1 x i64> @llvm.aarch64.neon.fcvtau.v1i64.v1f64(<1 x double>) nounwind readnone define <2 x i32> @fcvtms_2s(<2 x float> %A) nounwind { -;CHECK-LABEL: fcvtms_2s: -;CHECK-NOT: ld1 -;CHECK: fcvtms.2s v0, v0 -;CHECK-NEXT: ret +; CHECK-LABEL: fcvtms_2s: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtms.2s v0, v0 +; CHECK-NEXT: ret %tmp3 = call <2 x i32> @llvm.aarch64.neon.fcvtms.v2i32.v2f32(<2 x float> %A) ret <2 x i32> %tmp3 } define <4 x i32> @fcvtms_4s(<4 x float> %A) nounwind { -;CHECK-LABEL: fcvtms_4s: -;CHECK-NOT: ld1 -;CHECK: fcvtms.4s v0, v0 -;CHECK-NEXT: ret +; CHECK-LABEL: fcvtms_4s: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtms.4s v0, v0 +; CHECK-NEXT: ret %tmp3 = call <4 x i32> @llvm.aarch64.neon.fcvtms.v4i32.v4f32(<4 x float> %A) ret <4 x i32> %tmp3 } define <2 x i64> @fcvtms_2d(<2 x double> %A) nounwind { -;CHECK-LABEL: fcvtms_2d: -;CHECK-NOT: ld1 -;CHECK: fcvtms.2d v0, v0 -;CHECK-NEXT: ret +; CHECK-LABEL: fcvtms_2d: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtms.2d v0, v0 +; CHECK-NEXT: ret %tmp3 = call <2 x i64> @llvm.aarch64.neon.fcvtms.v2i64.v2f64(<2 x double> %A) ret <2 x i64> %tmp3 } define <1 x i64> @fcvtms_1d(<1 x double> %A) nounwind { -;CHECK-LABEL: fcvtms_1d: -;CHECK-NOT: ld1 -;CHECK: fcvtms d0, d0 -;CHECK-NEXT: ret +; CHECK-LABEL: fcvtms_1d: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtms d0, d0 +; CHECK-NEXT: ret %tmp3 = call <1 x i64> @llvm.aarch64.neon.fcvtms.v1i64.v1f64(<1 x double> %A) ret <1 x i64> %tmp3 } @@ -130,37 +135,37 @@ declare <2 x i64> @llvm.aarch64.neon.fcvtms.v2i64.v2f64(<2 x double>) nounwind r declare <1 x i64> @llvm.aarch64.neon.fcvtms.v1i64.v1f64(<1 x double>) nounwind readnone define <2 x i32> @fcvtmu_2s(<2 x float> %A) nounwind { -;CHECK-LABEL: fcvtmu_2s: -;CHECK-NOT: ld1 -;CHECK: fcvtmu.2s v0, v0 -;CHECK-NEXT: ret +; CHECK-LABEL: fcvtmu_2s: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtmu.2s v0, v0 +; CHECK-NEXT: ret %tmp3 = call <2 x i32> @llvm.aarch64.neon.fcvtmu.v2i32.v2f32(<2 x float> %A) ret <2 x i32> %tmp3 } define <4 x i32> @fcvtmu_4s(<4 x float> %A) nounwind { -;CHECK-LABEL: fcvtmu_4s: -;CHECK-NOT: ld1 -;CHECK: fcvtmu.4s v0, v0 -;CHECK-NEXT: ret +; CHECK-LABEL: fcvtmu_4s: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtmu.4s v0, v0 +; CHECK-NEXT: ret %tmp3 = call <4 x i32> @llvm.aarch64.neon.fcvtmu.v4i32.v4f32(<4 x float> %A) ret <4 x i32> %tmp3 } define <2 x i64> @fcvtmu_2d(<2 x double> %A) nounwind { -;CHECK-LABEL: fcvtmu_2d: -;CHECK-NOT: ld1 -;CHECK: fcvtmu.2d v0, v0 -;CHECK-NEXT: ret +; CHECK-LABEL: fcvtmu_2d: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtmu.2d v0, v0 +; CHECK-NEXT: ret %tmp3 = call <2 x i64> @llvm.aarch64.neon.fcvtmu.v2i64.v2f64(<2 x double> %A) ret <2 x i64> %tmp3 } define <1 x i64> @fcvtmu_1d(<1 x double> %A) nounwind { -;CHECK-LABEL: fcvtmu_1d: -;CHECK-NOT: ld1 -;CHECK: fcvtmu d0, d0 -;CHECK-NEXT: ret +; CHECK-LABEL: fcvtmu_1d: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtmu d0, d0 +; CHECK-NEXT: ret %tmp3 = call <1 x i64> @llvm.aarch64.neon.fcvtmu.v1i64.v1f64(<1 x double> %A) ret <1 x i64> %tmp3 } @@ -171,37 +176,37 @@ declare <2 x i64> @llvm.aarch64.neon.fcvtmu.v2i64.v2f64(<2 x double>) nounwind r declare <1 x i64> @llvm.aarch64.neon.fcvtmu.v1i64.v1f64(<1 x double>) nounwind readnone define <2 x i32> @fcvtps_2s(<2 x float> %A) nounwind { -;CHECK-LABEL: fcvtps_2s: -;CHECK-NOT: ld1 -;CHECK: fcvtps.2s v0, v0 -;CHECK-NEXT: ret +; CHECK-LABEL: fcvtps_2s: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtps.2s v0, v0 +; CHECK-NEXT: ret %tmp3 = call <2 x i32> @llvm.aarch64.neon.fcvtps.v2i32.v2f32(<2 x float> %A) ret <2 x i32> %tmp3 } define <4 x i32> @fcvtps_4s(<4 x float> %A) nounwind { -;CHECK-LABEL: fcvtps_4s: -;CHECK-NOT: ld1 -;CHECK: fcvtps.4s v0, v0 -;CHECK-NEXT: ret +; CHECK-LABEL: fcvtps_4s: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtps.4s v0, v0 +; CHECK-NEXT: ret %tmp3 = call <4 x i32> @llvm.aarch64.neon.fcvtps.v4i32.v4f32(<4 x float> %A) ret <4 x i32> %tmp3 } define <2 x i64> @fcvtps_2d(<2 x double> %A) nounwind { -;CHECK-LABEL: fcvtps_2d: -;CHECK-NOT: ld1 -;CHECK: fcvtps.2d v0, v0 -;CHECK-NEXT: ret +; CHECK-LABEL: fcvtps_2d: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtps.2d v0, v0 +; CHECK-NEXT: ret %tmp3 = call <2 x i64> @llvm.aarch64.neon.fcvtps.v2i64.v2f64(<2 x double> %A) ret <2 x i64> %tmp3 } define <1 x i64> @fcvtps_1d(<1 x double> %A) nounwind { -;CHECK-LABEL: fcvtps_1d: -;CHECK-NOT: ld1 -;CHECK: fcvtps d0, d0 -;CHECK-NEXT: ret +; CHECK-LABEL: fcvtps_1d: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtps d0, d0 +; CHECK-NEXT: ret %tmp3 = call <1 x i64> @llvm.aarch64.neon.fcvtps.v1i64.v1f64(<1 x double> %A) ret <1 x i64> %tmp3 } @@ -212,37 +217,37 @@ declare <2 x i64> @llvm.aarch64.neon.fcvtps.v2i64.v2f64(<2 x double>) nounwind r declare <1 x i64> @llvm.aarch64.neon.fcvtps.v1i64.v1f64(<1 x double>) nounwind readnone define <2 x i32> @fcvtpu_2s(<2 x float> %A) nounwind { -;CHECK-LABEL: fcvtpu_2s: -;CHECK-NOT: ld1 -;CHECK: fcvtpu.2s v0, v0 -;CHECK-NEXT: ret +; CHECK-LABEL: fcvtpu_2s: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtpu.2s v0, v0 +; CHECK-NEXT: ret %tmp3 = call <2 x i32> @llvm.aarch64.neon.fcvtpu.v2i32.v2f32(<2 x float> %A) ret <2 x i32> %tmp3 } define <4 x i32> @fcvtpu_4s(<4 x float> %A) nounwind { -;CHECK-LABEL: fcvtpu_4s: -;CHECK-NOT: ld1 -;CHECK: fcvtpu.4s v0, v0 -;CHECK-NEXT: ret +; CHECK-LABEL: fcvtpu_4s: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtpu.4s v0, v0 +; CHECK-NEXT: ret %tmp3 = call <4 x i32> @llvm.aarch64.neon.fcvtpu.v4i32.v4f32(<4 x float> %A) ret <4 x i32> %tmp3 } define <2 x i64> @fcvtpu_2d(<2 x double> %A) nounwind { -;CHECK-LABEL: fcvtpu_2d: -;CHECK-NOT: ld1 -;CHECK: fcvtpu.2d v0, v0 -;CHECK-NEXT: ret +; CHECK-LABEL: fcvtpu_2d: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtpu.2d v0, v0 +; CHECK-NEXT: ret %tmp3 = call <2 x i64> @llvm.aarch64.neon.fcvtpu.v2i64.v2f64(<2 x double> %A) ret <2 x i64> %tmp3 } define <1 x i64> @fcvtpu_1d(<1 x double> %A) nounwind { -;CHECK-LABEL: fcvtpu_1d: -;CHECK-NOT: ld1 -;CHECK: fcvtpu d0, d0 -;CHECK-NEXT: ret +; CHECK-LABEL: fcvtpu_1d: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtpu d0, d0 +; CHECK-NEXT: ret %tmp3 = call <1 x i64> @llvm.aarch64.neon.fcvtpu.v1i64.v1f64(<1 x double> %A) ret <1 x i64> %tmp3 } @@ -253,37 +258,37 @@ declare <2 x i64> @llvm.aarch64.neon.fcvtpu.v2i64.v2f64(<2 x double>) nounwind r declare <1 x i64> @llvm.aarch64.neon.fcvtpu.v1i64.v1f64(<1 x double>) nounwind readnone define <2 x i32> @fcvtns_2s(<2 x float> %A) nounwind { -;CHECK-LABEL: fcvtns_2s: -;CHECK-NOT: ld1 -;CHECK: fcvtns.2s v0, v0 -;CHECK-NEXT: ret +; CHECK-LABEL: fcvtns_2s: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtns.2s v0, v0 +; CHECK-NEXT: ret %tmp3 = call <2 x i32> @llvm.aarch64.neon.fcvtns.v2i32.v2f32(<2 x float> %A) ret <2 x i32> %tmp3 } define <4 x i32> @fcvtns_4s(<4 x float> %A) nounwind { -;CHECK-LABEL: fcvtns_4s: -;CHECK-NOT: ld1 -;CHECK: fcvtns.4s v0, v0 -;CHECK-NEXT: ret +; CHECK-LABEL: fcvtns_4s: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtns.4s v0, v0 +; CHECK-NEXT: ret %tmp3 = call <4 x i32> @llvm.aarch64.neon.fcvtns.v4i32.v4f32(<4 x float> %A) ret <4 x i32> %tmp3 } define <2 x i64> @fcvtns_2d(<2 x double> %A) nounwind { -;CHECK-LABEL: fcvtns_2d: -;CHECK-NOT: ld1 -;CHECK: fcvtns.2d v0, v0 -;CHECK-NEXT: ret +; CHECK-LABEL: fcvtns_2d: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtns.2d v0, v0 +; CHECK-NEXT: ret %tmp3 = call <2 x i64> @llvm.aarch64.neon.fcvtns.v2i64.v2f64(<2 x double> %A) ret <2 x i64> %tmp3 } define <1 x i64> @fcvtns_1d(<1 x double> %A) nounwind { -;CHECK-LABEL: fcvtns_1d: -;CHECK-NOT: ld1 -;CHECK: fcvtns d0, d0 -;CHECK-NEXT: ret +; CHECK-LABEL: fcvtns_1d: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtns d0, d0 +; CHECK-NEXT: ret %tmp3 = call <1 x i64> @llvm.aarch64.neon.fcvtns.v1i64.v1f64(<1 x double> %A) ret <1 x i64> %tmp3 } @@ -294,37 +299,37 @@ declare <2 x i64> @llvm.aarch64.neon.fcvtns.v2i64.v2f64(<2 x double>) nounwind r declare <1 x i64> @llvm.aarch64.neon.fcvtns.v1i64.v1f64(<1 x double>) nounwind readnone define <2 x i32> @fcvtnu_2s(<2 x float> %A) nounwind { -;CHECK-LABEL: fcvtnu_2s: -;CHECK-NOT: ld1 -;CHECK: fcvtnu.2s v0, v0 -;CHECK-NEXT: ret +; CHECK-LABEL: fcvtnu_2s: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtnu.2s v0, v0 +; CHECK-NEXT: ret %tmp3 = call <2 x i32> @llvm.aarch64.neon.fcvtnu.v2i32.v2f32(<2 x float> %A) ret <2 x i32> %tmp3 } define <4 x i32> @fcvtnu_4s(<4 x float> %A) nounwind { -;CHECK-LABEL: fcvtnu_4s: -;CHECK-NOT: ld1 -;CHECK: fcvtnu.4s v0, v0 -;CHECK-NEXT: ret +; CHECK-LABEL: fcvtnu_4s: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtnu.4s v0, v0 +; CHECK-NEXT: ret %tmp3 = call <4 x i32> @llvm.aarch64.neon.fcvtnu.v4i32.v4f32(<4 x float> %A) ret <4 x i32> %tmp3 } define <2 x i64> @fcvtnu_2d(<2 x double> %A) nounwind { -;CHECK-LABEL: fcvtnu_2d: -;CHECK-NOT: ld1 -;CHECK: fcvtnu.2d v0, v0 -;CHECK-NEXT: ret +; CHECK-LABEL: fcvtnu_2d: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtnu.2d v0, v0 +; CHECK-NEXT: ret %tmp3 = call <2 x i64> @llvm.aarch64.neon.fcvtnu.v2i64.v2f64(<2 x double> %A) ret <2 x i64> %tmp3 } define <1 x i64> @fcvtnu_1d(<1 x double> %A) nounwind { -;CHECK-LABEL: fcvtnu_1d: -;CHECK-NOT: ld1 -;CHECK: fcvtnu d0, d0 -;CHECK-NEXT: ret +; CHECK-LABEL: fcvtnu_1d: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtnu d0, d0 +; CHECK-NEXT: ret %tmp3 = call <1 x i64> @llvm.aarch64.neon.fcvtnu.v1i64.v1f64(<1 x double> %A) ret <1 x i64> %tmp3 } @@ -335,75 +340,81 @@ declare <2 x i64> @llvm.aarch64.neon.fcvtnu.v2i64.v2f64(<2 x double>) nounwind r declare <1 x i64> @llvm.aarch64.neon.fcvtnu.v1i64.v1f64(<1 x double>) nounwind readnone define <2 x i32> @fcvtzs_2s(<2 x float> %A) nounwind { -;CHECK-LABEL: fcvtzs_2s: -;CHECK-NOT: ld1 -;CHECK: fcvtzs.2s v0, v0 -;CHECK-NEXT: ret +; CHECK-LABEL: fcvtzs_2s: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtzs.2s v0, v0 +; CHECK-NEXT: ret %tmp3 = fptosi <2 x float> %A to <2 x i32> ret <2 x i32> %tmp3 } define <4 x i32> @fcvtzs_4s(<4 x float> %A) nounwind { -;CHECK-LABEL: fcvtzs_4s: -;CHECK-NOT: ld1 -;CHECK: fcvtzs.4s v0, v0 -;CHECK-NEXT: ret +; CHECK-LABEL: fcvtzs_4s: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtzs.4s v0, v0 +; CHECK-NEXT: ret %tmp3 = fptosi <4 x float> %A to <4 x i32> ret <4 x i32> %tmp3 } define <2 x i64> @fcvtzs_2d(<2 x double> %A) nounwind { -;CHECK-LABEL: fcvtzs_2d: -;CHECK-NOT: ld1 -;CHECK: fcvtzs.2d v0, v0 -;CHECK-NEXT: ret +; CHECK-LABEL: fcvtzs_2d: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtzs.2d v0, v0 +; CHECK-NEXT: ret %tmp3 = fptosi <2 x double> %A to <2 x i64> ret <2 x i64> %tmp3 } ; FIXME: Generate "fcvtzs d0, d0"? define <1 x i64> @fcvtzs_1d(<1 x double> %A) nounwind { -;CHECK-LABEL: fcvtzs_1d: -;CHECK-NOT: ld1 -;CHECK: fcvtzs x8, d0 -;CHECK-NEXT: mov d0, x8 -;CHECK-NEXT: ret +; CHECK-LABEL: fcvtzs_1d: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtzs x8, d0 +; CHECK-NEXT: fmov d0, x8 +; CHECK-NEXT: ret %tmp3 = fptosi <1 x double> %A to <1 x i64> ret <1 x i64> %tmp3 } define <2 x i32> @fcvtzs_2s_intrinsic(<2 x float> %A) nounwind { -;CHECK-LABEL: fcvtzs_2s_intrinsic: -;CHECK-NOT: ld1 -;CHECK: fcvtzs.2s v0, v0 -;CHECK-NEXT: ret +; CHECK-LABEL: fcvtzs_2s_intrinsic: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtzs.2s v0, v0 +; CHECK-NEXT: ret %tmp3 = call <2 x i32> @llvm.aarch64.neon.fcvtzs.v2i32.v2f32(<2 x float> %A) ret <2 x i32> %tmp3 } define <4 x i32> @fcvtzs_4s_intrinsic(<4 x float> %A) nounwind { -;CHECK-LABEL: fcvtzs_4s_intrinsic: -;CHECK-NOT: ld1 -;CHECK: fcvtzs.4s v0, v0 -;CHECK-NEXT: ret +; CHECK-LABEL: fcvtzs_4s_intrinsic: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtzs.4s v0, v0 +; CHECK-NEXT: ret %tmp3 = call <4 x i32> @llvm.aarch64.neon.fcvtzs.v4i32.v4f32(<4 x float> %A) ret <4 x i32> %tmp3 } define <2 x i64> @fcvtzs_2d_intrinsic(<2 x double> %A) nounwind { -;CHECK-LABEL: fcvtzs_2d_intrinsic: -;CHECK-NOT: ld1 -;CHECK: fcvtzs.2d v0, v0 -;CHECK-NEXT: ret +; CHECK-LABEL: fcvtzs_2d_intrinsic: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtzs.2d v0, v0 +; CHECK-NEXT: ret %tmp3 = call <2 x i64> @llvm.aarch64.neon.fcvtzs.v2i64.v2f64(<2 x double> %A) ret <2 x i64> %tmp3 } define <1 x i64> @fcvtzs_1d_intrinsic(<1 x double> %A) nounwind { -;CHECK-LABEL: fcvtzs_1d_intrinsic: -;CHECK-NOT: ld1 -;CHECK: fcvtzs{{.*}}, d0 -;CHECK: ret +; CHECK-SD-LABEL: fcvtzs_1d_intrinsic: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: fcvtzs d0, d0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: fcvtzs_1d_intrinsic: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: fcvtzs x8, d0 +; CHECK-GI-NEXT: fmov d0, x8 +; CHECK-GI-NEXT: ret %tmp3 = call <1 x i64> @llvm.aarch64.neon.fcvtzs.v1i64.v1f64(<1 x double> %A) ret <1 x i64> %tmp3 } @@ -414,75 +425,81 @@ declare <2 x i64> @llvm.aarch64.neon.fcvtzs.v2i64.v2f64(<2 x double>) nounwind r declare <1 x i64> @llvm.aarch64.neon.fcvtzs.v1i64.v1f64(<1 x double>) nounwind readnone define <2 x i32> @fcvtzu_2s(<2 x float> %A) nounwind { -;CHECK-LABEL: fcvtzu_2s: -;CHECK-NOT: ld1 -;CHECK: fcvtzu.2s v0, v0 -;CHECK-NEXT: ret +; CHECK-LABEL: fcvtzu_2s: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtzu.2s v0, v0 +; CHECK-NEXT: ret %tmp3 = fptoui <2 x float> %A to <2 x i32> ret <2 x i32> %tmp3 } define <4 x i32> @fcvtzu_4s(<4 x float> %A) nounwind { -;CHECK-LABEL: fcvtzu_4s: -;CHECK-NOT: ld1 -;CHECK: fcvtzu.4s v0, v0 -;CHECK-NEXT: ret +; CHECK-LABEL: fcvtzu_4s: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtzu.4s v0, v0 +; CHECK-NEXT: ret %tmp3 = fptoui <4 x float> %A to <4 x i32> ret <4 x i32> %tmp3 } define <2 x i64> @fcvtzu_2d(<2 x double> %A) nounwind { -;CHECK-LABEL: fcvtzu_2d: -;CHECK-NOT: ld1 -;CHECK: fcvtzu.2d v0, v0 -;CHECK-NEXT: ret +; CHECK-LABEL: fcvtzu_2d: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtzu.2d v0, v0 +; CHECK-NEXT: ret %tmp3 = fptoui <2 x double> %A to <2 x i64> ret <2 x i64> %tmp3 } ; FIXME: Generate "fcvtzu d0, d0"? define <1 x i64> @fcvtzu_1d(<1 x double> %A) nounwind { -;CHECK-LABEL: fcvtzu_1d: -;CHECK-NOT: ld1 -;CHECK: fcvtzu x8, d0 -;CHECK-NEXT: mov d0, x8 -;CHECK-NEXT: ret +; CHECK-LABEL: fcvtzu_1d: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtzu x8, d0 +; CHECK-NEXT: fmov d0, x8 +; CHECK-NEXT: ret %tmp3 = fptoui <1 x double> %A to <1 x i64> ret <1 x i64> %tmp3 } define <2 x i32> @fcvtzu_2s_intrinsic(<2 x float> %A) nounwind { -;CHECK-LABEL: fcvtzu_2s_intrinsic: -;CHECK-NOT: ld1 -;CHECK: fcvtzu.2s v0, v0 -;CHECK-NEXT: ret +; CHECK-LABEL: fcvtzu_2s_intrinsic: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtzu.2s v0, v0 +; CHECK-NEXT: ret %tmp3 = call <2 x i32> @llvm.aarch64.neon.fcvtzu.v2i32.v2f32(<2 x float> %A) ret <2 x i32> %tmp3 } define <4 x i32> @fcvtzu_4s_intrinsic(<4 x float> %A) nounwind { -;CHECK-LABEL: fcvtzu_4s_intrinsic: -;CHECK-NOT: ld1 -;CHECK: fcvtzu.4s v0, v0 -;CHECK-NEXT: ret +; CHECK-LABEL: fcvtzu_4s_intrinsic: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtzu.4s v0, v0 +; CHECK-NEXT: ret %tmp3 = call <4 x i32> @llvm.aarch64.neon.fcvtzu.v4i32.v4f32(<4 x float> %A) ret <4 x i32> %tmp3 } define <2 x i64> @fcvtzu_2d_intrinsic(<2 x double> %A) nounwind { -;CHECK-LABEL: fcvtzu_2d_intrinsic: -;CHECK-NOT: ld1 -;CHECK: fcvtzu.2d v0, v0 -;CHECK-NEXT: ret +; CHECK-LABEL: fcvtzu_2d_intrinsic: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtzu.2d v0, v0 +; CHECK-NEXT: ret %tmp3 = call <2 x i64> @llvm.aarch64.neon.fcvtzu.v2i64.v2f64(<2 x double> %A) ret <2 x i64> %tmp3 } define <1 x i64> @fcvtzu_1d_intrinsic(<1 x double> %A) nounwind { -;CHECK-LABEL: fcvtzu_1d_intrinsic: -;CHECK-NOT: ld1 -;CHECK: fcvtzu{{.*}}, d0 -;CHECK: ret +; CHECK-SD-LABEL: fcvtzu_1d_intrinsic: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: fcvtzu d0, d0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: fcvtzu_1d_intrinsic: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: fcvtzu x8, d0 +; CHECK-GI-NEXT: fmov d0, x8 +; CHECK-GI-NEXT: ret %tmp3 = call <1 x i64> @llvm.aarch64.neon.fcvtzu.v1i64.v1f64(<1 x double> %A) ret <1 x i64> %tmp3 } @@ -493,28 +510,28 @@ declare <2 x i64> @llvm.aarch64.neon.fcvtzu.v2i64.v2f64(<2 x double>) nounwind r declare <1 x i64> @llvm.aarch64.neon.fcvtzu.v1i64.v1f64(<1 x double>) nounwind readnone define <2 x float> @frinta_2s(<2 x float> %A) nounwind { -;CHECK-LABEL: frinta_2s: -;CHECK-NOT: ld1 -;CHECK: frinta.2s v0, v0 -;CHECK-NEXT: ret +; CHECK-LABEL: frinta_2s: +; CHECK: // %bb.0: +; CHECK-NEXT: frinta.2s v0, v0 +; CHECK-NEXT: ret %tmp3 = call <2 x float> @llvm.round.v2f32(<2 x float> %A) ret <2 x float> %tmp3 } define <4 x float> @frinta_4s(<4 x float> %A) nounwind { -;CHECK-LABEL: frinta_4s: -;CHECK-NOT: ld1 -;CHECK: frinta.4s v0, v0 -;CHECK-NEXT: ret +; CHECK-LABEL: frinta_4s: +; CHECK: // %bb.0: +; CHECK-NEXT: frinta.4s v0, v0 +; CHECK-NEXT: ret %tmp3 = call <4 x float> @llvm.round.v4f32(<4 x float> %A) ret <4 x float> %tmp3 } define <2 x double> @frinta_2d(<2 x double> %A) nounwind { -;CHECK-LABEL: frinta_2d: -;CHECK-NOT: ld1 -;CHECK: frinta.2d v0, v0 -;CHECK-NEXT: ret +; CHECK-LABEL: frinta_2d: +; CHECK: // %bb.0: +; CHECK-NEXT: frinta.2d v0, v0 +; CHECK-NEXT: ret %tmp3 = call <2 x double> @llvm.round.v2f64(<2 x double> %A) ret <2 x double> %tmp3 } @@ -524,28 +541,28 @@ declare <4 x float> @llvm.round.v4f32(<4 x float>) nounwind readnone declare <2 x double> @llvm.round.v2f64(<2 x double>) nounwind readnone define <2 x float> @frinti_2s(<2 x float> %A) nounwind { -;CHECK-LABEL: frinti_2s: -;CHECK-NOT: ld1 -;CHECK: frinti.2s v0, v0 -;CHECK-NEXT: ret +; CHECK-LABEL: frinti_2s: +; CHECK: // %bb.0: +; CHECK-NEXT: frinti.2s v0, v0 +; CHECK-NEXT: ret %tmp3 = call <2 x float> @llvm.nearbyint.v2f32(<2 x float> %A) ret <2 x float> %tmp3 } define <4 x float> @frinti_4s(<4 x float> %A) nounwind { -;CHECK-LABEL: frinti_4s: -;CHECK-NOT: ld1 -;CHECK: frinti.4s v0, v0 -;CHECK-NEXT: ret +; CHECK-LABEL: frinti_4s: +; CHECK: // %bb.0: +; CHECK-NEXT: frinti.4s v0, v0 +; CHECK-NEXT: ret %tmp3 = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %A) ret <4 x float> %tmp3 } define <2 x double> @frinti_2d(<2 x double> %A) nounwind { -;CHECK-LABEL: frinti_2d: -;CHECK-NOT: ld1 -;CHECK: frinti.2d v0, v0 -;CHECK-NEXT: ret +; CHECK-LABEL: frinti_2d: +; CHECK: // %bb.0: +; CHECK-NEXT: frinti.2d v0, v0 +; CHECK-NEXT: ret %tmp3 = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %A) ret <2 x double> %tmp3 } @@ -555,28 +572,28 @@ declare <4 x float> @llvm.nearbyint.v4f32(<4 x float>) nounwind readnone declare <2 x double> @llvm.nearbyint.v2f64(<2 x double>) nounwind readnone define <2 x float> @frintm_2s(<2 x float> %A) nounwind { -;CHECK-LABEL: frintm_2s: -;CHECK-NOT: ld1 -;CHECK: frintm.2s v0, v0 -;CHECK-NEXT: ret +; CHECK-LABEL: frintm_2s: +; CHECK: // %bb.0: +; CHECK-NEXT: frintm.2s v0, v0 +; CHECK-NEXT: ret %tmp3 = call <2 x float> @llvm.floor.v2f32(<2 x float> %A) ret <2 x float> %tmp3 } define <4 x float> @frintm_4s(<4 x float> %A) nounwind { -;CHECK-LABEL: frintm_4s: -;CHECK-NOT: ld1 -;CHECK: frintm.4s v0, v0 -;CHECK-NEXT: ret +; CHECK-LABEL: frintm_4s: +; CHECK: // %bb.0: +; CHECK-NEXT: frintm.4s v0, v0 +; CHECK-NEXT: ret %tmp3 = call <4 x float> @llvm.floor.v4f32(<4 x float> %A) ret <4 x float> %tmp3 } define <2 x double> @frintm_2d(<2 x double> %A) nounwind { -;CHECK-LABEL: frintm_2d: -;CHECK-NOT: ld1 -;CHECK: frintm.2d v0, v0 -;CHECK-NEXT: ret +; CHECK-LABEL: frintm_2d: +; CHECK: // %bb.0: +; CHECK-NEXT: frintm.2d v0, v0 +; CHECK-NEXT: ret %tmp3 = call <2 x double> @llvm.floor.v2f64(<2 x double> %A) ret <2 x double> %tmp3 } @@ -586,28 +603,28 @@ declare <4 x float> @llvm.floor.v4f32(<4 x float>) nounwind readnone declare <2 x double> @llvm.floor.v2f64(<2 x double>) nounwind readnone define <2 x float> @frintn_2s(<2 x float> %A) nounwind { -;CHECK-LABEL: frintn_2s: -;CHECK-NOT: ld1 -;CHECK: frintn.2s v0, v0 -;CHECK-NEXT: ret +; CHECK-LABEL: frintn_2s: +; CHECK: // %bb.0: +; CHECK-NEXT: frintn.2s v0, v0 +; CHECK-NEXT: ret %tmp3 = call <2 x float> @llvm.roundeven.v2f32(<2 x float> %A) ret <2 x float> %tmp3 } define <4 x float> @frintn_4s(<4 x float> %A) nounwind { -;CHECK-LABEL: frintn_4s: -;CHECK-NOT: ld1 -;CHECK: frintn.4s v0, v0 -;CHECK-NEXT: ret +; CHECK-LABEL: frintn_4s: +; CHECK: // %bb.0: +; CHECK-NEXT: frintn.4s v0, v0 +; CHECK-NEXT: ret %tmp3 = call <4 x float> @llvm.roundeven.v4f32(<4 x float> %A) ret <4 x float> %tmp3 } define <2 x double> @frintn_2d(<2 x double> %A) nounwind { -;CHECK-LABEL: frintn_2d: -;CHECK-NOT: ld1 -;CHECK: frintn.2d v0, v0 -;CHECK-NEXT: ret +; CHECK-LABEL: frintn_2d: +; CHECK: // %bb.0: +; CHECK-NEXT: frintn.2d v0, v0 +; CHECK-NEXT: ret %tmp3 = call <2 x double> @llvm.roundeven.v2f64(<2 x double> %A) ret <2 x double> %tmp3 } @@ -616,32 +633,29 @@ declare <2 x float> @llvm.roundeven.v2f32(<2 x float>) nounwind readnone declare <4 x float> @llvm.roundeven.v4f32(<4 x float>) nounwind readnone declare <2 x double> @llvm.roundeven.v2f64(<2 x double>) nounwind readnone -; FALLBACK-NOT: remark{{.*}}frintp_2s define <2 x float> @frintp_2s(<2 x float> %A) nounwind { -;CHECK-LABEL: frintp_2s: -;CHECK-NOT: ld1 -;CHECK: frintp.2s v0, v0 -;CHECK-NEXT: ret +; CHECK-LABEL: frintp_2s: +; CHECK: // %bb.0: +; CHECK-NEXT: frintp.2s v0, v0 +; CHECK-NEXT: ret %tmp3 = call <2 x float> @llvm.ceil.v2f32(<2 x float> %A) ret <2 x float> %tmp3 } -; FALLBACK-NOT: remark{{.*}}frintp_4s define <4 x float> @frintp_4s(<4 x float> %A) nounwind { -;CHECK-LABEL: frintp_4s: -;CHECK-NOT: ld1 -;CHECK: frintp.4s v0, v0 -;CHECK-NEXT: ret +; CHECK-LABEL: frintp_4s: +; CHECK: // %bb.0: +; CHECK-NEXT: frintp.4s v0, v0 +; CHECK-NEXT: ret %tmp3 = call <4 x float> @llvm.ceil.v4f32(<4 x float> %A) ret <4 x float> %tmp3 } -; FALLBACK-NOT: remark{{.*}}frintp_2d define <2 x double> @frintp_2d(<2 x double> %A) nounwind { -;CHECK-LABEL: frintp_2d: -;CHECK-NOT: ld1 -;CHECK: frintp.2d v0, v0 -;CHECK-NEXT: ret +; CHECK-LABEL: frintp_2d: +; CHECK: // %bb.0: +; CHECK-NEXT: frintp.2d v0, v0 +; CHECK-NEXT: ret %tmp3 = call <2 x double> @llvm.ceil.v2f64(<2 x double> %A) ret <2 x double> %tmp3 } @@ -651,28 +665,28 @@ declare <4 x float> @llvm.ceil.v4f32(<4 x float>) nounwind readnone declare <2 x double> @llvm.ceil.v2f64(<2 x double>) nounwind readnone define <2 x float> @frintx_2s(<2 x float> %A) nounwind { -;CHECK-LABEL: frintx_2s: -;CHECK-NOT: ld1 -;CHECK: frintx.2s v0, v0 -;CHECK-NEXT: ret +; CHECK-LABEL: frintx_2s: +; CHECK: // %bb.0: +; CHECK-NEXT: frintx.2s v0, v0 +; CHECK-NEXT: ret %tmp3 = call <2 x float> @llvm.rint.v2f32(<2 x float> %A) ret <2 x float> %tmp3 } define <4 x float> @frintx_4s(<4 x float> %A) nounwind { -;CHECK-LABEL: frintx_4s: -;CHECK-NOT: ld1 -;CHECK: frintx.4s v0, v0 -;CHECK-NEXT: ret +; CHECK-LABEL: frintx_4s: +; CHECK: // %bb.0: +; CHECK-NEXT: frintx.4s v0, v0 +; CHECK-NEXT: ret %tmp3 = call <4 x float> @llvm.rint.v4f32(<4 x float> %A) ret <4 x float> %tmp3 } define <2 x double> @frintx_2d(<2 x double> %A) nounwind { -;CHECK-LABEL: frintx_2d: -;CHECK-NOT: ld1 -;CHECK: frintx.2d v0, v0 -;CHECK-NEXT: ret +; CHECK-LABEL: frintx_2d: +; CHECK: // %bb.0: +; CHECK-NEXT: frintx.2d v0, v0 +; CHECK-NEXT: ret %tmp3 = call <2 x double> @llvm.rint.v2f64(<2 x double> %A) ret <2 x double> %tmp3 } @@ -682,28 +696,28 @@ declare <4 x float> @llvm.rint.v4f32(<4 x float>) nounwind readnone declare <2 x double> @llvm.rint.v2f64(<2 x double>) nounwind readnone define <2 x float> @frintz_2s(<2 x float> %A) nounwind { -;CHECK-LABEL: frintz_2s: -;CHECK-NOT: ld1 -;CHECK: frintz.2s v0, v0 -;CHECK-NEXT: ret +; CHECK-LABEL: frintz_2s: +; CHECK: // %bb.0: +; CHECK-NEXT: frintz.2s v0, v0 +; CHECK-NEXT: ret %tmp3 = call <2 x float> @llvm.trunc.v2f32(<2 x float> %A) ret <2 x float> %tmp3 } define <4 x float> @frintz_4s(<4 x float> %A) nounwind { -;CHECK-LABEL: frintz_4s: -;CHECK-NOT: ld1 -;CHECK: frintz.4s v0, v0 -;CHECK-NEXT: ret +; CHECK-LABEL: frintz_4s: +; CHECK: // %bb.0: +; CHECK-NEXT: frintz.4s v0, v0 +; CHECK-NEXT: ret %tmp3 = call <4 x float> @llvm.trunc.v4f32(<4 x float> %A) ret <4 x float> %tmp3 } define <2 x double> @frintz_2d(<2 x double> %A) nounwind { -;CHECK-LABEL: frintz_2d: -;CHECK-NOT: ld1 -;CHECK: frintz.2d v0, v0 -;CHECK-NEXT: ret +; CHECK-LABEL: frintz_2d: +; CHECK: // %bb.0: +; CHECK-NEXT: frintz.2d v0, v0 +; CHECK-NEXT: ret %tmp3 = call <2 x double> @llvm.trunc.v2f64(<2 x double> %A) ret <2 x double> %tmp3 } @@ -713,19 +727,20 @@ declare <4 x float> @llvm.trunc.v4f32(<4 x float>) nounwind readnone declare <2 x double> @llvm.trunc.v2f64(<2 x double>) nounwind readnone define <2 x float> @fcvtxn_2s(<2 x double> %A) nounwind { -;CHECK-LABEL: fcvtxn_2s: -;CHECK-NOT: ld1 -;CHECK: fcvtxn v0.2s, v0.2d -;CHECK-NEXT: ret +; CHECK-LABEL: fcvtxn_2s: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtxn v0.2s, v0.2d +; CHECK-NEXT: ret %tmp3 = call <2 x float> @llvm.aarch64.neon.fcvtxn.v2f32.v2f64(<2 x double> %A) ret <2 x float> %tmp3 } define <4 x float> @fcvtxn_4s(<2 x float> %ret, <2 x double> %A) nounwind { -;CHECK-LABEL: fcvtxn_4s: -;CHECK-NOT: ld1 -;CHECK: fcvtxn2 v0.4s, v1.2d -;CHECK-NEXT: ret +; CHECK-LABEL: fcvtxn_4s: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: fcvtxn2 v0.4s, v1.2d +; CHECK-NEXT: ret %tmp3 = call <2 x float> @llvm.aarch64.neon.fcvtxn.v2f32.v2f64(<2 x double> %A) %res = shufflevector <2 x float> %ret, <2 x float> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3> ret <4 x float> %res @@ -734,28 +749,28 @@ define <4 x float> @fcvtxn_4s(<2 x float> %ret, <2 x double> %A) nounwind { declare <2 x float> @llvm.aarch64.neon.fcvtxn.v2f32.v2f64(<2 x double>) nounwind readnone define <2 x i32> @fcvtzsc_2s(<2 x float> %A) nounwind { -;CHECK-LABEL: fcvtzsc_2s: -;CHECK-NOT: ld1 -;CHECK: fcvtzs.2s v0, v0, #1 -;CHECK-NEXT: ret +; CHECK-LABEL: fcvtzsc_2s: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtzs.2s v0, v0, #1 +; CHECK-NEXT: ret %tmp3 = call <2 x i32> @llvm.aarch64.neon.vcvtfp2fxs.v2i32.v2f32(<2 x float> %A, i32 1) ret <2 x i32> %tmp3 } define <4 x i32> @fcvtzsc_4s(<4 x float> %A) nounwind { -;CHECK-LABEL: fcvtzsc_4s: -;CHECK-NOT: ld1 -;CHECK: fcvtzs.4s v0, v0, #1 -;CHECK-NEXT: ret +; CHECK-LABEL: fcvtzsc_4s: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtzs.4s v0, v0, #1 +; CHECK-NEXT: ret %tmp3 = call <4 x i32> @llvm.aarch64.neon.vcvtfp2fxs.v4i32.v4f32(<4 x float> %A, i32 1) ret <4 x i32> %tmp3 } define <2 x i64> @fcvtzsc_2d(<2 x double> %A) nounwind { -;CHECK-LABEL: fcvtzsc_2d: -;CHECK-NOT: ld1 -;CHECK: fcvtzs.2d v0, v0, #1 -;CHECK-NEXT: ret +; CHECK-LABEL: fcvtzsc_2d: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtzs.2d v0, v0, #1 +; CHECK-NEXT: ret %tmp3 = call <2 x i64> @llvm.aarch64.neon.vcvtfp2fxs.v2i64.v2f64(<2 x double> %A, i32 1) ret <2 x i64> %tmp3 } @@ -765,28 +780,28 @@ declare <4 x i32> @llvm.aarch64.neon.vcvtfp2fxs.v4i32.v4f32(<4 x float>, i32) no declare <2 x i64> @llvm.aarch64.neon.vcvtfp2fxs.v2i64.v2f64(<2 x double>, i32) nounwind readnone define <2 x i32> @fcvtzuc_2s(<2 x float> %A) nounwind { -;CHECK-LABEL: fcvtzuc_2s: -;CHECK-NOT: ld1 -;CHECK: fcvtzu.2s v0, v0, #1 -;CHECK-NEXT: ret +; CHECK-LABEL: fcvtzuc_2s: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtzu.2s v0, v0, #1 +; CHECK-NEXT: ret %tmp3 = call <2 x i32> @llvm.aarch64.neon.vcvtfp2fxu.v2i32.v2f32(<2 x float> %A, i32 1) ret <2 x i32> %tmp3 } define <4 x i32> @fcvtzuc_4s(<4 x float> %A) nounwind { -;CHECK-LABEL: fcvtzuc_4s: -;CHECK-NOT: ld1 -;CHECK: fcvtzu.4s v0, v0, #1 -;CHECK-NEXT: ret +; CHECK-LABEL: fcvtzuc_4s: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtzu.4s v0, v0, #1 +; CHECK-NEXT: ret %tmp3 = call <4 x i32> @llvm.aarch64.neon.vcvtfp2fxu.v4i32.v4f32(<4 x float> %A, i32 1) ret <4 x i32> %tmp3 } define <2 x i64> @fcvtzuc_2d(<2 x double> %A) nounwind { -;CHECK-LABEL: fcvtzuc_2d: -;CHECK-NOT: ld1 -;CHECK: fcvtzu.2d v0, v0, #1 -;CHECK-NEXT: ret +; CHECK-LABEL: fcvtzuc_2d: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtzu.2d v0, v0, #1 +; CHECK-NEXT: ret %tmp3 = call <2 x i64> @llvm.aarch64.neon.vcvtfp2fxu.v2i64.v2f64(<2 x double> %A, i32 1) ret <2 x i64> %tmp3 } @@ -796,28 +811,28 @@ declare <4 x i32> @llvm.aarch64.neon.vcvtfp2fxu.v4i32.v4f32(<4 x float>, i32) no declare <2 x i64> @llvm.aarch64.neon.vcvtfp2fxu.v2i64.v2f64(<2 x double>, i32) nounwind readnone define <2 x float> @scvtf_2sc(<2 x i32> %A) nounwind { -;CHECK-LABEL: scvtf_2sc: -;CHECK-NOT: ld1 -;CHECK: scvtf.2s v0, v0, #1 -;CHECK-NEXT: ret +; CHECK-LABEL: scvtf_2sc: +; CHECK: // %bb.0: +; CHECK-NEXT: scvtf.2s v0, v0, #1 +; CHECK-NEXT: ret %tmp3 = call <2 x float> @llvm.aarch64.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32> %A, i32 1) ret <2 x float> %tmp3 } define <4 x float> @scvtf_4sc(<4 x i32> %A) nounwind { -;CHECK-LABEL: scvtf_4sc: -;CHECK-NOT: ld1 -;CHECK: scvtf.4s v0, v0, #1 -;CHECK-NEXT: ret +; CHECK-LABEL: scvtf_4sc: +; CHECK: // %bb.0: +; CHECK-NEXT: scvtf.4s v0, v0, #1 +; CHECK-NEXT: ret %tmp3 = call <4 x float> @llvm.aarch64.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32> %A, i32 1) ret <4 x float> %tmp3 } define <2 x double> @scvtf_2dc(<2 x i64> %A) nounwind { -;CHECK-LABEL: scvtf_2dc: -;CHECK-NOT: ld1 -;CHECK: scvtf.2d v0, v0, #1 -;CHECK-NEXT: ret +; CHECK-LABEL: scvtf_2dc: +; CHECK: // %bb.0: +; CHECK-NEXT: scvtf.2d v0, v0, #1 +; CHECK-NEXT: ret %tmp3 = call <2 x double> @llvm.aarch64.neon.vcvtfxs2fp.v2f64.v2i64(<2 x i64> %A, i32 1) ret <2 x double> %tmp3 } @@ -827,46 +842,77 @@ declare <4 x float> @llvm.aarch64.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32>, i32) no declare <2 x double> @llvm.aarch64.neon.vcvtfxs2fp.v2f64.v2i64(<2 x i64>, i32) nounwind readnone define <2 x float> @ucvtf_2sc(<2 x i32> %A) nounwind { -;CHECK-LABEL: ucvtf_2sc: -;CHECK-NOT: ld1 -;CHECK: ucvtf.2s v0, v0, #1 -;CHECK-NEXT: ret +; CHECK-LABEL: ucvtf_2sc: +; CHECK: // %bb.0: +; CHECK-NEXT: ucvtf.2s v0, v0, #1 +; CHECK-NEXT: ret %tmp3 = call <2 x float> @llvm.aarch64.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32> %A, i32 1) ret <2 x float> %tmp3 } define <4 x float> @ucvtf_4sc(<4 x i32> %A) nounwind { -;CHECK-LABEL: ucvtf_4sc: -;CHECK-NOT: ld1 -;CHECK: ucvtf.4s v0, v0, #1 -;CHECK-NEXT: ret +; CHECK-LABEL: ucvtf_4sc: +; CHECK: // %bb.0: +; CHECK-NEXT: ucvtf.4s v0, v0, #1 +; CHECK-NEXT: ret %tmp3 = call <4 x float> @llvm.aarch64.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32> %A, i32 1) ret <4 x float> %tmp3 } define <2 x double> @ucvtf_2dc(<2 x i64> %A) nounwind { -;CHECK-LABEL: ucvtf_2dc: -;CHECK-NOT: ld1 -;CHECK: ucvtf.2d v0, v0, #1 -;CHECK-NEXT: ret +; CHECK-LABEL: ucvtf_2dc: +; CHECK: // %bb.0: +; CHECK-NEXT: ucvtf.2d v0, v0, #1 +; CHECK-NEXT: ret %tmp3 = call <2 x double> @llvm.aarch64.neon.vcvtfxu2fp.v2f64.v2i64(<2 x i64> %A, i32 1) ret <2 x double> %tmp3 } - -;CHECK-LABEL: autogen_SD28458: -;CHECK: fcvt -;CHECK: ret define void @autogen_SD28458(<8 x double> %val.f64, ptr %addr.f32) { +; CHECK-SD-LABEL: autogen_SD28458: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: fcvtn v2.2s, v2.2d +; CHECK-SD-NEXT: fcvtn v0.2s, v0.2d +; CHECK-SD-NEXT: fcvtn2 v2.4s, v3.2d +; CHECK-SD-NEXT: fcvtn2 v0.4s, v1.2d +; CHECK-SD-NEXT: stp q0, q2, [x0] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: autogen_SD28458: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: fcvtn v0.2s, v0.2d +; CHECK-GI-NEXT: fcvtn v2.2s, v2.2d +; CHECK-GI-NEXT: fcvtn2 v0.4s, v1.2d +; CHECK-GI-NEXT: fcvtn2 v2.4s, v3.2d +; CHECK-GI-NEXT: stp q0, q2, [x0] +; CHECK-GI-NEXT: ret %Tr53 = fptrunc <8 x double> %val.f64 to <8 x float> store <8 x float> %Tr53, ptr %addr.f32 ret void } -;CHECK-LABEL: autogen_SD19225: -;CHECK: fcvt -;CHECK: ret define void @autogen_SD19225(ptr %addr.f64, ptr %addr.f32) { +; CHECK-SD-LABEL: autogen_SD19225: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ldp q1, q0, [x1] +; CHECK-SD-NEXT: fcvtl2 v2.2d, v0.4s +; CHECK-SD-NEXT: fcvtl v0.2d, v0.2s +; CHECK-SD-NEXT: fcvtl2 v3.2d, v1.4s +; CHECK-SD-NEXT: fcvtl v1.2d, v1.2s +; CHECK-SD-NEXT: stp q0, q2, [x0, #32] +; CHECK-SD-NEXT: stp q1, q3, [x0] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: autogen_SD19225: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: ldp q0, q1, [x1] +; CHECK-GI-NEXT: fcvtl v2.2d, v0.2s +; CHECK-GI-NEXT: fcvtl2 v0.2d, v0.4s +; CHECK-GI-NEXT: fcvtl v3.2d, v1.2s +; CHECK-GI-NEXT: fcvtl2 v1.2d, v1.4s +; CHECK-GI-NEXT: stp q2, q0, [x0] +; CHECK-GI-NEXT: stp q3, q1, [x0, #32] +; CHECK-GI-NEXT: ret %A = load <8 x float>, ptr %addr.f32 %Tr53 = fpext <8 x float> %A to <8 x double> store <8 x double> %Tr53, ptr %addr.f64 diff --git a/llvm/test/CodeGen/AArch64/arm64-vcvtxd_f32_f64.ll b/llvm/test/CodeGen/AArch64/arm64-vcvtxd_f32_f64.ll index 845b8cb9a1fe..5dd882106883 100644 --- a/llvm/test/CodeGen/AArch64/arm64-vcvtxd_f32_f64.ll +++ b/llvm/test/CodeGen/AArch64/arm64-vcvtxd_f32_f64.ll @@ -1,11 +1,14 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=arm64-eabi | FileCheck %s +; RUN: llc < %s -mtriple=arm64-eabi -global-isel -global-isel-abort=2 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI + +; CHECK-GI: warning: Instruction selection used fallback path for fcvtxn define float @fcvtxn(double %a) { ; CHECK-LABEL: fcvtxn: -; CHECK: fcvtxn s0, d0 -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtxn s0, d0 +; CHECK-NEXT: ret %vcvtxd.i = tail call float @llvm.aarch64.sisd.fcvtxn(double %a) nounwind ret float %vcvtxd.i } - -declare float @llvm.aarch64.sisd.fcvtxn(double) nounwind readnone diff --git a/llvm/test/CodeGen/AArch64/arm64-vshift.ll b/llvm/test/CodeGen/AArch64/arm64-vshift.ll index a7f9ca8d73c1..8ec5434085d6 100644 --- a/llvm/test/CodeGen/AArch64/arm64-vshift.ll +++ b/llvm/test/CodeGen/AArch64/arm64-vshift.ll @@ -101,8 +101,6 @@ ; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sli4s ; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sli2d ; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sqshlu_zero_shift_amount -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for lshr_trunc_v2i64_v2i8 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for ashr_trunc_v2i64_v2i8 define <8 x i8> @sqshl8b(ptr %A, ptr %B) nounwind { ; CHECK-LABEL: sqshl8b: @@ -4378,25 +4376,87 @@ define <8 x i16> @signbits_vashr(<8 x i16> %a) { } define <2 x i8> @lshr_trunc_v2i64_v2i8(<2 x i64> %a) { -; CHECK-LABEL: lshr_trunc_v2i64_v2i8: -; CHECK: // %bb.0: -; CHECK-NEXT: shrn v0.2s, v0.2d, #16 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: lshr_trunc_v2i64_v2i8: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: shrn v0.2s, v0.2d, #16 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: lshr_trunc_v2i64_v2i8: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: xtn v0.2s, v0.2d +; CHECK-GI-NEXT: ushr v0.2s, v0.2s, #16 +; CHECK-GI-NEXT: ret %b = lshr <2 x i64> %a, <i64 16, i64 16> %c = trunc <2 x i64> %b to <2 x i8> ret <2 x i8> %c } +define <4 x i16> @lshr_trunc_v4i64_v4i16(<4 x i64> %a) { +; CHECK-SD-LABEL: lshr_trunc_v4i64_v4i16: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: xtn v1.2s, v1.2d +; CHECK-SD-NEXT: xtn v0.2s, v0.2d +; CHECK-SD-NEXT: ushr v1.2s, v1.2s, #8 +; CHECK-SD-NEXT: ushr v0.2s, v0.2s, #8 +; CHECK-SD-NEXT: uzp1 v0.4h, v0.4h, v1.4h +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: lshr_trunc_v4i64_v4i16: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: adrp x8, .LCPI270_0 +; CHECK-GI-NEXT: uzp1 v0.4s, v0.4s, v1.4s +; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI270_0] +; CHECK-GI-NEXT: uzp1 v2.4s, v2.4s, v2.4s +; CHECK-GI-NEXT: neg v1.4s, v2.4s +; CHECK-GI-NEXT: ushl v0.4s, v0.4s, v1.4s +; CHECK-GI-NEXT: xtn v0.4h, v0.4s +; CHECK-GI-NEXT: ret + %b = lshr <4 x i64> %a, <i64 8, i64 8, i64 8, i64 8> + %c = trunc <4 x i64> %b to <4 x i16> + ret <4 x i16> %c +} + define <2 x i8> @ashr_trunc_v2i64_v2i8(<2 x i64> %a) { -; CHECK-LABEL: ashr_trunc_v2i64_v2i8: -; CHECK: // %bb.0: -; CHECK-NEXT: shrn v0.2s, v0.2d, #16 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: ashr_trunc_v2i64_v2i8: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: shrn v0.2s, v0.2d, #16 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: ashr_trunc_v2i64_v2i8: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: xtn v0.2s, v0.2d +; CHECK-GI-NEXT: sshr v0.2s, v0.2s, #16 +; CHECK-GI-NEXT: ret %b = ashr <2 x i64> %a, <i64 16, i64 16> %c = trunc <2 x i64> %b to <2 x i8> ret <2 x i8> %c } +define <4 x i16> @ashr_trunc_v4i64_v4i16(<4 x i64> %a) { +; CHECK-SD-LABEL: ashr_trunc_v4i64_v4i16: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: xtn v1.2s, v1.2d +; CHECK-SD-NEXT: xtn v0.2s, v0.2d +; CHECK-SD-NEXT: ushr v1.2s, v1.2s, #8 +; CHECK-SD-NEXT: ushr v0.2s, v0.2s, #8 +; CHECK-SD-NEXT: uzp1 v0.4h, v0.4h, v1.4h +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: ashr_trunc_v4i64_v4i16: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: adrp x8, .LCPI272_0 +; CHECK-GI-NEXT: uzp1 v0.4s, v0.4s, v1.4s +; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI272_0] +; CHECK-GI-NEXT: uzp1 v2.4s, v2.4s, v2.4s +; CHECK-GI-NEXT: neg v1.4s, v2.4s +; CHECK-GI-NEXT: sshl v0.4s, v0.4s, v1.4s +; CHECK-GI-NEXT: xtn v0.4h, v0.4s +; CHECK-GI-NEXT: ret + %b = ashr <4 x i64> %a, <i64 8, i64 8, i64 8, i64 8> + %c = trunc <4 x i64> %b to <4 x i16> + ret <4 x i16> %c +} + define <2 x i8> @shl_trunc_v2i64_v2i8(<2 x i64> %a) { ; CHECK-SD-LABEL: shl_trunc_v2i64_v2i8: ; CHECK-SD: // %bb.0: @@ -4414,4 +4474,27 @@ define <2 x i8> @shl_trunc_v2i64_v2i8(<2 x i64> %a) { ret <2 x i8> %c } +define <4 x i16> @shl_trunc_v4i64_v4i16(<4 x i64> %a) { +; CHECK-SD-LABEL: shl_trunc_v4i64_v4i16: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: uzp1 v0.4s, v0.4s, v1.4s +; CHECK-SD-NEXT: xtn v0.4h, v0.4s +; CHECK-SD-NEXT: shl v0.4h, v0.4h, #8 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: shl_trunc_v4i64_v4i16: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: adrp x8, .LCPI274_0 +; CHECK-GI-NEXT: uzp1 v0.4s, v0.4s, v1.4s +; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI274_0] +; CHECK-GI-NEXT: uzp1 v1.4s, v2.4s, v2.4s +; CHECK-GI-NEXT: xtn v0.4h, v0.4s +; CHECK-GI-NEXT: xtn v1.4h, v1.4s +; CHECK-GI-NEXT: ushl v0.4h, v0.4h, v1.4h +; CHECK-GI-NEXT: ret + %b = shl <4 x i64> %a, <i64 8, i64 8, i64 8, i64 8> + %c = trunc <4 x i64> %b to <4 x i16> + ret <4 x i16> %c +} + declare <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64>, <2 x i64>) diff --git a/llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmov-fpr.ll b/llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmov-fpr.ll index fa15ab42c263..a0f1b719372b 100644 --- a/llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmov-fpr.ll +++ b/llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmov-fpr.ll @@ -1,33 +1,84 @@ -; RUN: llc < %s -mtriple=arm64-linux-gnu | FileCheck %s -check-prefixes=NOTCPU-LINUX --match-full-lines -; RUN: llc < %s -mtriple=arm64-apple-macosx -mcpu=generic | FileCheck %s -check-prefixes=NOTCPU-APPLE --match-full-lines -; RUN: llc < %s -mtriple=arm64-apple-macosx -mattr=+zcm-fpr64 | FileCheck %s -check-prefixes=ATTR --match-full-lines +; RUN: llc < %s -mtriple=arm64-linux-gnu | FileCheck %s -check-prefixes=NOZCM-FPR128-CPU --match-full-lines +; RUN: llc < %s -mtriple=arm64-apple-macosx -mcpu=generic | FileCheck %s -check-prefixes=NOZCM-FPR128-CPU --match-full-lines +; RUN: llc < %s -mtriple=arm64-apple-macosx -mcpu=apple-m1 | FileCheck %s -check-prefixes=ZCM-FPR128-CPU --match-full-lines +; RUN: llc < %s -mtriple=arm64-apple-macosx -mcpu=apple-m1 -mattr=-zcm-fpr128 | FileCheck %s -check-prefixes=NOZCM-FPR128-ATTR --match-full-lines +; RUN: llc < %s -mtriple=arm64-apple-macosx -mattr=+zcm-fpr128 | FileCheck %s -check-prefixes=ZCM-FPR128-ATTR --match-full-lines + +define void @zero_cycle_regmov_FPR64(double %a, double %b, double %c, double %d) { +entry: +; CHECK-LABEL: t: +; NOZCM-FPR128-CPU: fmov d0, d2 +; NOZCM-FPR128-CPU: fmov d1, d3 +; NOZCM-FPR128-CPU: fmov [[REG2:d[0-9]+]], d3 +; NOZCM-FPR128-CPU: fmov [[REG1:d[0-9]+]], d2 +; NOZCM-FPR128-CPU-NEXT: bl {{_?foo_double}} +; NOZCM-FPR128-CPU: fmov d0, [[REG1]] +; NOZCM-FPR128-CPU: fmov d1, [[REG2]] + +; ZCM-FPR128-CPU: mov.16b [[REG2:v[0-9]+]], v3 +; ZCM-FPR128-CPU: mov.16b [[REG1:v[0-9]+]], v2 +; ZCM-FPR128-CPU: mov.16b v0, v2 +; ZCM-FPR128-CPU: mov.16b v1, v3 +; ZCM-FPR128-CPU-NEXT: bl {{_?foo_double}} +; ZCM-FPR128-CPU: mov.16b v0, [[REG1]] +; ZCM-FPR128-CPU: mov.16b v1, [[REG2]] + +; NOZCM-FPR128-ATTR: fmov [[REG2:d[0-9]+]], d3 +; NOZCM-FPR128-ATTR: fmov [[REG1:d[0-9]+]], d2 +; NOZCM-FPR128-ATTR: fmov d0, d2 +; NOZCM-FPR128-ATTR: fmov d1, d3 +; NOZCM-FPR128-ATTR-NEXT: bl {{_?foo_double}} +; NOZCM-FPR128-ATTR: fmov d0, [[REG1]] +; NOZCM-FPR128-ATTR: fmov d1, [[REG2]] + +; ZCM-FPR128-ATTR: mov.16b v0, v2 +; ZCM-FPR128-ATTR: mov.16b v1, v3 +; ZCM-FPR128-ATTR: mov.16b [[REG2:v[0-9]+]], v3 +; ZCM-FPR128-ATTR: mov.16b [[REG1:v[0-9]+]], v2 +; ZCM-FPR128-ATTR-NEXT: bl {{_?foo_double}} +; ZCM-FPR128-ATTR: mov.16b v0, [[REG1]] +; ZCM-FPR128-ATTR: mov.16b v1, [[REG2]] + %call = call double @foo_double(double %c, double %d) + %call1 = call double @foo_double(double %c, double %d) + unreachable +} + +declare float @foo_double(double, double) define void @zero_cycle_regmov_FPR32(float %a, float %b, float %c, float %d) { entry: ; CHECK-LABEL: t: -; NOTCPU-LINUX: fmov s0, s2 -; NOTCPU-LINUX: fmov s1, s3 -; NOTCPU-LINUX: fmov [[REG2:s[0-9]+]], s3 -; NOTCPU-LINUX: fmov [[REG1:s[0-9]+]], s2 -; NOTCPU-LINUX-NEXT: bl {{_?foo_float}} -; NOTCPU-LINUX: fmov s0, [[REG1]] -; NOTCPU-LINUX: fmov s1, [[REG2]] +; NOZCM-FPR128-CPU: fmov s0, s2 +; NOZCM-FPR128-CPU: fmov s1, s3 +; NOZCM-FPR128-CPU: fmov [[REG2:s[0-9]+]], s3 +; NOZCM-FPR128-CPU: fmov [[REG1:s[0-9]+]], s2 +; NOZCM-FPR128-CPU-NEXT: bl {{_?foo_float}} +; NOZCM-FPR128-CPU: fmov s0, [[REG1]] +; NOZCM-FPR128-CPU: fmov s1, [[REG2]] -; NOTCPU-APPLE: fmov s0, s2 -; NOTCPU-APPLE: fmov s1, s3 -; NOTCPU-APPLE: fmov [[REG2:s[0-9]+]], s3 -; NOTCPU-APPLE: fmov [[REG1:s[0-9]+]], s2 -; NOTCPU-APPLE-NEXT: bl {{_?foo_float}} -; NOTCPU-APPLE: fmov s0, [[REG1]] -; NOTCPU-APPLE: fmov s1, [[REG2]] +; ZCM-FPR128-CPU: mov.16b [[REG2:v[0-9]+]], v3 +; ZCM-FPR128-CPU: mov.16b [[REG1:v[0-9]+]], v2 +; ZCM-FPR128-CPU: mov.16b v0, v2 +; ZCM-FPR128-CPU: mov.16b v1, v3 +; ZCM-FPR128-CPU-NEXT: bl {{_?foo_float}} +; ZCM-FPR128-CPU: mov.16b v0, [[REG1]] +; ZCM-FPR128-CPU: mov.16b v1, [[REG2]] -; ATTR: fmov d0, d2 -; ATTR: fmov d1, d3 -; ATTR: fmov [[REG2:d[0-9]+]], d3 -; ATTR: fmov [[REG1:d[0-9]+]], d2 -; ATTR-NEXT: bl {{_?foo_float}} -; ATTR: fmov d0, [[REG1]] -; ATTR: fmov d1, [[REG2]] +; NOZCM-FPR128-ATTR: fmov [[REG2:s[0-9]+]], s3 +; NOZCM-FPR128-ATTR: fmov [[REG1:s[0-9]+]], s2 +; NOZCM-FPR128-ATTR: fmov s0, s2 +; NOZCM-FPR128-ATTR: fmov s1, s3 +; NOZCM-FPR128-ATTR-NEXT: bl {{_?foo_float}} +; NOZCM-FPR128-ATTR: fmov s0, [[REG1]] +; NOZCM-FPR128-ATTR: fmov s1, [[REG2]] + +; ZCM-FPR128-ATTR: mov.16b v0, v2 +; ZCM-FPR128-ATTR: mov.16b v1, v3 +; ZCM-FPR128-ATTR: mov.16b [[REG2:v[0-9]+]], v3 +; ZCM-FPR128-ATTR: mov.16b [[REG1:v[0-9]+]], v2 +; ZCM-FPR128-ATTR-NEXT: bl {{_?foo_float}} +; ZCM-FPR128-ATTR: mov.16b v0, [[REG1]] +; ZCM-FPR128-ATTR: mov.16b v1, [[REG2]] %call = call float @foo_float(float %c, float %d) %call1 = call float @foo_float(float %c, float %d) unreachable @@ -38,29 +89,37 @@ declare float @foo_float(float, float) define void @zero_cycle_regmov_FPR16(half %a, half %b, half %c, half %d) { entry: ; CHECK-LABEL: t: -; NOTCPU-LINUX: fmov s0, s2 -; NOTCPU-LINUX: fmov s1, s3 -; NOTCPU-LINUX: fmov [[REG2:s[0-9]+]], s3 -; NOTCPU-LINUX: fmov [[REG1:s[0-9]+]], s2 -; NOTCPU-LINUX-NEXT: bl {{_?foo_half}} -; NOTCPU-LINUX: fmov s0, [[REG1]] -; NOTCPU-LINUX: fmov s1, [[REG2]] +; NOZCM-FPR128-CPU: fmov s0, s2 +; NOZCM-FPR128-CPU: fmov s1, s3 +; NOZCM-FPR128-CPU: fmov [[REG2:s[0-9]+]], s3 +; NOZCM-FPR128-CPU: fmov [[REG1:s[0-9]+]], s2 +; NOZCM-FPR128-CPU-NEXT: bl {{_?foo_half}} +; NOZCM-FPR128-CPU: fmov s0, [[REG1]] +; NOZCM-FPR128-CPU: fmov s1, [[REG2]] + +; ZCM-FPR128-CPU: mov.16b [[REG2:v[0-9]+]], v3 +; ZCM-FPR128-CPU: mov.16b [[REG1:v[0-9]+]], v2 +; ZCM-FPR128-CPU: mov.16b v0, v2 +; ZCM-FPR128-CPU: mov.16b v1, v3 +; ZCM-FPR128-CPU-NEXT: bl {{_?foo_half}} +; ZCM-FPR128-CPU: mov.16b v0, [[REG1]] +; ZCM-FPR128-CPU: mov.16b v1, [[REG2]] -; NOTCPU-APPLE: fmov s0, s2 -; NOTCPU-APPLE: fmov s1, s3 -; NOTCPU-APPLE: fmov [[REG2:s[0-9]+]], s3 -; NOTCPU-APPLE: fmov [[REG1:s[0-9]+]], s2 -; NOTCPU-APPLE-NEXT: bl {{_?foo_half}} -; NOTCPU-APPLE: fmov s0, [[REG1]] -; NOTCPU-APPLE: fmov s1, [[REG2]] +; NOZCM-FPR128-ATTR: fmov [[REG2:s[0-9]+]], s3 +; NOZCM-FPR128-ATTR: fmov [[REG1:s[0-9]+]], s2 +; NOZCM-FPR128-ATTR: fmov s0, s2 +; NOZCM-FPR128-ATTR: fmov s1, s3 +; NOZCM-FPR128-ATTR-NEXT: bl {{_?foo_half}} +; NOZCM-FPR128-ATTR: fmov s0, [[REG1]] +; NOZCM-FPR128-ATTR: fmov s1, [[REG2]] -; ATTR: fmov d0, d2 -; ATTR: fmov d1, d3 -; ATTR: fmov [[REG2:d[0-9]+]], d3 -; ATTR: fmov [[REG1:d[0-9]+]], d2 -; ATTR-NEXT: bl {{_?foo_half}} -; ATTR: fmov d0, [[REG1]] -; ATTR: fmov d1, [[REG2]] +; ZCM-FPR128-ATTR: mov.16b v0, v2 +; ZCM-FPR128-ATTR: mov.16b v1, v3 +; ZCM-FPR128-ATTR: mov.16b [[REG2:v[0-9]+]], v3 +; ZCM-FPR128-ATTR: mov.16b [[REG1:v[0-9]+]], v2 +; ZCM-FPR128-ATTR-NEXT: bl {{_?foo_half}} +; ZCM-FPR128-ATTR: mov.16b v0, [[REG1]] +; ZCM-FPR128-ATTR: mov.16b v1, [[REG2]] %call = call half @foo_half(half %c, half %d) %call1 = call half @foo_half(half %c, half %d) unreachable diff --git a/llvm/test/CodeGen/AArch64/arm64-zero-cycle-zeroing-fpr.ll b/llvm/test/CodeGen/AArch64/arm64-zero-cycle-zeroing-fpr.ll new file mode 100644 index 000000000000..ccdaa8779e38 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/arm64-zero-cycle-zeroing-fpr.ll @@ -0,0 +1,167 @@ +; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+no-zcz-fpr64 | FileCheck %s -check-prefixes=ALL,NOZCZ-FPR64-NOZCZ-FPR128 +; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+no-zcz-fpr64,+fullfp16 | FileCheck %s -check-prefixes=ALL,NOZCZ-FPR64-NOZCZ-FPR128-FULLFP16 +; RUN: llc < %s -mtriple=aarch64-linux-gnu | FileCheck %s -check-prefixes=ALL,ZCZ-FPR64 +; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+fullfp16 | FileCheck %s -check-prefixes=ALL,ZCZ-FPR64 +; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+no-zcz-fpr64,+zcz-fpr128 | FileCheck %s -check-prefixes=ALL,NOZCZ-FPR64-ZCZ-FPR128 +; RUN: llc < %s -mtriple=arm64-apple-ios -mcpu=cyclone | FileCheck %s -check-prefixes=ALL,FP-WORKAROUND +; RUN: llc < %s -mtriple=arm64-apple-macosx -mcpu=apple-m1 | FileCheck %s -check-prefixes=ALL,NOZCZ-FPR64-ZCZ-FPR128 +; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=exynos-m3 | FileCheck %s -check-prefixes=ALL,ZCZ-FPR64 +; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=kryo | FileCheck %s -check-prefixes=ALL,ZCZ-FPR64 +; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=falkor | FileCheck %s -check-prefixes=ALL,ZCZ-FPR64 + +define half @tf16() { +entry: +; ALL-LABEL: tf16: +; FP-WORKAROUND: mov s0, wzr +; NOZCZ-FPR64-NOZCZ-FPR128: mov s0, wzr +; NOZCZ-FPR64-NOZCZ-FPR128-FULLFP16: mov h0, wzr +; ZCZ-FPR64: movi d0, #0 +; NOZCZ-FPR64-ZCZ-FPR128: movi{{(.2d)?}} v0{{(.2d)?}}, #0 + ret half 0.0 +} + +define float @tf32() { +entry: +; ALL-LABEL: tf32: +; FP-WORKAROUND: mov s0, wzr +; NOZCZ-FPR64-NOZCZ-FPR128: mov s0, wzr +; ZCZ-FPR64: movi d0, #0 +; NOZCZ-FPR64-ZCZ-FPR128: movi{{(.2d)?}} v0{{(.2d)?}}, #0 + ret float 0.0 +} + +define double @td64() { +entry: +; ALL-LABEL: td64: +; FP-WORKAROUND: mov d0, xzr +; NOZCZ-FPR64-NOZCZ-FPR128: mov d0, xzr +; ZCZ-FPR64: movi d0, #0 +; NOZCZ-FPR64-ZCZ-FPR128: movi{{(.2d)?}} v0{{(.2d)?}}, #0 + ret double 0.0 +} + +define <8 x i8> @tv8i8() { +entry: +; ALL-LABEL: tv8i8: +; FP-WORKAROUND: movi{{(.16b)?}} v0{{(.16b)?}}, #0 +; NOZCZ-FPR64-NOZCZ-FPR128: movi{{(.2d)?}} v0{{(.2d)?}}, #0 +; ZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0 +; NOZCZ-FPR64-ZCZ-FPR128: movi{{(.2d)?}} v0{{(.2d)?}}, #0 + ret <8 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0> +} + +define <4 x i16> @tv4i16() { +entry: +; ALL-LABEL: tv4i16: +; FP-WORKAROUND: movi{{(.16b)?}} v0{{(.16b)?}}, #0 +; NOZCZ-FPR64-NOZCZ-FPR128: movi{{(.2d)?}} v0{{(.2d)?}}, #0 +; ZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0 +; NOZCZ-FPR64-ZCZ-FPR128: movi{{(.2d)?}} v0{{(.2d)?}}, #0 + ret <4 x i16> <i16 0, i16 0, i16 0, i16 0> +} + +define <2 x i32> @tv2i32() { +entry: +; ALL-LABEL: tv2i32: +; FP-WORKAROUND: movi{{(.16b)?}} v0{{(.16b)?}}, #0 +; NOZCZ-FPR64-NOZCZ-FPR128: movi{{(.2d)?}} v0{{(.2d)?}}, #0 +; ZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0 +; NOZCZ-FPR64-ZCZ-FPR128: movi{{(.2d)?}} v0{{(.2d)?}}, #0 + ret <2 x i32> <i32 0, i32 0> +} + +define <2 x float> @tv2f32() { +entry: +; ALL-LABEL: tv2f32: +; FP-WORKAROUND: movi{{(.16b)?}} v0{{(.16b)?}}, #0 +; NOZCZ-FPR64-NOZCZ-FPR128: movi{{(.2d)?}} v0{{(.2d)?}}, #0 +; ZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0 +; NOZCZ-FPR64-ZCZ-FPR128: movi{{(.2d)?}} v0{{(.2d)?}}, #0 + ret <2 x float> <float 0.0, float 0.0> +} + +define <16 x i8> @tv16i8() { +entry: +; ALL-LABEL: tv16i8: +; FP-WORKAROUND: movi{{(.16b)?}} v0{{(.16b)?}}, #0 +; NOZCZ-FPR64-NOZCZ-FPR128: movi{{(.2d)?}} v0{{(.2d)?}}, #0 +; ZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0 +; NOZCZ-FPR64-ZCZ-FPR128: movi{{(.2d)?}} v0{{(.2d)?}}, #0 + ret <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0> +} + +define <8 x i16> @tv8i16() { +entry: +; ALL-LABEL: tv8i16: +; FP-WORKAROUND: movi{{(.16b)?}} v0{{(.16b)?}}, #0 +; NOZCZ-FPR64-NOZCZ-FPR128: movi{{(.2d)?}} v0{{(.2d)?}}, #0 +; ZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0 +; NOZCZ-FPR64-ZCZ-FPR128: movi{{(.2d)?}} v0{{(.2d)?}}, #0 + ret <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0> +} + +define <4 x i32> @tv4i32() { +entry: +; ALL-LABEL: tv4i32: +; FP-WORKAROUND: movi{{(.16b)?}} v0{{(.16b)?}}, #0 +; NOZCZ-FPR64-NOZCZ-FPR128: movi{{(.2d)?}} v0{{(.2d)?}}, #0 +; ZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0 +; NOZCZ-FPR64-ZCZ-FPR128: movi{{(.2d)?}} v0{{(.2d)?}}, #0 + ret <4 x i32> <i32 0, i32 0, i32 0, i32 0> +} + +define <2 x i64> @tv2i64() { +entry: +; ALL-LABEL: tv2i64: +; FP-WORKAROUND: movi{{(.16b)?}} v0{{(.16b)?}}, #0 +; NOZCZ-FPR64-NOZCZ-FPR128: movi{{(.2d)?}} v0{{(.2d)?}}, #0 +; ZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0 +; NOZCZ-FPR64-ZCZ-FPR128: movi{{(.2d)?}} v0{{(.2d)?}}, #0 + ret <2 x i64> <i64 0, i64 0> +} + +define <4 x float> @tv4f32() { +entry: +; ALL-LABEL: tv4f32: +; FP-WORKAROUND: movi{{(.16b)?}} v0{{(.16b)?}}, #0 +; NOZCZ-FPR64-NOZCZ-FPR128: movi{{(.2d)?}} v0{{(.2d)?}}, #0 +; ZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0 +; NOZCZ-FPR64-ZCZ-FPR128: movi{{(.2d)?}} v0{{(.2d)?}}, #0 + ret <4 x float> <float 0.0, float 0.0, float 0.0, float 0.0> +} + +define <2 x double> @tv2d64() { +entry: +; ALL-LABEL: tv2d64: +; FP-WORKAROUND: movi{{(.16b)?}} v0{{(.16b)?}}, #0 +; NOZCZ-FPR64-NOZCZ-FPR128: movi{{(.2d)?}} v0{{(.2d)?}}, #0 +; ZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0 +; NOZCZ-FPR64-ZCZ-FPR128: movi{{(.2d)?}} v0{{(.2d)?}}, #0 + ret <2 x double> <double 0.0, double 0.0> +} + +; We used to produce spills+reloads for a Q register with zero cycle zeroing +; enabled. +; ALL-LABEL: foo: +; ALL-NOT: str q{{[0-9]+}} +; ALL-NOT: ldr q{{[0-9]+}} +define double @foo(i32 %n) { +entry: + br label %for.body + +for.body: + %phi0 = phi double [ 1.0, %entry ], [ %v0, %for.body ] + %i.076 = phi i32 [ 0, %entry ], [ %inc, %for.body ] + %conv21 = sitofp i32 %i.076 to double + %call = tail call fast double @sin(double %conv21) + %cmp.i = fcmp fast olt double %phi0, %call + %v0 = select i1 %cmp.i, double %call, double %phi0 + %inc = add nuw nsw i32 %i.076, 1 + %cmp = icmp slt i32 %inc, %n + br i1 %cmp, label %for.body, label %for.end + +for.end: + ret double %v0 +} + +declare double @sin(double) diff --git a/llvm/test/CodeGen/AArch64/arm64-zero-cycle-zeroing-gpr.ll b/llvm/test/CodeGen/AArch64/arm64-zero-cycle-zeroing-gpr.ll new file mode 100644 index 000000000000..dc643062d869 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/arm64-zero-cycle-zeroing-gpr.ll @@ -0,0 +1,41 @@ +; RUN: llc < %s -mtriple=aarch64-linux-gnu | FileCheck %s -check-prefixes=ALL,NOZCZ-GPR +; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+zcz-gpr32 | FileCheck %s -check-prefixes=ALL,ZCZ-GPR32 +; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+zcz-gpr64 | FileCheck %s -check-prefixes=ALL,ZCZ-GPR64 +; RUN: llc < %s -mtriple=arm64-apple-macosx -mcpu=generic | FileCheck %s -check-prefixes=ALL,NOZCZ-GPR +; RUN: llc < %s -mtriple=arm64-apple-ios -mcpu=cyclone | FileCheck %s -check-prefixes=ALL,ZCZ-GPR32,ZCZ-GPR64 +; RUN: llc < %s -mtriple=arm64-apple-macosx -mcpu=apple-m1 | FileCheck %s -check-prefixes=ALL,ZCZ-GPR32,ZCZ-GPR64 +; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=exynos-m3 | FileCheck %s -check-prefixes=ALL,NOZCZ-GPR +; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=kryo | FileCheck %s -check-prefixes=ALL,ZCZ-GPR32,ZCZ-GPR64 +; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=falkor | FileCheck %s -check-prefixes=ALL,ZCZ-GPR32,ZCZ-GPR64 + +define i8 @ti8() { +entry: +; ALL-LABEL: ti8: +; NOZCZ-GPR: mov w0, wzr +; ZCZ-GPR32: mov w0, #0 + ret i8 0 +} + +define i16 @ti16() { +entry: +; ALL-LABEL: ti16: +; NOZCZ-GPR: mov w0, wzr +; ZCZ-GPR32: mov w0, #0 + ret i16 0 +} + +define i32 @ti32() { +entry: +; ALL-LABEL: ti32: +; NOZCZ-GPR: mov w0, wzr +; ZCZ-GPR32: mov w0, #0 + ret i32 0 +} + +define i64 @ti64() { +entry: +; ALL-LABEL: ti64: +; NOZCZ-GPR: mov x0, xzr +; ZCZ-GPR64: mov x0, #0 + ret i64 0 +} diff --git a/llvm/test/CodeGen/AArch64/arm64-zero-cycle-zeroing.ll b/llvm/test/CodeGen/AArch64/arm64-zero-cycle-zeroing.ll deleted file mode 100644 index 6c3cd4766d79..000000000000 --- a/llvm/test/CodeGen/AArch64/arm64-zero-cycle-zeroing.ll +++ /dev/null @@ -1,231 +0,0 @@ -; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=-zcz-gp,+no-zcz-fp | FileCheck %s -check-prefixes=ALL,NONEGP,NONEFP -; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+zcz | FileCheck %s -check-prefixes=ALL,ZEROGP,ZEROFP -; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+zcz -mattr=+fullfp16 | FileCheck %s -check-prefixes=ALL,ZEROGP,ZERO16 -; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+zcz-gp,+no-zcz-fp | FileCheck %s -check-prefixes=ALL,ZEROGP,NONEFP -; RUN: llc < %s -mtriple=aarch64-linux-gnu | FileCheck %s -check-prefixes=ALL,NONEGP,ZEROFP -; RUN: llc < %s -mtriple=arm64-apple-ios -mcpu=cyclone | FileCheck %s -check-prefixes=ALL,ZEROGP,NONEFP -; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=apple-a10 | FileCheck %s -check-prefixes=ALL,ZEROGP,ZEROFP -; RUN: llc < %s -mtriple=arm64-apple-ios -mcpu=cyclone -mattr=+fullfp16 | FileCheck %s -check-prefixes=ALL,ZEROGP,NONE16 -; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=exynos-m3 | FileCheck %s -check-prefixes=ALL,NONEGP,ZEROFP -; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=kryo | FileCheck %s -check-prefixes=ALL,ZEROGP,ZEROFP -; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=falkor | FileCheck %s -check-prefixes=ALL,ZEROGP,ZEROFP - -declare void @bar(half, float, double, <2 x double>) -declare void @bari(i32, i32) -declare void @barl(i64, i64) -declare void @barf(float, float) - -define void @t1() nounwind ssp { -entry: -; ALL-LABEL: t1: -; ALL-NOT: fmov -; NONEFP-DAG: fmov s0, wzr -; NONEFP-DAG: fmov s1, wzr -; NONEFP-DAG: fmov d2, xzr -; NONEFP-DAG: movi{{(.16b)?}} v3{{(.2d)?}}, #0 -; NONE16: fmov h0, wzr -; NONE16: fmov s1, wzr -; NONE16: fmov d2, xzr -; NONE16: movi{{(.16b)?}} v3{{(.2d)?}}, #0 -; ZEROFP-DAG: movi d0, #0 -; ZEROFP-DAG: movi d1, #0 -; ZEROFP-DAG: movi d2, #0 -; ZEROFP-DAG: movi v3.2d, #0 -; ZERO16: movi d0, #0 -; ZERO16: movi d1, #0 -; ZERO16: movi d2, #0 -; ZERO16: movi v3.2d, #0 - tail call void @bar(half 0.000000e+00, float 0.000000e+00, double 0.000000e+00, <2 x double> <double 0.000000e+00, double 0.000000e+00>) nounwind - ret void -} - -define void @t2() nounwind ssp { -entry: -; ALL-LABEL: t2: -; NONEGP: mov w0, wzr -; NONEGP: mov w1, wzr -; ZEROGP: mov w0, #0 -; ZEROGP: mov w1, #0 - tail call void @bari(i32 0, i32 0) nounwind - ret void -} - -define void @t3() nounwind ssp { -entry: -; ALL-LABEL: t3: -; NONEGP: mov x0, xzr -; NONEGP: mov x1, xzr -; ZEROGP: mov x0, #0 -; ZEROGP: mov x1, #0 - tail call void @barl(i64 0, i64 0) nounwind - ret void -} - -define void @t4() nounwind ssp { -; ALL-LABEL: t4: -; NONEFP: fmov s{{[0-3]+}}, wzr -; NONEFP: fmov s{{[0-3]+}}, wzr -; ZEROFP: movi d0, #0 -; ZEROFP: movi d1, #0 - tail call void @barf(float 0.000000e+00, float 0.000000e+00) nounwind - ret void -} - -declare double @sin(double) - -; We used to produce spills+reloads for a Q register with zero cycle zeroing -; enabled. -; ALL-LABEL: foo: -; ALL-NOT: str q{{[0-9]+}} -; ALL-NOT: ldr q{{[0-9]+}} -define double @foo(i32 %n) { -entry: - br label %for.body - -for.body: - %phi0 = phi double [ 1.0, %entry ], [ %v0, %for.body ] - %i.076 = phi i32 [ 0, %entry ], [ %inc, %for.body ] - %conv21 = sitofp i32 %i.076 to double - %call = tail call fast double @sin(double %conv21) - %cmp.i = fcmp fast olt double %phi0, %call - %v0 = select i1 %cmp.i, double %call, double %phi0 - %inc = add nuw nsw i32 %i.076, 1 - %cmp = icmp slt i32 %inc, %n - br i1 %cmp, label %for.body, label %for.end - -for.end: - ret double %v0 -} - -define <2 x i64> @t6() { -; ALL-LABEL: t6: -; ALL: movi{{(.16b)?}} v0{{(.2d)?}}, #0 - ret <2 x i64> zeroinitializer -} - -define i1 @ti1() { -entry: -; ALL-LABEL: ti1: -; NONEGP: mov w0, wzr -; ZEROGP: mov w0, #0 - ret i1 false -} - -define i8 @ti8() { -entry: -; ALL-LABEL: ti8: -; NONEGP: mov w0, wzr -; ZEROGP: mov w0, #0 - ret i8 0 -} - -define i16 @ti16() { -entry: -; ALL-LABEL: ti16: -; NONEGP: mov w0, wzr - ; ZEROGP: mov w0, #0 - ret i16 0 -} - -define i32 @ti32() { -entry: -; ALL-LABEL: ti32: -; NONEGP: mov w0, wzr -; ZEROGP: mov w0, #0 - ret i32 0 -} - -define i64 @ti64() { -entry: -; ALL-LABEL: ti64: -; NONEGP: mov x0, xzr -; ZEROGP: mov x0, #0 - ret i64 0 -} - -define float @tf32() { -entry: -; ALL-LABEL: tf32: -; NONEFP: mov s0, wzr -; ZEROFP: movi d0, #0 - ret float 0.0 -} - -define double @td64() { -entry: -; ALL-LABEL: td64: -; NONEFP: mov d0, xzr -; ZEROFP: movi d0, #0 - ret double 0.0 -} - -define <8 x i8> @tv8i8() { -entry: -; ALL-LABEL: tv8i8: -; ALL: movi{{(.16b)?}} v0{{(.2d)?}}, #0 - ret <8 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0> -} - -define <4 x i16> @tv4i16() { -entry: -; ALL-LABEL: tv4i16: -; ALL: movi{{(.16b)?}} v0{{(.2d)?}}, #0 - ret <4 x i16> <i16 0, i16 0, i16 0, i16 0> -} - -define <2 x i32> @tv2i32() { -entry: -; ALL-LABEL: tv2i32: -; ALL: movi{{(.16b)?}} v0{{(.2d)?}}, #0 - ret <2 x i32> <i32 0, i32 0> -} - -define <2 x float> @tv2f32() { -entry: -; ALL-LABEL: tv2f32: -; ALL: movi{{(.16b)?}} v0{{(.2d)?}}, #0 - ret <2 x float> <float 0.0, float 0.0> -} - -define <16 x i8> @tv16i8() { -entry: -; ALL-LABEL: tv16i8: -; ALL: movi{{(.16b)?}} v0{{(.2d)?}}, #0 - ret <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0> -} - -define <8 x i16> @tv8i16() { -entry: -; ALL-LABEL: tv8i16: -; ALL: movi{{(.16b)?}} v0{{(.2d)?}}, #0 - ret <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0> -} - -define <4 x i32> @tv4i32() { -entry: -; ALL-LABEL: tv4i32: -; ALL: movi{{(.16b)?}} v0{{(.2d)?}}, #0 - ret <4 x i32> <i32 0, i32 0, i32 0, i32 0> -} - -define <2 x i64> @tv2i64() { -entry: -; ALL-LABEL: tv2i64: -; ALL: movi{{(.16b)?}} v0{{(.2d)?}}, #0 - ret <2 x i64> <i64 0, i64 0> -} - -define <4 x float> @tv4f32() { -entry: -; ALL-LABEL: tv4f32: -; ALL: movi{{(.16b)?}} v0{{(.2d)?}}, #0 - ret <4 x float> <float 0.0, float 0.0, float 0.0, float 0.0> -} - -define <2 x double> @tv2d64() { -entry: -; ALL-LABEL: tv2d64: -; ALL: movi{{(.16b)?}} v0{{(.2d)?}}, #0 - ret <2 x double> <double 0.0, double 0.0> -} - diff --git a/llvm/test/CodeGen/AArch64/atomic-ops-msvc.ll b/llvm/test/CodeGen/AArch64/atomic-ops-msvc.ll index 42cb3d4e9589..bf78429da52f 100644 --- a/llvm/test/CodeGen/AArch64/atomic-ops-msvc.ll +++ b/llvm/test/CodeGen/AArch64/atomic-ops-msvc.ll @@ -850,18 +850,18 @@ define dso_local void @test_atomic_cmpxchg_i64(i64 %wanted, i64 %new) nounwind { ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldxr x8, [x9] ; CHECK-NEXT: cmp x8, x0 -; CHECK-NEXT: b.ne .LBB43_3 +; CHECK-NEXT: b.ne .LBB43_4 ; CHECK-NEXT: // %bb.2: // %cmpxchg.trystore ; CHECK-NEXT: // in Loop: Header=BB43_1 Depth=1 ; CHECK-NEXT: stxr w10, x1, [x9] ; CHECK-NEXT: cbnz w10, .LBB43_1 -; CHECK-NEXT: b .LBB43_4 -; CHECK-NEXT: .LBB43_3: // %cmpxchg.nostore -; CHECK-NEXT: clrex -; CHECK-NEXT: .LBB43_4: // %cmpxchg.end +; CHECK-NEXT: .LBB43_3: // %cmpxchg.end ; CHECK-NEXT: adrp x9, var64 ; CHECK-NEXT: str x8, [x9, :lo12:var64] ; CHECK-NEXT: ret +; CHECK-NEXT: .LBB43_4: // %cmpxchg.nostore +; CHECK-NEXT: clrex +; CHECK-NEXT: b .LBB43_3 %pair = cmpxchg ptr @var64, i64 %wanted, i64 %new monotonic monotonic %old = extractvalue { i64, i1 } %pair, 0 store i64 %old, ptr @var64 diff --git a/llvm/test/CodeGen/AArch64/atomic-ops.ll b/llvm/test/CodeGen/AArch64/atomic-ops.ll index d8ac89f76b32..deeba7ef3ce2 100644 --- a/llvm/test/CodeGen/AArch64/atomic-ops.ll +++ b/llvm/test/CodeGen/AArch64/atomic-ops.ll @@ -1090,18 +1090,18 @@ define dso_local void @test_atomic_cmpxchg_i64(i64 %wanted, i64 %new) nounwind { ; INLINE_ATOMICS-NEXT: // =>This Inner Loop Header: Depth=1 ; INLINE_ATOMICS-NEXT: ldxr x8, [x9] ; INLINE_ATOMICS-NEXT: cmp x8, x0 -; INLINE_ATOMICS-NEXT: b.ne .LBB43_3 +; INLINE_ATOMICS-NEXT: b.ne .LBB43_4 ; INLINE_ATOMICS-NEXT: // %bb.2: // %cmpxchg.trystore ; INLINE_ATOMICS-NEXT: // in Loop: Header=BB43_1 Depth=1 ; INLINE_ATOMICS-NEXT: stxr w10, x1, [x9] ; INLINE_ATOMICS-NEXT: cbnz w10, .LBB43_1 -; INLINE_ATOMICS-NEXT: b .LBB43_4 -; INLINE_ATOMICS-NEXT: .LBB43_3: // %cmpxchg.nostore -; INLINE_ATOMICS-NEXT: clrex -; INLINE_ATOMICS-NEXT: .LBB43_4: // %cmpxchg.end +; INLINE_ATOMICS-NEXT: .LBB43_3: // %cmpxchg.end ; INLINE_ATOMICS-NEXT: adrp x9, var64 ; INLINE_ATOMICS-NEXT: str x8, [x9, :lo12:var64] ; INLINE_ATOMICS-NEXT: ret +; INLINE_ATOMICS-NEXT: .LBB43_4: // %cmpxchg.nostore +; INLINE_ATOMICS-NEXT: clrex +; INLINE_ATOMICS-NEXT: b .LBB43_3 ; ; OUTLINE_ATOMICS-LABEL: test_atomic_cmpxchg_i64: ; OUTLINE_ATOMICS: // %bb.0: diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll index 21729b9dfd10..24a6c3c440e1 100644 --- a/llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll @@ -49,15 +49,9 @@ define half @test_atomicrmw_fadd_f16_seq_cst_align2(ptr %ptr, half %value) #0 { ; SOFTFP-NOLSE-NEXT: ldrh w0, [x0] ; SOFTFP-NOLSE-NEXT: mov w20, w1 ; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: b .LBB0_2 -; SOFTFP-NOLSE-NEXT: .LBB0_1: // %cmpxchg.nostore -; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB0_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB0_6 -; SOFTFP-NOLSE-NEXT: .LBB0_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB0_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 -; SOFTFP-NOLSE-NEXT: // Child Loop BB0_3 Depth 2 +; SOFTFP-NOLSE-NEXT: // Child Loop BB0_2 Depth 2 ; SOFTFP-NOLSE-NEXT: mov w22, w0 ; SOFTFP-NOLSE-NEXT: and w0, w20, #0xffff ; SOFTFP-NOLSE-NEXT: bl __extendhfsf2 @@ -68,19 +62,25 @@ define half @test_atomicrmw_fadd_f16_seq_cst_align2(ptr %ptr, half %value) #0 { ; SOFTFP-NOLSE-NEXT: bl __addsf3 ; SOFTFP-NOLSE-NEXT: bl __truncsfhf2 ; SOFTFP-NOLSE-NEXT: mov w8, w0 -; SOFTFP-NOLSE-NEXT: .LBB0_3: // %cmpxchg.start -; SOFTFP-NOLSE-NEXT: // Parent Loop BB0_2 Depth=1 +; SOFTFP-NOLSE-NEXT: .LBB0_2: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB0_1 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 ; SOFTFP-NOLSE-NEXT: ldaxrh w0, [x19] ; SOFTFP-NOLSE-NEXT: cmp w0, w22, uxth -; SOFTFP-NOLSE-NEXT: b.ne .LBB0_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore -; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB0_3 Depth=2 +; SOFTFP-NOLSE-NEXT: b.ne .LBB0_5 +; SOFTFP-NOLSE-NEXT: // %bb.3: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB0_2 Depth=2 ; SOFTFP-NOLSE-NEXT: stlxrh w9, w8, [x19] -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB0_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB0_2 Depth=1 +; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB0_2 +; SOFTFP-NOLSE-NEXT: // %bb.4: // in Loop: Header=BB0_1 Depth=1 ; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB0_2 +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB0_1 +; SOFTFP-NOLSE-NEXT: b .LBB0_6 +; SOFTFP-NOLSE-NEXT: .LBB0_5: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB0_1 Depth=1 +; SOFTFP-NOLSE-NEXT: mov w8, wzr +; SOFTFP-NOLSE-NEXT: clrex +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB0_1 ; SOFTFP-NOLSE-NEXT: .LBB0_6: // %atomicrmw.end ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 @@ -137,15 +137,9 @@ define half @test_atomicrmw_fadd_f16_seq_cst_align4(ptr %ptr, half %value) #0 { ; SOFTFP-NOLSE-NEXT: ldrh w0, [x0] ; SOFTFP-NOLSE-NEXT: mov w20, w1 ; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: b .LBB1_2 -; SOFTFP-NOLSE-NEXT: .LBB1_1: // %cmpxchg.nostore -; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB1_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB1_6 -; SOFTFP-NOLSE-NEXT: .LBB1_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB1_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 -; SOFTFP-NOLSE-NEXT: // Child Loop BB1_3 Depth 2 +; SOFTFP-NOLSE-NEXT: // Child Loop BB1_2 Depth 2 ; SOFTFP-NOLSE-NEXT: mov w22, w0 ; SOFTFP-NOLSE-NEXT: and w0, w20, #0xffff ; SOFTFP-NOLSE-NEXT: bl __extendhfsf2 @@ -156,19 +150,25 @@ define half @test_atomicrmw_fadd_f16_seq_cst_align4(ptr %ptr, half %value) #0 { ; SOFTFP-NOLSE-NEXT: bl __addsf3 ; SOFTFP-NOLSE-NEXT: bl __truncsfhf2 ; SOFTFP-NOLSE-NEXT: mov w8, w0 -; SOFTFP-NOLSE-NEXT: .LBB1_3: // %cmpxchg.start -; SOFTFP-NOLSE-NEXT: // Parent Loop BB1_2 Depth=1 +; SOFTFP-NOLSE-NEXT: .LBB1_2: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB1_1 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 ; SOFTFP-NOLSE-NEXT: ldaxrh w0, [x19] ; SOFTFP-NOLSE-NEXT: cmp w0, w22, uxth -; SOFTFP-NOLSE-NEXT: b.ne .LBB1_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore -; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB1_3 Depth=2 +; SOFTFP-NOLSE-NEXT: b.ne .LBB1_5 +; SOFTFP-NOLSE-NEXT: // %bb.3: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB1_2 Depth=2 ; SOFTFP-NOLSE-NEXT: stlxrh w9, w8, [x19] -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB1_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB1_2 Depth=1 +; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB1_2 +; SOFTFP-NOLSE-NEXT: // %bb.4: // in Loop: Header=BB1_1 Depth=1 ; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB1_2 +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB1_1 +; SOFTFP-NOLSE-NEXT: b .LBB1_6 +; SOFTFP-NOLSE-NEXT: .LBB1_5: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB1_1 Depth=1 +; SOFTFP-NOLSE-NEXT: mov w8, wzr +; SOFTFP-NOLSE-NEXT: clrex +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB1_1 ; SOFTFP-NOLSE-NEXT: .LBB1_6: // %atomicrmw.end ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 @@ -236,34 +236,34 @@ define bfloat @test_atomicrmw_fadd_bf16_seq_cst_align2(ptr %ptr, bfloat %value) ; SOFTFP-NOLSE-NEXT: mov x19, x0 ; SOFTFP-NOLSE-NEXT: ldrh w0, [x0] ; SOFTFP-NOLSE-NEXT: lsl w20, w1, #16 -; SOFTFP-NOLSE-NEXT: b .LBB2_2 -; SOFTFP-NOLSE-NEXT: .LBB2_1: // %cmpxchg.nostore -; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB2_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB2_6 -; SOFTFP-NOLSE-NEXT: .LBB2_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB2_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 -; SOFTFP-NOLSE-NEXT: // Child Loop BB2_3 Depth 2 +; SOFTFP-NOLSE-NEXT: // Child Loop BB2_2 Depth 2 ; SOFTFP-NOLSE-NEXT: mov w21, w0 ; SOFTFP-NOLSE-NEXT: lsl w0, w0, #16 ; SOFTFP-NOLSE-NEXT: mov w1, w20 ; SOFTFP-NOLSE-NEXT: bl __addsf3 ; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 ; SOFTFP-NOLSE-NEXT: mov w8, w0 -; SOFTFP-NOLSE-NEXT: .LBB2_3: // %cmpxchg.start -; SOFTFP-NOLSE-NEXT: // Parent Loop BB2_2 Depth=1 +; SOFTFP-NOLSE-NEXT: .LBB2_2: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB2_1 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 ; SOFTFP-NOLSE-NEXT: ldaxrh w0, [x19] ; SOFTFP-NOLSE-NEXT: cmp w0, w21, uxth -; SOFTFP-NOLSE-NEXT: b.ne .LBB2_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore -; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB2_3 Depth=2 +; SOFTFP-NOLSE-NEXT: b.ne .LBB2_5 +; SOFTFP-NOLSE-NEXT: // %bb.3: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB2_2 Depth=2 ; SOFTFP-NOLSE-NEXT: stlxrh w9, w8, [x19] -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB2_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB2_2 Depth=1 +; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB2_2 +; SOFTFP-NOLSE-NEXT: // %bb.4: // in Loop: Header=BB2_1 Depth=1 ; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB2_2 +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB2_1 +; SOFTFP-NOLSE-NEXT: b .LBB2_6 +; SOFTFP-NOLSE-NEXT: .LBB2_5: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB2_1 Depth=1 +; SOFTFP-NOLSE-NEXT: mov w8, wzr +; SOFTFP-NOLSE-NEXT: clrex +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB2_1 ; SOFTFP-NOLSE-NEXT: .LBB2_6: // %atomicrmw.end ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 @@ -330,34 +330,34 @@ define bfloat @test_atomicrmw_fadd_bf16_seq_cst_align4(ptr %ptr, bfloat %value) ; SOFTFP-NOLSE-NEXT: mov x19, x0 ; SOFTFP-NOLSE-NEXT: ldrh w0, [x0] ; SOFTFP-NOLSE-NEXT: lsl w20, w1, #16 -; SOFTFP-NOLSE-NEXT: b .LBB3_2 -; SOFTFP-NOLSE-NEXT: .LBB3_1: // %cmpxchg.nostore -; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB3_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB3_6 -; SOFTFP-NOLSE-NEXT: .LBB3_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB3_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 -; SOFTFP-NOLSE-NEXT: // Child Loop BB3_3 Depth 2 +; SOFTFP-NOLSE-NEXT: // Child Loop BB3_2 Depth 2 ; SOFTFP-NOLSE-NEXT: mov w21, w0 ; SOFTFP-NOLSE-NEXT: lsl w0, w0, #16 ; SOFTFP-NOLSE-NEXT: mov w1, w20 ; SOFTFP-NOLSE-NEXT: bl __addsf3 ; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 ; SOFTFP-NOLSE-NEXT: mov w8, w0 -; SOFTFP-NOLSE-NEXT: .LBB3_3: // %cmpxchg.start -; SOFTFP-NOLSE-NEXT: // Parent Loop BB3_2 Depth=1 +; SOFTFP-NOLSE-NEXT: .LBB3_2: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB3_1 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 ; SOFTFP-NOLSE-NEXT: ldaxrh w0, [x19] ; SOFTFP-NOLSE-NEXT: cmp w0, w21, uxth -; SOFTFP-NOLSE-NEXT: b.ne .LBB3_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore -; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB3_3 Depth=2 +; SOFTFP-NOLSE-NEXT: b.ne .LBB3_5 +; SOFTFP-NOLSE-NEXT: // %bb.3: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB3_2 Depth=2 ; SOFTFP-NOLSE-NEXT: stlxrh w9, w8, [x19] -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB3_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB3_2 Depth=1 +; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB3_2 +; SOFTFP-NOLSE-NEXT: // %bb.4: // in Loop: Header=BB3_1 Depth=1 ; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB3_2 +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB3_1 +; SOFTFP-NOLSE-NEXT: b .LBB3_6 +; SOFTFP-NOLSE-NEXT: .LBB3_5: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB3_1 Depth=1 +; SOFTFP-NOLSE-NEXT: mov w8, wzr +; SOFTFP-NOLSE-NEXT: clrex +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB3_1 ; SOFTFP-NOLSE-NEXT: .LBB3_6: // %atomicrmw.end ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 @@ -406,32 +406,32 @@ define float @test_atomicrmw_fadd_f32_seq_cst_align4(ptr %ptr, float %value) #0 ; SOFTFP-NOLSE-NEXT: mov x19, x0 ; SOFTFP-NOLSE-NEXT: ldr w0, [x0] ; SOFTFP-NOLSE-NEXT: mov w20, w1 -; SOFTFP-NOLSE-NEXT: b .LBB4_2 -; SOFTFP-NOLSE-NEXT: .LBB4_1: // %cmpxchg.nostore -; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB4_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB4_6 -; SOFTFP-NOLSE-NEXT: .LBB4_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB4_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 -; SOFTFP-NOLSE-NEXT: // Child Loop BB4_3 Depth 2 +; SOFTFP-NOLSE-NEXT: // Child Loop BB4_2 Depth 2 ; SOFTFP-NOLSE-NEXT: mov w1, w20 ; SOFTFP-NOLSE-NEXT: mov w21, w0 ; SOFTFP-NOLSE-NEXT: bl __addsf3 ; SOFTFP-NOLSE-NEXT: mov w8, w0 -; SOFTFP-NOLSE-NEXT: .LBB4_3: // %cmpxchg.start -; SOFTFP-NOLSE-NEXT: // Parent Loop BB4_2 Depth=1 +; SOFTFP-NOLSE-NEXT: .LBB4_2: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB4_1 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 ; SOFTFP-NOLSE-NEXT: ldaxr w0, [x19] ; SOFTFP-NOLSE-NEXT: cmp w0, w21 -; SOFTFP-NOLSE-NEXT: b.ne .LBB4_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore -; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB4_3 Depth=2 +; SOFTFP-NOLSE-NEXT: b.ne .LBB4_5 +; SOFTFP-NOLSE-NEXT: // %bb.3: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB4_2 Depth=2 ; SOFTFP-NOLSE-NEXT: stlxr w9, w8, [x19] -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB4_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB4_2 Depth=1 +; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB4_2 +; SOFTFP-NOLSE-NEXT: // %bb.4: // in Loop: Header=BB4_1 Depth=1 ; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB4_2 +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB4_1 +; SOFTFP-NOLSE-NEXT: b .LBB4_6 +; SOFTFP-NOLSE-NEXT: .LBB4_5: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB4_1 Depth=1 +; SOFTFP-NOLSE-NEXT: mov w8, wzr +; SOFTFP-NOLSE-NEXT: clrex +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB4_1 ; SOFTFP-NOLSE-NEXT: .LBB4_6: // %atomicrmw.end ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 @@ -480,32 +480,32 @@ define double @test_atomicrmw_fadd_f32_seq_cst_align8(ptr %ptr, double %value) # ; SOFTFP-NOLSE-NEXT: mov x19, x0 ; SOFTFP-NOLSE-NEXT: ldr x0, [x0] ; SOFTFP-NOLSE-NEXT: mov x20, x1 -; SOFTFP-NOLSE-NEXT: b .LBB5_2 -; SOFTFP-NOLSE-NEXT: .LBB5_1: // %cmpxchg.nostore -; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB5_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB5_6 -; SOFTFP-NOLSE-NEXT: .LBB5_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB5_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 -; SOFTFP-NOLSE-NEXT: // Child Loop BB5_3 Depth 2 +; SOFTFP-NOLSE-NEXT: // Child Loop BB5_2 Depth 2 ; SOFTFP-NOLSE-NEXT: mov x1, x20 ; SOFTFP-NOLSE-NEXT: mov x21, x0 ; SOFTFP-NOLSE-NEXT: bl __adddf3 ; SOFTFP-NOLSE-NEXT: mov x8, x0 -; SOFTFP-NOLSE-NEXT: .LBB5_3: // %cmpxchg.start -; SOFTFP-NOLSE-NEXT: // Parent Loop BB5_2 Depth=1 +; SOFTFP-NOLSE-NEXT: .LBB5_2: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB5_1 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 ; SOFTFP-NOLSE-NEXT: ldaxr x0, [x19] ; SOFTFP-NOLSE-NEXT: cmp x0, x21 -; SOFTFP-NOLSE-NEXT: b.ne .LBB5_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore -; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB5_3 Depth=2 +; SOFTFP-NOLSE-NEXT: b.ne .LBB5_5 +; SOFTFP-NOLSE-NEXT: // %bb.3: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB5_2 Depth=2 ; SOFTFP-NOLSE-NEXT: stlxr w9, x8, [x19] -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB5_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB5_2 Depth=1 +; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB5_2 +; SOFTFP-NOLSE-NEXT: // %bb.4: // in Loop: Header=BB5_1 Depth=1 ; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB5_2 +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB5_1 +; SOFTFP-NOLSE-NEXT: b .LBB5_6 +; SOFTFP-NOLSE-NEXT: .LBB5_5: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB5_1 Depth=1 +; SOFTFP-NOLSE-NEXT: mov w8, wzr +; SOFTFP-NOLSE-NEXT: clrex +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB5_1 ; SOFTFP-NOLSE-NEXT: .LBB5_6: // %atomicrmw.end ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload @@ -701,16 +701,9 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_seq_cst_align4(ptr %ptr, <2 x half> ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: mov w19, w2 ; SOFTFP-NOLSE-NEXT: mov x20, x0 -; SOFTFP-NOLSE-NEXT: b .LBB7_2 -; SOFTFP-NOLSE-NEXT: .LBB7_1: // %cmpxchg.nostore -; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB7_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: lsr w23, w22, #16 -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB7_6 -; SOFTFP-NOLSE-NEXT: .LBB7_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB7_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 -; SOFTFP-NOLSE-NEXT: // Child Loop BB7_3 Depth 2 +; SOFTFP-NOLSE-NEXT: // Child Loop BB7_2 Depth 2 ; SOFTFP-NOLSE-NEXT: and w0, w19, #0xffff ; SOFTFP-NOLSE-NEXT: bl __extendhfsf2 ; SOFTFP-NOLSE-NEXT: mov w24, w0 @@ -731,20 +724,27 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_seq_cst_align4(ptr %ptr, <2 x half> ; SOFTFP-NOLSE-NEXT: mov w8, w22 ; SOFTFP-NOLSE-NEXT: bfi w0, w24, #16, #16 ; SOFTFP-NOLSE-NEXT: bfi w8, w23, #16, #16 -; SOFTFP-NOLSE-NEXT: .LBB7_3: // %cmpxchg.start -; SOFTFP-NOLSE-NEXT: // Parent Loop BB7_2 Depth=1 +; SOFTFP-NOLSE-NEXT: .LBB7_2: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB7_1 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 ; SOFTFP-NOLSE-NEXT: ldaxr w22, [x20] ; SOFTFP-NOLSE-NEXT: cmp w22, w8 -; SOFTFP-NOLSE-NEXT: b.ne .LBB7_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore -; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB7_3 Depth=2 +; SOFTFP-NOLSE-NEXT: b.ne .LBB7_5 +; SOFTFP-NOLSE-NEXT: // %bb.3: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB7_2 Depth=2 ; SOFTFP-NOLSE-NEXT: stlxr w9, w0, [x20] -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB7_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB7_2 Depth=1 +; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB7_2 +; SOFTFP-NOLSE-NEXT: // %bb.4: // in Loop: Header=BB7_1 Depth=1 ; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 ; SOFTFP-NOLSE-NEXT: lsr w23, w22, #16 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB7_2 +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB7_1 +; SOFTFP-NOLSE-NEXT: b .LBB7_6 +; SOFTFP-NOLSE-NEXT: .LBB7_5: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB7_1 Depth=1 +; SOFTFP-NOLSE-NEXT: mov w8, wzr +; SOFTFP-NOLSE-NEXT: clrex +; SOFTFP-NOLSE-NEXT: lsr w23, w22, #16 +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB7_1 ; SOFTFP-NOLSE-NEXT: .LBB7_6: // %atomicrmw.end ; SOFTFP-NOLSE-NEXT: mov w0, w22 ; SOFTFP-NOLSE-NEXT: mov w1, w23 @@ -817,16 +817,9 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf ; SOFTFP-NOLSE-NEXT: lsl w21, w8, #16 ; SOFTFP-NOLSE-NEXT: mov x19, x0 ; SOFTFP-NOLSE-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: b .LBB8_2 -; SOFTFP-NOLSE-NEXT: .LBB8_1: // %cmpxchg.nostore -; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB8_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: lsr w1, w22, #16 -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB8_6 -; SOFTFP-NOLSE-NEXT: .LBB8_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB8_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 -; SOFTFP-NOLSE-NEXT: // Child Loop BB8_3 Depth 2 +; SOFTFP-NOLSE-NEXT: // Child Loop BB8_2 Depth 2 ; SOFTFP-NOLSE-NEXT: lsl w23, w1, #16 ; SOFTFP-NOLSE-NEXT: mov w1, w20 ; SOFTFP-NOLSE-NEXT: mov w0, w23 @@ -839,20 +832,27 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf ; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 ; SOFTFP-NOLSE-NEXT: bfxil w23, w22, #0, #16 ; SOFTFP-NOLSE-NEXT: bfi w0, w24, #16, #16 -; SOFTFP-NOLSE-NEXT: .LBB8_3: // %cmpxchg.start -; SOFTFP-NOLSE-NEXT: // Parent Loop BB8_2 Depth=1 +; SOFTFP-NOLSE-NEXT: .LBB8_2: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB8_1 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 ; SOFTFP-NOLSE-NEXT: ldaxr w22, [x19] ; SOFTFP-NOLSE-NEXT: cmp w22, w23 -; SOFTFP-NOLSE-NEXT: b.ne .LBB8_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore -; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB8_3 Depth=2 +; SOFTFP-NOLSE-NEXT: b.ne .LBB8_5 +; SOFTFP-NOLSE-NEXT: // %bb.3: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB8_2 Depth=2 ; SOFTFP-NOLSE-NEXT: stlxr w8, w0, [x19] -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB8_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB8_2 Depth=1 +; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB8_2 +; SOFTFP-NOLSE-NEXT: // %bb.4: // in Loop: Header=BB8_1 Depth=1 ; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 ; SOFTFP-NOLSE-NEXT: lsr w1, w22, #16 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB8_2 +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB8_1 +; SOFTFP-NOLSE-NEXT: b .LBB8_6 +; SOFTFP-NOLSE-NEXT: .LBB8_5: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB8_1 Depth=1 +; SOFTFP-NOLSE-NEXT: mov w8, wzr +; SOFTFP-NOLSE-NEXT: clrex +; SOFTFP-NOLSE-NEXT: lsr w1, w22, #16 +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB8_1 ; SOFTFP-NOLSE-NEXT: .LBB8_6: // %atomicrmw.end ; SOFTFP-NOLSE-NEXT: mov w0, w22 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload @@ -906,16 +906,9 @@ define <2 x float> @test_atomicrmw_fadd_v2f32_seq_cst_align8(ptr %ptr, <2 x floa ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: mov w19, w2 ; SOFTFP-NOLSE-NEXT: mov x20, x0 -; SOFTFP-NOLSE-NEXT: b .LBB9_2 -; SOFTFP-NOLSE-NEXT: .LBB9_1: // %cmpxchg.nostore -; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB9_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: lsr x23, x22, #32 -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB9_6 -; SOFTFP-NOLSE-NEXT: .LBB9_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB9_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 -; SOFTFP-NOLSE-NEXT: // Child Loop BB9_3 Depth 2 +; SOFTFP-NOLSE-NEXT: // Child Loop BB9_2 Depth 2 ; SOFTFP-NOLSE-NEXT: mov w0, w23 ; SOFTFP-NOLSE-NEXT: mov w1, w19 ; SOFTFP-NOLSE-NEXT: bl __addsf3 @@ -928,20 +921,27 @@ define <2 x float> @test_atomicrmw_fadd_v2f32_seq_cst_align8(ptr %ptr, <2 x floa ; SOFTFP-NOLSE-NEXT: // kill: def $w23 killed $w23 killed $x23 def $x23 ; SOFTFP-NOLSE-NEXT: orr x8, x8, x24, lsl #32 ; SOFTFP-NOLSE-NEXT: orr x9, x9, x23, lsl #32 -; SOFTFP-NOLSE-NEXT: .LBB9_3: // %cmpxchg.start -; SOFTFP-NOLSE-NEXT: // Parent Loop BB9_2 Depth=1 +; SOFTFP-NOLSE-NEXT: .LBB9_2: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB9_1 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 ; SOFTFP-NOLSE-NEXT: ldaxr x22, [x20] ; SOFTFP-NOLSE-NEXT: cmp x22, x9 -; SOFTFP-NOLSE-NEXT: b.ne .LBB9_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore -; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB9_3 Depth=2 +; SOFTFP-NOLSE-NEXT: b.ne .LBB9_5 +; SOFTFP-NOLSE-NEXT: // %bb.3: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB9_2 Depth=2 ; SOFTFP-NOLSE-NEXT: stlxr w10, x8, [x20] -; SOFTFP-NOLSE-NEXT: cbnz w10, .LBB9_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB9_2 Depth=1 +; SOFTFP-NOLSE-NEXT: cbnz w10, .LBB9_2 +; SOFTFP-NOLSE-NEXT: // %bb.4: // in Loop: Header=BB9_1 Depth=1 ; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 ; SOFTFP-NOLSE-NEXT: lsr x23, x22, #32 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB9_2 +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB9_1 +; SOFTFP-NOLSE-NEXT: b .LBB9_6 +; SOFTFP-NOLSE-NEXT: .LBB9_5: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB9_1 Depth=1 +; SOFTFP-NOLSE-NEXT: mov w8, wzr +; SOFTFP-NOLSE-NEXT: clrex +; SOFTFP-NOLSE-NEXT: lsr x23, x22, #32 +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB9_1 ; SOFTFP-NOLSE-NEXT: .LBB9_6: // %atomicrmw.end ; SOFTFP-NOLSE-NEXT: mov w0, w22 ; SOFTFP-NOLSE-NEXT: mov w1, w23 diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll index e3e18a1f91c6..16825c9dcd17 100644 --- a/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll @@ -51,15 +51,9 @@ define half @test_atomicrmw_fmax_f16_seq_cst_align2(ptr %ptr, half %value) #0 { ; SOFTFP-NOLSE-NEXT: ldrh w0, [x0] ; SOFTFP-NOLSE-NEXT: mov w20, w1 ; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: b .LBB0_2 -; SOFTFP-NOLSE-NEXT: .LBB0_1: // %cmpxchg.nostore -; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB0_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB0_6 -; SOFTFP-NOLSE-NEXT: .LBB0_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB0_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 -; SOFTFP-NOLSE-NEXT: // Child Loop BB0_3 Depth 2 +; SOFTFP-NOLSE-NEXT: // Child Loop BB0_2 Depth 2 ; SOFTFP-NOLSE-NEXT: mov w22, w0 ; SOFTFP-NOLSE-NEXT: and w0, w20, #0xffff ; SOFTFP-NOLSE-NEXT: bl __extendhfsf2 @@ -70,19 +64,25 @@ define half @test_atomicrmw_fmax_f16_seq_cst_align2(ptr %ptr, half %value) #0 { ; SOFTFP-NOLSE-NEXT: bl fmaxf ; SOFTFP-NOLSE-NEXT: bl __truncsfhf2 ; SOFTFP-NOLSE-NEXT: mov w8, w0 -; SOFTFP-NOLSE-NEXT: .LBB0_3: // %cmpxchg.start -; SOFTFP-NOLSE-NEXT: // Parent Loop BB0_2 Depth=1 +; SOFTFP-NOLSE-NEXT: .LBB0_2: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB0_1 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 ; SOFTFP-NOLSE-NEXT: ldaxrh w0, [x19] ; SOFTFP-NOLSE-NEXT: cmp w0, w22, uxth -; SOFTFP-NOLSE-NEXT: b.ne .LBB0_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore -; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB0_3 Depth=2 +; SOFTFP-NOLSE-NEXT: b.ne .LBB0_5 +; SOFTFP-NOLSE-NEXT: // %bb.3: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB0_2 Depth=2 ; SOFTFP-NOLSE-NEXT: stlxrh w9, w8, [x19] -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB0_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB0_2 Depth=1 +; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB0_2 +; SOFTFP-NOLSE-NEXT: // %bb.4: // in Loop: Header=BB0_1 Depth=1 ; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB0_2 +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB0_1 +; SOFTFP-NOLSE-NEXT: b .LBB0_6 +; SOFTFP-NOLSE-NEXT: .LBB0_5: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB0_1 Depth=1 +; SOFTFP-NOLSE-NEXT: mov w8, wzr +; SOFTFP-NOLSE-NEXT: clrex +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB0_1 ; SOFTFP-NOLSE-NEXT: .LBB0_6: // %atomicrmw.end ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 @@ -139,15 +139,9 @@ define half @test_atomicrmw_fmax_f16_seq_cst_align4(ptr %ptr, half %value) #0 { ; SOFTFP-NOLSE-NEXT: ldrh w0, [x0] ; SOFTFP-NOLSE-NEXT: mov w20, w1 ; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: b .LBB1_2 -; SOFTFP-NOLSE-NEXT: .LBB1_1: // %cmpxchg.nostore -; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB1_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB1_6 -; SOFTFP-NOLSE-NEXT: .LBB1_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB1_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 -; SOFTFP-NOLSE-NEXT: // Child Loop BB1_3 Depth 2 +; SOFTFP-NOLSE-NEXT: // Child Loop BB1_2 Depth 2 ; SOFTFP-NOLSE-NEXT: mov w22, w0 ; SOFTFP-NOLSE-NEXT: and w0, w20, #0xffff ; SOFTFP-NOLSE-NEXT: bl __extendhfsf2 @@ -158,19 +152,25 @@ define half @test_atomicrmw_fmax_f16_seq_cst_align4(ptr %ptr, half %value) #0 { ; SOFTFP-NOLSE-NEXT: bl fmaxf ; SOFTFP-NOLSE-NEXT: bl __truncsfhf2 ; SOFTFP-NOLSE-NEXT: mov w8, w0 -; SOFTFP-NOLSE-NEXT: .LBB1_3: // %cmpxchg.start -; SOFTFP-NOLSE-NEXT: // Parent Loop BB1_2 Depth=1 +; SOFTFP-NOLSE-NEXT: .LBB1_2: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB1_1 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 ; SOFTFP-NOLSE-NEXT: ldaxrh w0, [x19] ; SOFTFP-NOLSE-NEXT: cmp w0, w22, uxth -; SOFTFP-NOLSE-NEXT: b.ne .LBB1_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore -; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB1_3 Depth=2 +; SOFTFP-NOLSE-NEXT: b.ne .LBB1_5 +; SOFTFP-NOLSE-NEXT: // %bb.3: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB1_2 Depth=2 ; SOFTFP-NOLSE-NEXT: stlxrh w9, w8, [x19] -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB1_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB1_2 Depth=1 +; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB1_2 +; SOFTFP-NOLSE-NEXT: // %bb.4: // in Loop: Header=BB1_1 Depth=1 ; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB1_2 +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB1_1 +; SOFTFP-NOLSE-NEXT: b .LBB1_6 +; SOFTFP-NOLSE-NEXT: .LBB1_5: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB1_1 Depth=1 +; SOFTFP-NOLSE-NEXT: mov w8, wzr +; SOFTFP-NOLSE-NEXT: clrex +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB1_1 ; SOFTFP-NOLSE-NEXT: .LBB1_6: // %atomicrmw.end ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 @@ -238,34 +238,34 @@ define bfloat @test_atomicrmw_fmax_bf16_seq_cst_align2(ptr %ptr, bfloat %value) ; SOFTFP-NOLSE-NEXT: mov x19, x0 ; SOFTFP-NOLSE-NEXT: ldrh w0, [x0] ; SOFTFP-NOLSE-NEXT: lsl w20, w1, #16 -; SOFTFP-NOLSE-NEXT: b .LBB2_2 -; SOFTFP-NOLSE-NEXT: .LBB2_1: // %cmpxchg.nostore -; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB2_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB2_6 -; SOFTFP-NOLSE-NEXT: .LBB2_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB2_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 -; SOFTFP-NOLSE-NEXT: // Child Loop BB2_3 Depth 2 +; SOFTFP-NOLSE-NEXT: // Child Loop BB2_2 Depth 2 ; SOFTFP-NOLSE-NEXT: mov w21, w0 ; SOFTFP-NOLSE-NEXT: lsl w0, w0, #16 ; SOFTFP-NOLSE-NEXT: mov w1, w20 ; SOFTFP-NOLSE-NEXT: bl fmaxf ; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 ; SOFTFP-NOLSE-NEXT: mov w8, w0 -; SOFTFP-NOLSE-NEXT: .LBB2_3: // %cmpxchg.start -; SOFTFP-NOLSE-NEXT: // Parent Loop BB2_2 Depth=1 +; SOFTFP-NOLSE-NEXT: .LBB2_2: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB2_1 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 ; SOFTFP-NOLSE-NEXT: ldaxrh w0, [x19] ; SOFTFP-NOLSE-NEXT: cmp w0, w21, uxth -; SOFTFP-NOLSE-NEXT: b.ne .LBB2_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore -; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB2_3 Depth=2 +; SOFTFP-NOLSE-NEXT: b.ne .LBB2_5 +; SOFTFP-NOLSE-NEXT: // %bb.3: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB2_2 Depth=2 ; SOFTFP-NOLSE-NEXT: stlxrh w9, w8, [x19] -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB2_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB2_2 Depth=1 +; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB2_2 +; SOFTFP-NOLSE-NEXT: // %bb.4: // in Loop: Header=BB2_1 Depth=1 ; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB2_2 +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB2_1 +; SOFTFP-NOLSE-NEXT: b .LBB2_6 +; SOFTFP-NOLSE-NEXT: .LBB2_5: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB2_1 Depth=1 +; SOFTFP-NOLSE-NEXT: mov w8, wzr +; SOFTFP-NOLSE-NEXT: clrex +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB2_1 ; SOFTFP-NOLSE-NEXT: .LBB2_6: // %atomicrmw.end ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 @@ -332,34 +332,34 @@ define bfloat @test_atomicrmw_fmax_bf16_seq_cst_align4(ptr %ptr, bfloat %value) ; SOFTFP-NOLSE-NEXT: mov x19, x0 ; SOFTFP-NOLSE-NEXT: ldrh w0, [x0] ; SOFTFP-NOLSE-NEXT: lsl w20, w1, #16 -; SOFTFP-NOLSE-NEXT: b .LBB3_2 -; SOFTFP-NOLSE-NEXT: .LBB3_1: // %cmpxchg.nostore -; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB3_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB3_6 -; SOFTFP-NOLSE-NEXT: .LBB3_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB3_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 -; SOFTFP-NOLSE-NEXT: // Child Loop BB3_3 Depth 2 +; SOFTFP-NOLSE-NEXT: // Child Loop BB3_2 Depth 2 ; SOFTFP-NOLSE-NEXT: mov w21, w0 ; SOFTFP-NOLSE-NEXT: lsl w0, w0, #16 ; SOFTFP-NOLSE-NEXT: mov w1, w20 ; SOFTFP-NOLSE-NEXT: bl fmaxf ; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 ; SOFTFP-NOLSE-NEXT: mov w8, w0 -; SOFTFP-NOLSE-NEXT: .LBB3_3: // %cmpxchg.start -; SOFTFP-NOLSE-NEXT: // Parent Loop BB3_2 Depth=1 +; SOFTFP-NOLSE-NEXT: .LBB3_2: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB3_1 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 ; SOFTFP-NOLSE-NEXT: ldaxrh w0, [x19] ; SOFTFP-NOLSE-NEXT: cmp w0, w21, uxth -; SOFTFP-NOLSE-NEXT: b.ne .LBB3_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore -; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB3_3 Depth=2 +; SOFTFP-NOLSE-NEXT: b.ne .LBB3_5 +; SOFTFP-NOLSE-NEXT: // %bb.3: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB3_2 Depth=2 ; SOFTFP-NOLSE-NEXT: stlxrh w9, w8, [x19] -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB3_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB3_2 Depth=1 +; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB3_2 +; SOFTFP-NOLSE-NEXT: // %bb.4: // in Loop: Header=BB3_1 Depth=1 ; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB3_2 +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB3_1 +; SOFTFP-NOLSE-NEXT: b .LBB3_6 +; SOFTFP-NOLSE-NEXT: .LBB3_5: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB3_1 Depth=1 +; SOFTFP-NOLSE-NEXT: mov w8, wzr +; SOFTFP-NOLSE-NEXT: clrex +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB3_1 ; SOFTFP-NOLSE-NEXT: .LBB3_6: // %atomicrmw.end ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 @@ -408,32 +408,32 @@ define float @test_atomicrmw_fmax_f32_seq_cst_align4(ptr %ptr, float %value) #0 ; SOFTFP-NOLSE-NEXT: mov x19, x0 ; SOFTFP-NOLSE-NEXT: ldr w0, [x0] ; SOFTFP-NOLSE-NEXT: mov w20, w1 -; SOFTFP-NOLSE-NEXT: b .LBB4_2 -; SOFTFP-NOLSE-NEXT: .LBB4_1: // %cmpxchg.nostore -; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB4_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB4_6 -; SOFTFP-NOLSE-NEXT: .LBB4_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB4_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 -; SOFTFP-NOLSE-NEXT: // Child Loop BB4_3 Depth 2 +; SOFTFP-NOLSE-NEXT: // Child Loop BB4_2 Depth 2 ; SOFTFP-NOLSE-NEXT: mov w1, w20 ; SOFTFP-NOLSE-NEXT: mov w21, w0 ; SOFTFP-NOLSE-NEXT: bl fmaxf ; SOFTFP-NOLSE-NEXT: mov w8, w0 -; SOFTFP-NOLSE-NEXT: .LBB4_3: // %cmpxchg.start -; SOFTFP-NOLSE-NEXT: // Parent Loop BB4_2 Depth=1 +; SOFTFP-NOLSE-NEXT: .LBB4_2: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB4_1 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 ; SOFTFP-NOLSE-NEXT: ldaxr w0, [x19] ; SOFTFP-NOLSE-NEXT: cmp w0, w21 -; SOFTFP-NOLSE-NEXT: b.ne .LBB4_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore -; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB4_3 Depth=2 +; SOFTFP-NOLSE-NEXT: b.ne .LBB4_5 +; SOFTFP-NOLSE-NEXT: // %bb.3: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB4_2 Depth=2 ; SOFTFP-NOLSE-NEXT: stlxr w9, w8, [x19] -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB4_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB4_2 Depth=1 +; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB4_2 +; SOFTFP-NOLSE-NEXT: // %bb.4: // in Loop: Header=BB4_1 Depth=1 ; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB4_2 +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB4_1 +; SOFTFP-NOLSE-NEXT: b .LBB4_6 +; SOFTFP-NOLSE-NEXT: .LBB4_5: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB4_1 Depth=1 +; SOFTFP-NOLSE-NEXT: mov w8, wzr +; SOFTFP-NOLSE-NEXT: clrex +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB4_1 ; SOFTFP-NOLSE-NEXT: .LBB4_6: // %atomicrmw.end ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 @@ -482,32 +482,32 @@ define double @test_atomicrmw_fmax_f32_seq_cst_align8(ptr %ptr, double %value) # ; SOFTFP-NOLSE-NEXT: mov x19, x0 ; SOFTFP-NOLSE-NEXT: ldr x0, [x0] ; SOFTFP-NOLSE-NEXT: mov x20, x1 -; SOFTFP-NOLSE-NEXT: b .LBB5_2 -; SOFTFP-NOLSE-NEXT: .LBB5_1: // %cmpxchg.nostore -; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB5_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB5_6 -; SOFTFP-NOLSE-NEXT: .LBB5_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB5_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 -; SOFTFP-NOLSE-NEXT: // Child Loop BB5_3 Depth 2 +; SOFTFP-NOLSE-NEXT: // Child Loop BB5_2 Depth 2 ; SOFTFP-NOLSE-NEXT: mov x1, x20 ; SOFTFP-NOLSE-NEXT: mov x21, x0 ; SOFTFP-NOLSE-NEXT: bl fmax ; SOFTFP-NOLSE-NEXT: mov x8, x0 -; SOFTFP-NOLSE-NEXT: .LBB5_3: // %cmpxchg.start -; SOFTFP-NOLSE-NEXT: // Parent Loop BB5_2 Depth=1 +; SOFTFP-NOLSE-NEXT: .LBB5_2: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB5_1 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 ; SOFTFP-NOLSE-NEXT: ldaxr x0, [x19] ; SOFTFP-NOLSE-NEXT: cmp x0, x21 -; SOFTFP-NOLSE-NEXT: b.ne .LBB5_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore -; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB5_3 Depth=2 +; SOFTFP-NOLSE-NEXT: b.ne .LBB5_5 +; SOFTFP-NOLSE-NEXT: // %bb.3: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB5_2 Depth=2 ; SOFTFP-NOLSE-NEXT: stlxr w9, x8, [x19] -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB5_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB5_2 Depth=1 +; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB5_2 +; SOFTFP-NOLSE-NEXT: // %bb.4: // in Loop: Header=BB5_1 Depth=1 ; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB5_2 +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB5_1 +; SOFTFP-NOLSE-NEXT: b .LBB5_6 +; SOFTFP-NOLSE-NEXT: .LBB5_5: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB5_1 Depth=1 +; SOFTFP-NOLSE-NEXT: mov w8, wzr +; SOFTFP-NOLSE-NEXT: clrex +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB5_1 ; SOFTFP-NOLSE-NEXT: .LBB5_6: // %atomicrmw.end ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload @@ -581,16 +581,9 @@ define <2 x half> @test_atomicrmw_fmax_v2f16_seq_cst_align4(ptr %ptr, <2 x half> ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: mov w19, w2 ; SOFTFP-NOLSE-NEXT: mov x20, x0 -; SOFTFP-NOLSE-NEXT: b .LBB6_2 -; SOFTFP-NOLSE-NEXT: .LBB6_1: // %cmpxchg.nostore -; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB6_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: lsr w23, w22, #16 -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB6_6 -; SOFTFP-NOLSE-NEXT: .LBB6_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB6_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 -; SOFTFP-NOLSE-NEXT: // Child Loop BB6_3 Depth 2 +; SOFTFP-NOLSE-NEXT: // Child Loop BB6_2 Depth 2 ; SOFTFP-NOLSE-NEXT: and w0, w19, #0xffff ; SOFTFP-NOLSE-NEXT: bl __extendhfsf2 ; SOFTFP-NOLSE-NEXT: mov w24, w0 @@ -611,20 +604,27 @@ define <2 x half> @test_atomicrmw_fmax_v2f16_seq_cst_align4(ptr %ptr, <2 x half> ; SOFTFP-NOLSE-NEXT: mov w8, w22 ; SOFTFP-NOLSE-NEXT: bfi w0, w24, #16, #16 ; SOFTFP-NOLSE-NEXT: bfi w8, w23, #16, #16 -; SOFTFP-NOLSE-NEXT: .LBB6_3: // %cmpxchg.start -; SOFTFP-NOLSE-NEXT: // Parent Loop BB6_2 Depth=1 +; SOFTFP-NOLSE-NEXT: .LBB6_2: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB6_1 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 ; SOFTFP-NOLSE-NEXT: ldaxr w22, [x20] ; SOFTFP-NOLSE-NEXT: cmp w22, w8 -; SOFTFP-NOLSE-NEXT: b.ne .LBB6_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore -; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB6_3 Depth=2 +; SOFTFP-NOLSE-NEXT: b.ne .LBB6_5 +; SOFTFP-NOLSE-NEXT: // %bb.3: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB6_2 Depth=2 ; SOFTFP-NOLSE-NEXT: stlxr w9, w0, [x20] -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB6_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB6_2 Depth=1 +; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB6_2 +; SOFTFP-NOLSE-NEXT: // %bb.4: // in Loop: Header=BB6_1 Depth=1 ; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 ; SOFTFP-NOLSE-NEXT: lsr w23, w22, #16 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB6_2 +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB6_1 +; SOFTFP-NOLSE-NEXT: b .LBB6_6 +; SOFTFP-NOLSE-NEXT: .LBB6_5: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB6_1 Depth=1 +; SOFTFP-NOLSE-NEXT: mov w8, wzr +; SOFTFP-NOLSE-NEXT: clrex +; SOFTFP-NOLSE-NEXT: lsr w23, w22, #16 +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB6_1 ; SOFTFP-NOLSE-NEXT: .LBB6_6: // %atomicrmw.end ; SOFTFP-NOLSE-NEXT: mov w0, w22 ; SOFTFP-NOLSE-NEXT: mov w1, w23 @@ -725,16 +725,9 @@ define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf ; SOFTFP-NOLSE-NEXT: lsl w21, w8, #16 ; SOFTFP-NOLSE-NEXT: mov x19, x0 ; SOFTFP-NOLSE-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: b .LBB7_2 -; SOFTFP-NOLSE-NEXT: .LBB7_1: // %cmpxchg.nostore -; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB7_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: lsr w1, w22, #16 -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB7_6 -; SOFTFP-NOLSE-NEXT: .LBB7_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB7_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 -; SOFTFP-NOLSE-NEXT: // Child Loop BB7_3 Depth 2 +; SOFTFP-NOLSE-NEXT: // Child Loop BB7_2 Depth 2 ; SOFTFP-NOLSE-NEXT: lsl w23, w1, #16 ; SOFTFP-NOLSE-NEXT: mov w1, w20 ; SOFTFP-NOLSE-NEXT: mov w0, w23 @@ -747,20 +740,27 @@ define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf ; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 ; SOFTFP-NOLSE-NEXT: bfxil w23, w22, #0, #16 ; SOFTFP-NOLSE-NEXT: bfi w0, w24, #16, #16 -; SOFTFP-NOLSE-NEXT: .LBB7_3: // %cmpxchg.start -; SOFTFP-NOLSE-NEXT: // Parent Loop BB7_2 Depth=1 +; SOFTFP-NOLSE-NEXT: .LBB7_2: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB7_1 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 ; SOFTFP-NOLSE-NEXT: ldaxr w22, [x19] ; SOFTFP-NOLSE-NEXT: cmp w22, w23 -; SOFTFP-NOLSE-NEXT: b.ne .LBB7_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore -; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB7_3 Depth=2 +; SOFTFP-NOLSE-NEXT: b.ne .LBB7_5 +; SOFTFP-NOLSE-NEXT: // %bb.3: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB7_2 Depth=2 ; SOFTFP-NOLSE-NEXT: stlxr w8, w0, [x19] -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB7_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB7_2 Depth=1 +; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB7_2 +; SOFTFP-NOLSE-NEXT: // %bb.4: // in Loop: Header=BB7_1 Depth=1 ; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 ; SOFTFP-NOLSE-NEXT: lsr w1, w22, #16 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB7_2 +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB7_1 +; SOFTFP-NOLSE-NEXT: b .LBB7_6 +; SOFTFP-NOLSE-NEXT: .LBB7_5: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB7_1 Depth=1 +; SOFTFP-NOLSE-NEXT: mov w8, wzr +; SOFTFP-NOLSE-NEXT: clrex +; SOFTFP-NOLSE-NEXT: lsr w1, w22, #16 +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB7_1 ; SOFTFP-NOLSE-NEXT: .LBB7_6: // %atomicrmw.end ; SOFTFP-NOLSE-NEXT: mov w0, w22 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload @@ -814,16 +814,9 @@ define <2 x float> @test_atomicrmw_fmax_v2f32_seq_cst_align8(ptr %ptr, <2 x floa ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: mov w19, w2 ; SOFTFP-NOLSE-NEXT: mov x20, x0 -; SOFTFP-NOLSE-NEXT: b .LBB8_2 -; SOFTFP-NOLSE-NEXT: .LBB8_1: // %cmpxchg.nostore -; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB8_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: lsr x23, x22, #32 -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB8_6 -; SOFTFP-NOLSE-NEXT: .LBB8_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB8_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 -; SOFTFP-NOLSE-NEXT: // Child Loop BB8_3 Depth 2 +; SOFTFP-NOLSE-NEXT: // Child Loop BB8_2 Depth 2 ; SOFTFP-NOLSE-NEXT: mov w0, w23 ; SOFTFP-NOLSE-NEXT: mov w1, w19 ; SOFTFP-NOLSE-NEXT: bl fmaxf @@ -836,20 +829,27 @@ define <2 x float> @test_atomicrmw_fmax_v2f32_seq_cst_align8(ptr %ptr, <2 x floa ; SOFTFP-NOLSE-NEXT: // kill: def $w23 killed $w23 killed $x23 def $x23 ; SOFTFP-NOLSE-NEXT: orr x8, x8, x24, lsl #32 ; SOFTFP-NOLSE-NEXT: orr x9, x9, x23, lsl #32 -; SOFTFP-NOLSE-NEXT: .LBB8_3: // %cmpxchg.start -; SOFTFP-NOLSE-NEXT: // Parent Loop BB8_2 Depth=1 +; SOFTFP-NOLSE-NEXT: .LBB8_2: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB8_1 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 ; SOFTFP-NOLSE-NEXT: ldaxr x22, [x20] ; SOFTFP-NOLSE-NEXT: cmp x22, x9 -; SOFTFP-NOLSE-NEXT: b.ne .LBB8_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore -; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB8_3 Depth=2 +; SOFTFP-NOLSE-NEXT: b.ne .LBB8_5 +; SOFTFP-NOLSE-NEXT: // %bb.3: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB8_2 Depth=2 ; SOFTFP-NOLSE-NEXT: stlxr w10, x8, [x20] -; SOFTFP-NOLSE-NEXT: cbnz w10, .LBB8_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB8_2 Depth=1 +; SOFTFP-NOLSE-NEXT: cbnz w10, .LBB8_2 +; SOFTFP-NOLSE-NEXT: // %bb.4: // in Loop: Header=BB8_1 Depth=1 ; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 ; SOFTFP-NOLSE-NEXT: lsr x23, x22, #32 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB8_2 +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB8_1 +; SOFTFP-NOLSE-NEXT: b .LBB8_6 +; SOFTFP-NOLSE-NEXT: .LBB8_5: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB8_1 Depth=1 +; SOFTFP-NOLSE-NEXT: mov w8, wzr +; SOFTFP-NOLSE-NEXT: clrex +; SOFTFP-NOLSE-NEXT: lsr x23, x22, #32 +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB8_1 ; SOFTFP-NOLSE-NEXT: .LBB8_6: // %atomicrmw.end ; SOFTFP-NOLSE-NEXT: mov w0, w22 ; SOFTFP-NOLSE-NEXT: mov w1, w23 diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll index 10de6777bd28..314075c61910 100644 --- a/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll @@ -51,15 +51,9 @@ define half @test_atomicrmw_fmin_f16_seq_cst_align2(ptr %ptr, half %value) #0 { ; SOFTFP-NOLSE-NEXT: ldrh w0, [x0] ; SOFTFP-NOLSE-NEXT: mov w20, w1 ; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: b .LBB0_2 -; SOFTFP-NOLSE-NEXT: .LBB0_1: // %cmpxchg.nostore -; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB0_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB0_6 -; SOFTFP-NOLSE-NEXT: .LBB0_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB0_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 -; SOFTFP-NOLSE-NEXT: // Child Loop BB0_3 Depth 2 +; SOFTFP-NOLSE-NEXT: // Child Loop BB0_2 Depth 2 ; SOFTFP-NOLSE-NEXT: mov w22, w0 ; SOFTFP-NOLSE-NEXT: and w0, w20, #0xffff ; SOFTFP-NOLSE-NEXT: bl __extendhfsf2 @@ -70,19 +64,25 @@ define half @test_atomicrmw_fmin_f16_seq_cst_align2(ptr %ptr, half %value) #0 { ; SOFTFP-NOLSE-NEXT: bl fminf ; SOFTFP-NOLSE-NEXT: bl __truncsfhf2 ; SOFTFP-NOLSE-NEXT: mov w8, w0 -; SOFTFP-NOLSE-NEXT: .LBB0_3: // %cmpxchg.start -; SOFTFP-NOLSE-NEXT: // Parent Loop BB0_2 Depth=1 +; SOFTFP-NOLSE-NEXT: .LBB0_2: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB0_1 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 ; SOFTFP-NOLSE-NEXT: ldaxrh w0, [x19] ; SOFTFP-NOLSE-NEXT: cmp w0, w22, uxth -; SOFTFP-NOLSE-NEXT: b.ne .LBB0_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore -; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB0_3 Depth=2 +; SOFTFP-NOLSE-NEXT: b.ne .LBB0_5 +; SOFTFP-NOLSE-NEXT: // %bb.3: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB0_2 Depth=2 ; SOFTFP-NOLSE-NEXT: stlxrh w9, w8, [x19] -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB0_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB0_2 Depth=1 +; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB0_2 +; SOFTFP-NOLSE-NEXT: // %bb.4: // in Loop: Header=BB0_1 Depth=1 ; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB0_2 +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB0_1 +; SOFTFP-NOLSE-NEXT: b .LBB0_6 +; SOFTFP-NOLSE-NEXT: .LBB0_5: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB0_1 Depth=1 +; SOFTFP-NOLSE-NEXT: mov w8, wzr +; SOFTFP-NOLSE-NEXT: clrex +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB0_1 ; SOFTFP-NOLSE-NEXT: .LBB0_6: // %atomicrmw.end ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 @@ -139,15 +139,9 @@ define half @test_atomicrmw_fmin_f16_seq_cst_align4(ptr %ptr, half %value) #0 { ; SOFTFP-NOLSE-NEXT: ldrh w0, [x0] ; SOFTFP-NOLSE-NEXT: mov w20, w1 ; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: b .LBB1_2 -; SOFTFP-NOLSE-NEXT: .LBB1_1: // %cmpxchg.nostore -; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB1_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB1_6 -; SOFTFP-NOLSE-NEXT: .LBB1_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB1_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 -; SOFTFP-NOLSE-NEXT: // Child Loop BB1_3 Depth 2 +; SOFTFP-NOLSE-NEXT: // Child Loop BB1_2 Depth 2 ; SOFTFP-NOLSE-NEXT: mov w22, w0 ; SOFTFP-NOLSE-NEXT: and w0, w20, #0xffff ; SOFTFP-NOLSE-NEXT: bl __extendhfsf2 @@ -158,19 +152,25 @@ define half @test_atomicrmw_fmin_f16_seq_cst_align4(ptr %ptr, half %value) #0 { ; SOFTFP-NOLSE-NEXT: bl fminf ; SOFTFP-NOLSE-NEXT: bl __truncsfhf2 ; SOFTFP-NOLSE-NEXT: mov w8, w0 -; SOFTFP-NOLSE-NEXT: .LBB1_3: // %cmpxchg.start -; SOFTFP-NOLSE-NEXT: // Parent Loop BB1_2 Depth=1 +; SOFTFP-NOLSE-NEXT: .LBB1_2: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB1_1 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 ; SOFTFP-NOLSE-NEXT: ldaxrh w0, [x19] ; SOFTFP-NOLSE-NEXT: cmp w0, w22, uxth -; SOFTFP-NOLSE-NEXT: b.ne .LBB1_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore -; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB1_3 Depth=2 +; SOFTFP-NOLSE-NEXT: b.ne .LBB1_5 +; SOFTFP-NOLSE-NEXT: // %bb.3: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB1_2 Depth=2 ; SOFTFP-NOLSE-NEXT: stlxrh w9, w8, [x19] -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB1_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB1_2 Depth=1 +; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB1_2 +; SOFTFP-NOLSE-NEXT: // %bb.4: // in Loop: Header=BB1_1 Depth=1 ; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB1_2 +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB1_1 +; SOFTFP-NOLSE-NEXT: b .LBB1_6 +; SOFTFP-NOLSE-NEXT: .LBB1_5: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB1_1 Depth=1 +; SOFTFP-NOLSE-NEXT: mov w8, wzr +; SOFTFP-NOLSE-NEXT: clrex +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB1_1 ; SOFTFP-NOLSE-NEXT: .LBB1_6: // %atomicrmw.end ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 @@ -238,34 +238,34 @@ define bfloat @test_atomicrmw_fmin_bf16_seq_cst_align2(ptr %ptr, bfloat %value) ; SOFTFP-NOLSE-NEXT: mov x19, x0 ; SOFTFP-NOLSE-NEXT: ldrh w0, [x0] ; SOFTFP-NOLSE-NEXT: lsl w20, w1, #16 -; SOFTFP-NOLSE-NEXT: b .LBB2_2 -; SOFTFP-NOLSE-NEXT: .LBB2_1: // %cmpxchg.nostore -; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB2_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB2_6 -; SOFTFP-NOLSE-NEXT: .LBB2_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB2_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 -; SOFTFP-NOLSE-NEXT: // Child Loop BB2_3 Depth 2 +; SOFTFP-NOLSE-NEXT: // Child Loop BB2_2 Depth 2 ; SOFTFP-NOLSE-NEXT: mov w21, w0 ; SOFTFP-NOLSE-NEXT: lsl w0, w0, #16 ; SOFTFP-NOLSE-NEXT: mov w1, w20 ; SOFTFP-NOLSE-NEXT: bl fminf ; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 ; SOFTFP-NOLSE-NEXT: mov w8, w0 -; SOFTFP-NOLSE-NEXT: .LBB2_3: // %cmpxchg.start -; SOFTFP-NOLSE-NEXT: // Parent Loop BB2_2 Depth=1 +; SOFTFP-NOLSE-NEXT: .LBB2_2: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB2_1 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 ; SOFTFP-NOLSE-NEXT: ldaxrh w0, [x19] ; SOFTFP-NOLSE-NEXT: cmp w0, w21, uxth -; SOFTFP-NOLSE-NEXT: b.ne .LBB2_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore -; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB2_3 Depth=2 +; SOFTFP-NOLSE-NEXT: b.ne .LBB2_5 +; SOFTFP-NOLSE-NEXT: // %bb.3: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB2_2 Depth=2 ; SOFTFP-NOLSE-NEXT: stlxrh w9, w8, [x19] -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB2_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB2_2 Depth=1 +; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB2_2 +; SOFTFP-NOLSE-NEXT: // %bb.4: // in Loop: Header=BB2_1 Depth=1 ; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB2_2 +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB2_1 +; SOFTFP-NOLSE-NEXT: b .LBB2_6 +; SOFTFP-NOLSE-NEXT: .LBB2_5: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB2_1 Depth=1 +; SOFTFP-NOLSE-NEXT: mov w8, wzr +; SOFTFP-NOLSE-NEXT: clrex +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB2_1 ; SOFTFP-NOLSE-NEXT: .LBB2_6: // %atomicrmw.end ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 @@ -332,34 +332,34 @@ define bfloat @test_atomicrmw_fmin_bf16_seq_cst_align4(ptr %ptr, bfloat %value) ; SOFTFP-NOLSE-NEXT: mov x19, x0 ; SOFTFP-NOLSE-NEXT: ldrh w0, [x0] ; SOFTFP-NOLSE-NEXT: lsl w20, w1, #16 -; SOFTFP-NOLSE-NEXT: b .LBB3_2 -; SOFTFP-NOLSE-NEXT: .LBB3_1: // %cmpxchg.nostore -; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB3_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB3_6 -; SOFTFP-NOLSE-NEXT: .LBB3_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB3_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 -; SOFTFP-NOLSE-NEXT: // Child Loop BB3_3 Depth 2 +; SOFTFP-NOLSE-NEXT: // Child Loop BB3_2 Depth 2 ; SOFTFP-NOLSE-NEXT: mov w21, w0 ; SOFTFP-NOLSE-NEXT: lsl w0, w0, #16 ; SOFTFP-NOLSE-NEXT: mov w1, w20 ; SOFTFP-NOLSE-NEXT: bl fminf ; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 ; SOFTFP-NOLSE-NEXT: mov w8, w0 -; SOFTFP-NOLSE-NEXT: .LBB3_3: // %cmpxchg.start -; SOFTFP-NOLSE-NEXT: // Parent Loop BB3_2 Depth=1 +; SOFTFP-NOLSE-NEXT: .LBB3_2: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB3_1 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 ; SOFTFP-NOLSE-NEXT: ldaxrh w0, [x19] ; SOFTFP-NOLSE-NEXT: cmp w0, w21, uxth -; SOFTFP-NOLSE-NEXT: b.ne .LBB3_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore -; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB3_3 Depth=2 +; SOFTFP-NOLSE-NEXT: b.ne .LBB3_5 +; SOFTFP-NOLSE-NEXT: // %bb.3: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB3_2 Depth=2 ; SOFTFP-NOLSE-NEXT: stlxrh w9, w8, [x19] -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB3_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB3_2 Depth=1 +; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB3_2 +; SOFTFP-NOLSE-NEXT: // %bb.4: // in Loop: Header=BB3_1 Depth=1 ; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB3_2 +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB3_1 +; SOFTFP-NOLSE-NEXT: b .LBB3_6 +; SOFTFP-NOLSE-NEXT: .LBB3_5: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB3_1 Depth=1 +; SOFTFP-NOLSE-NEXT: mov w8, wzr +; SOFTFP-NOLSE-NEXT: clrex +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB3_1 ; SOFTFP-NOLSE-NEXT: .LBB3_6: // %atomicrmw.end ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 @@ -408,32 +408,32 @@ define float @test_atomicrmw_fmin_f32_seq_cst_align4(ptr %ptr, float %value) #0 ; SOFTFP-NOLSE-NEXT: mov x19, x0 ; SOFTFP-NOLSE-NEXT: ldr w0, [x0] ; SOFTFP-NOLSE-NEXT: mov w20, w1 -; SOFTFP-NOLSE-NEXT: b .LBB4_2 -; SOFTFP-NOLSE-NEXT: .LBB4_1: // %cmpxchg.nostore -; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB4_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB4_6 -; SOFTFP-NOLSE-NEXT: .LBB4_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB4_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 -; SOFTFP-NOLSE-NEXT: // Child Loop BB4_3 Depth 2 +; SOFTFP-NOLSE-NEXT: // Child Loop BB4_2 Depth 2 ; SOFTFP-NOLSE-NEXT: mov w1, w20 ; SOFTFP-NOLSE-NEXT: mov w21, w0 ; SOFTFP-NOLSE-NEXT: bl fminf ; SOFTFP-NOLSE-NEXT: mov w8, w0 -; SOFTFP-NOLSE-NEXT: .LBB4_3: // %cmpxchg.start -; SOFTFP-NOLSE-NEXT: // Parent Loop BB4_2 Depth=1 +; SOFTFP-NOLSE-NEXT: .LBB4_2: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB4_1 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 ; SOFTFP-NOLSE-NEXT: ldaxr w0, [x19] ; SOFTFP-NOLSE-NEXT: cmp w0, w21 -; SOFTFP-NOLSE-NEXT: b.ne .LBB4_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore -; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB4_3 Depth=2 +; SOFTFP-NOLSE-NEXT: b.ne .LBB4_5 +; SOFTFP-NOLSE-NEXT: // %bb.3: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB4_2 Depth=2 ; SOFTFP-NOLSE-NEXT: stlxr w9, w8, [x19] -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB4_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB4_2 Depth=1 +; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB4_2 +; SOFTFP-NOLSE-NEXT: // %bb.4: // in Loop: Header=BB4_1 Depth=1 ; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB4_2 +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB4_1 +; SOFTFP-NOLSE-NEXT: b .LBB4_6 +; SOFTFP-NOLSE-NEXT: .LBB4_5: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB4_1 Depth=1 +; SOFTFP-NOLSE-NEXT: mov w8, wzr +; SOFTFP-NOLSE-NEXT: clrex +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB4_1 ; SOFTFP-NOLSE-NEXT: .LBB4_6: // %atomicrmw.end ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 @@ -482,32 +482,32 @@ define double @test_atomicrmw_fmin_f32_seq_cst_align8(ptr %ptr, double %value) # ; SOFTFP-NOLSE-NEXT: mov x19, x0 ; SOFTFP-NOLSE-NEXT: ldr x0, [x0] ; SOFTFP-NOLSE-NEXT: mov x20, x1 -; SOFTFP-NOLSE-NEXT: b .LBB5_2 -; SOFTFP-NOLSE-NEXT: .LBB5_1: // %cmpxchg.nostore -; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB5_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB5_6 -; SOFTFP-NOLSE-NEXT: .LBB5_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB5_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 -; SOFTFP-NOLSE-NEXT: // Child Loop BB5_3 Depth 2 +; SOFTFP-NOLSE-NEXT: // Child Loop BB5_2 Depth 2 ; SOFTFP-NOLSE-NEXT: mov x1, x20 ; SOFTFP-NOLSE-NEXT: mov x21, x0 ; SOFTFP-NOLSE-NEXT: bl fmin ; SOFTFP-NOLSE-NEXT: mov x8, x0 -; SOFTFP-NOLSE-NEXT: .LBB5_3: // %cmpxchg.start -; SOFTFP-NOLSE-NEXT: // Parent Loop BB5_2 Depth=1 +; SOFTFP-NOLSE-NEXT: .LBB5_2: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB5_1 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 ; SOFTFP-NOLSE-NEXT: ldaxr x0, [x19] ; SOFTFP-NOLSE-NEXT: cmp x0, x21 -; SOFTFP-NOLSE-NEXT: b.ne .LBB5_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore -; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB5_3 Depth=2 +; SOFTFP-NOLSE-NEXT: b.ne .LBB5_5 +; SOFTFP-NOLSE-NEXT: // %bb.3: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB5_2 Depth=2 ; SOFTFP-NOLSE-NEXT: stlxr w9, x8, [x19] -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB5_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB5_2 Depth=1 +; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB5_2 +; SOFTFP-NOLSE-NEXT: // %bb.4: // in Loop: Header=BB5_1 Depth=1 ; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB5_2 +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB5_1 +; SOFTFP-NOLSE-NEXT: b .LBB5_6 +; SOFTFP-NOLSE-NEXT: .LBB5_5: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB5_1 Depth=1 +; SOFTFP-NOLSE-NEXT: mov w8, wzr +; SOFTFP-NOLSE-NEXT: clrex +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB5_1 ; SOFTFP-NOLSE-NEXT: .LBB5_6: // %atomicrmw.end ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload @@ -581,16 +581,9 @@ define <2 x half> @test_atomicrmw_fmin_v2f16_seq_cst_align4(ptr %ptr, <2 x half> ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: mov w19, w2 ; SOFTFP-NOLSE-NEXT: mov x20, x0 -; SOFTFP-NOLSE-NEXT: b .LBB6_2 -; SOFTFP-NOLSE-NEXT: .LBB6_1: // %cmpxchg.nostore -; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB6_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: lsr w23, w22, #16 -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB6_6 -; SOFTFP-NOLSE-NEXT: .LBB6_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB6_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 -; SOFTFP-NOLSE-NEXT: // Child Loop BB6_3 Depth 2 +; SOFTFP-NOLSE-NEXT: // Child Loop BB6_2 Depth 2 ; SOFTFP-NOLSE-NEXT: and w0, w19, #0xffff ; SOFTFP-NOLSE-NEXT: bl __extendhfsf2 ; SOFTFP-NOLSE-NEXT: mov w24, w0 @@ -611,20 +604,27 @@ define <2 x half> @test_atomicrmw_fmin_v2f16_seq_cst_align4(ptr %ptr, <2 x half> ; SOFTFP-NOLSE-NEXT: mov w8, w22 ; SOFTFP-NOLSE-NEXT: bfi w0, w24, #16, #16 ; SOFTFP-NOLSE-NEXT: bfi w8, w23, #16, #16 -; SOFTFP-NOLSE-NEXT: .LBB6_3: // %cmpxchg.start -; SOFTFP-NOLSE-NEXT: // Parent Loop BB6_2 Depth=1 +; SOFTFP-NOLSE-NEXT: .LBB6_2: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB6_1 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 ; SOFTFP-NOLSE-NEXT: ldaxr w22, [x20] ; SOFTFP-NOLSE-NEXT: cmp w22, w8 -; SOFTFP-NOLSE-NEXT: b.ne .LBB6_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore -; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB6_3 Depth=2 +; SOFTFP-NOLSE-NEXT: b.ne .LBB6_5 +; SOFTFP-NOLSE-NEXT: // %bb.3: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB6_2 Depth=2 ; SOFTFP-NOLSE-NEXT: stlxr w9, w0, [x20] -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB6_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB6_2 Depth=1 +; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB6_2 +; SOFTFP-NOLSE-NEXT: // %bb.4: // in Loop: Header=BB6_1 Depth=1 ; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 ; SOFTFP-NOLSE-NEXT: lsr w23, w22, #16 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB6_2 +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB6_1 +; SOFTFP-NOLSE-NEXT: b .LBB6_6 +; SOFTFP-NOLSE-NEXT: .LBB6_5: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB6_1 Depth=1 +; SOFTFP-NOLSE-NEXT: mov w8, wzr +; SOFTFP-NOLSE-NEXT: clrex +; SOFTFP-NOLSE-NEXT: lsr w23, w22, #16 +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB6_1 ; SOFTFP-NOLSE-NEXT: .LBB6_6: // %atomicrmw.end ; SOFTFP-NOLSE-NEXT: mov w0, w22 ; SOFTFP-NOLSE-NEXT: mov w1, w23 @@ -725,16 +725,9 @@ define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf ; SOFTFP-NOLSE-NEXT: lsl w21, w8, #16 ; SOFTFP-NOLSE-NEXT: mov x19, x0 ; SOFTFP-NOLSE-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: b .LBB7_2 -; SOFTFP-NOLSE-NEXT: .LBB7_1: // %cmpxchg.nostore -; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB7_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: lsr w1, w22, #16 -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB7_6 -; SOFTFP-NOLSE-NEXT: .LBB7_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB7_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 -; SOFTFP-NOLSE-NEXT: // Child Loop BB7_3 Depth 2 +; SOFTFP-NOLSE-NEXT: // Child Loop BB7_2 Depth 2 ; SOFTFP-NOLSE-NEXT: lsl w23, w1, #16 ; SOFTFP-NOLSE-NEXT: mov w1, w20 ; SOFTFP-NOLSE-NEXT: mov w0, w23 @@ -747,20 +740,27 @@ define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf ; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 ; SOFTFP-NOLSE-NEXT: bfxil w23, w22, #0, #16 ; SOFTFP-NOLSE-NEXT: bfi w0, w24, #16, #16 -; SOFTFP-NOLSE-NEXT: .LBB7_3: // %cmpxchg.start -; SOFTFP-NOLSE-NEXT: // Parent Loop BB7_2 Depth=1 +; SOFTFP-NOLSE-NEXT: .LBB7_2: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB7_1 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 ; SOFTFP-NOLSE-NEXT: ldaxr w22, [x19] ; SOFTFP-NOLSE-NEXT: cmp w22, w23 -; SOFTFP-NOLSE-NEXT: b.ne .LBB7_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore -; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB7_3 Depth=2 +; SOFTFP-NOLSE-NEXT: b.ne .LBB7_5 +; SOFTFP-NOLSE-NEXT: // %bb.3: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB7_2 Depth=2 ; SOFTFP-NOLSE-NEXT: stlxr w8, w0, [x19] -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB7_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB7_2 Depth=1 +; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB7_2 +; SOFTFP-NOLSE-NEXT: // %bb.4: // in Loop: Header=BB7_1 Depth=1 ; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 ; SOFTFP-NOLSE-NEXT: lsr w1, w22, #16 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB7_2 +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB7_1 +; SOFTFP-NOLSE-NEXT: b .LBB7_6 +; SOFTFP-NOLSE-NEXT: .LBB7_5: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB7_1 Depth=1 +; SOFTFP-NOLSE-NEXT: mov w8, wzr +; SOFTFP-NOLSE-NEXT: clrex +; SOFTFP-NOLSE-NEXT: lsr w1, w22, #16 +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB7_1 ; SOFTFP-NOLSE-NEXT: .LBB7_6: // %atomicrmw.end ; SOFTFP-NOLSE-NEXT: mov w0, w22 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload @@ -814,16 +814,9 @@ define <2 x float> @test_atomicrmw_fmin_v2f32_seq_cst_align8(ptr %ptr, <2 x floa ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: mov w19, w2 ; SOFTFP-NOLSE-NEXT: mov x20, x0 -; SOFTFP-NOLSE-NEXT: b .LBB8_2 -; SOFTFP-NOLSE-NEXT: .LBB8_1: // %cmpxchg.nostore -; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB8_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: lsr x23, x22, #32 -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB8_6 -; SOFTFP-NOLSE-NEXT: .LBB8_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB8_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 -; SOFTFP-NOLSE-NEXT: // Child Loop BB8_3 Depth 2 +; SOFTFP-NOLSE-NEXT: // Child Loop BB8_2 Depth 2 ; SOFTFP-NOLSE-NEXT: mov w0, w23 ; SOFTFP-NOLSE-NEXT: mov w1, w19 ; SOFTFP-NOLSE-NEXT: bl fminf @@ -836,20 +829,27 @@ define <2 x float> @test_atomicrmw_fmin_v2f32_seq_cst_align8(ptr %ptr, <2 x floa ; SOFTFP-NOLSE-NEXT: // kill: def $w23 killed $w23 killed $x23 def $x23 ; SOFTFP-NOLSE-NEXT: orr x8, x8, x24, lsl #32 ; SOFTFP-NOLSE-NEXT: orr x9, x9, x23, lsl #32 -; SOFTFP-NOLSE-NEXT: .LBB8_3: // %cmpxchg.start -; SOFTFP-NOLSE-NEXT: // Parent Loop BB8_2 Depth=1 +; SOFTFP-NOLSE-NEXT: .LBB8_2: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB8_1 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 ; SOFTFP-NOLSE-NEXT: ldaxr x22, [x20] ; SOFTFP-NOLSE-NEXT: cmp x22, x9 -; SOFTFP-NOLSE-NEXT: b.ne .LBB8_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore -; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB8_3 Depth=2 +; SOFTFP-NOLSE-NEXT: b.ne .LBB8_5 +; SOFTFP-NOLSE-NEXT: // %bb.3: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB8_2 Depth=2 ; SOFTFP-NOLSE-NEXT: stlxr w10, x8, [x20] -; SOFTFP-NOLSE-NEXT: cbnz w10, .LBB8_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB8_2 Depth=1 +; SOFTFP-NOLSE-NEXT: cbnz w10, .LBB8_2 +; SOFTFP-NOLSE-NEXT: // %bb.4: // in Loop: Header=BB8_1 Depth=1 ; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 ; SOFTFP-NOLSE-NEXT: lsr x23, x22, #32 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB8_2 +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB8_1 +; SOFTFP-NOLSE-NEXT: b .LBB8_6 +; SOFTFP-NOLSE-NEXT: .LBB8_5: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB8_1 Depth=1 +; SOFTFP-NOLSE-NEXT: mov w8, wzr +; SOFTFP-NOLSE-NEXT: clrex +; SOFTFP-NOLSE-NEXT: lsr x23, x22, #32 +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB8_1 ; SOFTFP-NOLSE-NEXT: .LBB8_6: // %atomicrmw.end ; SOFTFP-NOLSE-NEXT: mov w0, w22 ; SOFTFP-NOLSE-NEXT: mov w1, w23 diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fsub.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fsub.ll index 82e0f14e68e2..6bb541684c2b 100644 --- a/llvm/test/CodeGen/AArch64/atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AArch64/atomicrmw-fsub.ll @@ -49,15 +49,9 @@ define half @test_atomicrmw_fsub_f16_seq_cst_align2(ptr %ptr, half %value) #0 { ; SOFTFP-NOLSE-NEXT: ldrh w0, [x0] ; SOFTFP-NOLSE-NEXT: mov w20, w1 ; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: b .LBB0_2 -; SOFTFP-NOLSE-NEXT: .LBB0_1: // %cmpxchg.nostore -; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB0_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB0_6 -; SOFTFP-NOLSE-NEXT: .LBB0_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB0_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 -; SOFTFP-NOLSE-NEXT: // Child Loop BB0_3 Depth 2 +; SOFTFP-NOLSE-NEXT: // Child Loop BB0_2 Depth 2 ; SOFTFP-NOLSE-NEXT: mov w22, w0 ; SOFTFP-NOLSE-NEXT: and w0, w20, #0xffff ; SOFTFP-NOLSE-NEXT: bl __extendhfsf2 @@ -68,19 +62,25 @@ define half @test_atomicrmw_fsub_f16_seq_cst_align2(ptr %ptr, half %value) #0 { ; SOFTFP-NOLSE-NEXT: bl __subsf3 ; SOFTFP-NOLSE-NEXT: bl __truncsfhf2 ; SOFTFP-NOLSE-NEXT: mov w8, w0 -; SOFTFP-NOLSE-NEXT: .LBB0_3: // %cmpxchg.start -; SOFTFP-NOLSE-NEXT: // Parent Loop BB0_2 Depth=1 +; SOFTFP-NOLSE-NEXT: .LBB0_2: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB0_1 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 ; SOFTFP-NOLSE-NEXT: ldaxrh w0, [x19] ; SOFTFP-NOLSE-NEXT: cmp w0, w22, uxth -; SOFTFP-NOLSE-NEXT: b.ne .LBB0_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore -; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB0_3 Depth=2 +; SOFTFP-NOLSE-NEXT: b.ne .LBB0_5 +; SOFTFP-NOLSE-NEXT: // %bb.3: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB0_2 Depth=2 ; SOFTFP-NOLSE-NEXT: stlxrh w9, w8, [x19] -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB0_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB0_2 Depth=1 +; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB0_2 +; SOFTFP-NOLSE-NEXT: // %bb.4: // in Loop: Header=BB0_1 Depth=1 ; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB0_2 +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB0_1 +; SOFTFP-NOLSE-NEXT: b .LBB0_6 +; SOFTFP-NOLSE-NEXT: .LBB0_5: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB0_1 Depth=1 +; SOFTFP-NOLSE-NEXT: mov w8, wzr +; SOFTFP-NOLSE-NEXT: clrex +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB0_1 ; SOFTFP-NOLSE-NEXT: .LBB0_6: // %atomicrmw.end ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 @@ -137,15 +137,9 @@ define half @test_atomicrmw_fsub_f16_seq_cst_align4(ptr %ptr, half %value) #0 { ; SOFTFP-NOLSE-NEXT: ldrh w0, [x0] ; SOFTFP-NOLSE-NEXT: mov w20, w1 ; SOFTFP-NOLSE-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: b .LBB1_2 -; SOFTFP-NOLSE-NEXT: .LBB1_1: // %cmpxchg.nostore -; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB1_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB1_6 -; SOFTFP-NOLSE-NEXT: .LBB1_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB1_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 -; SOFTFP-NOLSE-NEXT: // Child Loop BB1_3 Depth 2 +; SOFTFP-NOLSE-NEXT: // Child Loop BB1_2 Depth 2 ; SOFTFP-NOLSE-NEXT: mov w22, w0 ; SOFTFP-NOLSE-NEXT: and w0, w20, #0xffff ; SOFTFP-NOLSE-NEXT: bl __extendhfsf2 @@ -156,19 +150,25 @@ define half @test_atomicrmw_fsub_f16_seq_cst_align4(ptr %ptr, half %value) #0 { ; SOFTFP-NOLSE-NEXT: bl __subsf3 ; SOFTFP-NOLSE-NEXT: bl __truncsfhf2 ; SOFTFP-NOLSE-NEXT: mov w8, w0 -; SOFTFP-NOLSE-NEXT: .LBB1_3: // %cmpxchg.start -; SOFTFP-NOLSE-NEXT: // Parent Loop BB1_2 Depth=1 +; SOFTFP-NOLSE-NEXT: .LBB1_2: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB1_1 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 ; SOFTFP-NOLSE-NEXT: ldaxrh w0, [x19] ; SOFTFP-NOLSE-NEXT: cmp w0, w22, uxth -; SOFTFP-NOLSE-NEXT: b.ne .LBB1_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore -; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB1_3 Depth=2 +; SOFTFP-NOLSE-NEXT: b.ne .LBB1_5 +; SOFTFP-NOLSE-NEXT: // %bb.3: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB1_2 Depth=2 ; SOFTFP-NOLSE-NEXT: stlxrh w9, w8, [x19] -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB1_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB1_2 Depth=1 +; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB1_2 +; SOFTFP-NOLSE-NEXT: // %bb.4: // in Loop: Header=BB1_1 Depth=1 ; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB1_2 +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB1_1 +; SOFTFP-NOLSE-NEXT: b .LBB1_6 +; SOFTFP-NOLSE-NEXT: .LBB1_5: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB1_1 Depth=1 +; SOFTFP-NOLSE-NEXT: mov w8, wzr +; SOFTFP-NOLSE-NEXT: clrex +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB1_1 ; SOFTFP-NOLSE-NEXT: .LBB1_6: // %atomicrmw.end ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 @@ -236,34 +236,34 @@ define bfloat @test_atomicrmw_fsub_bf16_seq_cst_align2(ptr %ptr, bfloat %value) ; SOFTFP-NOLSE-NEXT: mov x19, x0 ; SOFTFP-NOLSE-NEXT: ldrh w0, [x0] ; SOFTFP-NOLSE-NEXT: lsl w20, w1, #16 -; SOFTFP-NOLSE-NEXT: b .LBB2_2 -; SOFTFP-NOLSE-NEXT: .LBB2_1: // %cmpxchg.nostore -; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB2_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB2_6 -; SOFTFP-NOLSE-NEXT: .LBB2_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB2_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 -; SOFTFP-NOLSE-NEXT: // Child Loop BB2_3 Depth 2 +; SOFTFP-NOLSE-NEXT: // Child Loop BB2_2 Depth 2 ; SOFTFP-NOLSE-NEXT: mov w21, w0 ; SOFTFP-NOLSE-NEXT: lsl w0, w0, #16 ; SOFTFP-NOLSE-NEXT: mov w1, w20 ; SOFTFP-NOLSE-NEXT: bl __subsf3 ; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 ; SOFTFP-NOLSE-NEXT: mov w8, w0 -; SOFTFP-NOLSE-NEXT: .LBB2_3: // %cmpxchg.start -; SOFTFP-NOLSE-NEXT: // Parent Loop BB2_2 Depth=1 +; SOFTFP-NOLSE-NEXT: .LBB2_2: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB2_1 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 ; SOFTFP-NOLSE-NEXT: ldaxrh w0, [x19] ; SOFTFP-NOLSE-NEXT: cmp w0, w21, uxth -; SOFTFP-NOLSE-NEXT: b.ne .LBB2_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore -; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB2_3 Depth=2 +; SOFTFP-NOLSE-NEXT: b.ne .LBB2_5 +; SOFTFP-NOLSE-NEXT: // %bb.3: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB2_2 Depth=2 ; SOFTFP-NOLSE-NEXT: stlxrh w9, w8, [x19] -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB2_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB2_2 Depth=1 +; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB2_2 +; SOFTFP-NOLSE-NEXT: // %bb.4: // in Loop: Header=BB2_1 Depth=1 ; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB2_2 +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB2_1 +; SOFTFP-NOLSE-NEXT: b .LBB2_6 +; SOFTFP-NOLSE-NEXT: .LBB2_5: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB2_1 Depth=1 +; SOFTFP-NOLSE-NEXT: mov w8, wzr +; SOFTFP-NOLSE-NEXT: clrex +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB2_1 ; SOFTFP-NOLSE-NEXT: .LBB2_6: // %atomicrmw.end ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 @@ -330,34 +330,34 @@ define bfloat @test_atomicrmw_fsub_bf16_seq_cst_align4(ptr %ptr, bfloat %value) ; SOFTFP-NOLSE-NEXT: mov x19, x0 ; SOFTFP-NOLSE-NEXT: ldrh w0, [x0] ; SOFTFP-NOLSE-NEXT: lsl w20, w1, #16 -; SOFTFP-NOLSE-NEXT: b .LBB3_2 -; SOFTFP-NOLSE-NEXT: .LBB3_1: // %cmpxchg.nostore -; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB3_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB3_6 -; SOFTFP-NOLSE-NEXT: .LBB3_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB3_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 -; SOFTFP-NOLSE-NEXT: // Child Loop BB3_3 Depth 2 +; SOFTFP-NOLSE-NEXT: // Child Loop BB3_2 Depth 2 ; SOFTFP-NOLSE-NEXT: mov w21, w0 ; SOFTFP-NOLSE-NEXT: lsl w0, w0, #16 ; SOFTFP-NOLSE-NEXT: mov w1, w20 ; SOFTFP-NOLSE-NEXT: bl __subsf3 ; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 ; SOFTFP-NOLSE-NEXT: mov w8, w0 -; SOFTFP-NOLSE-NEXT: .LBB3_3: // %cmpxchg.start -; SOFTFP-NOLSE-NEXT: // Parent Loop BB3_2 Depth=1 +; SOFTFP-NOLSE-NEXT: .LBB3_2: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB3_1 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 ; SOFTFP-NOLSE-NEXT: ldaxrh w0, [x19] ; SOFTFP-NOLSE-NEXT: cmp w0, w21, uxth -; SOFTFP-NOLSE-NEXT: b.ne .LBB3_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore -; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB3_3 Depth=2 +; SOFTFP-NOLSE-NEXT: b.ne .LBB3_5 +; SOFTFP-NOLSE-NEXT: // %bb.3: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB3_2 Depth=2 ; SOFTFP-NOLSE-NEXT: stlxrh w9, w8, [x19] -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB3_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB3_2 Depth=1 +; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB3_2 +; SOFTFP-NOLSE-NEXT: // %bb.4: // in Loop: Header=BB3_1 Depth=1 ; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB3_2 +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB3_1 +; SOFTFP-NOLSE-NEXT: b .LBB3_6 +; SOFTFP-NOLSE-NEXT: .LBB3_5: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB3_1 Depth=1 +; SOFTFP-NOLSE-NEXT: mov w8, wzr +; SOFTFP-NOLSE-NEXT: clrex +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB3_1 ; SOFTFP-NOLSE-NEXT: .LBB3_6: // %atomicrmw.end ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 @@ -406,32 +406,32 @@ define float @test_atomicrmw_fsub_f32_seq_cst_align4(ptr %ptr, float %value) #0 ; SOFTFP-NOLSE-NEXT: mov x19, x0 ; SOFTFP-NOLSE-NEXT: ldr w0, [x0] ; SOFTFP-NOLSE-NEXT: mov w20, w1 -; SOFTFP-NOLSE-NEXT: b .LBB4_2 -; SOFTFP-NOLSE-NEXT: .LBB4_1: // %cmpxchg.nostore -; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB4_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB4_6 -; SOFTFP-NOLSE-NEXT: .LBB4_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB4_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 -; SOFTFP-NOLSE-NEXT: // Child Loop BB4_3 Depth 2 +; SOFTFP-NOLSE-NEXT: // Child Loop BB4_2 Depth 2 ; SOFTFP-NOLSE-NEXT: mov w1, w20 ; SOFTFP-NOLSE-NEXT: mov w21, w0 ; SOFTFP-NOLSE-NEXT: bl __subsf3 ; SOFTFP-NOLSE-NEXT: mov w8, w0 -; SOFTFP-NOLSE-NEXT: .LBB4_3: // %cmpxchg.start -; SOFTFP-NOLSE-NEXT: // Parent Loop BB4_2 Depth=1 +; SOFTFP-NOLSE-NEXT: .LBB4_2: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB4_1 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 ; SOFTFP-NOLSE-NEXT: ldaxr w0, [x19] ; SOFTFP-NOLSE-NEXT: cmp w0, w21 -; SOFTFP-NOLSE-NEXT: b.ne .LBB4_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore -; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB4_3 Depth=2 +; SOFTFP-NOLSE-NEXT: b.ne .LBB4_5 +; SOFTFP-NOLSE-NEXT: // %bb.3: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB4_2 Depth=2 ; SOFTFP-NOLSE-NEXT: stlxr w9, w8, [x19] -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB4_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB4_2 Depth=1 +; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB4_2 +; SOFTFP-NOLSE-NEXT: // %bb.4: // in Loop: Header=BB4_1 Depth=1 ; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB4_2 +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB4_1 +; SOFTFP-NOLSE-NEXT: b .LBB4_6 +; SOFTFP-NOLSE-NEXT: .LBB4_5: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB4_1 Depth=1 +; SOFTFP-NOLSE-NEXT: mov w8, wzr +; SOFTFP-NOLSE-NEXT: clrex +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB4_1 ; SOFTFP-NOLSE-NEXT: .LBB4_6: // %atomicrmw.end ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: // kill: def $w0 killed $w0 killed $x0 @@ -480,32 +480,32 @@ define double @test_atomicrmw_fsub_f32_seq_cst_align8(ptr %ptr, double %value) # ; SOFTFP-NOLSE-NEXT: mov x19, x0 ; SOFTFP-NOLSE-NEXT: ldr x0, [x0] ; SOFTFP-NOLSE-NEXT: mov x20, x1 -; SOFTFP-NOLSE-NEXT: b .LBB5_2 -; SOFTFP-NOLSE-NEXT: .LBB5_1: // %cmpxchg.nostore -; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB5_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB5_6 -; SOFTFP-NOLSE-NEXT: .LBB5_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB5_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 -; SOFTFP-NOLSE-NEXT: // Child Loop BB5_3 Depth 2 +; SOFTFP-NOLSE-NEXT: // Child Loop BB5_2 Depth 2 ; SOFTFP-NOLSE-NEXT: mov x1, x20 ; SOFTFP-NOLSE-NEXT: mov x21, x0 ; SOFTFP-NOLSE-NEXT: bl __subdf3 ; SOFTFP-NOLSE-NEXT: mov x8, x0 -; SOFTFP-NOLSE-NEXT: .LBB5_3: // %cmpxchg.start -; SOFTFP-NOLSE-NEXT: // Parent Loop BB5_2 Depth=1 +; SOFTFP-NOLSE-NEXT: .LBB5_2: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB5_1 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 ; SOFTFP-NOLSE-NEXT: ldaxr x0, [x19] ; SOFTFP-NOLSE-NEXT: cmp x0, x21 -; SOFTFP-NOLSE-NEXT: b.ne .LBB5_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore -; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB5_3 Depth=2 +; SOFTFP-NOLSE-NEXT: b.ne .LBB5_5 +; SOFTFP-NOLSE-NEXT: // %bb.3: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB5_2 Depth=2 ; SOFTFP-NOLSE-NEXT: stlxr w9, x8, [x19] -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB5_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB5_2 Depth=1 +; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB5_2 +; SOFTFP-NOLSE-NEXT: // %bb.4: // in Loop: Header=BB5_1 Depth=1 ; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB5_2 +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB5_1 +; SOFTFP-NOLSE-NEXT: b .LBB5_6 +; SOFTFP-NOLSE-NEXT: .LBB5_5: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB5_1 Depth=1 +; SOFTFP-NOLSE-NEXT: mov w8, wzr +; SOFTFP-NOLSE-NEXT: clrex +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB5_1 ; SOFTFP-NOLSE-NEXT: .LBB5_6: // %atomicrmw.end ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload ; SOFTFP-NOLSE-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload @@ -701,16 +701,9 @@ define <2 x half> @test_atomicrmw_fsub_v2f16_seq_cst_align4(ptr %ptr, <2 x half> ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: mov w19, w2 ; SOFTFP-NOLSE-NEXT: mov x20, x0 -; SOFTFP-NOLSE-NEXT: b .LBB7_2 -; SOFTFP-NOLSE-NEXT: .LBB7_1: // %cmpxchg.nostore -; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB7_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: lsr w23, w22, #16 -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB7_6 -; SOFTFP-NOLSE-NEXT: .LBB7_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB7_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 -; SOFTFP-NOLSE-NEXT: // Child Loop BB7_3 Depth 2 +; SOFTFP-NOLSE-NEXT: // Child Loop BB7_2 Depth 2 ; SOFTFP-NOLSE-NEXT: and w0, w19, #0xffff ; SOFTFP-NOLSE-NEXT: bl __extendhfsf2 ; SOFTFP-NOLSE-NEXT: mov w24, w0 @@ -731,20 +724,27 @@ define <2 x half> @test_atomicrmw_fsub_v2f16_seq_cst_align4(ptr %ptr, <2 x half> ; SOFTFP-NOLSE-NEXT: mov w8, w22 ; SOFTFP-NOLSE-NEXT: bfi w0, w24, #16, #16 ; SOFTFP-NOLSE-NEXT: bfi w8, w23, #16, #16 -; SOFTFP-NOLSE-NEXT: .LBB7_3: // %cmpxchg.start -; SOFTFP-NOLSE-NEXT: // Parent Loop BB7_2 Depth=1 +; SOFTFP-NOLSE-NEXT: .LBB7_2: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB7_1 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 ; SOFTFP-NOLSE-NEXT: ldaxr w22, [x20] ; SOFTFP-NOLSE-NEXT: cmp w22, w8 -; SOFTFP-NOLSE-NEXT: b.ne .LBB7_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore -; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB7_3 Depth=2 +; SOFTFP-NOLSE-NEXT: b.ne .LBB7_5 +; SOFTFP-NOLSE-NEXT: // %bb.3: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB7_2 Depth=2 ; SOFTFP-NOLSE-NEXT: stlxr w9, w0, [x20] -; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB7_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB7_2 Depth=1 +; SOFTFP-NOLSE-NEXT: cbnz w9, .LBB7_2 +; SOFTFP-NOLSE-NEXT: // %bb.4: // in Loop: Header=BB7_1 Depth=1 ; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 ; SOFTFP-NOLSE-NEXT: lsr w23, w22, #16 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB7_2 +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB7_1 +; SOFTFP-NOLSE-NEXT: b .LBB7_6 +; SOFTFP-NOLSE-NEXT: .LBB7_5: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB7_1 Depth=1 +; SOFTFP-NOLSE-NEXT: mov w8, wzr +; SOFTFP-NOLSE-NEXT: clrex +; SOFTFP-NOLSE-NEXT: lsr w23, w22, #16 +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB7_1 ; SOFTFP-NOLSE-NEXT: .LBB7_6: // %atomicrmw.end ; SOFTFP-NOLSE-NEXT: mov w0, w22 ; SOFTFP-NOLSE-NEXT: mov w1, w23 @@ -817,16 +817,9 @@ define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf ; SOFTFP-NOLSE-NEXT: lsl w21, w8, #16 ; SOFTFP-NOLSE-NEXT: mov x19, x0 ; SOFTFP-NOLSE-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill -; SOFTFP-NOLSE-NEXT: b .LBB8_2 -; SOFTFP-NOLSE-NEXT: .LBB8_1: // %cmpxchg.nostore -; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB8_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: lsr w1, w22, #16 -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB8_6 -; SOFTFP-NOLSE-NEXT: .LBB8_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB8_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 -; SOFTFP-NOLSE-NEXT: // Child Loop BB8_3 Depth 2 +; SOFTFP-NOLSE-NEXT: // Child Loop BB8_2 Depth 2 ; SOFTFP-NOLSE-NEXT: lsl w23, w1, #16 ; SOFTFP-NOLSE-NEXT: mov w1, w20 ; SOFTFP-NOLSE-NEXT: mov w0, w23 @@ -839,20 +832,27 @@ define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf ; SOFTFP-NOLSE-NEXT: bl __truncsfbf2 ; SOFTFP-NOLSE-NEXT: bfxil w23, w22, #0, #16 ; SOFTFP-NOLSE-NEXT: bfi w0, w24, #16, #16 -; SOFTFP-NOLSE-NEXT: .LBB8_3: // %cmpxchg.start -; SOFTFP-NOLSE-NEXT: // Parent Loop BB8_2 Depth=1 +; SOFTFP-NOLSE-NEXT: .LBB8_2: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB8_1 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 ; SOFTFP-NOLSE-NEXT: ldaxr w22, [x19] ; SOFTFP-NOLSE-NEXT: cmp w22, w23 -; SOFTFP-NOLSE-NEXT: b.ne .LBB8_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore -; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB8_3 Depth=2 +; SOFTFP-NOLSE-NEXT: b.ne .LBB8_5 +; SOFTFP-NOLSE-NEXT: // %bb.3: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB8_2 Depth=2 ; SOFTFP-NOLSE-NEXT: stlxr w8, w0, [x19] -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB8_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB8_2 Depth=1 +; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB8_2 +; SOFTFP-NOLSE-NEXT: // %bb.4: // in Loop: Header=BB8_1 Depth=1 ; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 ; SOFTFP-NOLSE-NEXT: lsr w1, w22, #16 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB8_2 +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB8_1 +; SOFTFP-NOLSE-NEXT: b .LBB8_6 +; SOFTFP-NOLSE-NEXT: .LBB8_5: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB8_1 Depth=1 +; SOFTFP-NOLSE-NEXT: mov w8, wzr +; SOFTFP-NOLSE-NEXT: clrex +; SOFTFP-NOLSE-NEXT: lsr w1, w22, #16 +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB8_1 ; SOFTFP-NOLSE-NEXT: .LBB8_6: // %atomicrmw.end ; SOFTFP-NOLSE-NEXT: mov w0, w22 ; SOFTFP-NOLSE-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload @@ -906,16 +906,9 @@ define <2 x float> @test_atomicrmw_fsub_v2f32_seq_cst_align8(ptr %ptr, <2 x floa ; SOFTFP-NOLSE-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill ; SOFTFP-NOLSE-NEXT: mov w19, w2 ; SOFTFP-NOLSE-NEXT: mov x20, x0 -; SOFTFP-NOLSE-NEXT: b .LBB9_2 -; SOFTFP-NOLSE-NEXT: .LBB9_1: // %cmpxchg.nostore -; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB9_2 Depth=1 -; SOFTFP-NOLSE-NEXT: mov w8, wzr -; SOFTFP-NOLSE-NEXT: clrex -; SOFTFP-NOLSE-NEXT: lsr x23, x22, #32 -; SOFTFP-NOLSE-NEXT: cbnz w8, .LBB9_6 -; SOFTFP-NOLSE-NEXT: .LBB9_2: // %atomicrmw.start +; SOFTFP-NOLSE-NEXT: .LBB9_1: // %atomicrmw.start ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 -; SOFTFP-NOLSE-NEXT: // Child Loop BB9_3 Depth 2 +; SOFTFP-NOLSE-NEXT: // Child Loop BB9_2 Depth 2 ; SOFTFP-NOLSE-NEXT: mov w0, w23 ; SOFTFP-NOLSE-NEXT: mov w1, w19 ; SOFTFP-NOLSE-NEXT: bl __subsf3 @@ -928,20 +921,27 @@ define <2 x float> @test_atomicrmw_fsub_v2f32_seq_cst_align8(ptr %ptr, <2 x floa ; SOFTFP-NOLSE-NEXT: // kill: def $w23 killed $w23 killed $x23 def $x23 ; SOFTFP-NOLSE-NEXT: orr x8, x8, x24, lsl #32 ; SOFTFP-NOLSE-NEXT: orr x9, x9, x23, lsl #32 -; SOFTFP-NOLSE-NEXT: .LBB9_3: // %cmpxchg.start -; SOFTFP-NOLSE-NEXT: // Parent Loop BB9_2 Depth=1 +; SOFTFP-NOLSE-NEXT: .LBB9_2: // %cmpxchg.start +; SOFTFP-NOLSE-NEXT: // Parent Loop BB9_1 Depth=1 ; SOFTFP-NOLSE-NEXT: // => This Inner Loop Header: Depth=2 ; SOFTFP-NOLSE-NEXT: ldaxr x22, [x20] ; SOFTFP-NOLSE-NEXT: cmp x22, x9 -; SOFTFP-NOLSE-NEXT: b.ne .LBB9_1 -; SOFTFP-NOLSE-NEXT: // %bb.4: // %cmpxchg.trystore -; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB9_3 Depth=2 +; SOFTFP-NOLSE-NEXT: b.ne .LBB9_5 +; SOFTFP-NOLSE-NEXT: // %bb.3: // %cmpxchg.trystore +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB9_2 Depth=2 ; SOFTFP-NOLSE-NEXT: stlxr w10, x8, [x20] -; SOFTFP-NOLSE-NEXT: cbnz w10, .LBB9_3 -; SOFTFP-NOLSE-NEXT: // %bb.5: // in Loop: Header=BB9_2 Depth=1 +; SOFTFP-NOLSE-NEXT: cbnz w10, .LBB9_2 +; SOFTFP-NOLSE-NEXT: // %bb.4: // in Loop: Header=BB9_1 Depth=1 ; SOFTFP-NOLSE-NEXT: mov w8, #1 // =0x1 ; SOFTFP-NOLSE-NEXT: lsr x23, x22, #32 -; SOFTFP-NOLSE-NEXT: cbz w8, .LBB9_2 +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB9_1 +; SOFTFP-NOLSE-NEXT: b .LBB9_6 +; SOFTFP-NOLSE-NEXT: .LBB9_5: // %cmpxchg.nostore +; SOFTFP-NOLSE-NEXT: // in Loop: Header=BB9_1 Depth=1 +; SOFTFP-NOLSE-NEXT: mov w8, wzr +; SOFTFP-NOLSE-NEXT: clrex +; SOFTFP-NOLSE-NEXT: lsr x23, x22, #32 +; SOFTFP-NOLSE-NEXT: cbz w8, .LBB9_1 ; SOFTFP-NOLSE-NEXT: .LBB9_6: // %atomicrmw.end ; SOFTFP-NOLSE-NEXT: mov w0, w22 ; SOFTFP-NOLSE-NEXT: mov w1, w23 diff --git a/llvm/test/CodeGen/AArch64/bitcast_truncstore.ll b/llvm/test/CodeGen/AArch64/bitcast_truncstore.ll new file mode 100644 index 000000000000..e1f1bb429409 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/bitcast_truncstore.ll @@ -0,0 +1,83 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s + +define void @_Z10test_truncstore_f64toi32Pjd(ptr %n, double %x) { +; CHECK-LABEL: _Z10test_truncstore_f64toi32Pjd: +; CHECK: // %bb.0: +; CHECK-NEXT: str s0, [x0] +; CHECK-NEXT: ret + %i64 = bitcast double %x to i64 + %conv = trunc i64 %i64 to i32 + store i32 %conv, ptr %n, align 4 + ret void +} + +define void @_Z9test_truncstore_f64toi16Ptd(ptr %n, double %x) { +; CHECK-LABEL: _Z9test_truncstore_f64toi16Ptd: +; CHECK: // %bb.0: +; CHECK-NEXT: str h0, [x0] +; CHECK-NEXT: ret + %i64 = bitcast double %x to i64 + %conv = trunc i64 %i64 to i16 + store i16 %conv, ptr %n, align 2 + ret void +} + +define void @_Z13test_truncstore_f64toi8Phd(ptr %n, double %x) { +; CHECK-LABEL: _Z13test_truncstore_f64toi8Phd: +; CHECK: // %bb.0: +; CHECK-NEXT: str b0, [x0] +; CHECK-NEXT: ret + %i64 = bitcast double %x to i64 + %conv = trunc i64 %i64 to i8 + store i8 %conv, ptr %n, align 1 + ret void +} + +define void @_Z17test_truncstore_f32toi16Ptf(ptr %n, float %x) { +; CHECK-LABEL: _Z17test_truncstore_f32toi16Ptf: +; CHECK: // %bb.0: +; CHECK-NEXT: str h0, [x0] +; CHECK-NEXT: ret + %i32 = bitcast float %x to i32 + %conv = trunc i32 %i32 to i16 + store i16 %conv, ptr %n, align 2 + ret void +} + +define void @_Z16test_truncstore_f32toi8Phf(ptr %n, float %x) { +; CHECK-LABEL: _Z16test_truncstore_f32toi8Phf: +; CHECK: // %bb.0: +; CHECK-NEXT: str b0, [x0] +; CHECK-NEXT: ret + %i32 = bitcast float %x to i32 + %conv = trunc i32 %i32 to i8 + store i8 %conv, ptr %n, align 1 + ret void +} + +define void @test_truncstore_i64tof32(ptr %n, i64 %x) { +; CHECK-LABEL: test_truncstore_i64tof32: +; CHECK: // %bb.0: +; CHECK-NEXT: fmov d0, x1 +; CHECK-NEXT: fcvt s0, d0 +; CHECK-NEXT: str s0, [x0] +; CHECK-NEXT: ret + %d = bitcast i64 %x to double + %f = fptrunc double %d to float + store float %f, ptr %n, align 4 + ret void +} + +define void @test_truncstore_i32tof16(ptr %n, i32 %x) { +; CHECK-LABEL: test_truncstore_i32tof16: +; CHECK: // %bb.0: +; CHECK-NEXT: fmov s0, w1 +; CHECK-NEXT: fcvt h0, s0 +; CHECK-NEXT: str h0, [x0] +; CHECK-NEXT: ret + %f = bitcast i32 %x to float + %h = fptrunc float %f to half + store half %h, ptr %n, align 2 + ret void +} diff --git a/llvm/test/CodeGen/AArch64/blr-bti-preserves-operands.mir b/llvm/test/CodeGen/AArch64/blr-bti-preserves-operands.mir index f41e590c870a..a4a392c41ec9 100644 --- a/llvm/test/CodeGen/AArch64/blr-bti-preserves-operands.mir +++ b/llvm/test/CodeGen/AArch64/blr-bti-preserves-operands.mir @@ -8,7 +8,7 @@ # The arguments to the call must become implicit arguments, because the branch # only expects to get 1 explicit operand which is the branch target. -# CHECK: BUNDLE implicit-def $lr, implicit-def $sp, implicit $sp, implicit $x0, implicit $w1 { +# CHECK: BUNDLE implicit-def dead $lr, implicit-def $sp, implicit $sp, implicit $x0, implicit $w1 { # CHECK: BL @_setjmp, csr_aarch64_aapcs, implicit-def $lr, implicit $sp, implicit $x0, implicit $w1, implicit-def dead $lr, implicit $sp, implicit-def $sp # CHECK: HINT 36 # CHECK: } diff --git a/llvm/test/CodeGen/AArch64/bti-ehpad.ll b/llvm/test/CodeGen/AArch64/bti-ehpad.ll new file mode 100644 index 000000000000..674421adaf51 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/bti-ehpad.ll @@ -0,0 +1,44 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=aarch64-unknown-linux-gnu %s -o - | FileCheck %s + +; Purpose: With BTI enabled, the landing pad (%lpad) begins with an EH_LABEL and the +; first *executed* instruction is `bti j`. (BTI is inserted *after* the EH label and meta.) + +declare i32 @__gxx_personality_v0(...) +declare void @may_throw() + +define void @test() #0 personality ptr @__gxx_personality_v0 { +; CHECK-LABEL: test: +; CHECK: .Lfunc_begin0: +; CHECK-NEXT: .cfi_startproc +; CHECK-NEXT: .cfi_personality 156, DW.ref.__gxx_personality_v0 +; CHECK-NEXT: .cfi_lsda 28, .Lexception0 +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: bti c +; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w30, -16 +; CHECK-NEXT: .Ltmp0: // EH_LABEL +; CHECK-NEXT: bl may_throw +; CHECK-NEXT: .Ltmp1: // EH_LABEL +; CHECK-NEXT: // %bb.1: // %common.ret +; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB0_2: // %lpad +; CHECK-NEXT: .Ltmp2: // EH_LABEL +; CHECK-NEXT: bti j +; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + invoke void @may_throw() + to label %ret unwind label %lpad + +lpad: + landingpad { ptr, i32 } cleanup + ret void + +ret: + ret void +} + +attributes #0 = { noinline "branch-target-enforcement"="true" "target-features"="+bti" } diff --git a/llvm/test/CodeGen/AArch64/cmpxchg-idioms.ll b/llvm/test/CodeGen/AArch64/cmpxchg-idioms.ll index b7817ebe59b9..3f4dd116d91f 100644 --- a/llvm/test/CodeGen/AArch64/cmpxchg-idioms.ll +++ b/llvm/test/CodeGen/AArch64/cmpxchg-idioms.ll @@ -181,41 +181,41 @@ define i1 @test_conditional2(i32 %a, i32 %b, ptr %c) { ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldaxr w8, [x19] ; CHECK-NEXT: cmp w8, w21 -; CHECK-NEXT: b.ne LBB3_4 +; CHECK-NEXT: b.ne LBB3_9 ; CHECK-NEXT: ; %bb.2: ; %cmpxchg.trystore ; CHECK-NEXT: ; in Loop: Header=BB3_1 Depth=1 ; CHECK-NEXT: stlxr w8, w20, [x19] ; CHECK-NEXT: cbnz w8, LBB3_1 ; CHECK-NEXT: ; %bb.3: ; CHECK-NEXT: mov w8, #1 ; =0x1 -; CHECK-NEXT: b LBB3_5 -; CHECK-NEXT: LBB3_4: ; %cmpxchg.nostore -; CHECK-NEXT: mov w8, wzr -; CHECK-NEXT: clrex -; CHECK-NEXT: LBB3_5: ; %for.cond.preheader +; CHECK-NEXT: LBB3_4: ; %for.cond.preheader ; CHECK-NEXT: mov w22, #2 ; =0x2 -; CHECK-NEXT: LBB3_6: ; %for.cond +; CHECK-NEXT: LBB3_5: ; %for.cond ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: cbz w22, LBB3_9 -; CHECK-NEXT: ; %bb.7: ; %for.body -; CHECK-NEXT: ; in Loop: Header=BB3_6 Depth=1 +; CHECK-NEXT: cbz w22, LBB3_8 +; CHECK-NEXT: ; %bb.6: ; %for.body +; CHECK-NEXT: ; in Loop: Header=BB3_5 Depth=1 ; CHECK-NEXT: sub w22, w22, #1 ; CHECK-NEXT: orr w9, w21, w20 ; CHECK-NEXT: ldr w10, [x19, w22, sxtw #2] ; CHECK-NEXT: cmp w9, w10 -; CHECK-NEXT: b.eq LBB3_6 -; CHECK-NEXT: ; %bb.8: ; %if.then -; CHECK-NEXT: ; in Loop: Header=BB3_6 Depth=1 +; CHECK-NEXT: b.eq LBB3_5 +; CHECK-NEXT: ; %bb.7: ; %if.then +; CHECK-NEXT: ; in Loop: Header=BB3_5 Depth=1 ; CHECK-NEXT: str w9, [x19, w22, sxtw #2] ; CHECK-NEXT: bl _foo ; CHECK-NEXT: mov w8, wzr -; CHECK-NEXT: b LBB3_6 -; CHECK-NEXT: LBB3_9: ; %for.cond.cleanup +; CHECK-NEXT: b LBB3_5 +; CHECK-NEXT: LBB3_8: ; %for.cond.cleanup ; CHECK-NEXT: ldp x29, x30, [sp, #32] ; 16-byte Folded Reload ; CHECK-NEXT: and w0, w8, #0x1 ; CHECK-NEXT: ldp x20, x19, [sp, #16] ; 16-byte Folded Reload ; CHECK-NEXT: ldp x22, x21, [sp], #48 ; 16-byte Folded Reload ; CHECK-NEXT: ret +; CHECK-NEXT: LBB3_9: ; %cmpxchg.nostore +; CHECK-NEXT: mov w8, wzr +; CHECK-NEXT: clrex +; CHECK-NEXT: b LBB3_4 ; ; OUTLINE-ATOMICS-LABEL: test_conditional2: ; OUTLINE-ATOMICS: ; %bb.0: ; %entry diff --git a/llvm/test/CodeGen/AArch64/cvt-fp-int-fp.ll b/llvm/test/CodeGen/AArch64/cvt-fp-int-fp.ll index e3263252875f..1207de746894 100644 --- a/llvm/test/CodeGen/AArch64/cvt-fp-int-fp.ll +++ b/llvm/test/CodeGen/AArch64/cvt-fp-int-fp.ll @@ -119,6 +119,103 @@ entry: ret bfloat %conv1 } +define i64 @testu_f64_multiuse(double %x) { +; CHECK-LABEL: testu_f64_multiuse: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcvtzu x8, d0 +; CHECK-NEXT: ucvtf d1, x8 +; CHECK-NEXT: fcmp d0, d1 +; CHECK-NEXT: csel x0, x8, xzr, eq +; CHECK-NEXT: ret +entry: + %conv = fptoui double %x to i64 + %conv1 = uitofp i64 %conv to double + %cmp = fcmp oeq double %x, %conv1 + %cond = select i1 %cmp, i64 %conv, i64 0 + ret i64 %cond +} + +define i32 @testu_f32_multiuse(float %x) { +; CHECK-LABEL: testu_f32_multiuse: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcvtzu w8, s0 +; CHECK-NEXT: ucvtf s1, w8 +; CHECK-NEXT: fcmp s0, s1 +; CHECK-NEXT: csel w0, w8, wzr, eq +; CHECK-NEXT: ret +entry: + %conv = fptoui float %x to i32 + %conv1 = uitofp i32 %conv to float + %cmp = fcmp oeq float %x, %conv1 + %cond = select i1 %cmp, i32 %conv, i32 0 + ret i32 %cond +} + +define i32 @testu_f16_multiuse(half %x) { +; CHECK-LABEL: testu_f16_multiuse: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcvtzu w8, h0 +; CHECK-NEXT: ucvtf h1, w8 +; CHECK-NEXT: fcmp h0, h1 +; CHECK-NEXT: csel w0, w8, wzr, eq +; CHECK-NEXT: ret +entry: + %conv = fptoui half %x to i32 + %conv1 = uitofp i32 %conv to half + %cmp = fcmp oeq half %x, %conv1 + %cond = select i1 %cmp, i32 %conv, i32 0 + ret i32 %cond +} + +define i64 @tests_f64_multiuse(double %x) { +; CHECK-LABEL: tests_f64_multiuse: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcvtzs x8, d0 +; CHECK-NEXT: scvtf d1, x8 +; CHECK-NEXT: fcmp d0, d1 +; CHECK-NEXT: csel x0, x8, xzr, eq +; CHECK-NEXT: ret +entry: + %conv = fptosi double %x to i64 + %conv1 = sitofp i64 %conv to double + %cmp = fcmp oeq double %x, %conv1 + %cond = select i1 %cmp, i64 %conv, i64 0 + ret i64 %cond +} + +define i32 @tests_f32_multiuse(float %x) { +; CHECK-LABEL: tests_f32_multiuse: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcvtzs w8, s0 +; CHECK-NEXT: scvtf s1, w8 +; CHECK-NEXT: fcmp s0, s1 +; CHECK-NEXT: csel w0, w8, wzr, eq +; CHECK-NEXT: ret +entry: + %conv = fptosi float %x to i32 + %conv1 = sitofp i32 %conv to float + %cmp = fcmp oeq float %x, %conv1 + %cond = select i1 %cmp, i32 %conv, i32 0 + ret i32 %cond +} + +define i32 @tests_f16_multiuse(half %x) { +; CHECK-LABEL: tests_f16_multiuse: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcvtzs w8, h0 +; CHECK-NEXT: scvtf h1, w8 +; CHECK-NEXT: fcmp h0, h1 +; CHECK-NEXT: csel w0, w8, wzr, eq +; CHECK-NEXT: ret +entry: + %conv = fptosi half %x to i32 + %conv1 = sitofp i32 %conv to half + %cmp = fcmp oeq half %x, %conv1 + %cond = select i1 %cmp, i32 %conv, i32 0 + ret i32 %cond +} + + define double @t1_strict(double %x) #0 { ; CHECK-LABEL: t1_strict: ; CHECK: // %bb.0: // %entry diff --git a/llvm/test/CodeGen/AArch64/dag-combine-concat-vectors.ll b/llvm/test/CodeGen/AArch64/dag-combine-concat-vectors.ll index 53126a08db86..c0c31427307b 100644 --- a/llvm/test/CodeGen/AArch64/dag-combine-concat-vectors.ll +++ b/llvm/test/CodeGen/AArch64/dag-combine-concat-vectors.ll @@ -8,7 +8,7 @@ declare void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8>, <vscale x define fastcc i8 @allocno_reload_assign(ptr %p) { ; CHECK-LABEL: allocno_reload_assign: ; CHECK: // %bb.0: -; CHECK-NEXT: fmov d0, xzr +; CHECK-NEXT: movi d0, #0000000000000000 ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: cmpeq p0.d, p0/z, z0.d, #0 ; CHECK-NEXT: uzp1 p0.s, p0.s, p0.s diff --git a/llvm/test/CodeGen/AArch64/expand-select.ll b/llvm/test/CodeGen/AArch64/expand-select.ll index 7ca6adb1338d..1ca4719d9b6b 100644 --- a/llvm/test/CodeGen/AArch64/expand-select.ll +++ b/llvm/test/CodeGen/AArch64/expand-select.ll @@ -4,8 +4,8 @@ define void @foo(i32 %In1, <2 x i128> %In2, <2 x i128> %In3, ptr %Out) { ; CHECK-LABEL: foo: ; CHECK: // %bb.0: +; CHECK-NEXT: movi d0, #0000000000000000 ; CHECK-NEXT: and w8, w0, #0x1 -; CHECK-NEXT: fmov s0, wzr ; CHECK-NEXT: ldr x11, [sp] ; CHECK-NEXT: fmov s1, w8 ; CHECK-NEXT: ldp x8, x10, [sp, #8] @@ -31,8 +31,8 @@ define void @foo(i32 %In1, <2 x i128> %In2, <2 x i128> %In3, ptr %Out) { define void @bar(i32 %In1, <2 x i96> %In2, <2 x i96> %In3, ptr %Out) { ; CHECK-LABEL: bar: ; CHECK: // %bb.0: +; CHECK-NEXT: movi d0, #0000000000000000 ; CHECK-NEXT: and w8, w0, #0x1 -; CHECK-NEXT: fmov s0, wzr ; CHECK-NEXT: ldr x10, [sp, #16] ; CHECK-NEXT: fmov s1, w8 ; CHECK-NEXT: cmeq v0.4s, v1.4s, v0.4s diff --git a/llvm/test/CodeGen/AArch64/ext-narrow-index.ll b/llvm/test/CodeGen/AArch64/ext-narrow-index.ll index 177f2cafcf83..f62cfef9baf2 100644 --- a/llvm/test/CodeGen/AArch64/ext-narrow-index.ll +++ b/llvm/test/CodeGen/AArch64/ext-narrow-index.ll @@ -382,7 +382,7 @@ entry: define <1 x i64> @i64_zero_off2(<2 x i64> %arg1) { ; CHECK-LABEL: i64_zero_off2: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: fmov d0, xzr +; CHECK-NEXT: movi d0, #0000000000000000 ; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i64> %arg1, <2 x i64> zeroinitializer, <1 x i32> <i32 2> diff --git a/llvm/test/CodeGen/AArch64/extract-vector-cmp.ll b/llvm/test/CodeGen/AArch64/extract-vector-cmp.ll index f076ee12427d..832e34b664fb 100644 --- a/llvm/test/CodeGen/AArch64/extract-vector-cmp.ll +++ b/llvm/test/CodeGen/AArch64/extract-vector-cmp.ll @@ -142,14 +142,12 @@ for.cond.cleanup: } -; TODO: Combine the sbfx(cset) into a csetm define i32 @issue_121372(<4 x i32> %v) { ; CHECK-LABEL: issue_121372: ; CHECK: // %bb.0: ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: cmp w8, #0 -; CHECK-NEXT: cset w8, eq -; CHECK-NEXT: sbfx w8, w8, #0, #1 +; CHECK-NEXT: csetm w8, eq ; CHECK-NEXT: cmp w8, #1 ; CHECK-NEXT: csetm w0, lt ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/extract-vector-elt.ll b/llvm/test/CodeGen/AArch64/extract-vector-elt.ll index 18b028c9898e..6ab703c08b83 100644 --- a/llvm/test/CodeGen/AArch64/extract-vector-elt.ll +++ b/llvm/test/CodeGen/AArch64/extract-vector-elt.ll @@ -1093,3 +1093,38 @@ loop: ret: ret i32 %3 } + +define <3 x ptr> @v3move(<3 x ptr> %a, <3 x ptr> %b, <3 x ptr> %x) { +; CHECK-SD-LABEL: v3move: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: fmov d1, d7 +; CHECK-SD-NEXT: fmov d0, d6 +; CHECK-SD-NEXT: ldr d2, [sp] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: v3move: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: ldr x8, [sp] +; CHECK-GI-NEXT: fmov d0, d6 +; CHECK-GI-NEXT: fmov d1, d7 +; CHECK-GI-NEXT: fmov d2, x8 +; CHECK-GI-NEXT: ret +entry: + ret <3 x ptr> %x +} + +define ptr @v3ext(<3 x ptr> %a, <3 x ptr> %b, <3 x ptr> %x) { +; CHECK-SD-LABEL: v3ext: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: ldr d0, [sp] +; CHECK-SD-NEXT: fmov x0, d0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: v3ext: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: ldr x0, [sp] +; CHECK-GI-NEXT: ret +entry: + %c = extractelement <3 x ptr> %x, i32 2 + ret ptr %c +} diff --git a/llvm/test/CodeGen/AArch64/f16-imm.ll b/llvm/test/CodeGen/AArch64/f16-imm.ll index 58793bf19f3a..68873f9b7c3d 100644 --- a/llvm/test/CodeGen/AArch64/f16-imm.ll +++ b/llvm/test/CodeGen/AArch64/f16-imm.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=aarch64 -mattr=+fullfp16,+no-zcz-fp | FileCheck %s --check-prefixes=CHECK-FP16,CHECK-NOZCZ -; RUN: llc < %s -mtriple=aarch64 -mattr=+fullfp16,+zcz | FileCheck %s --check-prefixes=CHECK-FP16,CHECK-ZCZ +; RUN: llc < %s -mtriple=aarch64 -mattr=+fullfp16,+no-zcz-fpr64 | FileCheck %s --check-prefixes=CHECK-FP16,CHECK-NOZCZ +; RUN: llc < %s -mtriple=aarch64 -mattr=+fullfp16,+zcz-gpr32,+zcz-gpr64 | FileCheck %s --check-prefixes=CHECK-FP16,CHECK-ZCZ ; RUN: llc < %s -mtriple=aarch64 -mattr=-fullfp16 | FileCheck %s --check-prefixes=CHECK-NOFP16 define half @Const0() { diff --git a/llvm/test/CodeGen/AArch64/fixed_masked_deinterleaved_loads.ll b/llvm/test/CodeGen/AArch64/fixed_masked_deinterleaved_loads.ll new file mode 100644 index 000000000000..730dfed5ff22 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/fixed_masked_deinterleaved_loads.ll @@ -0,0 +1,464 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=aarch64-linux-gnu < %s | FileCheck %s +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s + +define { <16 x i8>, <16 x i8> } @foo_ld2_v16i8(<16 x i1> %mask, ptr %p) { +; CHECK-LABEL: foo_ld2_v16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: zip2 v1.16b, v0.16b, v0.16b +; CHECK-NEXT: zip1 v0.16b, v0.16b, v0.16b +; CHECK-NEXT: adrp x8, .LCPI0_0 +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI0_0] +; CHECK-NEXT: shl v1.16b, v1.16b, #7 +; CHECK-NEXT: shl v0.16b, v0.16b, #7 +; CHECK-NEXT: cmlt v1.16b, v1.16b, #0 +; CHECK-NEXT: cmlt v0.16b, v0.16b, #0 +; CHECK-NEXT: and v1.16b, v1.16b, v2.16b +; CHECK-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-NEXT: ext v2.16b, v1.16b, v1.16b, #8 +; CHECK-NEXT: ext v3.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: zip1 v1.16b, v1.16b, v2.16b +; CHECK-NEXT: zip1 v0.16b, v0.16b, v3.16b +; CHECK-NEXT: addv h1, v1.8h +; CHECK-NEXT: addv h0, v0.8h +; CHECK-NEXT: fmov w9, s1 +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: bfi w8, w9, #16, #16 +; CHECK-NEXT: tbz w8, #0, .LBB0_2 +; CHECK-NEXT: // %bb.1: // %cond.load +; CHECK-NEXT: ldr b1, [x0] +; CHECK-NEXT: tbnz w8, #1, .LBB0_3 +; CHECK-NEXT: b .LBB0_4 +; CHECK-NEXT: .LBB0_2: +; CHECK-NEXT: // implicit-def: $q1 +; CHECK-NEXT: tbz w8, #1, .LBB0_4 +; CHECK-NEXT: .LBB0_3: // %cond.load1 +; CHECK-NEXT: add x9, x0, #1 +; CHECK-NEXT: ld1 { v1.b }[1], [x9] +; CHECK-NEXT: .LBB0_4: // %else2 +; CHECK-NEXT: tbnz w8, #2, .LBB0_20 +; CHECK-NEXT: // %bb.5: // %else5 +; CHECK-NEXT: tbnz w8, #3, .LBB0_21 +; CHECK-NEXT: .LBB0_6: // %else8 +; CHECK-NEXT: tbnz w8, #4, .LBB0_22 +; CHECK-NEXT: .LBB0_7: // %else11 +; CHECK-NEXT: tbnz w8, #5, .LBB0_23 +; CHECK-NEXT: .LBB0_8: // %else14 +; CHECK-NEXT: tbnz w8, #6, .LBB0_24 +; CHECK-NEXT: .LBB0_9: // %else17 +; CHECK-NEXT: tbnz w8, #7, .LBB0_25 +; CHECK-NEXT: .LBB0_10: // %else20 +; CHECK-NEXT: tbnz w8, #8, .LBB0_26 +; CHECK-NEXT: .LBB0_11: // %else23 +; CHECK-NEXT: tbnz w8, #9, .LBB0_27 +; CHECK-NEXT: .LBB0_12: // %else26 +; CHECK-NEXT: tbnz w8, #10, .LBB0_28 +; CHECK-NEXT: .LBB0_13: // %else29 +; CHECK-NEXT: tbnz w8, #11, .LBB0_29 +; CHECK-NEXT: .LBB0_14: // %else32 +; CHECK-NEXT: tbnz w8, #12, .LBB0_30 +; CHECK-NEXT: .LBB0_15: // %else35 +; CHECK-NEXT: tbnz w8, #13, .LBB0_31 +; CHECK-NEXT: .LBB0_16: // %else38 +; CHECK-NEXT: tbnz w8, #14, .LBB0_32 +; CHECK-NEXT: .LBB0_17: // %else41 +; CHECK-NEXT: tbnz w8, #15, .LBB0_33 +; CHECK-NEXT: .LBB0_18: // %else44 +; CHECK-NEXT: tbz w8, #16, .LBB0_34 +; CHECK-NEXT: .LBB0_19: // %cond.load46 +; CHECK-NEXT: add x9, x0, #16 +; CHECK-NEXT: ld1 { v2.b }[0], [x9] +; CHECK-NEXT: tbnz w8, #17, .LBB0_35 +; CHECK-NEXT: b .LBB0_36 +; CHECK-NEXT: .LBB0_20: // %cond.load4 +; CHECK-NEXT: add x9, x0, #2 +; CHECK-NEXT: ld1 { v1.b }[2], [x9] +; CHECK-NEXT: tbz w8, #3, .LBB0_6 +; CHECK-NEXT: .LBB0_21: // %cond.load7 +; CHECK-NEXT: add x9, x0, #3 +; CHECK-NEXT: ld1 { v1.b }[3], [x9] +; CHECK-NEXT: tbz w8, #4, .LBB0_7 +; CHECK-NEXT: .LBB0_22: // %cond.load10 +; CHECK-NEXT: add x9, x0, #4 +; CHECK-NEXT: ld1 { v1.b }[4], [x9] +; CHECK-NEXT: tbz w8, #5, .LBB0_8 +; CHECK-NEXT: .LBB0_23: // %cond.load13 +; CHECK-NEXT: add x9, x0, #5 +; CHECK-NEXT: ld1 { v1.b }[5], [x9] +; CHECK-NEXT: tbz w8, #6, .LBB0_9 +; CHECK-NEXT: .LBB0_24: // %cond.load16 +; CHECK-NEXT: add x9, x0, #6 +; CHECK-NEXT: ld1 { v1.b }[6], [x9] +; CHECK-NEXT: tbz w8, #7, .LBB0_10 +; CHECK-NEXT: .LBB0_25: // %cond.load19 +; CHECK-NEXT: add x9, x0, #7 +; CHECK-NEXT: ld1 { v1.b }[7], [x9] +; CHECK-NEXT: tbz w8, #8, .LBB0_11 +; CHECK-NEXT: .LBB0_26: // %cond.load22 +; CHECK-NEXT: add x9, x0, #8 +; CHECK-NEXT: ld1 { v1.b }[8], [x9] +; CHECK-NEXT: tbz w8, #9, .LBB0_12 +; CHECK-NEXT: .LBB0_27: // %cond.load25 +; CHECK-NEXT: add x9, x0, #9 +; CHECK-NEXT: ld1 { v1.b }[9], [x9] +; CHECK-NEXT: tbz w8, #10, .LBB0_13 +; CHECK-NEXT: .LBB0_28: // %cond.load28 +; CHECK-NEXT: add x9, x0, #10 +; CHECK-NEXT: ld1 { v1.b }[10], [x9] +; CHECK-NEXT: tbz w8, #11, .LBB0_14 +; CHECK-NEXT: .LBB0_29: // %cond.load31 +; CHECK-NEXT: add x9, x0, #11 +; CHECK-NEXT: ld1 { v1.b }[11], [x9] +; CHECK-NEXT: tbz w8, #12, .LBB0_15 +; CHECK-NEXT: .LBB0_30: // %cond.load34 +; CHECK-NEXT: add x9, x0, #12 +; CHECK-NEXT: ld1 { v1.b }[12], [x9] +; CHECK-NEXT: tbz w8, #13, .LBB0_16 +; CHECK-NEXT: .LBB0_31: // %cond.load37 +; CHECK-NEXT: add x9, x0, #13 +; CHECK-NEXT: ld1 { v1.b }[13], [x9] +; CHECK-NEXT: tbz w8, #14, .LBB0_17 +; CHECK-NEXT: .LBB0_32: // %cond.load40 +; CHECK-NEXT: add x9, x0, #14 +; CHECK-NEXT: ld1 { v1.b }[14], [x9] +; CHECK-NEXT: tbz w8, #15, .LBB0_18 +; CHECK-NEXT: .LBB0_33: // %cond.load43 +; CHECK-NEXT: add x9, x0, #15 +; CHECK-NEXT: ld1 { v1.b }[15], [x9] +; CHECK-NEXT: tbnz w8, #16, .LBB0_19 +; CHECK-NEXT: .LBB0_34: +; CHECK-NEXT: // implicit-def: $q2 +; CHECK-NEXT: tbz w8, #17, .LBB0_36 +; CHECK-NEXT: .LBB0_35: // %cond.load49 +; CHECK-NEXT: add x9, x0, #17 +; CHECK-NEXT: ld1 { v2.b }[1], [x9] +; CHECK-NEXT: .LBB0_36: // %else50 +; CHECK-NEXT: tbnz w8, #18, .LBB0_52 +; CHECK-NEXT: // %bb.37: // %else53 +; CHECK-NEXT: tbnz w8, #19, .LBB0_53 +; CHECK-NEXT: .LBB0_38: // %else56 +; CHECK-NEXT: tbnz w8, #20, .LBB0_54 +; CHECK-NEXT: .LBB0_39: // %else59 +; CHECK-NEXT: tbnz w8, #21, .LBB0_55 +; CHECK-NEXT: .LBB0_40: // %else62 +; CHECK-NEXT: tbnz w8, #22, .LBB0_56 +; CHECK-NEXT: .LBB0_41: // %else65 +; CHECK-NEXT: tbnz w8, #23, .LBB0_57 +; CHECK-NEXT: .LBB0_42: // %else68 +; CHECK-NEXT: tbnz w8, #24, .LBB0_58 +; CHECK-NEXT: .LBB0_43: // %else71 +; CHECK-NEXT: tbnz w8, #25, .LBB0_59 +; CHECK-NEXT: .LBB0_44: // %else74 +; CHECK-NEXT: tbnz w8, #26, .LBB0_60 +; CHECK-NEXT: .LBB0_45: // %else77 +; CHECK-NEXT: tbnz w8, #27, .LBB0_61 +; CHECK-NEXT: .LBB0_46: // %else80 +; CHECK-NEXT: tbnz w8, #28, .LBB0_62 +; CHECK-NEXT: .LBB0_47: // %else83 +; CHECK-NEXT: tbnz w8, #29, .LBB0_63 +; CHECK-NEXT: .LBB0_48: // %else86 +; CHECK-NEXT: tbnz w8, #30, .LBB0_64 +; CHECK-NEXT: .LBB0_49: // %else89 +; CHECK-NEXT: tbz w8, #31, .LBB0_51 +; CHECK-NEXT: .LBB0_50: // %cond.load91 +; CHECK-NEXT: add x8, x0, #31 +; CHECK-NEXT: ld1 { v2.b }[15], [x8] +; CHECK-NEXT: .LBB0_51: // %else92 +; CHECK-NEXT: uzp1 v0.16b, v1.16b, v2.16b +; CHECK-NEXT: uzp2 v1.16b, v1.16b, v2.16b +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB0_52: // %cond.load52 +; CHECK-NEXT: add x9, x0, #18 +; CHECK-NEXT: ld1 { v2.b }[2], [x9] +; CHECK-NEXT: tbz w8, #19, .LBB0_38 +; CHECK-NEXT: .LBB0_53: // %cond.load55 +; CHECK-NEXT: add x9, x0, #19 +; CHECK-NEXT: ld1 { v2.b }[3], [x9] +; CHECK-NEXT: tbz w8, #20, .LBB0_39 +; CHECK-NEXT: .LBB0_54: // %cond.load58 +; CHECK-NEXT: add x9, x0, #20 +; CHECK-NEXT: ld1 { v2.b }[4], [x9] +; CHECK-NEXT: tbz w8, #21, .LBB0_40 +; CHECK-NEXT: .LBB0_55: // %cond.load61 +; CHECK-NEXT: add x9, x0, #21 +; CHECK-NEXT: ld1 { v2.b }[5], [x9] +; CHECK-NEXT: tbz w8, #22, .LBB0_41 +; CHECK-NEXT: .LBB0_56: // %cond.load64 +; CHECK-NEXT: add x9, x0, #22 +; CHECK-NEXT: ld1 { v2.b }[6], [x9] +; CHECK-NEXT: tbz w8, #23, .LBB0_42 +; CHECK-NEXT: .LBB0_57: // %cond.load67 +; CHECK-NEXT: add x9, x0, #23 +; CHECK-NEXT: ld1 { v2.b }[7], [x9] +; CHECK-NEXT: tbz w8, #24, .LBB0_43 +; CHECK-NEXT: .LBB0_58: // %cond.load70 +; CHECK-NEXT: add x9, x0, #24 +; CHECK-NEXT: ld1 { v2.b }[8], [x9] +; CHECK-NEXT: tbz w8, #25, .LBB0_44 +; CHECK-NEXT: .LBB0_59: // %cond.load73 +; CHECK-NEXT: add x9, x0, #25 +; CHECK-NEXT: ld1 { v2.b }[9], [x9] +; CHECK-NEXT: tbz w8, #26, .LBB0_45 +; CHECK-NEXT: .LBB0_60: // %cond.load76 +; CHECK-NEXT: add x9, x0, #26 +; CHECK-NEXT: ld1 { v2.b }[10], [x9] +; CHECK-NEXT: tbz w8, #27, .LBB0_46 +; CHECK-NEXT: .LBB0_61: // %cond.load79 +; CHECK-NEXT: add x9, x0, #27 +; CHECK-NEXT: ld1 { v2.b }[11], [x9] +; CHECK-NEXT: tbz w8, #28, .LBB0_47 +; CHECK-NEXT: .LBB0_62: // %cond.load82 +; CHECK-NEXT: add x9, x0, #28 +; CHECK-NEXT: ld1 { v2.b }[12], [x9] +; CHECK-NEXT: tbz w8, #29, .LBB0_48 +; CHECK-NEXT: .LBB0_63: // %cond.load85 +; CHECK-NEXT: add x9, x0, #29 +; CHECK-NEXT: ld1 { v2.b }[13], [x9] +; CHECK-NEXT: tbz w8, #30, .LBB0_49 +; CHECK-NEXT: .LBB0_64: // %cond.load88 +; CHECK-NEXT: add x9, x0, #30 +; CHECK-NEXT: ld1 { v2.b }[14], [x9] +; CHECK-NEXT: tbnz w8, #31, .LBB0_50 +; CHECK-NEXT: b .LBB0_51 + %interleaved.mask = call <32 x i1> @llvm.vector.interleave2.v32i1(<16 x i1> %mask, <16 x i1> %mask) + %wide.masked.vec = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr %p, i32 1, <32 x i1> %interleaved.mask, <32 x i8> poison) + %deinterleaved.vec = call { <16 x i8>, <16 x i8> } @llvm.vector.deinterleave2.v32i8(<32 x i8> %wide.masked.vec) + ret { <16 x i8>, <16 x i8> } %deinterleaved.vec +} + +define { <8 x i16>, <8 x i16> } @foo_ld2_v8i16(<8 x i1> %mask, ptr %p) { +; CHECK-LABEL: foo_ld2_v8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: adrp x8, .LCPI1_0 +; CHECK-NEXT: zip1 v0.16b, v0.16b, v0.16b +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI1_0] +; CHECK-NEXT: shl v0.16b, v0.16b, #7 +; CHECK-NEXT: cmlt v0.16b, v0.16b, #0 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: zip1 v0.16b, v0.16b, v1.16b +; CHECK-NEXT: addv h0, v0.8h +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: tbz w8, #0, .LBB1_2 +; CHECK-NEXT: // %bb.1: // %cond.load +; CHECK-NEXT: ldr h1, [x0] +; CHECK-NEXT: tbnz w8, #1, .LBB1_3 +; CHECK-NEXT: b .LBB1_4 +; CHECK-NEXT: .LBB1_2: +; CHECK-NEXT: // implicit-def: $q1 +; CHECK-NEXT: tbz w8, #1, .LBB1_4 +; CHECK-NEXT: .LBB1_3: // %cond.load1 +; CHECK-NEXT: add x9, x0, #2 +; CHECK-NEXT: ld1 { v1.h }[1], [x9] +; CHECK-NEXT: .LBB1_4: // %else2 +; CHECK-NEXT: tbnz w8, #2, .LBB1_12 +; CHECK-NEXT: // %bb.5: // %else5 +; CHECK-NEXT: tbnz w8, #3, .LBB1_13 +; CHECK-NEXT: .LBB1_6: // %else8 +; CHECK-NEXT: tbnz w8, #4, .LBB1_14 +; CHECK-NEXT: .LBB1_7: // %else11 +; CHECK-NEXT: tbnz w8, #5, .LBB1_15 +; CHECK-NEXT: .LBB1_8: // %else14 +; CHECK-NEXT: tbnz w8, #6, .LBB1_16 +; CHECK-NEXT: .LBB1_9: // %else17 +; CHECK-NEXT: tbnz w8, #7, .LBB1_17 +; CHECK-NEXT: .LBB1_10: // %else20 +; CHECK-NEXT: tbz w8, #8, .LBB1_18 +; CHECK-NEXT: .LBB1_11: // %cond.load22 +; CHECK-NEXT: add x9, x0, #16 +; CHECK-NEXT: ld1 { v2.h }[0], [x9] +; CHECK-NEXT: tbnz w8, #9, .LBB1_19 +; CHECK-NEXT: b .LBB1_20 +; CHECK-NEXT: .LBB1_12: // %cond.load4 +; CHECK-NEXT: add x9, x0, #4 +; CHECK-NEXT: ld1 { v1.h }[2], [x9] +; CHECK-NEXT: tbz w8, #3, .LBB1_6 +; CHECK-NEXT: .LBB1_13: // %cond.load7 +; CHECK-NEXT: add x9, x0, #6 +; CHECK-NEXT: ld1 { v1.h }[3], [x9] +; CHECK-NEXT: tbz w8, #4, .LBB1_7 +; CHECK-NEXT: .LBB1_14: // %cond.load10 +; CHECK-NEXT: add x9, x0, #8 +; CHECK-NEXT: ld1 { v1.h }[4], [x9] +; CHECK-NEXT: tbz w8, #5, .LBB1_8 +; CHECK-NEXT: .LBB1_15: // %cond.load13 +; CHECK-NEXT: add x9, x0, #10 +; CHECK-NEXT: ld1 { v1.h }[5], [x9] +; CHECK-NEXT: tbz w8, #6, .LBB1_9 +; CHECK-NEXT: .LBB1_16: // %cond.load16 +; CHECK-NEXT: add x9, x0, #12 +; CHECK-NEXT: ld1 { v1.h }[6], [x9] +; CHECK-NEXT: tbz w8, #7, .LBB1_10 +; CHECK-NEXT: .LBB1_17: // %cond.load19 +; CHECK-NEXT: add x9, x0, #14 +; CHECK-NEXT: ld1 { v1.h }[7], [x9] +; CHECK-NEXT: tbnz w8, #8, .LBB1_11 +; CHECK-NEXT: .LBB1_18: +; CHECK-NEXT: // implicit-def: $q2 +; CHECK-NEXT: tbz w8, #9, .LBB1_20 +; CHECK-NEXT: .LBB1_19: // %cond.load25 +; CHECK-NEXT: add x9, x0, #18 +; CHECK-NEXT: ld1 { v2.h }[1], [x9] +; CHECK-NEXT: .LBB1_20: // %else26 +; CHECK-NEXT: tbnz w8, #10, .LBB1_28 +; CHECK-NEXT: // %bb.21: // %else29 +; CHECK-NEXT: tbnz w8, #11, .LBB1_29 +; CHECK-NEXT: .LBB1_22: // %else32 +; CHECK-NEXT: tbnz w8, #12, .LBB1_30 +; CHECK-NEXT: .LBB1_23: // %else35 +; CHECK-NEXT: tbnz w8, #13, .LBB1_31 +; CHECK-NEXT: .LBB1_24: // %else38 +; CHECK-NEXT: tbnz w8, #14, .LBB1_32 +; CHECK-NEXT: .LBB1_25: // %else41 +; CHECK-NEXT: tbz w8, #15, .LBB1_27 +; CHECK-NEXT: .LBB1_26: // %cond.load43 +; CHECK-NEXT: add x8, x0, #30 +; CHECK-NEXT: ld1 { v2.h }[7], [x8] +; CHECK-NEXT: .LBB1_27: // %else44 +; CHECK-NEXT: uzp1 v0.8h, v1.8h, v2.8h +; CHECK-NEXT: uzp2 v1.8h, v1.8h, v2.8h +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB1_28: // %cond.load28 +; CHECK-NEXT: add x9, x0, #20 +; CHECK-NEXT: ld1 { v2.h }[2], [x9] +; CHECK-NEXT: tbz w8, #11, .LBB1_22 +; CHECK-NEXT: .LBB1_29: // %cond.load31 +; CHECK-NEXT: add x9, x0, #22 +; CHECK-NEXT: ld1 { v2.h }[3], [x9] +; CHECK-NEXT: tbz w8, #12, .LBB1_23 +; CHECK-NEXT: .LBB1_30: // %cond.load34 +; CHECK-NEXT: add x9, x0, #24 +; CHECK-NEXT: ld1 { v2.h }[4], [x9] +; CHECK-NEXT: tbz w8, #13, .LBB1_24 +; CHECK-NEXT: .LBB1_31: // %cond.load37 +; CHECK-NEXT: add x9, x0, #26 +; CHECK-NEXT: ld1 { v2.h }[5], [x9] +; CHECK-NEXT: tbz w8, #14, .LBB1_25 +; CHECK-NEXT: .LBB1_32: // %cond.load40 +; CHECK-NEXT: add x9, x0, #28 +; CHECK-NEXT: ld1 { v2.h }[6], [x9] +; CHECK-NEXT: tbnz w8, #15, .LBB1_26 +; CHECK-NEXT: b .LBB1_27 + %interleaved.mask = call <16 x i1> @llvm.vector.interleave2.v16i1(<8 x i1> %mask, <8 x i1> %mask) + %wide.masked.vec = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr %p, i32 2, <16 x i1> %interleaved.mask, <16 x i16> poison) + %deinterleaved.vec = call { <8 x i16>, <8 x i16> } @llvm.vector.deinterleave2.v16i16(<16 x i16> %wide.masked.vec) + ret { <8 x i16>, <8 x i16> } %deinterleaved.vec +} + +define { <4 x float>, <4 x float> } @foo_ld2_v4f32(<4 x i1> %mask, ptr %p) { +; CHECK-LABEL: foo_ld2_v4f32: +; CHECK: // %bb.0: +; CHECK-NEXT: uzp1 v0.8b, v0.8b, v0.8b +; CHECK-NEXT: adrp x8, .LCPI2_0 +; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI2_0] +; CHECK-NEXT: zip1 v0.8b, v0.8b, v0.8b +; CHECK-NEXT: shl v0.8b, v0.8b, #7 +; CHECK-NEXT: cmlt v0.8b, v0.8b, #0 +; CHECK-NEXT: and v0.8b, v0.8b, v1.8b +; CHECK-NEXT: addv b0, v0.8b +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: tbz w8, #0, .LBB2_2 +; CHECK-NEXT: // %bb.1: // %cond.load +; CHECK-NEXT: ldr s1, [x0] +; CHECK-NEXT: tbnz w8, #1, .LBB2_3 +; CHECK-NEXT: b .LBB2_4 +; CHECK-NEXT: .LBB2_2: +; CHECK-NEXT: // implicit-def: $q1 +; CHECK-NEXT: tbz w8, #1, .LBB2_4 +; CHECK-NEXT: .LBB2_3: // %cond.load1 +; CHECK-NEXT: add x9, x0, #4 +; CHECK-NEXT: ld1 { v1.s }[1], [x9] +; CHECK-NEXT: .LBB2_4: // %else2 +; CHECK-NEXT: tbnz w8, #2, .LBB2_8 +; CHECK-NEXT: // %bb.5: // %else5 +; CHECK-NEXT: tbnz w8, #3, .LBB2_9 +; CHECK-NEXT: .LBB2_6: // %else8 +; CHECK-NEXT: tbz w8, #4, .LBB2_10 +; CHECK-NEXT: .LBB2_7: // %cond.load10 +; CHECK-NEXT: add x9, x0, #16 +; CHECK-NEXT: ld1 { v2.s }[0], [x9] +; CHECK-NEXT: tbnz w8, #5, .LBB2_11 +; CHECK-NEXT: b .LBB2_12 +; CHECK-NEXT: .LBB2_8: // %cond.load4 +; CHECK-NEXT: add x9, x0, #8 +; CHECK-NEXT: ld1 { v1.s }[2], [x9] +; CHECK-NEXT: tbz w8, #3, .LBB2_6 +; CHECK-NEXT: .LBB2_9: // %cond.load7 +; CHECK-NEXT: add x9, x0, #12 +; CHECK-NEXT: ld1 { v1.s }[3], [x9] +; CHECK-NEXT: tbnz w8, #4, .LBB2_7 +; CHECK-NEXT: .LBB2_10: +; CHECK-NEXT: // implicit-def: $q2 +; CHECK-NEXT: tbz w8, #5, .LBB2_12 +; CHECK-NEXT: .LBB2_11: // %cond.load13 +; CHECK-NEXT: add x9, x0, #20 +; CHECK-NEXT: ld1 { v2.s }[1], [x9] +; CHECK-NEXT: .LBB2_12: // %else14 +; CHECK-NEXT: tbnz w8, #6, .LBB2_16 +; CHECK-NEXT: // %bb.13: // %else17 +; CHECK-NEXT: tbz w8, #7, .LBB2_15 +; CHECK-NEXT: .LBB2_14: // %cond.load19 +; CHECK-NEXT: add x8, x0, #28 +; CHECK-NEXT: ld1 { v2.s }[3], [x8] +; CHECK-NEXT: .LBB2_15: // %else20 +; CHECK-NEXT: uzp1 v0.4s, v1.4s, v2.4s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v2.4s +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB2_16: // %cond.load16 +; CHECK-NEXT: add x9, x0, #24 +; CHECK-NEXT: ld1 { v2.s }[2], [x9] +; CHECK-NEXT: tbnz w8, #7, .LBB2_14 +; CHECK-NEXT: b .LBB2_15 + %interleaved.mask = call <8 x i1> @llvm.vector.interleave2.v8i1(<4 x i1> %mask, <4 x i1> %mask) + %wide.masked.vec = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %p, i32 4, <8 x i1> %interleaved.mask, <8 x float> poison) + %deinterleaved.vec = call { <4 x float>, <4 x float> } @llvm.vector.deinterleave2.v16f32(<8 x float> %wide.masked.vec) + ret { <4 x float>, <4 x float> } %deinterleaved.vec +} + +define { <2 x double>, <2 x double> } @foo_ld2_v2f64(<2 x i1> %mask, ptr %p) { +; CHECK-LABEL: foo_ld2_v2f64: +; CHECK: // %bb.0: +; CHECK-NEXT: uzp1 v0.4h, v0.4h, v0.4h +; CHECK-NEXT: adrp x8, .LCPI3_0 +; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI3_0] +; CHECK-NEXT: zip1 v0.4h, v0.4h, v0.4h +; CHECK-NEXT: shl v0.4h, v0.4h, #15 +; CHECK-NEXT: cmlt v0.4h, v0.4h, #0 +; CHECK-NEXT: and v0.8b, v0.8b, v1.8b +; CHECK-NEXT: addv h0, v0.4h +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: tbz w8, #0, .LBB3_2 +; CHECK-NEXT: // %bb.1: // %cond.load +; CHECK-NEXT: ldr d1, [x0] +; CHECK-NEXT: tbnz w8, #1, .LBB3_3 +; CHECK-NEXT: b .LBB3_4 +; CHECK-NEXT: .LBB3_2: +; CHECK-NEXT: // implicit-def: $q1 +; CHECK-NEXT: tbz w8, #1, .LBB3_4 +; CHECK-NEXT: .LBB3_3: // %cond.load1 +; CHECK-NEXT: add x9, x0, #8 +; CHECK-NEXT: ld1 { v1.d }[1], [x9] +; CHECK-NEXT: .LBB3_4: // %else2 +; CHECK-NEXT: tbz w8, #2, .LBB3_6 +; CHECK-NEXT: // %bb.5: // %cond.load4 +; CHECK-NEXT: add x9, x0, #16 +; CHECK-NEXT: ld1 { v2.d }[0], [x9] +; CHECK-NEXT: tbnz w8, #3, .LBB3_7 +; CHECK-NEXT: b .LBB3_8 +; CHECK-NEXT: .LBB3_6: +; CHECK-NEXT: // implicit-def: $q2 +; CHECK-NEXT: tbz w8, #3, .LBB3_8 +; CHECK-NEXT: .LBB3_7: // %cond.load7 +; CHECK-NEXT: add x8, x0, #24 +; CHECK-NEXT: ld1 { v2.d }[1], [x8] +; CHECK-NEXT: .LBB3_8: // %else8 +; CHECK-NEXT: zip1 v0.2d, v1.2d, v2.2d +; CHECK-NEXT: zip2 v1.2d, v1.2d, v2.2d +; CHECK-NEXT: ret + %interleaved.mask = call <4 x i1> @llvm.vector.interleave2.v4i1(<2 x i1> %mask, <2 x i1> %mask) + %wide.masked.vec = call <4 x double> @llvm.masked.load.v4f64.p0(ptr %p, i32 8, <4 x i1> %interleaved.mask, <4 x double> poison) + %deinterleaved.vec = call { <2 x double>, <2 x double> } @llvm.vector.deinterleave2.v4f64(<4 x double> %wide.masked.vec) + ret { <2 x double>, <2 x double> } %deinterleaved.vec +} + diff --git a/llvm/test/CodeGen/AArch64/fixed_masked_interleaved_stores.ll b/llvm/test/CodeGen/AArch64/fixed_masked_interleaved_stores.ll new file mode 100644 index 000000000000..ac1db2bc56bb --- /dev/null +++ b/llvm/test/CodeGen/AArch64/fixed_masked_interleaved_stores.ll @@ -0,0 +1,455 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=aarch64-linux-gnu < %s | FileCheck %s +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s + +define void @foo_st2_v16i8(<16 x i1> %mask, <16 x i8> %val1, <16 x i8> %val2, ptr %p) { +; CHECK-LABEL: foo_st2_v16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: zip2 v3.16b, v0.16b, v0.16b +; CHECK-NEXT: zip1 v0.16b, v0.16b, v0.16b +; CHECK-NEXT: adrp x8, .LCPI0_0 +; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI0_0] +; CHECK-NEXT: shl v3.16b, v3.16b, #7 +; CHECK-NEXT: shl v0.16b, v0.16b, #7 +; CHECK-NEXT: cmlt v3.16b, v3.16b, #0 +; CHECK-NEXT: cmlt v0.16b, v0.16b, #0 +; CHECK-NEXT: and v3.16b, v3.16b, v4.16b +; CHECK-NEXT: and v0.16b, v0.16b, v4.16b +; CHECK-NEXT: ext v4.16b, v3.16b, v3.16b, #8 +; CHECK-NEXT: ext v5.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: zip1 v3.16b, v3.16b, v4.16b +; CHECK-NEXT: zip1 v0.16b, v0.16b, v5.16b +; CHECK-NEXT: addv h3, v3.8h +; CHECK-NEXT: addv h0, v0.8h +; CHECK-NEXT: fmov w9, s3 +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: zip1 v0.16b, v1.16b, v2.16b +; CHECK-NEXT: bfi w8, w9, #16, #16 +; CHECK-NEXT: tbnz w8, #0, .LBB0_33 +; CHECK-NEXT: // %bb.1: // %else +; CHECK-NEXT: tbnz w8, #1, .LBB0_34 +; CHECK-NEXT: .LBB0_2: // %else2 +; CHECK-NEXT: tbnz w8, #2, .LBB0_35 +; CHECK-NEXT: .LBB0_3: // %else4 +; CHECK-NEXT: tbnz w8, #3, .LBB0_36 +; CHECK-NEXT: .LBB0_4: // %else6 +; CHECK-NEXT: tbnz w8, #4, .LBB0_37 +; CHECK-NEXT: .LBB0_5: // %else8 +; CHECK-NEXT: tbnz w8, #5, .LBB0_38 +; CHECK-NEXT: .LBB0_6: // %else10 +; CHECK-NEXT: tbnz w8, #6, .LBB0_39 +; CHECK-NEXT: .LBB0_7: // %else12 +; CHECK-NEXT: tbnz w8, #7, .LBB0_40 +; CHECK-NEXT: .LBB0_8: // %else14 +; CHECK-NEXT: tbnz w8, #8, .LBB0_41 +; CHECK-NEXT: .LBB0_9: // %else16 +; CHECK-NEXT: tbnz w8, #9, .LBB0_42 +; CHECK-NEXT: .LBB0_10: // %else18 +; CHECK-NEXT: tbnz w8, #10, .LBB0_43 +; CHECK-NEXT: .LBB0_11: // %else20 +; CHECK-NEXT: tbnz w8, #11, .LBB0_44 +; CHECK-NEXT: .LBB0_12: // %else22 +; CHECK-NEXT: tbnz w8, #12, .LBB0_45 +; CHECK-NEXT: .LBB0_13: // %else24 +; CHECK-NEXT: tbnz w8, #13, .LBB0_46 +; CHECK-NEXT: .LBB0_14: // %else26 +; CHECK-NEXT: tbnz w8, #14, .LBB0_47 +; CHECK-NEXT: .LBB0_15: // %else28 +; CHECK-NEXT: tbnz w8, #15, .LBB0_48 +; CHECK-NEXT: .LBB0_16: // %else30 +; CHECK-NEXT: zip2 v0.16b, v1.16b, v2.16b +; CHECK-NEXT: tbnz w8, #16, .LBB0_49 +; CHECK-NEXT: .LBB0_17: // %else32 +; CHECK-NEXT: tbnz w8, #17, .LBB0_50 +; CHECK-NEXT: .LBB0_18: // %else34 +; CHECK-NEXT: tbnz w8, #18, .LBB0_51 +; CHECK-NEXT: .LBB0_19: // %else36 +; CHECK-NEXT: tbnz w8, #19, .LBB0_52 +; CHECK-NEXT: .LBB0_20: // %else38 +; CHECK-NEXT: tbnz w8, #20, .LBB0_53 +; CHECK-NEXT: .LBB0_21: // %else40 +; CHECK-NEXT: tbnz w8, #21, .LBB0_54 +; CHECK-NEXT: .LBB0_22: // %else42 +; CHECK-NEXT: tbnz w8, #22, .LBB0_55 +; CHECK-NEXT: .LBB0_23: // %else44 +; CHECK-NEXT: tbnz w8, #23, .LBB0_56 +; CHECK-NEXT: .LBB0_24: // %else46 +; CHECK-NEXT: tbnz w8, #24, .LBB0_57 +; CHECK-NEXT: .LBB0_25: // %else48 +; CHECK-NEXT: tbnz w8, #25, .LBB0_58 +; CHECK-NEXT: .LBB0_26: // %else50 +; CHECK-NEXT: tbnz w8, #26, .LBB0_59 +; CHECK-NEXT: .LBB0_27: // %else52 +; CHECK-NEXT: tbnz w8, #27, .LBB0_60 +; CHECK-NEXT: .LBB0_28: // %else54 +; CHECK-NEXT: tbnz w8, #28, .LBB0_61 +; CHECK-NEXT: .LBB0_29: // %else56 +; CHECK-NEXT: tbnz w8, #29, .LBB0_62 +; CHECK-NEXT: .LBB0_30: // %else58 +; CHECK-NEXT: tbnz w8, #30, .LBB0_63 +; CHECK-NEXT: .LBB0_31: // %else60 +; CHECK-NEXT: tbnz w8, #31, .LBB0_64 +; CHECK-NEXT: .LBB0_32: // %else62 +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB0_33: // %cond.store +; CHECK-NEXT: str b0, [x0] +; CHECK-NEXT: tbz w8, #1, .LBB0_2 +; CHECK-NEXT: .LBB0_34: // %cond.store1 +; CHECK-NEXT: mov b3, v0.b[1] +; CHECK-NEXT: stur b3, [x0, #1] +; CHECK-NEXT: tbz w8, #2, .LBB0_3 +; CHECK-NEXT: .LBB0_35: // %cond.store3 +; CHECK-NEXT: mov b3, v0.b[2] +; CHECK-NEXT: stur b3, [x0, #2] +; CHECK-NEXT: tbz w8, #3, .LBB0_4 +; CHECK-NEXT: .LBB0_36: // %cond.store5 +; CHECK-NEXT: mov b3, v0.b[3] +; CHECK-NEXT: stur b3, [x0, #3] +; CHECK-NEXT: tbz w8, #4, .LBB0_5 +; CHECK-NEXT: .LBB0_37: // %cond.store7 +; CHECK-NEXT: mov b3, v0.b[4] +; CHECK-NEXT: stur b3, [x0, #4] +; CHECK-NEXT: tbz w8, #5, .LBB0_6 +; CHECK-NEXT: .LBB0_38: // %cond.store9 +; CHECK-NEXT: mov b3, v0.b[5] +; CHECK-NEXT: stur b3, [x0, #5] +; CHECK-NEXT: tbz w8, #6, .LBB0_7 +; CHECK-NEXT: .LBB0_39: // %cond.store11 +; CHECK-NEXT: mov b3, v0.b[6] +; CHECK-NEXT: stur b3, [x0, #6] +; CHECK-NEXT: tbz w8, #7, .LBB0_8 +; CHECK-NEXT: .LBB0_40: // %cond.store13 +; CHECK-NEXT: mov b3, v0.b[7] +; CHECK-NEXT: stur b3, [x0, #7] +; CHECK-NEXT: tbz w8, #8, .LBB0_9 +; CHECK-NEXT: .LBB0_41: // %cond.store15 +; CHECK-NEXT: mov b3, v0.b[8] +; CHECK-NEXT: stur b3, [x0, #8] +; CHECK-NEXT: tbz w8, #9, .LBB0_10 +; CHECK-NEXT: .LBB0_42: // %cond.store17 +; CHECK-NEXT: mov b3, v0.b[9] +; CHECK-NEXT: stur b3, [x0, #9] +; CHECK-NEXT: tbz w8, #10, .LBB0_11 +; CHECK-NEXT: .LBB0_43: // %cond.store19 +; CHECK-NEXT: mov b3, v0.b[10] +; CHECK-NEXT: stur b3, [x0, #10] +; CHECK-NEXT: tbz w8, #11, .LBB0_12 +; CHECK-NEXT: .LBB0_44: // %cond.store21 +; CHECK-NEXT: mov b3, v0.b[11] +; CHECK-NEXT: stur b3, [x0, #11] +; CHECK-NEXT: tbz w8, #12, .LBB0_13 +; CHECK-NEXT: .LBB0_45: // %cond.store23 +; CHECK-NEXT: mov b3, v0.b[12] +; CHECK-NEXT: stur b3, [x0, #12] +; CHECK-NEXT: tbz w8, #13, .LBB0_14 +; CHECK-NEXT: .LBB0_46: // %cond.store25 +; CHECK-NEXT: mov b3, v0.b[13] +; CHECK-NEXT: stur b3, [x0, #13] +; CHECK-NEXT: tbz w8, #14, .LBB0_15 +; CHECK-NEXT: .LBB0_47: // %cond.store27 +; CHECK-NEXT: mov b3, v0.b[14] +; CHECK-NEXT: stur b3, [x0, #14] +; CHECK-NEXT: tbz w8, #15, .LBB0_16 +; CHECK-NEXT: .LBB0_48: // %cond.store29 +; CHECK-NEXT: mov b0, v0.b[15] +; CHECK-NEXT: stur b0, [x0, #15] +; CHECK-NEXT: zip2 v0.16b, v1.16b, v2.16b +; CHECK-NEXT: tbz w8, #16, .LBB0_17 +; CHECK-NEXT: .LBB0_49: // %cond.store31 +; CHECK-NEXT: stur b0, [x0, #16] +; CHECK-NEXT: tbz w8, #17, .LBB0_18 +; CHECK-NEXT: .LBB0_50: // %cond.store33 +; CHECK-NEXT: mov b1, v0.b[1] +; CHECK-NEXT: stur b1, [x0, #17] +; CHECK-NEXT: tbz w8, #18, .LBB0_19 +; CHECK-NEXT: .LBB0_51: // %cond.store35 +; CHECK-NEXT: mov b1, v0.b[2] +; CHECK-NEXT: stur b1, [x0, #18] +; CHECK-NEXT: tbz w8, #19, .LBB0_20 +; CHECK-NEXT: .LBB0_52: // %cond.store37 +; CHECK-NEXT: mov b1, v0.b[3] +; CHECK-NEXT: stur b1, [x0, #19] +; CHECK-NEXT: tbz w8, #20, .LBB0_21 +; CHECK-NEXT: .LBB0_53: // %cond.store39 +; CHECK-NEXT: mov b1, v0.b[4] +; CHECK-NEXT: stur b1, [x0, #20] +; CHECK-NEXT: tbz w8, #21, .LBB0_22 +; CHECK-NEXT: .LBB0_54: // %cond.store41 +; CHECK-NEXT: mov b1, v0.b[5] +; CHECK-NEXT: stur b1, [x0, #21] +; CHECK-NEXT: tbz w8, #22, .LBB0_23 +; CHECK-NEXT: .LBB0_55: // %cond.store43 +; CHECK-NEXT: mov b1, v0.b[6] +; CHECK-NEXT: stur b1, [x0, #22] +; CHECK-NEXT: tbz w8, #23, .LBB0_24 +; CHECK-NEXT: .LBB0_56: // %cond.store45 +; CHECK-NEXT: mov b1, v0.b[7] +; CHECK-NEXT: stur b1, [x0, #23] +; CHECK-NEXT: tbz w8, #24, .LBB0_25 +; CHECK-NEXT: .LBB0_57: // %cond.store47 +; CHECK-NEXT: mov b1, v0.b[8] +; CHECK-NEXT: stur b1, [x0, #24] +; CHECK-NEXT: tbz w8, #25, .LBB0_26 +; CHECK-NEXT: .LBB0_58: // %cond.store49 +; CHECK-NEXT: mov b1, v0.b[9] +; CHECK-NEXT: stur b1, [x0, #25] +; CHECK-NEXT: tbz w8, #26, .LBB0_27 +; CHECK-NEXT: .LBB0_59: // %cond.store51 +; CHECK-NEXT: mov b1, v0.b[10] +; CHECK-NEXT: stur b1, [x0, #26] +; CHECK-NEXT: tbz w8, #27, .LBB0_28 +; CHECK-NEXT: .LBB0_60: // %cond.store53 +; CHECK-NEXT: mov b1, v0.b[11] +; CHECK-NEXT: stur b1, [x0, #27] +; CHECK-NEXT: tbz w8, #28, .LBB0_29 +; CHECK-NEXT: .LBB0_61: // %cond.store55 +; CHECK-NEXT: mov b1, v0.b[12] +; CHECK-NEXT: stur b1, [x0, #28] +; CHECK-NEXT: tbz w8, #29, .LBB0_30 +; CHECK-NEXT: .LBB0_62: // %cond.store57 +; CHECK-NEXT: mov b1, v0.b[13] +; CHECK-NEXT: stur b1, [x0, #29] +; CHECK-NEXT: tbz w8, #30, .LBB0_31 +; CHECK-NEXT: .LBB0_63: // %cond.store59 +; CHECK-NEXT: mov b1, v0.b[14] +; CHECK-NEXT: stur b1, [x0, #30] +; CHECK-NEXT: tbz w8, #31, .LBB0_32 +; CHECK-NEXT: .LBB0_64: // %cond.store61 +; CHECK-NEXT: mov b0, v0.b[15] +; CHECK-NEXT: stur b0, [x0, #31] +; CHECK-NEXT: ret + %interleaved.mask = call <32 x i1> @llvm.vector.interleave2.v32i1(<16 x i1> %mask, <16 x i1> %mask) + %strided.vec = call <32 x i8> @llvm.vector.interleave2.v32i8(<16 x i8> %val1, <16 x i8> %val2) + call void @llvm.masked.store.v32i8.p0(<32 x i8> %strided.vec, ptr %p, i32 1, <32 x i1> %interleaved.mask) + ret void +} + +define void @foo_st2_v8i16(<8 x i1> %mask, <8 x i16> %val1, <8 x i16> %val2, ptr %p) { +; CHECK-LABEL: foo_st2_v8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: adrp x8, .LCPI1_0 +; CHECK-NEXT: zip1 v0.16b, v0.16b, v0.16b +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI1_0] +; CHECK-NEXT: shl v0.16b, v0.16b, #7 +; CHECK-NEXT: cmlt v0.16b, v0.16b, #0 +; CHECK-NEXT: and v0.16b, v0.16b, v3.16b +; CHECK-NEXT: ext v3.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: zip1 v0.16b, v0.16b, v3.16b +; CHECK-NEXT: addv h3, v0.8h +; CHECK-NEXT: zip1 v0.8h, v1.8h, v2.8h +; CHECK-NEXT: fmov w8, s3 +; CHECK-NEXT: tbnz w8, #0, .LBB1_17 +; CHECK-NEXT: // %bb.1: // %else +; CHECK-NEXT: tbnz w8, #1, .LBB1_18 +; CHECK-NEXT: .LBB1_2: // %else2 +; CHECK-NEXT: tbnz w8, #2, .LBB1_19 +; CHECK-NEXT: .LBB1_3: // %else4 +; CHECK-NEXT: tbnz w8, #3, .LBB1_20 +; CHECK-NEXT: .LBB1_4: // %else6 +; CHECK-NEXT: tbnz w8, #4, .LBB1_21 +; CHECK-NEXT: .LBB1_5: // %else8 +; CHECK-NEXT: tbnz w8, #5, .LBB1_22 +; CHECK-NEXT: .LBB1_6: // %else10 +; CHECK-NEXT: tbnz w8, #6, .LBB1_23 +; CHECK-NEXT: .LBB1_7: // %else12 +; CHECK-NEXT: tbnz w8, #7, .LBB1_24 +; CHECK-NEXT: .LBB1_8: // %else14 +; CHECK-NEXT: zip2 v0.8h, v1.8h, v2.8h +; CHECK-NEXT: tbnz w8, #8, .LBB1_25 +; CHECK-NEXT: .LBB1_9: // %else16 +; CHECK-NEXT: tbnz w8, #9, .LBB1_26 +; CHECK-NEXT: .LBB1_10: // %else18 +; CHECK-NEXT: tbnz w8, #10, .LBB1_27 +; CHECK-NEXT: .LBB1_11: // %else20 +; CHECK-NEXT: tbnz w8, #11, .LBB1_28 +; CHECK-NEXT: .LBB1_12: // %else22 +; CHECK-NEXT: tbnz w8, #12, .LBB1_29 +; CHECK-NEXT: .LBB1_13: // %else24 +; CHECK-NEXT: tbnz w8, #13, .LBB1_30 +; CHECK-NEXT: .LBB1_14: // %else26 +; CHECK-NEXT: tbnz w8, #14, .LBB1_31 +; CHECK-NEXT: .LBB1_15: // %else28 +; CHECK-NEXT: tbnz w8, #15, .LBB1_32 +; CHECK-NEXT: .LBB1_16: // %else30 +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB1_17: // %cond.store +; CHECK-NEXT: str h0, [x0] +; CHECK-NEXT: tbz w8, #1, .LBB1_2 +; CHECK-NEXT: .LBB1_18: // %cond.store1 +; CHECK-NEXT: mov h3, v0.h[1] +; CHECK-NEXT: str h3, [x0, #2] +; CHECK-NEXT: tbz w8, #2, .LBB1_3 +; CHECK-NEXT: .LBB1_19: // %cond.store3 +; CHECK-NEXT: mov h3, v0.h[2] +; CHECK-NEXT: str h3, [x0, #4] +; CHECK-NEXT: tbz w8, #3, .LBB1_4 +; CHECK-NEXT: .LBB1_20: // %cond.store5 +; CHECK-NEXT: mov h3, v0.h[3] +; CHECK-NEXT: str h3, [x0, #6] +; CHECK-NEXT: tbz w8, #4, .LBB1_5 +; CHECK-NEXT: .LBB1_21: // %cond.store7 +; CHECK-NEXT: mov h3, v0.h[4] +; CHECK-NEXT: str h3, [x0, #8] +; CHECK-NEXT: tbz w8, #5, .LBB1_6 +; CHECK-NEXT: .LBB1_22: // %cond.store9 +; CHECK-NEXT: mov h3, v0.h[5] +; CHECK-NEXT: str h3, [x0, #10] +; CHECK-NEXT: tbz w8, #6, .LBB1_7 +; CHECK-NEXT: .LBB1_23: // %cond.store11 +; CHECK-NEXT: mov h3, v0.h[6] +; CHECK-NEXT: str h3, [x0, #12] +; CHECK-NEXT: tbz w8, #7, .LBB1_8 +; CHECK-NEXT: .LBB1_24: // %cond.store13 +; CHECK-NEXT: mov h0, v0.h[7] +; CHECK-NEXT: str h0, [x0, #14] +; CHECK-NEXT: zip2 v0.8h, v1.8h, v2.8h +; CHECK-NEXT: tbz w8, #8, .LBB1_9 +; CHECK-NEXT: .LBB1_25: // %cond.store15 +; CHECK-NEXT: str h0, [x0, #16] +; CHECK-NEXT: tbz w8, #9, .LBB1_10 +; CHECK-NEXT: .LBB1_26: // %cond.store17 +; CHECK-NEXT: mov h1, v0.h[1] +; CHECK-NEXT: str h1, [x0, #18] +; CHECK-NEXT: tbz w8, #10, .LBB1_11 +; CHECK-NEXT: .LBB1_27: // %cond.store19 +; CHECK-NEXT: mov h1, v0.h[2] +; CHECK-NEXT: str h1, [x0, #20] +; CHECK-NEXT: tbz w8, #11, .LBB1_12 +; CHECK-NEXT: .LBB1_28: // %cond.store21 +; CHECK-NEXT: mov h1, v0.h[3] +; CHECK-NEXT: str h1, [x0, #22] +; CHECK-NEXT: tbz w8, #12, .LBB1_13 +; CHECK-NEXT: .LBB1_29: // %cond.store23 +; CHECK-NEXT: mov h1, v0.h[4] +; CHECK-NEXT: str h1, [x0, #24] +; CHECK-NEXT: tbz w8, #13, .LBB1_14 +; CHECK-NEXT: .LBB1_30: // %cond.store25 +; CHECK-NEXT: mov h1, v0.h[5] +; CHECK-NEXT: str h1, [x0, #26] +; CHECK-NEXT: tbz w8, #14, .LBB1_15 +; CHECK-NEXT: .LBB1_31: // %cond.store27 +; CHECK-NEXT: mov h1, v0.h[6] +; CHECK-NEXT: str h1, [x0, #28] +; CHECK-NEXT: tbz w8, #15, .LBB1_16 +; CHECK-NEXT: .LBB1_32: // %cond.store29 +; CHECK-NEXT: mov h0, v0.h[7] +; CHECK-NEXT: str h0, [x0, #30] +; CHECK-NEXT: ret + %interleaved.mask = call <16 x i1> @llvm.vector.interleave2.v16i1(<8 x i1> %mask, <8 x i1> %mask) + %strided.vec = call <16 x i16> @llvm.vector.interleave2.v16i16(<8 x i16> %val1, <8 x i16> %val2) + call void @llvm.masked.store.v16i16.p0(<16 x i16> %strided.vec, ptr %p, i32 1, <16 x i1> %interleaved.mask) + ret void +} + +define void @foo_st2_v4i32(<4 x i1> %mask, <4 x i32> %val1, <4 x i32> %val2, ptr %p) { +; CHECK-LABEL: foo_st2_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: uzp1 v0.8b, v0.8b, v0.8b +; CHECK-NEXT: adrp x8, .LCPI2_0 +; CHECK-NEXT: ldr d3, [x8, :lo12:.LCPI2_0] +; CHECK-NEXT: zip1 v0.8b, v0.8b, v0.8b +; CHECK-NEXT: shl v0.8b, v0.8b, #7 +; CHECK-NEXT: cmlt v0.8b, v0.8b, #0 +; CHECK-NEXT: and v0.8b, v0.8b, v3.8b +; CHECK-NEXT: addv b3, v0.8b +; CHECK-NEXT: zip1 v0.4s, v1.4s, v2.4s +; CHECK-NEXT: fmov w8, s3 +; CHECK-NEXT: tbnz w8, #0, .LBB2_9 +; CHECK-NEXT: // %bb.1: // %else +; CHECK-NEXT: tbnz w8, #1, .LBB2_10 +; CHECK-NEXT: .LBB2_2: // %else2 +; CHECK-NEXT: tbnz w8, #2, .LBB2_11 +; CHECK-NEXT: .LBB2_3: // %else4 +; CHECK-NEXT: tbnz w8, #3, .LBB2_12 +; CHECK-NEXT: .LBB2_4: // %else6 +; CHECK-NEXT: zip2 v0.4s, v1.4s, v2.4s +; CHECK-NEXT: tbnz w8, #4, .LBB2_13 +; CHECK-NEXT: .LBB2_5: // %else8 +; CHECK-NEXT: tbnz w8, #5, .LBB2_14 +; CHECK-NEXT: .LBB2_6: // %else10 +; CHECK-NEXT: tbnz w8, #6, .LBB2_15 +; CHECK-NEXT: .LBB2_7: // %else12 +; CHECK-NEXT: tbnz w8, #7, .LBB2_16 +; CHECK-NEXT: .LBB2_8: // %else14 +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB2_9: // %cond.store +; CHECK-NEXT: str s0, [x0] +; CHECK-NEXT: tbz w8, #1, .LBB2_2 +; CHECK-NEXT: .LBB2_10: // %cond.store1 +; CHECK-NEXT: mov s3, v0.s[1] +; CHECK-NEXT: str s3, [x0, #4] +; CHECK-NEXT: tbz w8, #2, .LBB2_3 +; CHECK-NEXT: .LBB2_11: // %cond.store3 +; CHECK-NEXT: mov s3, v0.s[2] +; CHECK-NEXT: str s3, [x0, #8] +; CHECK-NEXT: tbz w8, #3, .LBB2_4 +; CHECK-NEXT: .LBB2_12: // %cond.store5 +; CHECK-NEXT: mov s0, v0.s[3] +; CHECK-NEXT: str s0, [x0, #12] +; CHECK-NEXT: zip2 v0.4s, v1.4s, v2.4s +; CHECK-NEXT: tbz w8, #4, .LBB2_5 +; CHECK-NEXT: .LBB2_13: // %cond.store7 +; CHECK-NEXT: str s0, [x0, #16] +; CHECK-NEXT: tbz w8, #5, .LBB2_6 +; CHECK-NEXT: .LBB2_14: // %cond.store9 +; CHECK-NEXT: mov s1, v0.s[1] +; CHECK-NEXT: str s1, [x0, #20] +; CHECK-NEXT: tbz w8, #6, .LBB2_7 +; CHECK-NEXT: .LBB2_15: // %cond.store11 +; CHECK-NEXT: mov s1, v0.s[2] +; CHECK-NEXT: str s1, [x0, #24] +; CHECK-NEXT: tbz w8, #7, .LBB2_8 +; CHECK-NEXT: .LBB2_16: // %cond.store13 +; CHECK-NEXT: mov s0, v0.s[3] +; CHECK-NEXT: str s0, [x0, #28] +; CHECK-NEXT: ret + %interleaved.mask = call <8 x i1> @llvm.vector.interleave2.v8i1(<4 x i1> %mask, <4 x i1> %mask) + %strided.vec = call <8 x i32> @llvm.vector.interleave2.v8i32(<4 x i32> %val1, <4 x i32> %val2) + call void @llvm.masked.store.v8i32.p0(<8 x i32> %strided.vec, ptr %p, i32 1, <8 x i1> %interleaved.mask) + ret void +} + +define void @foo_st2_v2i64(<2 x i1> %mask, <2 x i64> %val1, <2 x i64> %val2, ptr %p) { +; CHECK-LABEL: foo_st2_v2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: uzp1 v0.4h, v0.4h, v0.4h +; CHECK-NEXT: adrp x8, .LCPI3_0 +; CHECK-NEXT: ldr d3, [x8, :lo12:.LCPI3_0] +; CHECK-NEXT: zip1 v0.4h, v0.4h, v0.4h +; CHECK-NEXT: shl v0.4h, v0.4h, #15 +; CHECK-NEXT: cmlt v0.4h, v0.4h, #0 +; CHECK-NEXT: and v0.8b, v0.8b, v3.8b +; CHECK-NEXT: addv h3, v0.4h +; CHECK-NEXT: zip1 v0.2d, v1.2d, v2.2d +; CHECK-NEXT: fmov w8, s3 +; CHECK-NEXT: tbnz w8, #0, .LBB3_5 +; CHECK-NEXT: // %bb.1: // %else +; CHECK-NEXT: tbnz w8, #1, .LBB3_6 +; CHECK-NEXT: .LBB3_2: // %else2 +; CHECK-NEXT: zip2 v0.2d, v1.2d, v2.2d +; CHECK-NEXT: tbnz w8, #2, .LBB3_7 +; CHECK-NEXT: .LBB3_3: // %else4 +; CHECK-NEXT: tbnz w8, #3, .LBB3_8 +; CHECK-NEXT: .LBB3_4: // %else6 +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB3_5: // %cond.store +; CHECK-NEXT: str d0, [x0] +; CHECK-NEXT: tbz w8, #1, .LBB3_2 +; CHECK-NEXT: .LBB3_6: // %cond.store1 +; CHECK-NEXT: mov d0, v0.d[1] +; CHECK-NEXT: str d0, [x0, #8] +; CHECK-NEXT: zip2 v0.2d, v1.2d, v2.2d +; CHECK-NEXT: tbz w8, #2, .LBB3_3 +; CHECK-NEXT: .LBB3_7: // %cond.store3 +; CHECK-NEXT: str d0, [x0, #16] +; CHECK-NEXT: tbz w8, #3, .LBB3_4 +; CHECK-NEXT: .LBB3_8: // %cond.store5 +; CHECK-NEXT: mov d0, v0.d[1] +; CHECK-NEXT: str d0, [x0, #24] +; CHECK-NEXT: ret + %interleaved.mask = call <4 x i1> @llvm.vector.interleave2.v4i1(<2 x i1> %mask, <2 x i1> %mask) + %strided.vec = call <4 x i64> @llvm.vector.interleave2.v4i64(<2 x i64> %val1, <2 x i64> %val2) + call void @llvm.masked.store.v4i64.p0(<4 x i64> %strided.vec, ptr %p, i32 1, <4 x i1> %interleaved.mask) + ret void +} diff --git a/llvm/test/CodeGen/AArch64/fp16_i16_intrinsic_scalar.ll b/llvm/test/CodeGen/AArch64/fp16_i16_intrinsic_scalar.ll new file mode 100644 index 000000000000..ab502508fadb --- /dev/null +++ b/llvm/test/CodeGen/AArch64/fp16_i16_intrinsic_scalar.ll @@ -0,0 +1,128 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=aarch64 -global-isel=0 -mattr=+v8.2a,+fullfp16 | FileCheck %s + +; Test f16 -> i16 NEON intrinics, currently only supported in SDAG. +; Should be merged with fp16_intrinsic_scalar_1op.ll once there is +; support in GlSel. + +declare i16 @llvm.aarch64.neon.fcvtzs.i16.f16(half) +declare i16 @llvm.aarch64.neon.fcvtzu.i16.f16(half) +declare i16 @llvm.aarch64.neon.fcvtas.i16.f16(half) +declare i16 @llvm.aarch64.neon.fcvtau.i16.f16(half) +declare i16 @llvm.aarch64.neon.fcvtms.i16.f16(half) +declare i16 @llvm.aarch64.neon.fcvtmu.i16.f16(half) +declare i16 @llvm.aarch64.neon.fcvtns.i16.f16(half) +declare i16 @llvm.aarch64.neon.fcvtnu.i16.f16(half) +declare i16 @llvm.aarch64.neon.fcvtps.i16.f16(half) +declare i16 @llvm.aarch64.neon.fcvtpu.i16.f16(half) + + +define i16 @fcvtzs_intrinsic_i16(half %a) { +; CHECK-LABEL: fcvtzs_intrinsic_i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcvtzs h0, h0 +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret +entry: + %fcvt = tail call i16 @llvm.aarch64.neon.fcvtzs.i16.f16(half %a) + ret i16 %fcvt +} + +define i16 @fcvtzu_intrinsic_i16(half %a) { +; CHECK-LABEL: fcvtzu_intrinsic_i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcvtzu h0, h0 +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret +entry: + %fcvt = tail call i16 @llvm.aarch64.neon.fcvtzu.i16.f16(half %a) + ret i16 %fcvt +} + +define i16 @fcvtas_intrinsic_i16(half %a) { +; CHECK-LABEL: fcvtas_intrinsic_i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcvtas h0, h0 +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret +entry: + %fcvt = tail call i16 @llvm.aarch64.neon.fcvtas.i16.f16(half %a) + ret i16 %fcvt +} + +define i16 @fcvtau_intrinsic_i16(half %a) { +; CHECK-LABEL: fcvtau_intrinsic_i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcvtau h0, h0 +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret +entry: + %fcvt = tail call i16 @llvm.aarch64.neon.fcvtau.i16.f16(half %a) + ret i16 %fcvt +} + +define i16 @fcvtms_intrinsic_i16(half %a) { +; CHECK-LABEL: fcvtms_intrinsic_i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcvtms h0, h0 +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret +entry: + %fcvt = tail call i16 @llvm.aarch64.neon.fcvtms.i16.f16(half %a) + ret i16 %fcvt +} + +define i16 @fcvtmu_intrinsic_i16(half %a) { +; CHECK-LABEL: fcvtmu_intrinsic_i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcvtmu h0, h0 +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret +entry: + %fcvt = tail call i16 @llvm.aarch64.neon.fcvtmu.i16.f16(half %a) + ret i16 %fcvt +} + +define i16 @fcvtns_intrinsic_i16(half %a) { +; CHECK-LABEL: fcvtns_intrinsic_i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcvtns h0, h0 +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret +entry: + %fcvt = tail call i16 @llvm.aarch64.neon.fcvtns.i16.f16(half %a) + ret i16 %fcvt +} + +define i16 @fcvtnu_intrinsic_i16(half %a) { +; CHECK-LABEL: fcvtnu_intrinsic_i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcvtnu h0, h0 +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret +entry: + %fcvt = tail call i16 @llvm.aarch64.neon.fcvtnu.i16.f16(half %a) + ret i16 %fcvt +} + +define i16 @fcvtps_intrinsic_i16(half %a) { +; CHECK-LABEL: fcvtps_intrinsic_i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcvtps h0, h0 +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret +entry: + %fcvt = tail call i16 @llvm.aarch64.neon.fcvtps.i16.f16(half %a) + ret i16 %fcvt +} + +define i16 @fcvtpu_intrinsic_i16(half %a) { +; CHECK-LABEL: fcvtpu_intrinsic_i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcvtpu h0, h0 +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret +entry: + %fcvt = tail call i16 @llvm.aarch64.neon.fcvtpu.i16.f16(half %a) + ret i16 %fcvt +} diff --git a/llvm/test/CodeGen/AArch64/fp16_intrinsic_vector_1op.ll b/llvm/test/CodeGen/AArch64/fp16_intrinsic_vector_1op.ll index 58cbc2953dbc..b4fc8971ede8 100644 --- a/llvm/test/CodeGen/AArch64/fp16_intrinsic_vector_1op.ll +++ b/llvm/test/CodeGen/AArch64/fp16_intrinsic_vector_1op.ll @@ -1,13 +1,25 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=aarch64 -mattr=+v8.2a,+fullfp16 | FileCheck %s declare <4 x half> @llvm.nearbyint.v4f16(<4 x half>) declare <8 x half> @llvm.nearbyint.v8f16(<8 x half>) declare <4 x half> @llvm.sqrt.v4f16(<4 x half>) declare <8 x half> @llvm.sqrt.v8f16(<8 x half>) +declare <4 x i16> @llvm.aarch64.neon.fcvtzs.v4i16.v4f16(<4 x half>) +declare <4 x i16> @llvm.aarch64.neon.fcvtzu.v4i16.v4f16(<4 x half>) +declare <4 x i16> @llvm.aarch64.neon.fcvtas.v4i16.v4f16(<4 x half>) +declare <4 x i16> @llvm.aarch64.neon.fcvtau.v4i16.v4f16(<4 x half>) +declare <4 x i16> @llvm.aarch64.neon.fcvtms.v4i16.v4f16(<4 x half>) +declare <4 x i16> @llvm.aarch64.neon.fcvtmu.v4i16.v4f16(<4 x half>) +declare <4 x i16> @llvm.aarch64.neon.fcvtns.v4i16.v4f16(<4 x half>) +declare <4 x i16> @llvm.aarch64.neon.fcvtnu.v4i16.v4f16(<4 x half>) +declare <4 x i16> @llvm.aarch64.neon.fcvtps.v4i16.v4f16(<4 x half>) +declare <4 x i16> @llvm.aarch64.neon.fcvtpu.v4i16.v4f16(<4 x half>) define dso_local <4 x half> @t_vrndi_f16(<4 x half> %a) { ; CHECK-LABEL: t_vrndi_f16: -; CHECK: frinti v0.4h, v0.4h +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: frinti v0.4h, v0.4h ; CHECK-NEXT: ret entry: %vrndi1.i = tail call <4 x half> @llvm.nearbyint.v4f16(<4 x half> %a) @@ -16,7 +28,8 @@ entry: define dso_local <8 x half> @t_vrndiq_f16(<8 x half> %a) { ; CHECK-LABEL: t_vrndiq_f16: -; CHECK: frinti v0.8h, v0.8h +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: frinti v0.8h, v0.8h ; CHECK-NEXT: ret entry: %vrndi1.i = tail call <8 x half> @llvm.nearbyint.v8f16(<8 x half> %a) @@ -25,7 +38,8 @@ entry: define dso_local <4 x half> @t_vsqrt_f16(<4 x half> %a) { ; CHECK-LABEL: t_vsqrt_f16: -; CHECK: fsqrt v0.4h, v0.4h +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fsqrt v0.4h, v0.4h ; CHECK-NEXT: ret entry: %vsqrt.i = tail call <4 x half> @llvm.sqrt.v4f16(<4 x half> %a) @@ -34,9 +48,110 @@ entry: define dso_local <8 x half> @t_vsqrtq_f16(<8 x half> %a) { ; CHECK-LABEL: t_vsqrtq_f16: -; CHECK: fsqrt v0.8h, v0.8h +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fsqrt v0.8h, v0.8h ; CHECK-NEXT: ret entry: %vsqrt.i = tail call <8 x half> @llvm.sqrt.v8f16(<8 x half> %a) ret <8 x half> %vsqrt.i } + +define <4 x i16> @t_fcvtzs_v4i16_v4f16(<4 x half> %a) { +; CHECK-LABEL: t_fcvtzs_v4i16_v4f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcvtzs v0.4h, v0.4h +; CHECK-NEXT: ret +entry: + %vcvt = tail call <4 x i16> @llvm.aarch64.neon.fcvtzs.v4i16.v4f16(<4 x half> %a) + ret <4 x i16> %vcvt +} + +define <4 x i16> @t_fcvtzu_v4i16_v4f16(<4 x half> %a) { +; CHECK-LABEL: t_fcvtzu_v4i16_v4f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcvtzu v0.4h, v0.4h +; CHECK-NEXT: ret +entry: + %vcvt = tail call <4 x i16> @llvm.aarch64.neon.fcvtzu.v4i16.v4f16(<4 x half> %a) + ret <4 x i16> %vcvt +} + +define <4 x i16> @t_fcvtas_v4i16_v4f16(<4 x half> %a) { +; CHECK-LABEL: t_fcvtas_v4i16_v4f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcvtas v0.4h, v0.4h +; CHECK-NEXT: ret +entry: + %vcvt = tail call <4 x i16> @llvm.aarch64.neon.fcvtas.v4i16.v4f16(<4 x half> %a) + ret <4 x i16> %vcvt +} + +define <4 x i16> @t_fcvtau_v4i16_v4f16(<4 x half> %a) { +; CHECK-LABEL: t_fcvtau_v4i16_v4f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcvtau v0.4h, v0.4h +; CHECK-NEXT: ret +entry: + %vcvt = tail call <4 x i16> @llvm.aarch64.neon.fcvtau.v4i16.v4f16(<4 x half> %a) + ret <4 x i16> %vcvt +} + +define <4 x i16> @t_fcvtms_v4i16_v4f16(<4 x half> %a) { +; CHECK-LABEL: t_fcvtms_v4i16_v4f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcvtms v0.4h, v0.4h +; CHECK-NEXT: ret +entry: + %vcvt = tail call <4 x i16> @llvm.aarch64.neon.fcvtms.v4i16.v4f16(<4 x half> %a) + ret <4 x i16> %vcvt +} + +define <4 x i16> @t_fcvtmu_v4i16_v4f16(<4 x half> %a) { +; CHECK-LABEL: t_fcvtmu_v4i16_v4f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcvtmu v0.4h, v0.4h +; CHECK-NEXT: ret +entry: + %vcvt = tail call <4 x i16> @llvm.aarch64.neon.fcvtmu.v4i16.v4f16(<4 x half> %a) + ret <4 x i16> %vcvt +} + +define <4 x i16> @t_fcvtns_v4i16_v4f16(<4 x half> %a) { +; CHECK-LABEL: t_fcvtns_v4i16_v4f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcvtns v0.4h, v0.4h +; CHECK-NEXT: ret +entry: + %vcvt = tail call <4 x i16> @llvm.aarch64.neon.fcvtns.v4i16.v4f16(<4 x half> %a) + ret <4 x i16> %vcvt +} + +define <4 x i16> @t_fcvtnu_v4i16_v4f16(<4 x half> %a) { +; CHECK-LABEL: t_fcvtnu_v4i16_v4f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcvtnu v0.4h, v0.4h +; CHECK-NEXT: ret +entry: + %vcvt = tail call <4 x i16> @llvm.aarch64.neon.fcvtnu.v4i16.v4f16(<4 x half> %a) + ret <4 x i16> %vcvt +} + +define <4 x i16> @t_fcvtps_v4i16_v4f16(<4 x half> %a) { +; CHECK-LABEL: t_fcvtps_v4i16_v4f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcvtps v0.4h, v0.4h +; CHECK-NEXT: ret +entry: + %vcvt = tail call <4 x i16> @llvm.aarch64.neon.fcvtps.v4i16.v4f16(<4 x half> %a) + ret <4 x i16> %vcvt +} + +define <4 x i16> @t_fcvtpu_v4i16_v4f16(<4 x half> %a) { +; CHECK-LABEL: t_fcvtpu_v4i16_v4f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcvtpu v0.4h, v0.4h +; CHECK-NEXT: ret +entry: + %vcvt = tail call <4 x i16> @llvm.aarch64.neon.fcvtpu.v4i16.v4f16(<4 x half> %a) + ret <4 x i16> %vcvt +} diff --git a/llvm/test/CodeGen/AArch64/fptosi-sat-scalar.ll b/llvm/test/CodeGen/AArch64/fptosi-sat-scalar.ll index e3aef487890f..ddee23cc3fc5 100644 --- a/llvm/test/CodeGen/AArch64/fptosi-sat-scalar.ll +++ b/llvm/test/CodeGen/AArch64/fptosi-sat-scalar.ll @@ -670,13 +670,9 @@ define i16 @test_signed_i16_f16(half %f) nounwind { ; ; CHECK-SD-FP16-LABEL: test_signed_i16_f16: ; CHECK-SD-FP16: // %bb.0: -; CHECK-SD-FP16-NEXT: fcvtzs w8, h0 -; CHECK-SD-FP16-NEXT: mov w9, #32767 // =0x7fff -; CHECK-SD-FP16-NEXT: cmp w8, w9 -; CHECK-SD-FP16-NEXT: csel w8, w8, w9, lt -; CHECK-SD-FP16-NEXT: mov w9, #-32768 // =0xffff8000 -; CHECK-SD-FP16-NEXT: cmn w8, #8, lsl #12 // =32768 -; CHECK-SD-FP16-NEXT: csel w0, w8, w9, gt +; CHECK-SD-FP16-NEXT: fcvtzs h0, h0 +; CHECK-SD-FP16-NEXT: fmov w8, s0 +; CHECK-SD-FP16-NEXT: sxth w0, w8 ; CHECK-SD-FP16-NEXT: ret ; ; CHECK-GI-CVT-LABEL: test_signed_i16_f16: @@ -693,13 +689,8 @@ define i16 @test_signed_i16_f16(half %f) nounwind { ; ; CHECK-GI-FP16-LABEL: test_signed_i16_f16: ; CHECK-GI-FP16: // %bb.0: -; CHECK-GI-FP16-NEXT: fcvtzs w8, h0 -; CHECK-GI-FP16-NEXT: mov w9, #32767 // =0x7fff -; CHECK-GI-FP16-NEXT: cmp w8, w9 -; CHECK-GI-FP16-NEXT: csel w8, w8, w9, lt -; CHECK-GI-FP16-NEXT: mov w9, #-32768 // =0xffff8000 -; CHECK-GI-FP16-NEXT: cmn w8, #8, lsl #12 // =32768 -; CHECK-GI-FP16-NEXT: csel w0, w8, w9, gt +; CHECK-GI-FP16-NEXT: fcvtzs h0, h0 +; CHECK-GI-FP16-NEXT: fmov w0, s0 ; CHECK-GI-FP16-NEXT: ret %x = call i16 @llvm.fptosi.sat.i16.f16(half %f) ret i16 %x diff --git a/llvm/test/CodeGen/AArch64/fptoui-sat-scalar.ll b/llvm/test/CodeGen/AArch64/fptoui-sat-scalar.ll index 07e49e331415..8abad4419663 100644 --- a/llvm/test/CodeGen/AArch64/fptoui-sat-scalar.ll +++ b/llvm/test/CodeGen/AArch64/fptoui-sat-scalar.ll @@ -531,10 +531,8 @@ define i16 @test_unsigned_i16_f16(half %f) nounwind { ; ; CHECK-SD-FP16-LABEL: test_unsigned_i16_f16: ; CHECK-SD-FP16: // %bb.0: -; CHECK-SD-FP16-NEXT: fcvtzu w8, h0 -; CHECK-SD-FP16-NEXT: mov w9, #65535 // =0xffff -; CHECK-SD-FP16-NEXT: cmp w8, w9 -; CHECK-SD-FP16-NEXT: csel w0, w8, w9, lo +; CHECK-SD-FP16-NEXT: fcvtzu h0, h0 +; CHECK-SD-FP16-NEXT: fmov w0, s0 ; CHECK-SD-FP16-NEXT: ret ; ; CHECK-GI-CVT-LABEL: test_unsigned_i16_f16: @@ -548,10 +546,8 @@ define i16 @test_unsigned_i16_f16(half %f) nounwind { ; ; CHECK-GI-FP16-LABEL: test_unsigned_i16_f16: ; CHECK-GI-FP16: // %bb.0: -; CHECK-GI-FP16-NEXT: fcvtzu w8, h0 -; CHECK-GI-FP16-NEXT: mov w9, #65535 // =0xffff -; CHECK-GI-FP16-NEXT: cmp w8, w9 -; CHECK-GI-FP16-NEXT: csel w0, w8, w9, lo +; CHECK-GI-FP16-NEXT: fcvtzu h0, h0 +; CHECK-GI-FP16-NEXT: fmov w0, s0 ; CHECK-GI-FP16-NEXT: ret %x = call i16 @llvm.fptoui.sat.i16.f16(half %f) ret i16 %x diff --git a/llvm/test/CodeGen/AArch64/freeze.ll b/llvm/test/CodeGen/AArch64/freeze.ll index 36d88c552493..7a9d6b7e5245 100644 --- a/llvm/test/CodeGen/AArch64/freeze.ll +++ b/llvm/test/CodeGen/AArch64/freeze.ll @@ -430,6 +430,76 @@ define <8 x i16> @freeze_abds(<8 x i16> %a, <8 x i16> %b) { ret <8 x i16> %r } +define <8 x i16> @freeze_uhadd(<8 x i16> %a0, <8 x i16> %a1) { +; CHECK-LABEL: freeze_uhadd: +; CHECK: // %bb.0: +; CHECK-NEXT: movi v2.8h, #15 +; CHECK-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-NEXT: and v1.16b, v1.16b, v2.16b +; CHECK-NEXT: uhadd v0.8h, v0.8h, v1.8h +; CHECK-NEXT: ret + %m0 = and <8 x i16> %a0, splat (i16 15) + %m1 = and <8 x i16> %a1, splat (i16 15) + %avg = call <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16> %m0, <8 x i16> %m1) + %frozen = freeze <8 x i16> %avg + %masked = and <8 x i16> %frozen, splat (i16 31) + ret <8 x i16> %masked +} + +define <8 x i16> @freeze_urhadd(<8 x i16> %a0, <8 x i16> %a1) { +; CHECK-LABEL: freeze_urhadd: +; CHECK: // %bb.0: +; CHECK-NEXT: movi v2.8h, #15 +; CHECK-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-NEXT: and v1.16b, v1.16b, v2.16b +; CHECK-NEXT: urhadd v0.8h, v0.8h, v1.8h +; CHECK-NEXT: ret + %m0 = and <8 x i16> %a0, splat (i16 15) + %m1 = and <8 x i16> %a1, splat (i16 15) + %avg = call <8 x i16> @llvm.aarch64.neon.urhadd.v8i16(<8 x i16> %m0, <8 x i16> %m1) + %frozen = freeze <8 x i16> %avg + %masked = and <8 x i16> %frozen, splat (i16 31) + ret <8 x i16> %masked +} + +; TODO: Unnecessary sext_inreg +define <8 x i16> @freeze_shadd(<8 x i8> %a0, <8 x i16> %a1) { +; CHECK-LABEL: freeze_shadd: +; CHECK: // %bb.0: +; CHECK-NEXT: sshll v0.8h, v0.8b, #0 +; CHECK-NEXT: sshr v1.8h, v1.8h, #8 +; CHECK-NEXT: shadd v0.8h, v0.8h, v1.8h +; CHECK-NEXT: shl v0.8h, v0.8h, #8 +; CHECK-NEXT: sshr v0.8h, v0.8h, #8 +; CHECK-NEXT: ret + %x0 = sext <8 x i8> %a0 to <8 x i16> + %x1 = ashr <8 x i16> %a1, splat (i16 8) + %avg = call <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16> %x0, <8 x i16> %x1) + %frozen = freeze <8 x i16> %avg + %trunc = trunc <8 x i16> %frozen to <8 x i8> + %sext = sext <8 x i8> %trunc to <8 x i16> + ret <8 x i16> %sext +} + +; TODO: Unnecessary sext_inreg +define <8 x i16> @freeze_srhadd(<8 x i8> %a0, <8 x i16> %a1) { +; CHECK-LABEL: freeze_srhadd: +; CHECK: // %bb.0: +; CHECK-NEXT: sshll v0.8h, v0.8b, #0 +; CHECK-NEXT: sshr v1.8h, v1.8h, #8 +; CHECK-NEXT: srhadd v0.8h, v0.8h, v1.8h +; CHECK-NEXT: shl v0.8h, v0.8h, #8 +; CHECK-NEXT: sshr v0.8h, v0.8h, #8 +; CHECK-NEXT: ret + %x0 = sext <8 x i8> %a0 to <8 x i16> + %x1 = ashr <8 x i16> %a1, splat (i16 8) + %avg = call <8 x i16> @llvm.aarch64.neon.srhadd.v8i16(<8 x i16> %x0, <8 x i16> %x1) + %frozen = freeze <8 x i16> %avg + %trunc = trunc <8 x i16> %frozen to <8 x i8> + %sext = sext <8 x i8> %trunc to <8 x i16> + ret <8 x i16> %sext +} + define i32 @freeze_scmp(i32 %a0) nounwind { ; CHECK-LABEL: freeze_scmp: ; CHECK: // %bb.0: diff --git a/llvm/test/CodeGen/AArch64/fsh.ll b/llvm/test/CodeGen/AArch64/fsh.ll index ae2ef2649102..765f6b77b41a 100644 --- a/llvm/test/CodeGen/AArch64/fsh.ll +++ b/llvm/test/CodeGen/AArch64/fsh.ll @@ -1379,7 +1379,7 @@ define <7 x i32> @rotl_v7i32(<7 x i32> %a, <7 x i32> %c) { ; CHECK-GI-LABEL: rotl_v7i32: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: ldr s0, [sp, #24] -; CHECK-GI-NEXT: fmov s1, wzr +; CHECK-GI-NEXT: movi d1, #0000000000000000 ; CHECK-GI-NEXT: fmov s3, w7 ; CHECK-GI-NEXT: ldr s2, [sp, #32] ; CHECK-GI-NEXT: mov x8, sp @@ -1387,31 +1387,32 @@ define <7 x i32> @rotl_v7i32(<7 x i32> %a, <7 x i32> %c) { ; CHECK-GI-NEXT: mov v6.16b, v0.16b ; CHECK-GI-NEXT: ldr s7, [sp] ; CHECK-GI-NEXT: ldr s5, [sp, #40] -; CHECK-GI-NEXT: mov v1.s[1], wzr ; CHECK-GI-NEXT: ld1 { v3.s }[1], [x8] ; CHECK-GI-NEXT: add x8, sp, #8 +; CHECK-GI-NEXT: fmov s16, w0 +; CHECK-GI-NEXT: mov v1.s[1], wzr ; CHECK-GI-NEXT: mov v4.s[1], v7.s[0] ; CHECK-GI-NEXT: ldr s7, [sp, #8] -; CHECK-GI-NEXT: fmov s16, w0 ; CHECK-GI-NEXT: mov v6.s[1], v2.s[0] ; CHECK-GI-NEXT: fmov s17, w0 ; CHECK-GI-NEXT: add x9, sp, #16 ; CHECK-GI-NEXT: ld1 { v3.s }[2], [x8] ; CHECK-GI-NEXT: mov w8, #31 // =0x1f +; CHECK-GI-NEXT: mov v16.s[1], w1 +; CHECK-GI-NEXT: fmov s18, w8 ; CHECK-GI-NEXT: mov v0.s[1], v2.s[0] +; CHECK-GI-NEXT: fmov s2, w4 ; CHECK-GI-NEXT: mov v1.s[2], wzr -; CHECK-GI-NEXT: fmov s18, w8 -; CHECK-GI-NEXT: mov v16.s[1], w1 ; CHECK-GI-NEXT: mov v4.s[2], v7.s[0] ; CHECK-GI-NEXT: ldr s7, [sp, #16] -; CHECK-GI-NEXT: mov v17.s[1], w1 ; CHECK-GI-NEXT: mov v6.s[2], v5.s[0] ; CHECK-GI-NEXT: ld1 { v3.s }[3], [x9] -; CHECK-GI-NEXT: fmov s2, w4 +; CHECK-GI-NEXT: mov v17.s[1], w1 ; CHECK-GI-NEXT: mov v18.s[1], w8 ; CHECK-GI-NEXT: movi v19.4s, #31 -; CHECK-GI-NEXT: mov v0.s[2], v5.s[0] ; CHECK-GI-NEXT: mov v16.s[2], w2 +; CHECK-GI-NEXT: mov v2.s[1], w5 +; CHECK-GI-NEXT: mov v0.s[2], v5.s[0] ; CHECK-GI-NEXT: mov v4.s[3], v7.s[0] ; CHECK-GI-NEXT: fmov s7, w4 ; CHECK-GI-NEXT: neg v3.4s, v3.4s @@ -1419,15 +1420,14 @@ define <7 x i32> @rotl_v7i32(<7 x i32> %a, <7 x i32> %c) { ; CHECK-GI-NEXT: fmov s6, w8 ; CHECK-GI-NEXT: mov v17.s[2], w2 ; CHECK-GI-NEXT: mov v18.s[2], w8 -; CHECK-GI-NEXT: mov v2.s[1], w5 +; CHECK-GI-NEXT: mov v16.s[3], w3 ; CHECK-GI-NEXT: mov v7.s[1], w5 ; CHECK-GI-NEXT: and v3.16b, v3.16b, v19.16b -; CHECK-GI-NEXT: mov v16.s[3], w3 +; CHECK-GI-NEXT: mov v2.s[2], w6 ; CHECK-GI-NEXT: mov v6.s[1], w8 ; CHECK-GI-NEXT: and v4.16b, v4.16b, v19.16b ; CHECK-GI-NEXT: mov v17.s[3], w3 ; CHECK-GI-NEXT: and v1.16b, v1.16b, v18.16b -; CHECK-GI-NEXT: mov v2.s[2], w6 ; CHECK-GI-NEXT: neg v3.4s, v3.4s ; CHECK-GI-NEXT: mov v7.s[2], w6 ; CHECK-GI-NEXT: mov v6.s[2], w8 @@ -1510,7 +1510,7 @@ define <7 x i32> @rotr_v7i32(<7 x i32> %a, <7 x i32> %c) { ; CHECK-GI-NEXT: fmov s2, w7 ; CHECK-GI-NEXT: mov x8, sp ; CHECK-GI-NEXT: ldr s6, [sp, #8] -; CHECK-GI-NEXT: fmov s0, wzr +; CHECK-GI-NEXT: movi d0, #0000000000000000 ; CHECK-GI-NEXT: ldr s7, [sp, #32] ; CHECK-GI-NEXT: fmov s16, w0 ; CHECK-GI-NEXT: fmov s17, w0 @@ -1518,12 +1518,12 @@ define <7 x i32> @rotr_v7i32(<7 x i32> %a, <7 x i32> %c) { ; CHECK-GI-NEXT: ldr s3, [sp, #24] ; CHECK-GI-NEXT: ld1 { v2.s }[1], [x8] ; CHECK-GI-NEXT: mov w8, #31 // =0x1f -; CHECK-GI-NEXT: mov v0.s[1], wzr ; CHECK-GI-NEXT: add x9, sp, #8 +; CHECK-GI-NEXT: ldr s5, [sp, #40] ; CHECK-GI-NEXT: mov v4.16b, v3.16b ; CHECK-GI-NEXT: mov v3.s[1], v7.s[0] +; CHECK-GI-NEXT: mov v0.s[1], wzr ; CHECK-GI-NEXT: fmov s18, w8 -; CHECK-GI-NEXT: ldr s5, [sp, #40] ; CHECK-GI-NEXT: ld1 { v2.s }[2], [x9] ; CHECK-GI-NEXT: mov v17.s[1], w1 ; CHECK-GI-NEXT: mov v1.s[2], v6.s[0] @@ -1531,9 +1531,9 @@ define <7 x i32> @rotr_v7i32(<7 x i32> %a, <7 x i32> %c) { ; CHECK-GI-NEXT: mov v16.s[1], w1 ; CHECK-GI-NEXT: mov v4.s[1], v7.s[0] ; CHECK-GI-NEXT: ldr s7, [sp, #16] +; CHECK-GI-NEXT: fmov s19, w4 ; CHECK-GI-NEXT: mov v18.s[1], w8 ; CHECK-GI-NEXT: mov v3.s[2], v5.s[0] -; CHECK-GI-NEXT: fmov s19, w4 ; CHECK-GI-NEXT: add x10, sp, #16 ; CHECK-GI-NEXT: mov v6.s[1], w8 ; CHECK-GI-NEXT: mov v0.s[2], wzr diff --git a/llvm/test/CodeGen/AArch64/funnel-shift.ll b/llvm/test/CodeGen/AArch64/funnel-shift.ll index e5aa360f804c..f9fd2ad1b5b6 100644 --- a/llvm/test/CodeGen/AArch64/funnel-shift.ll +++ b/llvm/test/CodeGen/AArch64/funnel-shift.ll @@ -674,14 +674,12 @@ define i32 @or_shl_fshl_simplify(i32 %x, i32 %y, i32 %s) { ; CHECK-GI-LABEL: or_shl_fshl_simplify: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: mov w8, #31 // =0x1f -; CHECK-GI-NEXT: and w9, w2, #0x1f -; CHECK-GI-NEXT: lsr w10, w0, #1 -; CHECK-GI-NEXT: lsl w11, w1, w2 +; CHECK-GI-NEXT: lsr w9, w0, #1 +; CHECK-GI-NEXT: and w10, w2, #0x1f ; CHECK-GI-NEXT: bic w8, w8, w2 -; CHECK-GI-NEXT: lsl w9, w1, w9 -; CHECK-GI-NEXT: lsr w8, w10, w8 -; CHECK-GI-NEXT: orr w9, w9, w11 -; CHECK-GI-NEXT: orr w0, w9, w8 +; CHECK-GI-NEXT: lsl w10, w1, w10 +; CHECK-GI-NEXT: lsr w8, w9, w8 +; CHECK-GI-NEXT: orr w0, w10, w8 ; CHECK-GI-NEXT: ret %shy = shl i32 %y, %s %fun = call i32 @llvm.fshl.i32(i32 %y, i32 %x, i32 %s) @@ -702,14 +700,12 @@ define i32 @or_lshr_fshr_simplify(i32 %x, i32 %y, i32 %s) { ; CHECK-GI-LABEL: or_lshr_fshr_simplify: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: mov w8, #31 // =0x1f -; CHECK-GI-NEXT: and w9, w2, #0x1f -; CHECK-GI-NEXT: lsl w10, w0, #1 -; CHECK-GI-NEXT: lsr w11, w1, w2 +; CHECK-GI-NEXT: lsl w9, w0, #1 +; CHECK-GI-NEXT: and w10, w2, #0x1f ; CHECK-GI-NEXT: bic w8, w8, w2 -; CHECK-GI-NEXT: lsr w9, w1, w9 -; CHECK-GI-NEXT: lsl w8, w10, w8 -; CHECK-GI-NEXT: orr w9, w11, w9 -; CHECK-GI-NEXT: orr w0, w9, w8 +; CHECK-GI-NEXT: lsl w8, w9, w8 +; CHECK-GI-NEXT: lsr w9, w1, w10 +; CHECK-GI-NEXT: orr w0, w8, w9 ; CHECK-GI-NEXT: ret %shy = lshr i32 %y, %s %fun = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 %s) diff --git a/llvm/test/CodeGen/AArch64/insertextract.ll b/llvm/test/CodeGen/AArch64/insertextract.ll index 13a43d6d3523..a9167ad6ebb7 100644 --- a/llvm/test/CodeGen/AArch64/insertextract.ll +++ b/llvm/test/CodeGen/AArch64/insertextract.ll @@ -1,17 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=aarch64 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD -; RUN: llc -mtriple=aarch64 -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI - -; CHECK-GI: warning: Instruction selection used fallback path for insert_v2i128_0 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for insert_v2i128_1 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for insert_v2i128_c -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for insert_v2fp128_0 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for insert_v2fp128_1 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for insert_v2fp128_c -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for extract_v2i128_0 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for extract_v2i128_1 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for extract_v2i128_c -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for extract_v2fp128_c +; RUN: llc -mtriple=aarch64 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI define <2 x double> @insert_v2f64_0(<2 x double> %a, double %b, i32 %c) { ; CHECK-LABEL: insert_v2f64_0: @@ -1324,13 +1313,21 @@ entry: } define <2 x i128> @insert_v2i128_0(<2 x i128> %a, i128 %b, i32 %c) { -; CHECK-LABEL: insert_v2i128_0: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: adds x2, x2, x2 -; CHECK-NEXT: mov x1, x5 -; CHECK-NEXT: mov x0, x4 -; CHECK-NEXT: adc x3, x3, x3 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: insert_v2i128_0: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: adds x2, x2, x2 +; CHECK-SD-NEXT: mov x1, x5 +; CHECK-SD-NEXT: mov x0, x4 +; CHECK-SD-NEXT: adc x3, x3, x3 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: insert_v2i128_0: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: adds x2, x2, x2 +; CHECK-GI-NEXT: mov x0, x4 +; CHECK-GI-NEXT: mov x1, x5 +; CHECK-GI-NEXT: adc x3, x3, x3 +; CHECK-GI-NEXT: ret entry: %aa = add <2 x i128> %a, %a %d = insertelement <2 x i128> %aa, i128 %b, i32 0 @@ -1338,13 +1335,21 @@ entry: } define <2 x i128> @insert_v2i128_1(<2 x i128> %a, i128 %b, i32 %c) { -; CHECK-LABEL: insert_v2i128_1: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: adds x0, x0, x0 -; CHECK-NEXT: mov x3, x5 -; CHECK-NEXT: mov x2, x4 -; CHECK-NEXT: adc x1, x1, x1 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: insert_v2i128_1: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: adds x0, x0, x0 +; CHECK-SD-NEXT: mov x3, x5 +; CHECK-SD-NEXT: mov x2, x4 +; CHECK-SD-NEXT: adc x1, x1, x1 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: insert_v2i128_1: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: adds x0, x0, x0 +; CHECK-GI-NEXT: mov x2, x4 +; CHECK-GI-NEXT: mov x3, x5 +; CHECK-GI-NEXT: adc x1, x1, x1 +; CHECK-GI-NEXT: ret entry: %aa = add <2 x i128> %a, %a %d = insertelement <2 x i128> %aa, i128 %b, i32 1 @@ -1352,28 +1357,63 @@ entry: } define <2 x i128> @insert_v2i128_c(<2 x i128> %a, i128 %b, i32 %c) { -; CHECK-LABEL: insert_v2i128_c: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: sub sp, sp, #32 -; CHECK-NEXT: .cfi_def_cfa_offset 32 -; CHECK-NEXT: // kill: def $w6 killed $w6 def $x6 -; CHECK-NEXT: adds x8, x0, x0 -; CHECK-NEXT: and x11, x6, #0x1 -; CHECK-NEXT: mov x12, sp -; CHECK-NEXT: adc x9, x1, x1 -; CHECK-NEXT: adds x10, x2, x2 -; CHECK-NEXT: add x11, x12, x11, lsl #4 -; CHECK-NEXT: str x8, [sp] -; CHECK-NEXT: adc x8, x3, x3 -; CHECK-NEXT: str x10, [sp, #16] -; CHECK-NEXT: str x4, [x11] -; CHECK-NEXT: str x8, [sp, #24] -; CHECK-NEXT: str x9, [sp, #8] -; CHECK-NEXT: str x5, [x11, #8] -; CHECK-NEXT: ldp x0, x1, [sp] -; CHECK-NEXT: ldp x2, x3, [sp, #16] -; CHECK-NEXT: add sp, sp, #32 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: insert_v2i128_c: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: sub sp, sp, #32 +; CHECK-SD-NEXT: .cfi_def_cfa_offset 32 +; CHECK-SD-NEXT: // kill: def $w6 killed $w6 def $x6 +; CHECK-SD-NEXT: adds x8, x0, x0 +; CHECK-SD-NEXT: and x11, x6, #0x1 +; CHECK-SD-NEXT: mov x12, sp +; CHECK-SD-NEXT: adc x9, x1, x1 +; CHECK-SD-NEXT: adds x10, x2, x2 +; CHECK-SD-NEXT: add x11, x12, x11, lsl #4 +; CHECK-SD-NEXT: str x8, [sp] +; CHECK-SD-NEXT: adc x8, x3, x3 +; CHECK-SD-NEXT: str x10, [sp, #16] +; CHECK-SD-NEXT: str x4, [x11] +; CHECK-SD-NEXT: str x8, [sp, #24] +; CHECK-SD-NEXT: str x9, [sp, #8] +; CHECK-SD-NEXT: str x5, [x11, #8] +; CHECK-SD-NEXT: ldp x0, x1, [sp] +; CHECK-SD-NEXT: ldp x2, x3, [sp, #16] +; CHECK-SD-NEXT: add sp, sp, #32 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: insert_v2i128_c: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-GI-NEXT: sub x9, sp, #48 +; CHECK-GI-NEXT: mov x29, sp +; CHECK-GI-NEXT: and sp, x9, #0xffffffffffffffe0 +; CHECK-GI-NEXT: .cfi_def_cfa w29, 16 +; CHECK-GI-NEXT: .cfi_offset w30, -8 +; CHECK-GI-NEXT: .cfi_offset w29, -16 +; CHECK-GI-NEXT: adds x8, x0, x0 +; CHECK-GI-NEXT: mov v2.d[0], x4 +; CHECK-GI-NEXT: adc x9, x1, x1 +; CHECK-GI-NEXT: mov v0.d[0], x8 +; CHECK-GI-NEXT: adds x8, x2, x2 +; CHECK-GI-NEXT: mov v1.d[0], x8 +; CHECK-GI-NEXT: adc x8, x3, x3 +; CHECK-GI-NEXT: mov v2.d[1], x5 +; CHECK-GI-NEXT: mov v0.d[1], x9 +; CHECK-GI-NEXT: mov x9, sp +; CHECK-GI-NEXT: mov v1.d[1], x8 +; CHECK-GI-NEXT: mov w8, w6 +; CHECK-GI-NEXT: and x8, x8, #0x1 +; CHECK-GI-NEXT: stp q0, q1, [sp] +; CHECK-GI-NEXT: str q2, [x9, x8, lsl #4] +; CHECK-GI-NEXT: ldp q0, q1, [sp] +; CHECK-GI-NEXT: mov d2, v0.d[1] +; CHECK-GI-NEXT: mov d3, v1.d[1] +; CHECK-GI-NEXT: fmov x0, d0 +; CHECK-GI-NEXT: fmov x2, d1 +; CHECK-GI-NEXT: fmov x1, d2 +; CHECK-GI-NEXT: fmov x3, d3 +; CHECK-GI-NEXT: mov sp, x29 +; CHECK-GI-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-GI-NEXT: ret entry: %aa = add <2 x i128> %a, %a %d = insertelement <2 x i128> %aa, i128 %b, i32 %c @@ -1381,20 +1421,38 @@ entry: } define <2 x fp128> @insert_v2fp128_0(<2 x fp128> %a, fp128 %b, i32 %c) { -; CHECK-LABEL: insert_v2fp128_0: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: sub sp, sp, #32 -; CHECK-NEXT: str x30, [sp, #16] // 8-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 32 -; CHECK-NEXT: .cfi_offset w30, -16 -; CHECK-NEXT: mov v0.16b, v1.16b -; CHECK-NEXT: str q2, [sp] // 16-byte Folded Spill -; CHECK-NEXT: bl __addtf3 -; CHECK-NEXT: mov v1.16b, v0.16b -; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload -; CHECK-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload -; CHECK-NEXT: add sp, sp, #32 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: insert_v2fp128_0: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: sub sp, sp, #32 +; CHECK-SD-NEXT: str x30, [sp, #16] // 8-byte Folded Spill +; CHECK-SD-NEXT: .cfi_def_cfa_offset 32 +; CHECK-SD-NEXT: .cfi_offset w30, -16 +; CHECK-SD-NEXT: mov v0.16b, v1.16b +; CHECK-SD-NEXT: str q2, [sp] // 16-byte Folded Spill +; CHECK-SD-NEXT: bl __addtf3 +; CHECK-SD-NEXT: mov v1.16b, v0.16b +; CHECK-SD-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-SD-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload +; CHECK-SD-NEXT: add sp, sp, #32 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: insert_v2fp128_0: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sub sp, sp, #48 +; CHECK-GI-NEXT: str x30, [sp, #32] // 8-byte Folded Spill +; CHECK-GI-NEXT: .cfi_def_cfa_offset 48 +; CHECK-GI-NEXT: .cfi_offset w30, -16 +; CHECK-GI-NEXT: stp q1, q2, [sp] // 32-byte Folded Spill +; CHECK-GI-NEXT: mov v1.16b, v0.16b +; CHECK-GI-NEXT: bl __addtf3 +; CHECK-GI-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v1.16b, v0.16b +; CHECK-GI-NEXT: bl __addtf3 +; CHECK-GI-NEXT: mov v1.16b, v0.16b +; CHECK-GI-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload +; CHECK-GI-NEXT: add sp, sp, #48 +; CHECK-GI-NEXT: ret entry: %aa = fadd <2 x fp128> %a, %a %d = insertelement <2 x fp128> %aa, fp128 %b, i32 0 @@ -1402,19 +1460,38 @@ entry: } define <2 x fp128> @insert_v2fp128_1(<2 x fp128> %a, fp128 %b, i32 %c) { -; CHECK-LABEL: insert_v2fp128_1: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: sub sp, sp, #32 -; CHECK-NEXT: str x30, [sp, #16] // 8-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 32 -; CHECK-NEXT: .cfi_offset w30, -16 -; CHECK-NEXT: mov v1.16b, v0.16b -; CHECK-NEXT: str q2, [sp] // 16-byte Folded Spill -; CHECK-NEXT: bl __addtf3 -; CHECK-NEXT: ldr q1, [sp] // 16-byte Folded Reload -; CHECK-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload -; CHECK-NEXT: add sp, sp, #32 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: insert_v2fp128_1: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: sub sp, sp, #32 +; CHECK-SD-NEXT: str x30, [sp, #16] // 8-byte Folded Spill +; CHECK-SD-NEXT: .cfi_def_cfa_offset 32 +; CHECK-SD-NEXT: .cfi_offset w30, -16 +; CHECK-SD-NEXT: mov v1.16b, v0.16b +; CHECK-SD-NEXT: str q2, [sp] // 16-byte Folded Spill +; CHECK-SD-NEXT: bl __addtf3 +; CHECK-SD-NEXT: ldr q1, [sp] // 16-byte Folded Reload +; CHECK-SD-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload +; CHECK-SD-NEXT: add sp, sp, #32 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: insert_v2fp128_1: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sub sp, sp, #64 +; CHECK-GI-NEXT: str x30, [sp, #48] // 8-byte Folded Spill +; CHECK-GI-NEXT: .cfi_def_cfa_offset 64 +; CHECK-GI-NEXT: .cfi_offset w30, -16 +; CHECK-GI-NEXT: stp q1, q2, [sp, #16] // 32-byte Folded Spill +; CHECK-GI-NEXT: mov v1.16b, v0.16b +; CHECK-GI-NEXT: bl __addtf3 +; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-GI-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v1.16b, v0.16b +; CHECK-GI-NEXT: bl __addtf3 +; CHECK-GI-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q1, [sp, #32] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr x30, [sp, #48] // 8-byte Folded Reload +; CHECK-GI-NEXT: add sp, sp, #64 +; CHECK-GI-NEXT: ret entry: %aa = fadd <2 x fp128> %a, %a %d = insertelement <2 x fp128> %aa, fp128 %b, i32 1 @@ -1422,32 +1499,65 @@ entry: } define <2 x fp128> @insert_v2fp128_c(<2 x fp128> %a, fp128 %b, i32 %c) { -; CHECK-LABEL: insert_v2fp128_c: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: sub sp, sp, #96 -; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 96 -; CHECK-NEXT: .cfi_offset w19, -8 -; CHECK-NEXT: .cfi_offset w30, -16 -; CHECK-NEXT: str q1, [sp] // 16-byte Folded Spill -; CHECK-NEXT: mov v1.16b, v0.16b -; CHECK-NEXT: mov w19, w0 -; CHECK-NEXT: str q2, [sp, #32] // 16-byte Folded Spill -; CHECK-NEXT: bl __addtf3 -; CHECK-NEXT: str q0, [sp, #16] // 16-byte Folded Spill -; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload -; CHECK-NEXT: mov v1.16b, v0.16b -; CHECK-NEXT: bl __addtf3 -; CHECK-NEXT: str q0, [sp, #64] -; CHECK-NEXT: ldp q3, q0, [sp, #16] // 32-byte Folded Reload -; CHECK-NEXT: and x8, x19, #0x1 -; CHECK-NEXT: add x9, sp, #48 -; CHECK-NEXT: str q3, [sp, #48] -; CHECK-NEXT: str q0, [x9, x8, lsl #4] -; CHECK-NEXT: ldp q0, q1, [sp, #48] -; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload -; CHECK-NEXT: add sp, sp, #96 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: insert_v2fp128_c: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: sub sp, sp, #96 +; CHECK-SD-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-SD-NEXT: .cfi_def_cfa_offset 96 +; CHECK-SD-NEXT: .cfi_offset w19, -8 +; CHECK-SD-NEXT: .cfi_offset w30, -16 +; CHECK-SD-NEXT: str q1, [sp] // 16-byte Folded Spill +; CHECK-SD-NEXT: mov v1.16b, v0.16b +; CHECK-SD-NEXT: mov w19, w0 +; CHECK-SD-NEXT: str q2, [sp, #32] // 16-byte Folded Spill +; CHECK-SD-NEXT: bl __addtf3 +; CHECK-SD-NEXT: str q0, [sp, #16] // 16-byte Folded Spill +; CHECK-SD-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-SD-NEXT: mov v1.16b, v0.16b +; CHECK-SD-NEXT: bl __addtf3 +; CHECK-SD-NEXT: str q0, [sp, #64] +; CHECK-SD-NEXT: ldp q3, q0, [sp, #16] // 32-byte Folded Reload +; CHECK-SD-NEXT: and x8, x19, #0x1 +; CHECK-SD-NEXT: add x9, sp, #48 +; CHECK-SD-NEXT: str q3, [sp, #48] +; CHECK-SD-NEXT: str q0, [x9, x8, lsl #4] +; CHECK-SD-NEXT: ldp q0, q1, [sp, #48] +; CHECK-SD-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-SD-NEXT: add sp, sp, #96 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: insert_v2fp128_c: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-GI-NEXT: sub x9, sp, #96 +; CHECK-GI-NEXT: str x19, [sp, #16] // 8-byte Folded Spill +; CHECK-GI-NEXT: mov x29, sp +; CHECK-GI-NEXT: and sp, x9, #0xffffffffffffffe0 +; CHECK-GI-NEXT: .cfi_def_cfa w29, 32 +; CHECK-GI-NEXT: .cfi_offset w19, -16 +; CHECK-GI-NEXT: .cfi_offset w30, -24 +; CHECK-GI-NEXT: .cfi_offset w29, -32 +; CHECK-GI-NEXT: str q1, [sp, #16] // 16-byte Folded Spill +; CHECK-GI-NEXT: mov v1.16b, v0.16b +; CHECK-GI-NEXT: mov w19, w0 +; CHECK-GI-NEXT: str q2, [sp, #48] // 16-byte Folded Spill +; CHECK-GI-NEXT: bl __addtf3 +; CHECK-GI-NEXT: str q0, [sp, #32] // 16-byte Folded Spill +; CHECK-GI-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v1.16b, v0.16b +; CHECK-GI-NEXT: bl __addtf3 +; CHECK-GI-NEXT: ldr q1, [sp, #32] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov w8, w19 +; CHECK-GI-NEXT: add x9, sp, #64 +; CHECK-GI-NEXT: and x8, x8, #0x1 +; CHECK-GI-NEXT: stp q1, q0, [sp, #64] +; CHECK-GI-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload +; CHECK-GI-NEXT: str q0, [x9, x8, lsl #4] +; CHECK-GI-NEXT: ldp q0, q1, [sp, #64] +; CHECK-GI-NEXT: mov sp, x29 +; CHECK-GI-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload +; CHECK-GI-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-GI-NEXT: ret entry: %aa = fadd <2 x fp128> %a, %a %d = insertelement <2 x fp128> %aa, fp128 %b, i32 %c @@ -2741,31 +2851,60 @@ entry: } define i128 @extract_v2i128_c(<2 x i128> %a, i32 %c) { -; CHECK-LABEL: extract_v2i128_c: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: sub sp, sp, #64 -; CHECK-NEXT: .cfi_def_cfa_offset 64 -; CHECK-NEXT: adds x9, x0, x0 -; CHECK-NEXT: mov w8, w4 -; CHECK-NEXT: adc x10, x1, x1 -; CHECK-NEXT: adds x11, x2, x2 -; CHECK-NEXT: fmov d1, x9 -; CHECK-NEXT: fmov d0, x11 -; CHECK-NEXT: adc x12, x3, x3 -; CHECK-NEXT: add x8, x8, x8 -; CHECK-NEXT: and x9, x8, #0x3 -; CHECK-NEXT: add w8, w8, #1 -; CHECK-NEXT: mov x11, sp -; CHECK-NEXT: mov v1.d[1], x10 -; CHECK-NEXT: add x10, sp, #32 -; CHECK-NEXT: and x8, x8, #0x3 -; CHECK-NEXT: mov v0.d[1], x12 -; CHECK-NEXT: stp q1, q0, [sp] -; CHECK-NEXT: stp q1, q0, [sp, #32] -; CHECK-NEXT: ldr x0, [x10, x9, lsl #3] -; CHECK-NEXT: ldr x1, [x11, x8, lsl #3] -; CHECK-NEXT: add sp, sp, #64 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: extract_v2i128_c: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: sub sp, sp, #64 +; CHECK-SD-NEXT: .cfi_def_cfa_offset 64 +; CHECK-SD-NEXT: adds x9, x0, x0 +; CHECK-SD-NEXT: mov w8, w4 +; CHECK-SD-NEXT: adc x10, x1, x1 +; CHECK-SD-NEXT: adds x11, x2, x2 +; CHECK-SD-NEXT: fmov d1, x9 +; CHECK-SD-NEXT: fmov d0, x11 +; CHECK-SD-NEXT: adc x12, x3, x3 +; CHECK-SD-NEXT: add x8, x8, x8 +; CHECK-SD-NEXT: and x9, x8, #0x3 +; CHECK-SD-NEXT: add w8, w8, #1 +; CHECK-SD-NEXT: mov x11, sp +; CHECK-SD-NEXT: mov v1.d[1], x10 +; CHECK-SD-NEXT: add x10, sp, #32 +; CHECK-SD-NEXT: and x8, x8, #0x3 +; CHECK-SD-NEXT: mov v0.d[1], x12 +; CHECK-SD-NEXT: stp q1, q0, [sp] +; CHECK-SD-NEXT: stp q1, q0, [sp, #32] +; CHECK-SD-NEXT: ldr x0, [x10, x9, lsl #3] +; CHECK-SD-NEXT: ldr x1, [x11, x8, lsl #3] +; CHECK-SD-NEXT: add sp, sp, #64 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: extract_v2i128_c: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-GI-NEXT: sub x9, sp, #48 +; CHECK-GI-NEXT: mov x29, sp +; CHECK-GI-NEXT: and sp, x9, #0xffffffffffffffe0 +; CHECK-GI-NEXT: .cfi_def_cfa w29, 16 +; CHECK-GI-NEXT: .cfi_offset w30, -8 +; CHECK-GI-NEXT: .cfi_offset w29, -16 +; CHECK-GI-NEXT: adds x8, x0, x0 +; CHECK-GI-NEXT: adc x9, x1, x1 +; CHECK-GI-NEXT: mov v0.d[0], x8 +; CHECK-GI-NEXT: adds x8, x2, x2 +; CHECK-GI-NEXT: mov v1.d[0], x8 +; CHECK-GI-NEXT: adc x8, x3, x3 +; CHECK-GI-NEXT: mov v0.d[1], x9 +; CHECK-GI-NEXT: mov x9, sp +; CHECK-GI-NEXT: mov v1.d[1], x8 +; CHECK-GI-NEXT: mov w8, w4 +; CHECK-GI-NEXT: and x8, x8, #0x1 +; CHECK-GI-NEXT: stp q0, q1, [sp] +; CHECK-GI-NEXT: ldr q0, [x9, x8, lsl #4] +; CHECK-GI-NEXT: mov d1, v0.d[1] +; CHECK-GI-NEXT: fmov x0, d0 +; CHECK-GI-NEXT: fmov x1, d1 +; CHECK-GI-NEXT: mov sp, x29 +; CHECK-GI-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-GI-NEXT: ret entry: %b = add <2 x i128> %a, %a %d = extractelement <2 x i128> %b, i32 %c @@ -2792,16 +2931,34 @@ entry: } define fp128 @extract_v2fp128_c(<2 x fp128> %a, i32 %c) { -; CHECK-LABEL: extract_v2fp128_c: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: stp q0, q1, [sp, #-32]! -; CHECK-NEXT: .cfi_def_cfa_offset 32 -; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 -; CHECK-NEXT: and x8, x0, #0x1 -; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: ldr q0, [x9, x8, lsl #4] -; CHECK-NEXT: add sp, sp, #32 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: extract_v2fp128_c: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: stp q0, q1, [sp, #-32]! +; CHECK-SD-NEXT: .cfi_def_cfa_offset 32 +; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-SD-NEXT: and x8, x0, #0x1 +; CHECK-SD-NEXT: mov x9, sp +; CHECK-SD-NEXT: ldr q0, [x9, x8, lsl #4] +; CHECK-SD-NEXT: add sp, sp, #32 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: extract_v2fp128_c: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-GI-NEXT: sub x9, sp, #48 +; CHECK-GI-NEXT: mov x29, sp +; CHECK-GI-NEXT: and sp, x9, #0xffffffffffffffe0 +; CHECK-GI-NEXT: .cfi_def_cfa w29, 16 +; CHECK-GI-NEXT: .cfi_offset w30, -8 +; CHECK-GI-NEXT: .cfi_offset w29, -16 +; CHECK-GI-NEXT: mov w8, w0 +; CHECK-GI-NEXT: stp q0, q1, [sp] +; CHECK-GI-NEXT: mov x9, sp +; CHECK-GI-NEXT: and x8, x8, #0x1 +; CHECK-GI-NEXT: ldr q0, [x9, x8, lsl #4] +; CHECK-GI-NEXT: mov sp, x29 +; CHECK-GI-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-GI-NEXT: ret entry: %d = extractelement <2 x fp128> %a, i32 %c ret fp128 %d diff --git a/llvm/test/CodeGen/AArch64/local-bounds-single-trap.ll b/llvm/test/CodeGen/AArch64/local-bounds-single-trap.ll new file mode 100644 index 000000000000..8b8a3e430df6 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/local-bounds-single-trap.ll @@ -0,0 +1,86 @@ +; RUN: llc -O3 -mtriple arm64-linux -filetype asm -o - %s | FileCheck %s -check-prefix CHECK-ASM +; This test checks that nomerge correctly prevents the traps from being merged +; in the compiled code. +; Prior to ae6dc64ec670891cb15049277e43133d4df7fb4b, this test showed that +; nomerge did not work correctly. + +@B = dso_local global [10 x i8] zeroinitializer, align 1 +@B2 = dso_local global [10 x i8] zeroinitializer, align 1 + +; Function Attrs: noinline nounwind uwtable +define dso_local void @f8(i32 noundef %i, i32 noundef %k) #0 { +entry: +; CHECK-ASM: cmp x8, #10 +; CHECK-ASM: b.hi .LBB0_5 +; CHECK-ASM: // %bb.1: // %entry +; CHECK-ASM: mov w9, #10 // =0xa +; CHECK-ASM: sub x9, x9, x8 +; CHECK-ASM: cbz x9, .LBB0_5 +; CHECK-ASM: // %bb.2: +; CHECK-ASM: ldrsw x9, [sp, #8] +; CHECK-ASM: adrp x10, B +; CHECK-ASM: add x10, x10, :lo12:B +; CHECK-ASM: strb wzr, [x10, x8] +; CHECK-ASM: cmp x9, #10 +; CHECK-ASM: b.hi .LBB0_6 +; CHECK-ASM: // %bb.3: +; CHECK-ASM: mov w8, #10 // =0xa +; CHECK-ASM: sub x8, x8, x9 +; CHECK-ASM: cbz x8, .LBB0_6 +; CHECK-ASM: // %bb.4: +; CHECK-ASM: adrp x8, B2 +; CHECK-ASM: add x8, x8, :lo12:B2 +; CHECK-ASM: strb wzr, [x8, x9] +; CHECK-ASM: add sp, sp, #16 +; CHECK-ASM: .cfi_def_cfa_offset 0 +; CHECK-ASM: ret +; CHECK-ASM: .LBB0_5: // %trap +; CHECK-ASM: .cfi_restore_state +; CHECK-ASM: brk #0x1 +; CHECK-ASM: .LBB0_6: // %trap3 +; CHECK-ASM: brk #0x1 + %i.addr = alloca i32, align 4 + %k.addr = alloca i32, align 4 + store i32 %i, ptr %i.addr, align 4 + store i32 %k, ptr %k.addr, align 4 + %0 = load i32, ptr %i.addr, align 4 + %idxprom = sext i32 %0 to i64 + %1 = add i64 0, %idxprom + %arrayidx = getelementptr inbounds [10 x i8], ptr @B, i64 0, i64 %idxprom + %2 = sub i64 10, %1 + %3 = icmp ult i64 10, %1 + %4 = icmp ult i64 %2, 1 + %5 = or i1 %3, %4 + br i1 %5, label %trap, label %6 + +6: ; preds = %entry + store i8 0, ptr %arrayidx, align 1 + %7 = load i32, ptr %k.addr, align 4 + %idxprom1 = sext i32 %7 to i64 + %8 = add i64 0, %idxprom1 + %arrayidx2 = getelementptr inbounds [10 x i8], ptr @B2, i64 0, i64 %idxprom1 + %9 = sub i64 10, %8 + %10 = icmp ult i64 10, %8 + %11 = icmp ult i64 %9, 1 + %12 = or i1 %10, %11 + br i1 %12, label %trap3, label %13 + +13: ; preds = %6 + store i8 0, ptr %arrayidx2, align 1 + ret void + +trap: ; preds = %entry + call void @llvm.trap() #2 + unreachable + +trap3: ; preds = %6 + call void @llvm.trap() #2 + unreachable +} + +; Function Attrs: cold noreturn nounwind memory(inaccessiblemem: write) +declare void @llvm.trap() #1 + +attributes #0 = { noinline nounwind uwtable } +attributes #1 = { cold noreturn nounwind memory(inaccessiblemem: write) } +attributes #2 = { noreturn nounwind nomerge } diff --git a/llvm/test/CodeGen/AArch64/lshr-trunc-lshr.ll b/llvm/test/CodeGen/AArch64/lshr-trunc-lshr.ll new file mode 100644 index 000000000000..8a576fc346bd --- /dev/null +++ b/llvm/test/CodeGen/AArch64/lshr-trunc-lshr.ll @@ -0,0 +1,125 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=aarch64-none-eabi -global-isel=0 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD +; RUN: llc -mtriple=aarch64-none-eabi -global-isel=1 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI + +define i32 @s32_test1(i64 %a) { +; CHECK-LABEL: s32_test1: +; CHECK: // %bb.0: +; CHECK-NEXT: lsr x0, x0, #48 +; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 +; CHECK-NEXT: ret + %r = lshr i64 %a, 32 + %ret = trunc i64 %r to i32 + %x = lshr i32 %ret, 16 + ret i32 %x +} + +define i32 @s32_test2(i64 %a) { +; CHECK-LABEL: s32_test2: +; CHECK: // %bb.0: +; CHECK-NEXT: ubfx x0, x0, #32, #16 +; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 +; CHECK-NEXT: ret + %r = lshr i64 %a, 16 + %ret = trunc i64 %r to i32 + %x = lshr i32 %ret, 16 + ret i32 %x +} + +define <8 x i8> @v8s8_test1(<8 x i16> %a) { +; CHECK-LABEL: v8s8_test1: +; CHECK: // %bb.0: +; CHECK-NEXT: ushr v0.8h, v0.8h, #12 +; CHECK-NEXT: xtn v0.8b, v0.8h +; CHECK-NEXT: ret + %r = lshr <8 x i16> %a, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> + %ret = trunc <8 x i16> %r to <8 x i8> + %x = lshr <8 x i8> %ret, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4> + ret <8 x i8> %x +} + +define <8 x i8> @v8s8_test2(<8 x i16> %a) { +; CHECK-SD-LABEL: v8s8_test2: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ushr v0.8h, v0.8h, #8 +; CHECK-SD-NEXT: bic v0.8h, #240 +; CHECK-SD-NEXT: xtn v0.8b, v0.8h +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: v8s8_test2: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi v1.8h, #15 +; CHECK-GI-NEXT: ushr v0.8h, v0.8h, #8 +; CHECK-GI-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-GI-NEXT: xtn v0.8b, v0.8h +; CHECK-GI-NEXT: ret + %r = lshr <8 x i16> %a, <i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4> + %ret = trunc <8 x i16> %r to <8 x i8> + %x = lshr <8 x i8> %ret, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4> + ret <8 x i8> %x +} + +define <4 x i16> @v4s16_test1(<4 x i32> %a) { +; CHECK-LABEL: v4s16_test1: +; CHECK: // %bb.0: +; CHECK-NEXT: ushr v0.4s, v0.4s, #24 +; CHECK-NEXT: xtn v0.4h, v0.4s +; CHECK-NEXT: ret + %r = lshr <4 x i32> %a, <i32 16, i32 16, i32 16, i32 16> + %ret = trunc <4 x i32> %r to <4 x i16> + %x = lshr <4 x i16> %ret, <i16 8, i16 8, i16 8, i16 8> + ret <4 x i16> %x +} + +define <4 x i16> @v4s16_test2(<4 x i32> %a) { +; CHECK-SD-LABEL: v4s16_test2: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: shrn v0.4h, v0.4s, #16 +; CHECK-SD-NEXT: bic v0.4h, #255, lsl #8 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: v4s16_test2: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi v1.2d, #0x0000ff000000ff +; CHECK-GI-NEXT: ushr v0.4s, v0.4s, #16 +; CHECK-GI-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-GI-NEXT: xtn v0.4h, v0.4s +; CHECK-GI-NEXT: ret + %r = lshr <4 x i32> %a, <i32 8, i32 8, i32 8, i32 8> + %ret = trunc <4 x i32> %r to <4 x i16> + %x = lshr <4 x i16> %ret, <i16 8, i16 8, i16 8, i16 8> + ret <4 x i16> %x +} + +define <2 x i32> @v2s32_test1(<2 x i64> %a) { +; CHECK-LABEL: v2s32_test1: +; CHECK: // %bb.0: +; CHECK-NEXT: ushr v0.2d, v0.2d, #48 +; CHECK-NEXT: xtn v0.2s, v0.2d +; CHECK-NEXT: ret + %r = lshr <2 x i64> %a, <i64 32, i64 32> + %ret = trunc <2 x i64> %r to <2 x i32> + %x = lshr <2 x i32> %ret, <i32 16, i32 16> + ret <2 x i32> %x +} + +define <2 x i32> @v2s32_test2(<2 x i64> %a) { +; CHECK-SD-LABEL: v2s32_test2: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: movi d1, #0x00ffff0000ffff +; CHECK-SD-NEXT: shrn v0.2s, v0.2d, #32 +; CHECK-SD-NEXT: and v0.8b, v0.8b, v1.8b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: v2s32_test2: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi v1.2d, #0x0000000000ffff +; CHECK-GI-NEXT: ushr v0.2d, v0.2d, #32 +; CHECK-GI-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-GI-NEXT: xtn v0.2s, v0.2d +; CHECK-GI-NEXT: ret + %r = lshr <2 x i64> %a, <i64 16, i64 16> + %ret = trunc <2 x i64> %r to <2 x i32> + %x = lshr <2 x i32> %ret, <i32 16, i32 16> + ret <2 x i32> %x +} diff --git a/llvm/test/CodeGen/AArch64/machine-outliner-flags.ll b/llvm/test/CodeGen/AArch64/machine-outliner-flags.ll index c435093b794e..0fbf2bc43d1e 100644 --- a/llvm/test/CodeGen/AArch64/machine-outliner-flags.ll +++ b/llvm/test/CodeGen/AArch64/machine-outliner-flags.ll @@ -1,25 +1,15 @@ ; REQUIRES: asserts -; RUN: llc %s -debug-pass=Structure -verify-machineinstrs \ -; RUN: --debug-only=machine-outliner -enable-machine-outliner=always \ -; RUN: -mtriple arm64---- -o /dev/null 2>&1 \ -; RUN: | FileCheck %s -check-prefix=ALWAYS +; RUN: llc %s -debug-pass=Structure -verify-machineinstrs --debug-only=machine-outliner -enable-machine-outliner=always -mtriple arm64---- -o /dev/null 2>&1 | FileCheck %s -check-prefixes=CHECK,ALWAYS +; RUN: llc %s -debug-pass=Structure -verify-machineinstrs --debug-only=machine-outliner -enable-machine-outliner -mtriple arm64---- -o /dev/null 2>&1 | FileCheck %s -check-prefixes=CHECK,ALWAYS -; RUN: llc %s -debug-pass=Structure -verify-machineinstrs \ -; RUN: --debug-only=machine-outliner -enable-machine-outliner \ -; RUN: -mtriple arm64---- -o /dev/null 2>&1 \ -; RUN: | FileCheck %s -check-prefix=ENABLE +; RUN: llc %s -debug-pass=Structure -verify-machineinstrs --debug-only=machine-outliner -mtriple arm64---- -o /dev/null 2>&1 | FileCheck %s -check-prefixes=CHECK,TARGET-DEFAULT -; RUN: llc %s -debug-pass=Structure -verify-machineinstrs \ -; RUN: -enable-machine-outliner=never -mtriple arm64---- -o /dev/null 2>&1 \ -; RUN: | FileCheck %s -check-prefix=NEVER +; RUN: llc %s -debug-pass=Structure -verify-machineinstrs --debug-only=machine-outliner -enable-machine-outliner=optimistic-pgo -mtriple arm64---- -o /dev/null 2>&1 | FileCheck %s -check-prefixes=CHECK,OPTIMISTIC -; RUN: llc %s -debug-pass=Structure -verify-machineinstrs \ -; RUN: --debug-only=machine-outliner -mtriple arm64---- -o /dev/null 2>&1 \ -; RUN: | FileCheck %s -check-prefix=NOT-ADDED +; RUN: llc %s -debug-pass=Structure -verify-machineinstrs --debug-only=machine-outliner -enable-machine-outliner=conservative-pgo -mtriple arm64---- -o /dev/null 2>&1 | FileCheck %s -check-prefixes=CHECK,CONSERVATIVE -; RUN: llc %s -O=0 -debug-pass=Structure -verify-machineinstrs \ -; RUN: -mtriple arm64---- -o /dev/null 2>&1 \ -; RUN: | FileCheck %s -check-prefix=OPTNONE +; RUN: llc %s -debug-pass=Structure -verify-machineinstrs --debug-only=machine-outliner -enable-machine-outliner=never -mtriple arm64---- -o /dev/null 2>&1 | FileCheck %s -check-prefix=DISABLED +; RUN: llc %s -debug-pass=Structure -verify-machineinstrs --debug-only=machine-outliner -O=0 -mtriple arm64---- -o /dev/null 2>&1 | FileCheck %s -check-prefix=DISABLED ; Make sure that the outliner is added to the pass pipeline only when the ; appropriate flags/settings are set. Make sure it isn't added otherwise. @@ -27,23 +17,21 @@ ; Cases where it should be added: ; * -enable-machine-outliner ; * -enable-machine-outliner=always -; * -enable-machine-outliner is not passed (AArch64 supports -; target-default outlining) +; * -enable-machine-outliner=optimistic-pgo +; * -enable-machine-outliner=conservative-pgo +; * -enable-machine-outliner is not passed (AArch64 supports target-default outlining) ; ; Cases where it should not be added: ; * -O0 or equivalent ; * -enable-machine-outliner=never is passed -; ALWAYS: Machine Outliner +; CHECK: Machine Outliner +; DISABLED-NOT: Machine Outliner ; ALWAYS: Machine Outliner: Running on all functions -; ENABLE: Machine Outliner -; ENABLE: Machine Outliner: Running on all functions -; NEVER-NOT: Machine Outliner -; NOT-ADDED: Machine Outliner -; NOT-ADDED: Machine Outliner: Running on target-default functions -; OPTNONE-NOT: Machine Outliner +; OPTIMISTIC: Machine Outliner: Running on optimistically cold functions +; CONSERVATIVE: Machine Outliner: Running on conservatively cold functions +; TARGET-DEFAULT: Machine Outliner: Running on target-default functions define void @foo() { ret void; } - diff --git a/llvm/test/CodeGen/AArch64/machine-outliner-pgo.ll b/llvm/test/CodeGen/AArch64/machine-outliner-pgo.ll new file mode 100644 index 000000000000..d0ea5e9ae101 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/machine-outliner-pgo.ll @@ -0,0 +1,83 @@ +; RUN: rm -rf %t && split-file %s %t + +; RUN: llvm-profdata merge %t/a.proftext -o %t/a.profdata +; RUN: opt %t/a.ll -passes=pgo-instr-use -pgo-test-profile-file=%t/a.profdata -S -o %t/a2.ll + +; RUN: llc < %t/a2.ll -enable-machine-outliner=conservative-pgo -mtriple=aarch64-linux-gnu -profile-summary-cold-count=0 | FileCheck %s --check-prefixes=CHECK,CONSERVATIVE +; RUN: llc < %t/a2.ll -enable-machine-outliner=optimistic-pgo -mtriple=aarch64-linux-gnu -profile-summary-cold-count=0 | FileCheck %s --check-prefixes=CHECK,OPTIMISTIC + +;--- a.ll +declare void @z(i32, i32, i32, i32) + +; CHECK-LABEL: always_outline: +define void @always_outline() cold { +entry: +; CHECK: [[OUTLINED:OUTLINED_FUNCTION_[0-9]+]] + tail call void @z(i32 1, i32 2, i32 3, i32 4) + ret void +; CHECK: .cfi_endproc +} + +; CHECK-LABEL: cold: +define void @cold() { +entry: +; CHECK: [[OUTLINED]] + tail call void @z(i32 1, i32 2, i32 3, i32 4) + ret void +; CHECK: .cfi_endproc +} + +; CHECK-LABEL: hot: +define void @hot() minsize { +entry: +; CHECK-NOT: [[OUTLINED]] + tail call void @z(i32 1, i32 2, i32 3, i32 4) + ret void +; CHECK: .cfi_endproc +} + +; CHECK-LABEL: no_profile_minsize: +define void @no_profile_minsize() minsize { +entry: +; CONSERVATIVE-NOT: [[OUTLINED]] +; OPTIMISTIC: [[OUTLINED]] + tail call void @z(i32 1, i32 2, i32 3, i32 4) + ret void +; CHECK: .cfi_endproc +} + +; CHECK-LABEL: no_profile_optsize: +define void @no_profile_optsize() optsize { +entry: +; CHECK-NOT: [[OUTLINED]] + tail call void @z(i32 1, i32 2, i32 3, i32 4) + ret void +; CHECK: .cfi_endproc +} + +; CHECK: [[OUTLINED]]: +; CHECK-SAME: // @{{.*}} Tail Call +; CHECK: mov w0, #1 +; CHECK-NEXT: mov w1, #2 +; CHECK-NEXT: mov w2, #3 +; CHECK-NEXT: mov w3, #4 +; CHECK-NEXT: b z + +;--- a.proftext +:ir + +cold +# Func Hash: +742261418966908927 +# Num Counters: +1 +# Counter Values: +0 + +hot +# Func Hash: +742261418966908927 +# Num Counters: +1 +# Counter Values: +100 diff --git a/llvm/test/CodeGen/AArch64/misched-fuse-cset.ll b/llvm/test/CodeGen/AArch64/misched-fuse-cset.ll new file mode 100644 index 000000000000..fa729d04a79e --- /dev/null +++ b/llvm/test/CodeGen/AArch64/misched-fuse-cset.ll @@ -0,0 +1,43 @@ +; RUN: llc %s -o - -mtriple=aarch64-unknown -mattr=+fuse-cset | FileCheck %s +; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=cortex-a78 | FileCheck %s +; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=cortex-a710 | FileCheck %s +; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=cortex-a715 | FileCheck %s +; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=cortex-a720 | FileCheck %s +; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=cortex-a725 | FileCheck %s +; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=cortex-x4 | FileCheck %s +; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=cortex-x925 | FileCheck %s +; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=neoverse-n2 | FileCheck %s +; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=neoverse-n3 | FileCheck %s +; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=neoverse-v1 | FileCheck %s +; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=neoverse-v2 | FileCheck %s +; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=neoverse-v3 | FileCheck %s + +target triple = "aarch64-unknown" + +define i32 @test_sub_cselw(i32 %a0, i32 %a1, i32 %a2) { +entry: + %v0 = sub i32 %a0, 13 + %cond = icmp eq i32 %v0, 0 + %v1 = add i32 %a1, 7 + %v2 = select i1 %cond, i32 0, i32 1 + %v3 = xor i32 %v1, %v2 + ret i32 %v3 + +; CHECK-LABEL: test_sub_cselw: +; CHECK: cmp {{w[0-9]}}, #13 +; CHECK-NEXT: cset {{w[0-9]}} +} + +define i64 @test_sub_cselx(i64 %a0, i64 %a1, i64 %a2) { +entry: + %v0 = sub i64 %a0, 13 + %cond = icmp eq i64 %v0, 0 + %v1 = add i64 %a1, 7 + %v2 = select i1 %cond, i64 0, i64 1 + %v3 = xor i64 %v1, %v2 + ret i64 %v3 + +; CHECK-LABEL: test_sub_cselx: +; CHECK: cmp {{x[0-9]}}, #13 +; CHECK-NEXT: cset {{w[0-9]}} +} diff --git a/llvm/test/CodeGen/AArch64/misched-fusion-csel.ll b/llvm/test/CodeGen/AArch64/misched-fusion-csel.ll index ac0adb7f85d0..8fa60ee93663 100644 --- a/llvm/test/CodeGen/AArch64/misched-fusion-csel.ll +++ b/llvm/test/CodeGen/AArch64/misched-fusion-csel.ll @@ -1,9 +1,42 @@ -; RUN: llc %s -o - -mtriple=aarch64-unknown -mattr=fuse-csel | FileCheck %s -; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=exynos-m3 | FileCheck %s -; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=exynos-m4 | FileCheck %s -; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=exynos-m5 | FileCheck %s +; RUN: llc %s -o - -mtriple=aarch64-unknown -mattr=fuse-csel -debug-only=machine-scheduler 2>&1 | FileCheck %s +; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=exynos-m3 -debug-only=machine-scheduler 2>&1 | FileCheck %s +; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=exynos-m4 -debug-only=machine-scheduler 2>&1 | FileCheck %s +; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=exynos-m5 -debug-only=machine-scheduler 2>&1 | FileCheck %s +; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=cortex-a78 -debug-only=machine-scheduler 2>&1 | FileCheck %s +; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=cortex-a710 -debug-only=machine-scheduler 2>&1 | FileCheck %s +; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=cortex-a715 -debug-only=machine-scheduler 2>&1 | FileCheck %s +; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=cortex-a720 -debug-only=machine-scheduler 2>&1 | FileCheck %s +; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=cortex-a725 -debug-only=machine-scheduler 2>&1 | FileCheck %s +; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=cortex-x4 -debug-only=machine-scheduler 2>&1 | FileCheck %s +; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=cortex-x925 -debug-only=machine-scheduler 2>&1 | FileCheck %s +; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=neoverse-n2 -debug-only=machine-scheduler 2>&1 | FileCheck %s +; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=neoverse-n3 -debug-only=machine-scheduler 2>&1 | FileCheck %s +; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=neoverse-v1 -debug-only=machine-scheduler 2>&1 | FileCheck %s +; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=neoverse-v2 -debug-only=machine-scheduler 2>&1 | FileCheck %s +; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=neoverse-v3 -debug-only=machine-scheduler 2>&1 | FileCheck %s +; REQUIRES: asserts -target triple = "aarch64-unknown" +; Check that the scheduling model has an edge between the SUBS and the CSEL. +; CHECK-LABEL: test_sub_cselw:%bb.0 +; CHECK: SU(2): %3:gpr32common = ADDWri %1:gpr32common, 7, 0 +; CHECK: SU(3): dead $wzr = SUBSWri %0:gpr32common, 13, 0, implicit-def $nzcv +; CHECK: Successors: +; CHECK: SU(4): Ord Latency=0 Cluster +; CHECK: SU(4): %5:gpr32 = CSELWr %0:gpr32common, %3:gpr32common, 0, implicit killed $nzcv +; CHECK: Predecessors: +; CHECK: SU(3): Ord Latency=0 Cluster +; CHECK: SU(5): $w0 = COPY %5:gpr32 + + +; CHECK-LABEL: test_sub_cselx:%bb.0 +; CHECK: SU(2): %3:gpr64common = ADDXri %1:gpr64common, 7, 0 +; CHECK: SU(3): dead $xzr = SUBSXri %0:gpr64common, 13, 0, implicit-def $nzcv +; CHECK: Successors: +; CHECK: SU(4): Ord Latency=0 Cluster +; CHECK: SU(4): %5:gpr64 = CSELXr %0:gpr64common, %3:gpr64common, 0, implicit killed $nzcv +; CHECK: Predecessors: +; CHECK: SU(3): Ord Latency=0 Cluster +; CHECK: SU(5): $x0 = COPY %5:gpr64 define i32 @test_sub_cselw(i32 %a0, i32 %a1, i32 %a2) { entry: diff --git a/llvm/test/CodeGen/AArch64/neon-dotreduce.ll b/llvm/test/CodeGen/AArch64/neon-dotreduce.ll index 88b6f6c40bac..fb2a1fa697c2 100644 --- a/llvm/test/CodeGen/AArch64/neon-dotreduce.ll +++ b/llvm/test/CodeGen/AArch64/neon-dotreduce.ll @@ -2400,7 +2400,7 @@ define i32 @test_udot_v25i8(ptr nocapture readonly %a, ptr nocapture readonly %b ; CHECK-GI-NEXT: .cfi_offset w30, -88 ; CHECK-GI-NEXT: .cfi_offset w29, -96 ; CHECK-GI-NEXT: ldp q2, q1, [x1] -; CHECK-GI-NEXT: fmov s0, wzr +; CHECK-GI-NEXT: movi d0, #0000000000000000 ; CHECK-GI-NEXT: str w2, [sp, #12] // 4-byte Folded Spill ; CHECK-GI-NEXT: mov b6, v2.b[3] ; CHECK-GI-NEXT: mov b7, v2.b[4] @@ -2710,7 +2710,7 @@ define i32 @test_udot_v25i8_nomla(ptr nocapture readonly %a1) { ; CHECK-GI-NEXT: .cfi_def_cfa_offset 16 ; CHECK-GI-NEXT: .cfi_offset w19, -16 ; CHECK-GI-NEXT: ldp q2, q1, [x0] -; CHECK-GI-NEXT: fmov s0, wzr +; CHECK-GI-NEXT: movi d0, #0000000000000000 ; CHECK-GI-NEXT: umov w15, v2.b[0] ; CHECK-GI-NEXT: umov w17, v2.b[4] ; CHECK-GI-NEXT: umov w0, v2.b[8] @@ -2830,7 +2830,7 @@ define i32 @test_sdot_v25i8(ptr nocapture readonly %a, ptr nocapture readonly %b ; CHECK-GI-NEXT: .cfi_offset w30, -88 ; CHECK-GI-NEXT: .cfi_offset w29, -96 ; CHECK-GI-NEXT: ldp q2, q1, [x1] -; CHECK-GI-NEXT: fmov s0, wzr +; CHECK-GI-NEXT: movi d0, #0000000000000000 ; CHECK-GI-NEXT: str w2, [sp, #12] // 4-byte Folded Spill ; CHECK-GI-NEXT: mov b5, v2.b[2] ; CHECK-GI-NEXT: mov b6, v2.b[3] @@ -3360,12 +3360,12 @@ define i32 @test_sdot_v25i8_double(<25 x i8> %a, <25 x i8> %b, <25 x i8> %c, <25 ; CHECK-GI-NEXT: sbfx w9, w11, #8, #8 ; CHECK-GI-NEXT: lsl w11, w3, #8 ; CHECK-GI-NEXT: sbfx w14, w14, #8, #8 -; CHECK-GI-NEXT: fmov s1, wzr +; CHECK-GI-NEXT: movi d1, #0000000000000000 ; CHECK-GI-NEXT: lsl w10, w10, #8 ; CHECK-GI-NEXT: mov v4.h[1], w8 ; CHECK-GI-NEXT: ldr w8, [sp, #152] ; CHECK-GI-NEXT: sbfx w11, w11, #8, #8 -; CHECK-GI-NEXT: fmov s0, wzr +; CHECK-GI-NEXT: movi d0, #0000000000000000 ; CHECK-GI-NEXT: mov v2.h[2], w9 ; CHECK-GI-NEXT: ldr w9, [sp, #40] ; CHECK-GI-NEXT: sbfx w10, w10, #8, #8 @@ -4012,25 +4012,24 @@ define i32 @test_sdot_v25i8_double_nomla(<25 x i8> %a, <25 x i8> %b, <25 x i8> % ; CHECK-GI-NEXT: sxtb w8, w4 ; CHECK-GI-NEXT: sxtb w10, w10 ; CHECK-GI-NEXT: ldr w14, [sp, #448] -; CHECK-GI-NEXT: fmov s1, wzr -; CHECK-GI-NEXT: fmov s0, wzr +; CHECK-GI-NEXT: movi d1, #0000000000000000 +; CHECK-GI-NEXT: movi d0, #0000000000000000 ; CHECK-GI-NEXT: fmov s3, w8 ; CHECK-GI-NEXT: sxtb w8, w2 ; CHECK-GI-NEXT: fmov s5, w10 ; CHECK-GI-NEXT: mov v2.s[1], w9 ; CHECK-GI-NEXT: sxtb w9, w5 ; CHECK-GI-NEXT: ldr w10, [sp, #80] -; CHECK-GI-NEXT: mov v1.s[1], wzr -; CHECK-GI-NEXT: mov v0.s[1], wzr ; CHECK-GI-NEXT: mov v3.s[1], w9 ; CHECK-GI-NEXT: ldr w9, [sp, #16] ; CHECK-GI-NEXT: sxtb w10, w10 +; CHECK-GI-NEXT: mov v1.s[1], wzr +; CHECK-GI-NEXT: mov v0.s[1], wzr ; CHECK-GI-NEXT: mov v2.s[2], w8 ; CHECK-GI-NEXT: sxtb w9, w9 ; CHECK-GI-NEXT: ldr w8, [sp, #24] ; CHECK-GI-NEXT: fmov s6, w10 ; CHECK-GI-NEXT: ldr w10, [sp, #64] -; CHECK-GI-NEXT: mov v1.s[2], wzr ; CHECK-GI-NEXT: mov v3.s[2], w11 ; CHECK-GI-NEXT: fmov s4, w9 ; CHECK-GI-NEXT: sxtb w8, w8 @@ -4039,7 +4038,7 @@ define i32 @test_sdot_v25i8_double_nomla(<25 x i8> %a, <25 x i8> %b, <25 x i8> % ; CHECK-GI-NEXT: sxtb w10, w10 ; CHECK-GI-NEXT: mov v2.s[3], w12 ; CHECK-GI-NEXT: ldr w12, [sp, #88] -; CHECK-GI-NEXT: mov v0.s[2], wzr +; CHECK-GI-NEXT: mov v1.s[2], wzr ; CHECK-GI-NEXT: mov v4.s[1], w8 ; CHECK-GI-NEXT: ldr w8, [sp, #120] ; CHECK-GI-NEXT: sxtb w9, w9 @@ -4063,7 +4062,7 @@ define i32 @test_sdot_v25i8_double_nomla(<25 x i8> %a, <25 x i8> %b, <25 x i8> % ; CHECK-GI-NEXT: ldr w10, [sp, #136] ; CHECK-GI-NEXT: sxtb w13, w13 ; CHECK-GI-NEXT: sxtb w9, w9 -; CHECK-GI-NEXT: mov v1.s[3], wzr +; CHECK-GI-NEXT: mov v0.s[2], wzr ; CHECK-GI-NEXT: mov v7.s[1], w8 ; CHECK-GI-NEXT: sxtb w10, w10 ; CHECK-GI-NEXT: ldr w8, [sp, #72] @@ -4072,8 +4071,9 @@ define i32 @test_sdot_v25i8_double_nomla(<25 x i8> %a, <25 x i8> %b, <25 x i8> % ; CHECK-GI-NEXT: mov v4.s[3], w9 ; CHECK-GI-NEXT: ldr w9, [sp, #360] ; CHECK-GI-NEXT: sxtb w8, w8 -; CHECK-GI-NEXT: mov v0.s[3], wzr +; CHECK-GI-NEXT: mov v1.s[3], wzr ; CHECK-GI-NEXT: sxtb w13, w13 +; CHECK-GI-NEXT: mov v0.s[3], wzr ; CHECK-GI-NEXT: add v2.4s, v2.4s, v3.4s ; CHECK-GI-NEXT: mov v7.s[2], w12 ; CHECK-GI-NEXT: ldr w12, [sp, #352] @@ -4562,13 +4562,13 @@ define i32 @test_udot_v33i8(ptr nocapture readonly %a, ptr nocapture readonly %b ; CHECK-GI-NEXT: .cfi_offset w30, -88 ; CHECK-GI-NEXT: .cfi_offset w29, -96 ; CHECK-GI-NEXT: ldp q7, q16, [x1] -; CHECK-GI-NEXT: fmov s5, wzr +; CHECK-GI-NEXT: movi d5, #0000000000000000 ; CHECK-GI-NEXT: str w2, [sp, #12] // 4-byte Folded Spill -; CHECK-GI-NEXT: fmov s6, wzr -; CHECK-GI-NEXT: fmov s0, wzr -; CHECK-GI-NEXT: fmov s1, wzr -; CHECK-GI-NEXT: fmov s3, wzr -; CHECK-GI-NEXT: fmov s2, wzr +; CHECK-GI-NEXT: movi d6, #0000000000000000 +; CHECK-GI-NEXT: movi d0, #0000000000000000 +; CHECK-GI-NEXT: movi d1, #0000000000000000 +; CHECK-GI-NEXT: movi d3, #0000000000000000 +; CHECK-GI-NEXT: movi d2, #0000000000000000 ; CHECK-GI-NEXT: mov b23, v7.b[7] ; CHECK-GI-NEXT: mov b17, v7.b[1] ; CHECK-GI-NEXT: fmov w11, s7 @@ -4822,7 +4822,7 @@ define i32 @test_udot_v33i8(ptr nocapture readonly %a, ptr nocapture readonly %b ; CHECK-GI-NEXT: mov v19.h[6], w11 ; CHECK-GI-NEXT: fmov w12, s16 ; CHECK-GI-NEXT: fmov w11, s7 -; CHECK-GI-NEXT: fmov s4, wzr +; CHECK-GI-NEXT: movi d4, #0000000000000000 ; CHECK-GI-NEXT: uxtb w9, w9 ; CHECK-GI-NEXT: mov v20.h[6], w10 ; CHECK-GI-NEXT: umov w10, v24.h[0] @@ -4991,13 +4991,13 @@ define i32 @test_udot_v33i8_nomla(ptr nocapture readonly %a1) { ; CHECK-GI-NEXT: .cfi_offset w19, -8 ; CHECK-GI-NEXT: .cfi_offset w20, -16 ; CHECK-GI-NEXT: ldp q7, q19, [x0] -; CHECK-GI-NEXT: fmov s1, wzr +; CHECK-GI-NEXT: movi d1, #0000000000000000 ; CHECK-GI-NEXT: ldrb w10, [x0, #32] -; CHECK-GI-NEXT: fmov s0, wzr -; CHECK-GI-NEXT: fmov s3, wzr -; CHECK-GI-NEXT: fmov s2, wzr -; CHECK-GI-NEXT: fmov s5, wzr -; CHECK-GI-NEXT: fmov s4, wzr +; CHECK-GI-NEXT: movi d0, #0000000000000000 +; CHECK-GI-NEXT: movi d3, #0000000000000000 +; CHECK-GI-NEXT: movi d2, #0000000000000000 +; CHECK-GI-NEXT: movi d5, #0000000000000000 +; CHECK-GI-NEXT: movi d4, #0000000000000000 ; CHECK-GI-NEXT: umov w15, v7.b[8] ; CHECK-GI-NEXT: umov w2, v7.b[12] ; CHECK-GI-NEXT: umov w16, v7.b[9] @@ -5022,13 +5022,13 @@ define i32 @test_udot_v33i8_nomla(ptr nocapture readonly %a1) { ; CHECK-GI-NEXT: mov v17.s[1], w16 ; CHECK-GI-NEXT: mov v18.s[1], w4 ; CHECK-GI-NEXT: umov w4, v19.b[4] +; CHECK-GI-NEXT: movi d6, #0000000000000000 ; CHECK-GI-NEXT: umov w6, v19.b[1] ; CHECK-GI-NEXT: umov w7, v19.b[5] -; CHECK-GI-NEXT: umov w19, v19.b[9] ; CHECK-GI-NEXT: mov v7.s[1], w9 ; CHECK-GI-NEXT: mov v16.s[1], w14 +; CHECK-GI-NEXT: umov w19, v19.b[9] ; CHECK-GI-NEXT: umov w20, v19.b[13] -; CHECK-GI-NEXT: fmov s6, wzr ; CHECK-GI-NEXT: umov w12, v19.b[2] ; CHECK-GI-NEXT: umov w8, v19.b[3] ; CHECK-GI-NEXT: mov v17.s[2], w3 @@ -5164,13 +5164,13 @@ define i32 @test_sdot_v33i8(ptr nocapture readonly %a, ptr nocapture readonly %b ; CHECK-GI-NEXT: .cfi_offset w30, -88 ; CHECK-GI-NEXT: .cfi_offset w29, -96 ; CHECK-GI-NEXT: ldp q7, q16, [x1] -; CHECK-GI-NEXT: fmov s1, wzr +; CHECK-GI-NEXT: movi d1, #0000000000000000 ; CHECK-GI-NEXT: str w2, [sp, #12] // 4-byte Folded Spill -; CHECK-GI-NEXT: fmov s3, wzr -; CHECK-GI-NEXT: fmov s2, wzr -; CHECK-GI-NEXT: fmov s5, wzr -; CHECK-GI-NEXT: fmov s4, wzr -; CHECK-GI-NEXT: fmov s6, wzr +; CHECK-GI-NEXT: movi d3, #0000000000000000 +; CHECK-GI-NEXT: movi d2, #0000000000000000 +; CHECK-GI-NEXT: movi d5, #0000000000000000 +; CHECK-GI-NEXT: movi d4, #0000000000000000 +; CHECK-GI-NEXT: movi d6, #0000000000000000 ; CHECK-GI-NEXT: mov b19, v7.b[3] ; CHECK-GI-NEXT: mov b23, v7.b[7] ; CHECK-GI-NEXT: mov b17, v7.b[1] @@ -5454,7 +5454,7 @@ define i32 @test_sdot_v33i8(ptr nocapture readonly %a, ptr nocapture readonly %b ; CHECK-GI-NEXT: smov w8, v20.h[7] ; CHECK-GI-NEXT: sxth w9, w9 ; CHECK-GI-NEXT: mov v16.s[1], w12 -; CHECK-GI-NEXT: fmov s0, wzr +; CHECK-GI-NEXT: movi d0, #0000000000000000 ; CHECK-GI-NEXT: fmov s19, w15 ; CHECK-GI-NEXT: smov w15, v22.h[6] ; CHECK-GI-NEXT: mov v1.s[1], wzr @@ -5900,28 +5900,28 @@ define i32 @test_sdot_v33i8_double(<33 x i8> %a, <33 x i8> %b, <33 x i8> %c, <33 ; CHECK-GI-NEXT: sbfx w15, w15, #8, #8 ; CHECK-GI-NEXT: mov v23.h[2], w8 ; CHECK-GI-NEXT: ldr w8, [sp, #112] -; CHECK-GI-NEXT: fmov s19, wzr +; CHECK-GI-NEXT: movi d19, #0000000000000000 ; CHECK-GI-NEXT: sbfx w9, w9, #8, #8 ; CHECK-GI-NEXT: sbfx w11, w11, #8, #8 -; CHECK-GI-NEXT: fmov s21, wzr +; CHECK-GI-NEXT: movi d21, #0000000000000000 ; CHECK-GI-NEXT: mov v22.h[3], w10 ; CHECK-GI-NEXT: ldr w10, [sp, #144] ; CHECK-GI-NEXT: lsl w8, w8, #8 -; CHECK-GI-NEXT: fmov s16, wzr -; CHECK-GI-NEXT: fmov s18, wzr -; CHECK-GI-NEXT: fmov s17, wzr +; CHECK-GI-NEXT: movi d16, #0000000000000000 +; CHECK-GI-NEXT: movi d18, #0000000000000000 +; CHECK-GI-NEXT: movi d17, #0000000000000000 ; CHECK-GI-NEXT: lsl w10, w10, #8 ; CHECK-GI-NEXT: mov v23.h[3], w9 ; CHECK-GI-NEXT: sbfx w8, w8, #8, #8 ; CHECK-GI-NEXT: ldr w9, [sp, #120] -; CHECK-GI-NEXT: fmov s20, wzr -; CHECK-GI-NEXT: fmov s6, wzr +; CHECK-GI-NEXT: movi d20, #0000000000000000 +; CHECK-GI-NEXT: movi d6, #0000000000000000 ; CHECK-GI-NEXT: sbfx w10, w10, #8, #8 ; CHECK-GI-NEXT: mov v22.h[4], w11 ; CHECK-GI-NEXT: lsl w11, w5, #8 ; CHECK-GI-NEXT: lsl w9, w9, #8 -; CHECK-GI-NEXT: fmov s7, wzr -; CHECK-GI-NEXT: fmov s2, wzr +; CHECK-GI-NEXT: movi d7, #0000000000000000 +; CHECK-GI-NEXT: movi d2, #0000000000000000 ; CHECK-GI-NEXT: fmov s24, w10 ; CHECK-GI-NEXT: mov v23.h[4], w8 ; CHECK-GI-NEXT: ldr w8, [sp, #160] @@ -5929,8 +5929,8 @@ define i32 @test_sdot_v33i8_double(<33 x i8> %a, <33 x i8> %b, <33 x i8> %c, <33 ; CHECK-GI-NEXT: ldr w10, [sp, #168] ; CHECK-GI-NEXT: sbfx w9, w9, #8, #8 ; CHECK-GI-NEXT: lsl w8, w8, #8 -; CHECK-GI-NEXT: fmov s4, wzr -; CHECK-GI-NEXT: fmov s3, wzr +; CHECK-GI-NEXT: movi d4, #0000000000000000 +; CHECK-GI-NEXT: movi d3, #0000000000000000 ; CHECK-GI-NEXT: mov v24.h[1], w12 ; CHECK-GI-NEXT: lsl w12, w6, #8 ; CHECK-GI-NEXT: mov v22.h[5], w11 @@ -5941,8 +5941,8 @@ define i32 @test_sdot_v33i8_double(<33 x i8> %a, <33 x i8> %b, <33 x i8> %c, <33 ; CHECK-GI-NEXT: ldr w11, [sp, #184] ; CHECK-GI-NEXT: ldr w9, [sp, #192] ; CHECK-GI-NEXT: sbfx w10, w10, #8, #8 -; CHECK-GI-NEXT: fmov s5, wzr -; CHECK-GI-NEXT: fmov s1, wzr +; CHECK-GI-NEXT: movi d5, #0000000000000000 +; CHECK-GI-NEXT: movi d1, #0000000000000000 ; CHECK-GI-NEXT: mov v24.h[2], w8 ; CHECK-GI-NEXT: mov v22.h[6], w12 ; CHECK-GI-NEXT: ldr w12, [sp, #208] @@ -5951,7 +5951,7 @@ define i32 @test_sdot_v33i8_double(<33 x i8> %a, <33 x i8> %b, <33 x i8> %c, <33 ; CHECK-GI-NEXT: lsl w9, w9, #8 ; CHECK-GI-NEXT: lsl w12, w12, #8 ; CHECK-GI-NEXT: ldr w8, [sp, #200] -; CHECK-GI-NEXT: fmov s0, wzr +; CHECK-GI-NEXT: movi d0, #0000000000000000 ; CHECK-GI-NEXT: lsl w13, w13, #8 ; CHECK-GI-NEXT: sbfx w9, w9, #8, #8 ; CHECK-GI-NEXT: mov v19.s[1], wzr @@ -6813,10 +6813,10 @@ define i32 @test_sdot_v33i8_double_nomla(<33 x i8> %a, <33 x i8> %b, <33 x i8> % ; CHECK-GI-NEXT: fmov s23, w12 ; CHECK-GI-NEXT: sxtb w10, w10 ; CHECK-GI-NEXT: sxtb w12, w7 -; CHECK-GI-NEXT: fmov s18, wzr +; CHECK-GI-NEXT: movi d18, #0000000000000000 ; CHECK-GI-NEXT: sxtb w8, w8 -; CHECK-GI-NEXT: fmov s19, wzr -; CHECK-GI-NEXT: fmov s20, wzr +; CHECK-GI-NEXT: movi d19, #0000000000000000 +; CHECK-GI-NEXT: movi d20, #0000000000000000 ; CHECK-GI-NEXT: mov v22.s[1], w9 ; CHECK-GI-NEXT: sxtb w9, w2 ; CHECK-GI-NEXT: mov v23.s[1], w13 @@ -6825,10 +6825,10 @@ define i32 @test_sdot_v33i8_double_nomla(<33 x i8> %a, <33 x i8> %b, <33 x i8> % ; CHECK-GI-NEXT: sxtb w11, w6 ; CHECK-GI-NEXT: ldr w13, [sp, #232] ; CHECK-GI-NEXT: mov v18.s[1], wzr -; CHECK-GI-NEXT: mov v19.s[1], wzr +; CHECK-GI-NEXT: movi d21, #0000000000000000 ; CHECK-GI-NEXT: fmov s25, w8 ; CHECK-GI-NEXT: ldr w8, [sp, #80] -; CHECK-GI-NEXT: fmov s21, wzr +; CHECK-GI-NEXT: mov v19.s[1], wzr ; CHECK-GI-NEXT: mov v22.s[2], w9 ; CHECK-GI-NEXT: mov v24.s[1], w10 ; CHECK-GI-NEXT: sxtb w10, w3 @@ -6837,10 +6837,10 @@ define i32 @test_sdot_v33i8_double_nomla(<33 x i8> %a, <33 x i8> %b, <33 x i8> % ; CHECK-GI-NEXT: sxtb w8, w8 ; CHECK-GI-NEXT: ldr w11, [sp, #136] ; CHECK-GI-NEXT: mov v18.s[2], wzr -; CHECK-GI-NEXT: mov v19.s[2], wzr +; CHECK-GI-NEXT: movi d6, #0000000000000000 ; CHECK-GI-NEXT: sxtb w9, w9 -; CHECK-GI-NEXT: fmov s6, wzr -; CHECK-GI-NEXT: fmov s7, wzr +; CHECK-GI-NEXT: mov v19.s[2], wzr +; CHECK-GI-NEXT: movi d7, #0000000000000000 ; CHECK-GI-NEXT: mov v22.s[3], w10 ; CHECK-GI-NEXT: ldr w10, [sp, #128] ; CHECK-GI-NEXT: mov v24.s[2], w8 @@ -6855,7 +6855,7 @@ define i32 @test_sdot_v33i8_double_nomla(<33 x i8> %a, <33 x i8> %b, <33 x i8> % ; CHECK-GI-NEXT: sxtb w8, w8 ; CHECK-GI-NEXT: fmov s26, w10 ; CHECK-GI-NEXT: ldr w10, [sp, #144] -; CHECK-GI-NEXT: mov v18.s[3], wzr +; CHECK-GI-NEXT: movi d5, #0000000000000000 ; CHECK-GI-NEXT: mov v25.s[2], w9 ; CHECK-GI-NEXT: ldr w9, [sp, #120] ; CHECK-GI-NEXT: sxtb w12, w12 @@ -6872,14 +6872,14 @@ define i32 @test_sdot_v33i8_double_nomla(<33 x i8> %a, <33 x i8> %b, <33 x i8> % ; CHECK-GI-NEXT: ldr w9, [sp, #192] ; CHECK-GI-NEXT: add v22.4s, v22.4s, v23.4s ; CHECK-GI-NEXT: mov v27.s[1], w8 -; CHECK-GI-NEXT: mov v19.s[3], wzr -; CHECK-GI-NEXT: fmov s5, wzr +; CHECK-GI-NEXT: movi d16, #0000000000000000 +; CHECK-GI-NEXT: movi d17, #0000000000000000 ; CHECK-GI-NEXT: mov v26.s[2], w10 ; CHECK-GI-NEXT: ldr w10, [sp, #200] ; CHECK-GI-NEXT: sxtb w9, w9 -; CHECK-GI-NEXT: fmov s16, wzr -; CHECK-GI-NEXT: fmov s17, wzr -; CHECK-GI-NEXT: fmov s0, wzr +; CHECK-GI-NEXT: movi d0, #0000000000000000 +; CHECK-GI-NEXT: movi d1, #0000000000000000 +; CHECK-GI-NEXT: movi d3, #0000000000000000 ; CHECK-GI-NEXT: sxtb w8, w10 ; CHECK-GI-NEXT: sxtb w10, w12 ; CHECK-GI-NEXT: fmov s28, w9 @@ -6936,7 +6936,7 @@ define i32 @test_sdot_v33i8_double_nomla(<33 x i8> %a, <33 x i8> %b, <33 x i8> % ; CHECK-GI-NEXT: fmov s9, w12 ; CHECK-GI-NEXT: sxtb w11, w11 ; CHECK-GI-NEXT: sxtb w10, w10 -; CHECK-GI-NEXT: fmov s1, wzr +; CHECK-GI-NEXT: movi d2, #0000000000000000 ; CHECK-GI-NEXT: sxtb w9, w9 ; CHECK-GI-NEXT: mov v30.s[3], w8 ; CHECK-GI-NEXT: ldr w8, [sp, #632] @@ -6948,10 +6948,10 @@ define i32 @test_sdot_v33i8_double_nomla(<33 x i8> %a, <33 x i8> %b, <33 x i8> % ; CHECK-GI-NEXT: ldr w10, [sp, #688] ; CHECK-GI-NEXT: sxtb w11, w11 ; CHECK-GI-NEXT: sxtb w8, w8 -; CHECK-GI-NEXT: fmov s3, wzr +; CHECK-GI-NEXT: movi d4, #0000000000000000 ; CHECK-GI-NEXT: sxtb w9, w9 ; CHECK-GI-NEXT: sxtb w10, w10 -; CHECK-GI-NEXT: fmov s2, wzr +; CHECK-GI-NEXT: mov v18.s[3], wzr ; CHECK-GI-NEXT: mov v9.s[2], w11 ; CHECK-GI-NEXT: ldr w11, [sp, #664] ; CHECK-GI-NEXT: mov v10.s[1], w8 @@ -6963,7 +6963,7 @@ define i32 @test_sdot_v33i8_double_nomla(<33 x i8> %a, <33 x i8> %b, <33 x i8> % ; CHECK-GI-NEXT: ldr w10, [sp, #672] ; CHECK-GI-NEXT: sxtb w8, w8 ; CHECK-GI-NEXT: sxtb w9, w9 -; CHECK-GI-NEXT: fmov s4, wzr +; CHECK-GI-NEXT: mov v19.s[3], wzr ; CHECK-GI-NEXT: mov v11.s[1], w11 ; CHECK-GI-NEXT: sxtb w10, w10 ; CHECK-GI-NEXT: mov v20.s[1], wzr @@ -7121,15 +7121,15 @@ define i32 @test_udot_v48i8(ptr nocapture readonly %a, ptr nocapture readonly %b ; ; CHECK-GI-LABEL: test_udot_v48i8: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: fmov s0, wzr +; CHECK-GI-NEXT: movi d0, #0000000000000000 ; CHECK-GI-NEXT: movi v1.2d, #0000000000000000 ; CHECK-GI-NEXT: ldr q7, [x0, #32] ; CHECK-GI-NEXT: movi v2.2d, #0000000000000000 ; CHECK-GI-NEXT: movi v3.2d, #0000000000000000 ; CHECK-GI-NEXT: ldr q17, [x1, #32] ; CHECK-GI-NEXT: ldp q4, q5, [x0] -; CHECK-GI-NEXT: mov v0.s[1], wzr ; CHECK-GI-NEXT: ldp q6, q16, [x1] +; CHECK-GI-NEXT: mov v0.s[1], wzr ; CHECK-GI-NEXT: udot v2.4s, v17.16b, v7.16b ; CHECK-GI-NEXT: udot v1.4s, v6.16b, v4.16b ; CHECK-GI-NEXT: udot v3.4s, v16.16b, v5.16b @@ -7169,7 +7169,7 @@ define i32 @test_udot_v48i8_nomla(ptr nocapture readonly %a1) { ; ; CHECK-GI-LABEL: test_udot_v48i8_nomla: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: fmov s0, wzr +; CHECK-GI-NEXT: movi d0, #0000000000000000 ; CHECK-GI-NEXT: movi v1.16b, #1 ; CHECK-GI-NEXT: ldr q7, [x0, #32] ; CHECK-GI-NEXT: movi v2.2d, #0000000000000000 @@ -7212,15 +7212,15 @@ define i32 @test_sdot_v48i8(ptr nocapture readonly %a, ptr nocapture readonly %b ; ; CHECK-GI-LABEL: test_sdot_v48i8: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: fmov s0, wzr +; CHECK-GI-NEXT: movi d0, #0000000000000000 ; CHECK-GI-NEXT: movi v1.2d, #0000000000000000 ; CHECK-GI-NEXT: ldr q7, [x0, #32] ; CHECK-GI-NEXT: movi v2.2d, #0000000000000000 ; CHECK-GI-NEXT: movi v3.2d, #0000000000000000 ; CHECK-GI-NEXT: ldr q17, [x1, #32] ; CHECK-GI-NEXT: ldp q4, q5, [x0] -; CHECK-GI-NEXT: mov v0.s[1], wzr ; CHECK-GI-NEXT: ldp q6, q16, [x1] +; CHECK-GI-NEXT: mov v0.s[1], wzr ; CHECK-GI-NEXT: sdot v2.4s, v17.16b, v7.16b ; CHECK-GI-NEXT: sdot v1.4s, v6.16b, v4.16b ; CHECK-GI-NEXT: sdot v3.4s, v16.16b, v5.16b @@ -7639,7 +7639,7 @@ define i32 @test_sdot_v48i8_double(<48 x i8> %a, <48 x i8> %b, <48 x i8> %c, <48 ; CHECK-GI-NEXT: fmov s2, w0 ; CHECK-GI-NEXT: ldr w11, [sp, #208] ; CHECK-GI-NEXT: ldr w8, [sp, #216] -; CHECK-GI-NEXT: fmov s1, wzr +; CHECK-GI-NEXT: movi d1, #0000000000000000 ; CHECK-GI-NEXT: fmov s3, w10 ; CHECK-GI-NEXT: ldr w10, [sp, #336] ; CHECK-GI-NEXT: ldr w12, [sp, #720] @@ -7663,7 +7663,7 @@ define i32 @test_sdot_v48i8_double(<48 x i8> %a, <48 x i8> %b, <48 x i8> %c, <48 ; CHECK-GI-NEXT: ldr w11, [sp, #16] ; CHECK-GI-NEXT: mov v7.b[1], w9 ; CHECK-GI-NEXT: ldr w9, [sp, #480] -; CHECK-GI-NEXT: fmov s0, wzr +; CHECK-GI-NEXT: movi d0, #0000000000000000 ; CHECK-GI-NEXT: mov v6.b[1], w8 ; CHECK-GI-NEXT: ldr w8, [sp, #96] ; CHECK-GI-NEXT: mov v4.b[2], w10 @@ -8271,7 +8271,7 @@ define i32 @test_sdot_v48i8_double_nomla(<48 x i8> %a, <48 x i8> %b, <48 x i8> % ; CHECK-GI-NEXT: fmov s2, w0 ; CHECK-GI-NEXT: ldr w10, [sp, #216] ; CHECK-GI-NEXT: ldr w12, [sp, #848] -; CHECK-GI-NEXT: fmov s1, wzr +; CHECK-GI-NEXT: movi d1, #0000000000000000 ; CHECK-GI-NEXT: fmov s4, w9 ; CHECK-GI-NEXT: fmov s3, w11 ; CHECK-GI-NEXT: ldr w11, [sp, #720] @@ -8295,7 +8295,7 @@ define i32 @test_sdot_v48i8_double_nomla(<48 x i8> %a, <48 x i8> %b, <48 x i8> % ; CHECK-GI-NEXT: mov v2.b[2], w2 ; CHECK-GI-NEXT: mov v3.b[2], w10 ; CHECK-GI-NEXT: ldr w10, [sp, #864] -; CHECK-GI-NEXT: fmov s0, wzr +; CHECK-GI-NEXT: movi d0, #0000000000000000 ; CHECK-GI-NEXT: mov v7.b[1], w11 ; CHECK-GI-NEXT: ldr w11, [sp, #992] ; CHECK-GI-NEXT: mov v4.b[2], w8 diff --git a/llvm/test/CodeGen/AArch64/neon-saba.ll b/llvm/test/CodeGen/AArch64/neon-saba.ll index 19967bd1a69e..ddb85d6dee03 100644 --- a/llvm/test/CodeGen/AArch64/neon-saba.ll +++ b/llvm/test/CodeGen/AArch64/neon-saba.ll @@ -12,9 +12,9 @@ define <4 x i32> @saba_abs_4s(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) #0 { ; ; CHECK-GI-LABEL: saba_abs_4s: ; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi v3.2d, #0000000000000000 ; CHECK-GI-NEXT: sub v1.4s, v1.4s, v2.4s -; CHECK-GI-NEXT: abs v1.4s, v1.4s -; CHECK-GI-NEXT: add v0.4s, v0.4s, v1.4s +; CHECK-GI-NEXT: saba v0.4s, v1.4s, v3.4s ; CHECK-GI-NEXT: ret %sub = sub nsw <4 x i32> %b, %c %abs = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %sub, i1 true) @@ -30,9 +30,9 @@ define <2 x i32> @saba_abs_2s(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) #0 { ; ; CHECK-GI-LABEL: saba_abs_2s: ; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi v3.2d, #0000000000000000 ; CHECK-GI-NEXT: sub v1.2s, v1.2s, v2.2s -; CHECK-GI-NEXT: abs v1.2s, v1.2s -; CHECK-GI-NEXT: add v0.2s, v0.2s, v1.2s +; CHECK-GI-NEXT: saba v0.2s, v1.2s, v3.2s ; CHECK-GI-NEXT: ret %sub = sub nsw <2 x i32> %b, %c %abs = call <2 x i32> @llvm.abs.v2i32(<2 x i32> %sub, i1 true) @@ -48,9 +48,9 @@ define <8 x i16> @saba_abs_8h(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) #0 { ; ; CHECK-GI-LABEL: saba_abs_8h: ; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi v3.2d, #0000000000000000 ; CHECK-GI-NEXT: sub v1.8h, v1.8h, v2.8h -; CHECK-GI-NEXT: abs v1.8h, v1.8h -; CHECK-GI-NEXT: add v0.8h, v0.8h, v1.8h +; CHECK-GI-NEXT: saba v0.8h, v1.8h, v3.8h ; CHECK-GI-NEXT: ret %sub = sub nsw <8 x i16> %b, %c %abs = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %sub, i1 true) @@ -66,9 +66,9 @@ define <4 x i16> @saba_abs_4h(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #0 { ; ; CHECK-GI-LABEL: saba_abs_4h: ; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi v3.2d, #0000000000000000 ; CHECK-GI-NEXT: sub v1.4h, v1.4h, v2.4h -; CHECK-GI-NEXT: abs v1.4h, v1.4h -; CHECK-GI-NEXT: add v0.4h, v0.4h, v1.4h +; CHECK-GI-NEXT: saba v0.4h, v1.4h, v3.4h ; CHECK-GI-NEXT: ret %sub = sub nsw <4 x i16> %b, %c %abs = call <4 x i16> @llvm.abs.v4i16(<4 x i16> %sub, i1 true) @@ -84,9 +84,9 @@ define <16 x i8> @saba_abs_16b(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #0 { ; ; CHECK-GI-LABEL: saba_abs_16b: ; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi v3.2d, #0000000000000000 ; CHECK-GI-NEXT: sub v1.16b, v1.16b, v2.16b -; CHECK-GI-NEXT: abs v1.16b, v1.16b -; CHECK-GI-NEXT: add v0.16b, v0.16b, v1.16b +; CHECK-GI-NEXT: saba v0.16b, v1.16b, v3.16b ; CHECK-GI-NEXT: ret %sub = sub nsw <16 x i8> %b, %c %abs = call <16 x i8> @llvm.abs.v16i8(<16 x i8> %sub, i1 true) @@ -102,9 +102,9 @@ define <8 x i8> @saba_abs_8b(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 { ; ; CHECK-GI-LABEL: saba_abs_8b: ; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi v3.2d, #0000000000000000 ; CHECK-GI-NEXT: sub v1.8b, v1.8b, v2.8b -; CHECK-GI-NEXT: abs v1.8b, v1.8b -; CHECK-GI-NEXT: add v0.8b, v0.8b, v1.8b +; CHECK-GI-NEXT: saba v0.8b, v1.8b, v3.8b ; CHECK-GI-NEXT: ret %sub = sub nsw <8 x i8> %b, %c %abs = call <8 x i8> @llvm.abs.v8i8(<8 x i8> %sub, i1 true) @@ -174,6 +174,214 @@ define <8 x i8> @saba_sabd_8b(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 { ret <8 x i8> %add } +; SABA from ADD(SABD(X, ZEROS)) + +define <4 x i32> @saba_sabd_zeros_4s(<4 x i32> %a, <4 x i32> %b) #0 { +; CHECK-LABEL: saba_sabd_zeros_4s: +; CHECK: // %bb.0: +; CHECK-NEXT: movi v2.2d, #0000000000000000 +; CHECK-NEXT: saba v0.4s, v1.4s, v2.4s +; CHECK-NEXT: ret + %sabd = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> %b, <4 x i32> zeroinitializer) + %add = add <4 x i32> %sabd, %a + ret <4 x i32> %add +} + +define <2 x i32> @saba_sabd_zeros_2s(<2 x i32> %a, <2 x i32> %b) #0 { +; CHECK-LABEL: saba_sabd_zeros_2s: +; CHECK: // %bb.0: +; CHECK-NEXT: movi v2.2d, #0000000000000000 +; CHECK-NEXT: saba v0.2s, v1.2s, v2.2s +; CHECK-NEXT: ret + %sabd = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %b, <2 x i32> zeroinitializer) + %add = add <2 x i32> %sabd, %a + ret <2 x i32> %add +} + +define <8 x i16> @saba_sabd_zeros_8h(<8 x i16> %a, <8 x i16> %b) #0 { +; CHECK-LABEL: saba_sabd_zeros_8h: +; CHECK: // %bb.0: +; CHECK-NEXT: movi v2.2d, #0000000000000000 +; CHECK-NEXT: saba v0.8h, v1.8h, v2.8h +; CHECK-NEXT: ret + %sabd = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> %b, <8 x i16> zeroinitializer) + %add = add <8 x i16> %sabd, %a + ret <8 x i16> %add +} + +define <4 x i16> @saba_sabd_zeros_4h(<4 x i16> %a, <4 x i16> %b) #0 { +; CHECK-LABEL: saba_sabd_zeros_4h: +; CHECK: // %bb.0: +; CHECK-NEXT: movi v2.2d, #0000000000000000 +; CHECK-NEXT: saba v0.4h, v1.4h, v2.4h +; CHECK-NEXT: ret + %sabd = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %b, <4 x i16> zeroinitializer) + %add = add <4 x i16> %sabd, %a + ret <4 x i16> %add +} + +define <16 x i8> @saba_sabd_zeros_16b(<16 x i8> %a, <16 x i8> %b) #0 { +; CHECK-LABEL: saba_sabd_zeros_16b: +; CHECK: // %bb.0: +; CHECK-NEXT: movi v2.2d, #0000000000000000 +; CHECK-NEXT: saba v0.16b, v1.16b, v2.16b +; CHECK-NEXT: ret + %sabd = call <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8> %b, <16 x i8> zeroinitializer) + %add = add <16 x i8> %sabd, %a + ret <16 x i8> %add +} + +define <8 x i8> @saba_sabd_zeros_8b(<8 x i8> %a, <8 x i8> %b) #0 { +; CHECK-LABEL: saba_sabd_zeros_8b: +; CHECK: // %bb.0: +; CHECK-NEXT: movi v2.2d, #0000000000000000 +; CHECK-NEXT: saba v0.8b, v1.8b, v2.8b +; CHECK-NEXT: ret + %sabd = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %b, <8 x i8> zeroinitializer) + %add = add <8 x i8> %sabd, %a + ret <8 x i8> %add +} + +define <4 x i32> @saba_abs_zeros_4s(<4 x i32> %a, <4 x i32> %b) #0 { +; CHECK-LABEL: saba_abs_zeros_4s: +; CHECK: // %bb.0: +; CHECK-NEXT: movi v2.2d, #0000000000000000 +; CHECK-NEXT: saba v0.4s, v1.4s, v2.4s +; CHECK-NEXT: ret + %abs = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %b, i1 true) + %add = add <4 x i32> %a, %abs + ret <4 x i32> %add +} + +define <2 x i32> @saba_abs_zeros_2s(<2 x i32> %a, <2 x i32> %b) #0 { +; CHECK-LABEL: saba_abs_zeros_2s: +; CHECK: // %bb.0: +; CHECK-NEXT: movi v2.2d, #0000000000000000 +; CHECK-NEXT: saba v0.2s, v1.2s, v2.2s +; CHECK-NEXT: ret + %abs = call <2 x i32> @llvm.abs.v2i32(<2 x i32> %b, i1 true) + %add = add <2 x i32> %a, %abs + ret <2 x i32> %add +} + +define <8 x i16> @saba_abs_zeros_8h(<8 x i16> %a, <8 x i16> %b) #0 { +; CHECK-LABEL: saba_abs_zeros_8h: +; CHECK: // %bb.0: +; CHECK-NEXT: movi v2.2d, #0000000000000000 +; CHECK-NEXT: saba v0.8h, v1.8h, v2.8h +; CHECK-NEXT: ret + %abs = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %b, i1 true) + %add = add <8 x i16> %a, %abs + ret <8 x i16> %add +} + +define <4 x i16> @saba_abs_zeros_4h(<4 x i16> %a, <4 x i16> %b) #0 { +; CHECK-LABEL: saba_abs_zeros_4h: +; CHECK: // %bb.0: +; CHECK-NEXT: movi v2.2d, #0000000000000000 +; CHECK-NEXT: saba v0.4h, v1.4h, v2.4h +; CHECK-NEXT: ret + %abs = call <4 x i16> @llvm.abs.v4i16(<4 x i16> %b, i1 true) + %add = add <4 x i16> %a, %abs + ret <4 x i16> %add +} + +define <16 x i8> @saba_abs_zeros_16b(<16 x i8> %a, <16 x i8> %b) #0 { +; CHECK-LABEL: saba_abs_zeros_16b: +; CHECK: // %bb.0: +; CHECK-NEXT: movi v2.2d, #0000000000000000 +; CHECK-NEXT: saba v0.16b, v1.16b, v2.16b +; CHECK-NEXT: ret + %abs = call <16 x i8> @llvm.abs.v16i8(<16 x i8> %b, i1 true) + %add = add <16 x i8> %a, %abs + ret <16 x i8> %add +} + +define <8 x i8> @saba_abs_zeros_8b(<8 x i8> %a, <8 x i8> %b) #0 { +; CHECK-LABEL: saba_abs_zeros_8b: +; CHECK: // %bb.0: +; CHECK-NEXT: movi v2.2d, #0000000000000000 +; CHECK-NEXT: saba v0.8b, v1.8b, v2.8b +; CHECK-NEXT: ret + %abs = call <8 x i8> @llvm.abs.v8i8(<8 x i8> %b, i1 true) + %add = add <8 x i8> %a, %abs + ret <8 x i8> %add +} + +; SABAL from ADD(ZEXT(SABD(X, ZEROS))) + +define <2 x i64> @sabal_sabd_zeros_2s(<2 x i64> %a, <2 x i32> %b) #0 { +; CHECK-LABEL: sabal_sabd_zeros_2s: +; CHECK: // %bb.0: +; CHECK-NEXT: movi v2.2d, #0000000000000000 +; CHECK-NEXT: sabal v0.2d, v1.2s, v2.2s +; CHECK-NEXT: ret + %sabd = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %b, <2 x i32> zeroinitializer) + %sabd.zext = zext <2 x i32> %sabd to <2 x i64> + %add = add <2 x i64> %sabd.zext, %a + ret <2 x i64> %add +} + +define <4 x i32> @sabal_sabd_zeros_4h(<4 x i32> %a, <4 x i16> %b) #0 { +; CHECK-LABEL: sabal_sabd_zeros_4h: +; CHECK: // %bb.0: +; CHECK-NEXT: movi v2.2d, #0000000000000000 +; CHECK-NEXT: sabal v0.4s, v1.4h, v2.4h +; CHECK-NEXT: ret + %sabd = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %b, <4 x i16> zeroinitializer) + %sabd.zext = zext <4 x i16> %sabd to <4 x i32> + %add = add <4 x i32> %sabd.zext, %a + ret <4 x i32> %add +} + +define <8 x i16> @sabal_sabd_zeros_8b(<8 x i16> %a, <8 x i8> %b) #0 { +; CHECK-LABEL: sabal_sabd_zeros_8b: +; CHECK: // %bb.0: +; CHECK-NEXT: movi v2.2d, #0000000000000000 +; CHECK-NEXT: sabal v0.8h, v1.8b, v2.8b +; CHECK-NEXT: ret + %sabd = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %b, <8 x i8> zeroinitializer) + %sabd.zext = zext <8 x i8> %sabd to <8 x i16> + %add = add <8 x i16> %sabd.zext, %a + ret <8 x i16> %add +} + +define <2 x i64> @sabal_abs_zeros_2s(<2 x i64> %a, <2 x i32> %b) #0 { +; CHECK-LABEL: sabal_abs_zeros_2s: +; CHECK: // %bb.0: +; CHECK-NEXT: movi v2.2d, #0000000000000000 +; CHECK-NEXT: sabal v0.2d, v1.2s, v2.2s +; CHECK-NEXT: ret + %abs = call <2 x i32> @llvm.abs.v2i32(<2 x i32> %b, i1 true) + %abs.zext = zext <2 x i32> %abs to <2 x i64> + %add = add <2 x i64> %a, %abs.zext + ret <2 x i64> %add +} + +define <4 x i32> @sabal_abs_zeros_4h(<4 x i32> %a, <4 x i16> %b) #0 { +; CHECK-LABEL: sabal_abs_zeros_4h: +; CHECK: // %bb.0: +; CHECK-NEXT: movi v2.2d, #0000000000000000 +; CHECK-NEXT: sabal v0.4s, v1.4h, v2.4h +; CHECK-NEXT: ret + %abs = call <4 x i16> @llvm.abs.v4i16(<4 x i16> %b, i1 true) + %abs.zext = zext <4 x i16> %abs to <4 x i32> + %add = add <4 x i32> %a, %abs.zext + ret <4 x i32> %add +} + +define <8 x i16> @sabal_abs_zeros_8b(<8 x i16> %a, <8 x i8> %b) #0 { +; CHECK-LABEL: sabal_abs_zeros_8b: +; CHECK: // %bb.0: +; CHECK-NEXT: movi v2.2d, #0000000000000000 +; CHECK-NEXT: sabal v0.8h, v1.8b, v2.8b +; CHECK-NEXT: ret + %abs = call <8 x i8> @llvm.abs.v8i8(<8 x i8> %b, i1 true) + %abs.zext = zext <8 x i8> %abs to <8 x i16> + %add = add <8 x i16> %a, %abs.zext + ret <8 x i16> %add +} + declare <4 x i32> @llvm.abs.v4i32(<4 x i32>, i1) declare <2 x i32> @llvm.abs.v2i32(<2 x i32>, i1) declare <8 x i16> @llvm.abs.v8i16(<8 x i16>, i1) diff --git a/llvm/test/CodeGen/AArch64/pr157118.ll b/llvm/test/CodeGen/AArch64/pr157118.ll new file mode 100644 index 000000000000..cdef46fe8f22 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/pr157118.ll @@ -0,0 +1,16 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=aarch64 < %s | FileCheck %s + +define <8 x i8> @test_vaba_u8(<8 x i8> noundef %a, <8 x i8> noundef %b, <8 x i8> noundef %c) { +; CHECK-LABEL: test_vaba_u8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: //APP +; CHECK-NEXT: //NO_APP +; CHECK-NEXT: uaba v0.8b, v1.8b, v2.8b +; CHECK-NEXT: ret +entry: + %0 = tail call <8 x i8> asm sideeffect "", "=w,0"(<8 x i8> %a) + %vabd.i = tail call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %b, <8 x i8> %c) + %add.i = add <8 x i8> %vabd.i, %0 + ret <8 x i8> %add.i +} diff --git a/llvm/test/CodeGen/AArch64/print-pipeline-passes.ll b/llvm/test/CodeGen/AArch64/print-pipeline-passes.ll new file mode 100644 index 000000000000..5852f97a6379 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/print-pipeline-passes.ll @@ -0,0 +1,10 @@ +; RUN: opt -mtriple=aarch64 -S -passes='default<O0>' -print-pipeline-passes < %s | FileCheck --check-prefix=O0 %s +; RUN: opt -mtriple=aarch64 -S -passes='default<O2>' -print-pipeline-passes < %s | FileCheck %s + +; CHECK: loop-idiom-vectorize +; O0: {{^}}function(ee-instrument<>),always-inline,coro-cond(coro-early,cgscc(coro-split),coro-cleanup,globaldce),function(annotation-remarks),verify,print{{$}} + +define void @foo() { +entry: + ret void +} diff --git a/llvm/test/CodeGen/AArch64/ptrauth-call.ll b/llvm/test/CodeGen/AArch64/ptrauth-call.ll index 700686b9f194..fc555a882be2 100644 --- a/llvm/test/CodeGen/AArch64/ptrauth-call.ll +++ b/llvm/test/CodeGen/AArch64/ptrauth-call.ll @@ -181,8 +181,9 @@ define void @test_tailcall_omit_mov_x16_x16(ptr %objptr) #0 { ; ELF-NEXT: movk x8, #6503, lsl #48 ; ELF-NEXT: autda x1, x8 ; ELF-NEXT: ldr x2, [x1] -; ELF-NEXT: movk x1, #54167, lsl #48 -; ELF-NEXT: braa x2, x1 +; ELF-NEXT: mov x16, x1 +; ELF-NEXT: movk x16, #54167, lsl #48 +; ELF-NEXT: braa x2, x16 %vtable.signed = load ptr, ptr %objptr, align 8 %objptr.int = ptrtoint ptr %objptr to i64 %vtable.discr = tail call i64 @llvm.ptrauth.blend(i64 %objptr.int, i64 6503) @@ -213,8 +214,9 @@ define i32 @test_call_omit_extra_moves(ptr %objptr) #0 { ; ELF-NEXT: movk x9, #6503, lsl #48 ; ELF-NEXT: autda x8, x9 ; ELF-NEXT: ldr x9, [x8] -; ELF-NEXT: movk x8, #34646, lsl #48 -; ELF-NEXT: blraa x9, x8 +; ELF-NEXT: mov x17, x8 +; ELF-NEXT: movk x17, #34646, lsl #48 +; ELF-NEXT: blraa x9, x17 ; ELF-NEXT: mov w0, #42 ; ELF-NEXT: ldr x30, [sp], #16 ; CHECK-NEXT: ret @@ -230,6 +232,97 @@ define i32 @test_call_omit_extra_moves(ptr %objptr) #0 { ret i32 42 } +; The second BLRA instruction should not reuse its AddrDisc operand as a scratch register (returned later). +define i64 @test_call_discr_csr_live(ptr %fnptr, i64 %addr.discr) #0 { +; ELF-LABEL: test_call_discr_csr_live: +; ELF-NEXT: str x30, [sp, #-32]! +; ELF-NEXT: stp x20, x19, [sp, #16] +; ELF-DAG: mov x[[FNPTR:[0-9]+]], x0 +; ELF-DAG: mov x[[ADDR_DISC:[0-9]+]], x1 +; ELF-DAG: mov x17, x1 +; ELF-NEXT: movk x17, #6503, lsl #48 +; ELF-NEXT: blraa x0, x17 +; ELF-NEXT: mov x17, x[[ADDR_DISC]] +; ELF-NEXT: movk x17, #6503, lsl #48 +; ELF-NEXT: blraa x[[FNPTR]], x17 +; ELF-NEXT: mov x0, x[[ADDR_DISC]] +; ELF-NEXT: ldp x20, x19, [sp, #16] +; ELF-NEXT: ldr x30, [sp], #32 +; ELF-NEXT: ret + %discr = tail call i64 @llvm.ptrauth.blend(i64 %addr.discr, i64 6503) + tail call void %fnptr() [ "ptrauth"(i32 0, i64 %discr) ] + tail call void %fnptr() [ "ptrauth"(i32 0, i64 %discr) ] + ret i64 %addr.discr +} + +; The second BLRA instruction may reuse its AddrDisc operand as a scratch register. +define i64 @test_call_discr_csr_killed(ptr %fnptr, i64 %addr.discr) #0 { +; ELF-LABEL: test_call_discr_csr_killed: +; ELF-NEXT: str x30, [sp, #-32]! +; ELF-NEXT: stp x20, x19, [sp, #16] +; ELF-DAG: mov x[[FNPTR:[0-9]+]], x0 +; ELF-DAG: mov x[[ADDR_DISC:[0-9]+]], x1 +; ELF-DAG: mov x17, x1 +; ELF-NEXT: movk x17, #6503, lsl #48 +; ELF-NEXT: blraa x0, x17 +; ELF-DAG: mov x17, x[[ADDR_DISC]] +; ELF-NEXT: movk x17, #6503, lsl #48 +; ELF-NEXT: blraa x[[FNPTR]], x17 +; ELF-NEXT: ldp x20, x19, [sp, #16] +; ELF-NEXT: mov w0, #42 +; ELF-NEXT: ldr x30, [sp], #32 +; ELF-NEXT: ret + %discr = tail call i64 @llvm.ptrauth.blend(i64 %addr.discr, i64 6503) + tail call void %fnptr() [ "ptrauth"(i32 0, i64 %discr) ] + tail call void %fnptr() [ "ptrauth"(i32 0, i64 %discr) ] + ret i64 42 +} + +; BLRA instruction should not reuse its AddrDisc operand as a scratch register (function argument). +define i64 @test_call_discr_arg(ptr %fnptr, i64 %addr.discr) #0 { +; ELF-LABEL: test_call_discr_arg: +; ELF-NEXT: str x30, [sp, #-16]! +; ELF-NEXT: mov x8, x0 +; ELF-NEXT: mov x0, xzr +; ELF-NEXT: mov x17, x1 +; ELF-NEXT: movk x17, #6503, lsl #48 +; ELF-NEXT: blraa x8, x17 +; ELF-NEXT: mov w0, #42 +; ELF-NEXT: ldr x30, [sp], #16 +; ELF-NEXT: ret + %discr = tail call i64 @llvm.ptrauth.blend(i64 %addr.discr, i64 6503) + tail call void %fnptr(ptr null, i64 %addr.discr) [ "ptrauth"(i32 0, i64 %discr) ] + ret i64 42 +} + +; BLRA instruction may reuse its AddrDisc operand as a scratch register. +define i64 @test_call_discr_non_arg(ptr %fnptr, i64 %addr.discr) #0 { +; ELF-LABEL: test_call_discr_non_arg: +; ELF-NEXT: str x30, [sp, #-16]! +; ELF-NEXT: mov x17, x1 +; ELF-NEXT: movk x17, #6503, lsl #48 +; ELF-NEXT: blraa x0, x17 +; ELF-NEXT: mov w0, #42 +; ELF-NEXT: ldr x30, [sp], #16 +; ELF-NEXT: ret + %discr = tail call i64 @llvm.ptrauth.blend(i64 %addr.discr, i64 6503) + tail call void %fnptr() [ "ptrauth"(i32 0, i64 %discr) ] + ret i64 42 +} + +; AUTH_TCRETURN instruction should not reuse its AddrDisc operand as a scratch register (function argument). +define i64 @test_tailcall_discr_arg(ptr %fnptr, i64 %addr.discr) #0 { +; ELF-LABEL: test_tailcall_discr_arg: +; ELF-NEXT: mov x2, x0 +; ELF-NEXT: mov x0, xzr +; ELF-NEXT: mov x16, x1 +; ELF-NEXT: movk x16, #6503, lsl #48 +; ELF-NEXT: braa x2, x16 + %discr = tail call i64 @llvm.ptrauth.blend(i64 %addr.discr, i64 6503) + %result = tail call i64 %fnptr(ptr null, i64 %addr.discr) [ "ptrauth"(i32 0, i64 %discr) ] + ret i64 %result +} + define i32 @test_call_ia_arg(ptr %arg0, i64 %arg1) #0 { ; DARWIN-LABEL: test_call_ia_arg: ; DARWIN-NEXT: stp x29, x30, [sp, #-16]! diff --git a/llvm/test/CodeGen/AArch64/ptrauth-intrinsic-auth-resign.ll b/llvm/test/CodeGen/AArch64/ptrauth-intrinsic-auth-resign.ll index 2634beb4e359..52b38a563200 100644 --- a/llvm/test/CodeGen/AArch64/ptrauth-intrinsic-auth-resign.ll +++ b/llvm/test/CodeGen/AArch64/ptrauth-intrinsic-auth-resign.ll @@ -755,5 +755,128 @@ define i64 @test_auth_ia_swapped(i64 %arg, i64 %arg1) { ret i64 %tmp } +; Authentications should not be speculated, as they crash on failure and it is +; perfectly correct to dynamically choose the signing schema or whether to +; perform authentication at all. +define ptr @auth_speculation(i64 %signed, i1 %cond) { +; UNCHECKED-LABEL: auth_speculation: +; UNCHECKED: %bb.0: +; UNCHECKED-DARWIN-NEXT: mov x16, x0 +; UNCHECKED-DARWIN-NEXT: tbz w1, #0, [[BB_ELSE:[A-Za-z0-9_.]+]] +; UNCHECKED-DARWIN-NEXT: %bb.1: +; UNCHECKED-DARWIN-NEXT: autdza x16 +; UNCHECKED-DARWIN-NEXT: b [[BB_RETURN:[A-Za-z0-9_.]+]] +; UNCHECKED-DARWIN-NEXT: [[BB_ELSE]]: +; UNCHECKED-DARWIN-NEXT: autdzb x16 +; UNCHECKED-DARWIN-NEXT: [[BB_RETURN]]: +; UNCHECKED-DARWIN-NEXT: ldr x8, [x16] +; UNCHECKED-ELF-NEXT: tbz w1, #0, [[BB_ELSE:[A-Za-z0-9_.]+]] +; UNCHECKED-ELF-NEXT: %bb.1: +; UNCHECKED-ELF-NEXT: autdza x0 +; UNCHECKED-ELF-NEXT: b [[BB_RETURN:[A-Za-z0-9_.]+]] +; UNCHECKED-ELF-NEXT: [[BB_ELSE]]: +; UNCHECKED-ELF-NEXT: autdzb x0 +; UNCHECKED-ELF-NEXT: [[BB_RETURN]]: +; UNCHECKED-ELF-NEXT: ldr x8, [x0] +; UNCHECKED-NEXT: ldr x8, [x8] +; UNCHECKED-NEXT: ldr x8, [x8] +; UNCHECKED-NEXT: ldr x0, [x8] +; UNCHECKED-NEXT: ret +; +; CHECKED-LABEL: auth_speculation: +; CHECKED: %bb.0: +; CHECKED-DARWIN-NEXT: mov x16, x0 +; CHECKED-DARWIN-NEXT: tbz w1, #0, [[BB_ELSE:[A-Za-z0-9_.]+]] +; CHECKED-DARWIN-NEXT: %bb.1: +; CHECKED-DARWIN-NEXT: autdza x16 +; CHECKED-DARWIN-NEXT: b [[BB_RETURN:[A-Za-z0-9_.]+]] +; CHECKED-DARWIN-NEXT: [[BB_ELSE]]: +; CHECKED-DARWIN-NEXT: autdzb x16 +; CHECKED-DARWIN-NEXT: [[BB_RETURN]]: +; CHECKED-DARWIN-NEXT: ldr x8, [x16] +; CHECKED-ELF-NEXT: tbz w1, #0, [[BB_ELSE:[A-Za-z0-9_.]+]] +; CHECKED-ELF-NEXT: %bb.1: +; CHECKED-ELF-NEXT: autdza x0 +; CHECKED-ELF-NEXT: b [[BB_RETURN:[A-Za-z0-9_.]+]] +; CHECKED-ELF-NEXT: [[BB_ELSE]]: +; CHECKED-ELF-NEXT: autdzb x0 +; CHECKED-ELF-NEXT: [[BB_RETURN]]: +; CHECKED-ELF-NEXT: ldr x8, [x0] +; CHECKED-NEXT: ldr x8, [x8] +; CHECKED-NEXT: ldr x8, [x8] +; CHECKED-NEXT: ldr x0, [x8] +; CHECKED-NEXT: ret +; +; TRAP-LABEL: auth_speculation: +; TRAP: %bb.0: +; TRAP-DARWIN-NEXT: mov x16, x0 +; TRAP-DARWIN-NEXT: tbz w1, #0, [[BB_ELSE:[A-Za-z0-9_.]+]] +; TRAP-DARWIN-NEXT: %bb.1: +; TRAP-DARWIN-NEXT: autdza x16 +; TRAP-DARWIN-NEXT: mov x17, x16 +; TRAP-DARWIN-NEXT: xpacd x17 +; TRAP-DARWIN-NEXT: cmp x16, x17 +; TRAP-DARWIN-NEXT: b.eq [[L]]auth_success_18 +; TRAP-DARWIN-NEXT: brk #0xc472 +; TRAP-DARWIN-NEXT: [[L]]auth_success_18: +; TRAP-DARWIN-NEXT: b [[BB_RETURN:[A-Za-z0-9_.]+]] +; TRAP-DARWIN-NEXT: [[BB_ELSE]]: +; TRAP-DARWIN-NEXT: autdzb x16 +; TRAP-DARWIN-NEXT: mov x17, x16 +; TRAP-DARWIN-NEXT: xpacd x17 +; TRAP-DARWIN-NEXT: cmp x16, x17 +; TRAP-DARWIN-NEXT: b.eq [[L]]auth_success_19 +; TRAP-DARWIN-NEXT: brk #0xc473 +; TRAP-DARWIN-NEXT: [[L]]auth_success_19: +; TRAP-DARWIN-NEXT: [[BB_RETURN]]: +; TRAP-DARWIN-NEXT: ldr x8, [x16] +; TRAP-ELF-NEXT: tbz w1, #0, [[BB_ELSE:[A-Za-z0-9_.]+]] +; TRAP-ELF-NEXT: %bb.1: +; TRAP-ELF-NEXT: autdza x0 +; TRAP-ELF-NEXT: mov x8, x0 +; TRAP-ELF-NEXT: xpacd x8 +; TRAP-ELF-NEXT: cmp x0, x8 +; TRAP-ELF-NEXT: b.eq [[L]]auth_success_18 +; TRAP-ELF-NEXT: brk #0xc472 +; TRAP-ELF-NEXT: [[L]]auth_success_18: +; TRAP-ELF-NEXT: b [[BB_RETURN:[A-Za-z0-9_.]+]] +; TRAP-ELF-NEXT: [[BB_ELSE]]: +; TRAP-ELF-NEXT: autdzb x0 +; TRAP-ELF-NEXT: mov x8, x0 +; TRAP-ELF-NEXT: xpacd x8 +; TRAP-ELF-NEXT: cmp x0, x8 +; TRAP-ELF-NEXT: b.eq [[L]]auth_success_19 +; TRAP-ELF-NEXT: brk #0xc473 +; TRAP-ELF-NEXT: [[L]]auth_success_19: +; TRAP-ELF-NEXT: [[BB_RETURN]]: +; TRAP-ELF-NEXT: ldr x8, [x0] +; TRAP-NEXT: ldr x8, [x8] +; TRAP-NEXT: ldr x8, [x8] +; TRAP-NEXT: ldr x0, [x8] +; TRAP-NEXT: ret +entry: + br i1 %cond, label %if.then, label %if.else + +if.then: + %auted.then = tail call i64 @llvm.ptrauth.auth(i64 %signed, i32 2, i64 0) + br label %return + +if.else: + %auted.else = tail call i64 @llvm.ptrauth.auth(i64 %signed, i32 3, i64 0) + br label %return + +return: + %auted = phi i64 [ %auted.then, %if.then ], [ %auted.else, %if.else ] + + ; A sequence of instructions that is common to both "then" and "else" + ; branches and is expensive to duplicate. + %ptr.0 = inttoptr i64 %auted to ptr + %ptr.1 = load ptr, ptr %ptr.0 + %ptr.2 = load ptr, ptr %ptr.1 + %ptr.3 = load ptr, ptr %ptr.2 + %ptr.4 = load ptr, ptr %ptr.3 + ret ptr %ptr.4 +} + declare i64 @llvm.ptrauth.auth(i64, i32, i64) declare i64 @llvm.ptrauth.resign(i64, i32, i64, i32, i64) diff --git a/llvm/test/CodeGen/AArch64/rand.ll b/llvm/test/CodeGen/AArch64/rand.ll index 706774d83b18..ed6d4b26ba56 100644 --- a/llvm/test/CodeGen/AArch64/rand.ll +++ b/llvm/test/CodeGen/AArch64/rand.ll @@ -4,11 +4,11 @@ define i32 @rndr(ptr %__addr) { ; CHECK-LABEL: rndr: ; CHECK: // %bb.0: -; CHECK-NEXT: mrs x9, RNDR -; CHECK-NEXT: mov x8, x0 -; CHECK-NEXT: cset w10, eq -; CHECK-NEXT: str x9, [x8] -; CHECK-NEXT: and w0, w10, #0x1 +; CHECK-NEXT: mrs x10, RNDR +; CHECK-NEXT: mov x9, x0 +; CHECK-NEXT: cset w8, eq +; CHECK-NEXT: str x10, [x9] +; CHECK-NEXT: mov w0, w8 ; CHECK-NEXT: ret %1 = tail call { i64, i1 } @llvm.aarch64.rndr() %2 = extractvalue { i64, i1 } %1, 0 @@ -22,11 +22,11 @@ define i32 @rndr(ptr %__addr) { define i32 @rndrrs(ptr %__addr) { ; CHECK-LABEL: rndrrs: ; CHECK: // %bb.0: -; CHECK-NEXT: mrs x9, RNDRRS -; CHECK-NEXT: mov x8, x0 -; CHECK-NEXT: cset w10, eq -; CHECK-NEXT: str x9, [x8] -; CHECK-NEXT: and w0, w10, #0x1 +; CHECK-NEXT: mrs x10, RNDRRS +; CHECK-NEXT: mov x9, x0 +; CHECK-NEXT: cset w8, eq +; CHECK-NEXT: str x10, [x9] +; CHECK-NEXT: mov w0, w8 ; CHECK-NEXT: ret %1 = tail call { i64, i1 } @llvm.aarch64.rndrrs() %2 = extractvalue { i64, i1 } %1, 0 diff --git a/llvm/test/CodeGen/AArch64/rem-by-const.ll b/llvm/test/CodeGen/AArch64/rem-by-const.ll index c57383ad9b1e..599fa510d4ae 100644 --- a/llvm/test/CodeGen/AArch64/rem-by-const.ll +++ b/llvm/test/CodeGen/AArch64/rem-by-const.ll @@ -336,26 +336,15 @@ entry: } define i32 @ui32_100(i32 %a, i32 %b) { -; CHECK-SD-LABEL: ui32_100: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: mov w8, #34079 // =0x851f -; CHECK-SD-NEXT: mov w9, #100 // =0x64 -; CHECK-SD-NEXT: movk w8, #20971, lsl #16 -; CHECK-SD-NEXT: umull x8, w0, w8 -; CHECK-SD-NEXT: lsr x8, x8, #37 -; CHECK-SD-NEXT: msub w0, w8, w9, w0 -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: ui32_100: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: mov w8, #34079 // =0x851f -; CHECK-GI-NEXT: mov w9, #100 // =0x64 -; CHECK-GI-NEXT: movk w8, #20971, lsl #16 -; CHECK-GI-NEXT: umull x8, w0, w8 -; CHECK-GI-NEXT: lsr x8, x8, #32 -; CHECK-GI-NEXT: lsr w8, w8, #5 -; CHECK-GI-NEXT: msub w0, w8, w9, w0 -; CHECK-GI-NEXT: ret +; CHECK-LABEL: ui32_100: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w8, #34079 // =0x851f +; CHECK-NEXT: mov w9, #100 // =0x64 +; CHECK-NEXT: movk w8, #20971, lsl #16 +; CHECK-NEXT: umull x8, w0, w8 +; CHECK-NEXT: lsr x8, x8, #37 +; CHECK-NEXT: msub w0, w8, w9, w0 +; CHECK-NEXT: ret entry: %s = urem i32 %a, 100 ret i32 %s @@ -1619,15 +1608,25 @@ entry: } define <8 x i8> @uv8i8_100(<8 x i8> %d, <8 x i8> %e) { -; CHECK-LABEL: uv8i8_100: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: movi v1.8b, #41 -; CHECK-NEXT: movi v2.8b, #100 -; CHECK-NEXT: umull v1.8h, v0.8b, v1.8b -; CHECK-NEXT: shrn v1.8b, v1.8h, #8 -; CHECK-NEXT: ushr v1.8b, v1.8b, #4 -; CHECK-NEXT: mls v0.8b, v1.8b, v2.8b -; CHECK-NEXT: ret +; CHECK-SD-LABEL: uv8i8_100: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: movi v1.8b, #41 +; CHECK-SD-NEXT: movi v2.8b, #100 +; CHECK-SD-NEXT: umull v1.8h, v0.8b, v1.8b +; CHECK-SD-NEXT: shrn v1.8b, v1.8h, #8 +; CHECK-SD-NEXT: ushr v1.8b, v1.8b, #4 +; CHECK-SD-NEXT: mls v0.8b, v1.8b, v2.8b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: uv8i8_100: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: movi v1.8b, #41 +; CHECK-GI-NEXT: movi v2.8b, #100 +; CHECK-GI-NEXT: umull v1.8h, v0.8b, v1.8b +; CHECK-GI-NEXT: ushr v1.8h, v1.8h, #12 +; CHECK-GI-NEXT: xtn v1.8b, v1.8h +; CHECK-GI-NEXT: mls v0.8b, v1.8b, v2.8b +; CHECK-GI-NEXT: ret entry: %s = urem <8 x i8> %d, <i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100> ret <8 x i8> %s @@ -2301,8 +2300,8 @@ define <4 x i16> @uv4i16_100(<4 x i16> %d, <4 x i16> %e) { ; CHECK-GI-NEXT: ldr d2, [x8, :lo12:.LCPI53_0] ; CHECK-GI-NEXT: umull v1.4s, v1.4h, v2.4h ; CHECK-GI-NEXT: movi v2.4h, #100 -; CHECK-GI-NEXT: shrn v1.4h, v1.4s, #16 -; CHECK-GI-NEXT: ushr v1.4h, v1.4h, #1 +; CHECK-GI-NEXT: ushr v1.4s, v1.4s, #17 +; CHECK-GI-NEXT: xtn v1.4h, v1.4s ; CHECK-GI-NEXT: mls v0.4h, v1.4h, v2.4h ; CHECK-GI-NEXT: ret entry: @@ -2656,8 +2655,8 @@ define <2 x i32> @uv2i32_100(<2 x i32> %d, <2 x i32> %e) { ; CHECK-GI-NEXT: movi v2.2s, #100 ; CHECK-GI-NEXT: ldr d1, [x8, :lo12:.LCPI63_0] ; CHECK-GI-NEXT: umull v1.2d, v0.2s, v1.2s -; CHECK-GI-NEXT: shrn v1.2s, v1.2d, #32 -; CHECK-GI-NEXT: ushr v1.2s, v1.2s, #5 +; CHECK-GI-NEXT: ushr v1.2d, v1.2d, #37 +; CHECK-GI-NEXT: xtn v1.2s, v1.2d ; CHECK-GI-NEXT: mls v0.2s, v1.2s, v2.2s ; CHECK-GI-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/scalable_masked_deinterleaved_loads.ll b/llvm/test/CodeGen/AArch64/scalable_masked_deinterleaved_loads.ll new file mode 100644 index 000000000000..c9b77a2109dc --- /dev/null +++ b/llvm/test/CodeGen/AArch64/scalable_masked_deinterleaved_loads.ll @@ -0,0 +1,287 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s + +define { <vscale x 16 x i8>, <vscale x 16 x i8> } @foo_ld2_nxv16i8(<vscale x 16 x i1> %mask, ptr %p) { +; CHECK-LABEL: foo_ld2_nxv16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ld2b { z0.b, z1.b }, p0/z, [x0] +; CHECK-NEXT: ret + %interleaved.mask = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> %mask, <vscale x 16 x i1> %mask) + %wide.masked.vec = call <vscale x 32 x i8> @llvm.masked.load.nxv32i8(ptr %p, i32 1, <vscale x 32 x i1> %interleaved.mask, <vscale x 32 x i8> poison) + %deinterleaved.vec = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> %wide.masked.vec) + ret { <vscale x 16 x i8>, <vscale x 16 x i8> } %deinterleaved.vec +} + +define { <vscale x 8 x i16>, <vscale x 8 x i16> } @foo_ld2_nxv8i16(<vscale x 8 x i1> %mask, ptr %p) { +; CHECK-LABEL: foo_ld2_nxv8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ld2h { z0.h, z1.h }, p0/z, [x0] +; CHECK-NEXT: ret + %interleaved.mask = call <vscale x 16 x i1> @llvm.vector.interleave2.nxv16i1(<vscale x 8 x i1> %mask, <vscale x 8 x i1> %mask) + %wide.masked.vec = call <vscale x 16 x i16> @llvm.masked.load.nxv16i16.p0(ptr %p, i32 2, <vscale x 16 x i1> %interleaved.mask, <vscale x 16 x i16> poison) + %deinterleaved.vec = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> %wide.masked.vec) + ret { <vscale x 8 x i16>, <vscale x 8 x i16> } %deinterleaved.vec +} + +define { <vscale x 4 x float>, <vscale x 4 x float> } @foo_ld2_nxv4f32(<vscale x 4 x i1> %mask, ptr %p) { +; CHECK-LABEL: foo_ld2_nxv4f32: +; CHECK: // %bb.0: +; CHECK-NEXT: ld2w { z0.s, z1.s }, p0/z, [x0] +; CHECK-NEXT: ret + %interleaved.mask = call <vscale x 8 x i1> @llvm.vector.interleave2.nxv8i1(<vscale x 4 x i1> %mask, <vscale x 4 x i1> %mask) + %wide.masked.vec = call <vscale x 8 x float> @llvm.masked.load.nxv8f32(ptr %p, i32 4, <vscale x 8 x i1> %interleaved.mask, <vscale x 8 x float> poison) + %deinterleaved.vec = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.vector.deinterleave2.nxv8f32(<vscale x 8 x float> %wide.masked.vec) + ret { <vscale x 4 x float>, <vscale x 4 x float> } %deinterleaved.vec +} + +define { <vscale x 2 x double>, <vscale x 2 x double> } @foo_ld2_nxv2f64(<vscale x 2 x i1> %mask, ptr %p) { +; CHECK-LABEL: foo_ld2_nxv2f64: +; CHECK: // %bb.0: +; CHECK-NEXT: ld2d { z0.d, z1.d }, p0/z, [x0] +; CHECK-NEXT: ret + %interleaved.mask = call <vscale x 4 x i1> @llvm.vector.interleave2.nxv4i1(<vscale x 2 x i1> %mask, <vscale x 2 x i1> %mask) + %wide.masked.vec = call <vscale x 4 x double> @llvm.masked.load.nxv4f64(ptr %p, i32 8, <vscale x 4 x i1> %interleaved.mask, <vscale x 4 x double> poison) + %deinterleaved.vec = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.masked.vec) + ret { <vscale x 2 x double>, <vscale x 2 x double> } %deinterleaved.vec +} + +define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @foo_ld4_nxv16i8(<vscale x 16 x i1> %mask, ptr %p) { +; CHECK-LABEL: foo_ld4_nxv16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ld4b { z0.b - z3.b }, p0/z, [x0] +; CHECK-NEXT: ret + %interleaved.mask = call <vscale x 64 x i1> @llvm.vector.interleave4.nxv64i1(<vscale x 16 x i1> %mask, <vscale x 16 x i1> %mask, <vscale x 16 x i1> %mask, <vscale x 16 x i1> %mask) + %wide.masked.vec = call <vscale x 64 x i8> @llvm.masked.load.nxv64i8(ptr %p, i32 1, <vscale x 64 x i1> %interleaved.mask, <vscale x 64 x i8> poison) + %deinterleaved.vec = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave4.nxv64i8(<vscale x 64 x i8> %wide.masked.vec) + ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %deinterleaved.vec +} + +define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @foo_ld4_nxv8i16(<vscale x 8 x i1> %mask, ptr %p) { +; CHECK-LABEL: foo_ld4_nxv8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ld4h { z0.h - z3.h }, p0/z, [x0] +; CHECK-NEXT: ret + %interleaved.mask = call <vscale x 32 x i1> @llvm.vector.interleave4.nxv32i1(<vscale x 8 x i1> %mask, <vscale x 8 x i1> %mask, <vscale x 8 x i1> %mask, <vscale x 8 x i1> %mask) + %wide.masked.vec = call <vscale x 32 x i16> @llvm.masked.load.nxv32i16(ptr %p, i32 2, <vscale x 32 x i1> %interleaved.mask, <vscale x 32 x i16> poison) + %deinterleaved.vec = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave4.nxv32i16(<vscale x 32 x i16> %wide.masked.vec) + ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %deinterleaved.vec +} + +define { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @foo_ld4_nxv4f32(<vscale x 4 x i1> %mask, ptr %p) { +; CHECK-LABEL: foo_ld4_nxv4f32: +; CHECK: // %bb.0: +; CHECK-NEXT: ld4w { z0.s - z3.s }, p0/z, [x0] +; CHECK-NEXT: ret + %interleaved.mask = call <vscale x 16 x i1> @llvm.vector.interleave4.nxv16i1(<vscale x 4 x i1> %mask, <vscale x 4 x i1> %mask, <vscale x 4 x i1> %mask, <vscale x 4 x i1> %mask) + %wide.masked.vec = call <vscale x 16 x float> @llvm.masked.load.nxv16f32(ptr %p, i32 4, <vscale x 16 x i1> %interleaved.mask, <vscale x 16 x float> poison) + %deinterleaved.vec = call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.vector.deinterleave4.nxv16f32(<vscale x 16 x float> %wide.masked.vec) + ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %deinterleaved.vec +} + +define { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @foo_ld4_nxv2f64(<vscale x 2 x i1> %mask, ptr %p) { +; CHECK-LABEL: foo_ld4_nxv2f64: +; CHECK: // %bb.0: +; CHECK-NEXT: ld4d { z0.d - z3.d }, p0/z, [x0] +; CHECK-NEXT: ret + %interleaved.mask = call <vscale x 8 x i1> @llvm.vector.interleave4.nxv8i1(<vscale x 2 x i1> %mask, <vscale x 2 x i1> %mask, <vscale x 2 x i1> %mask, <vscale x 2 x i1> %mask) + %wide.masked.vec = call <vscale x 8 x double> @llvm.masked.load.nxv8f64(ptr %p, i32 8, <vscale x 8 x i1> %interleaved.mask, <vscale x 8 x double> poison) + %deinterleaved.vec = call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave4.nxv8f64(<vscale x 8 x double> %wide.masked.vec) + ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %deinterleaved.vec +} + + +define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @foo_ld4_nxv16i8_mul_use_of_mask(<vscale x 16 x i1> %mask, ptr %p, ptr %p2) { +; CHECK-LABEL: foo_ld4_nxv16i8_mul_use_of_mask: +; CHECK: // %bb.0: +; CHECK-NEXT: zip1 p2.b, p0.b, p0.b +; CHECK-NEXT: ld4b { z0.b - z3.b }, p0/z, [x0] +; CHECK-NEXT: zip2 p1.b, p0.b, p0.b +; CHECK-NEXT: zip1 p3.b, p2.b, p2.b +; CHECK-NEXT: zip2 p0.b, p1.b, p1.b +; CHECK-NEXT: zip1 p1.b, p1.b, p1.b +; CHECK-NEXT: zip2 p2.b, p2.b, p2.b +; CHECK-NEXT: // fake_use: $p3 +; CHECK-NEXT: // fake_use: $p2 +; CHECK-NEXT: // fake_use: $p1 +; CHECK-NEXT: // fake_use: $p0 +; CHECK-NEXT: ret + %interleaved.mask = call <vscale x 64 x i1> @llvm.vector.interleave4.nxv64i1(<vscale x 16 x i1> %mask, <vscale x 16 x i1> %mask, <vscale x 16 x i1> %mask, <vscale x 16 x i1> %mask) + %wide.masked.vec = call <vscale x 64 x i8> @llvm.masked.load.nxv64i8(ptr %p, i32 4, <vscale x 64 x i1> %interleaved.mask, <vscale x 64 x i8> poison) + %deinterleaved.vec = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave4.nxv64i8(<vscale x 64 x i8> %wide.masked.vec) + call void (...) @llvm.fake.use(<vscale x 64 x i1> %interleaved.mask) + ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %deinterleaved.vec +} + +define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @foo_ld4_nxv16i8_mask_of_interleaved_ones(ptr %p) { +; CHECK-LABEL: foo_ld4_nxv16i8_mask_of_interleaved_ones: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: ld4b { z0.b - z3.b }, p0/z, [x0] +; CHECK-NEXT: ret + %interleaved.mask = call <vscale x 64 x i1> @llvm.vector.interleave4.nxv64i1(<vscale x 16 x i1> splat(i1 1), <vscale x 16 x i1> splat(i1 1), <vscale x 16 x i1> splat(i1 1), <vscale x 16 x i1> splat(i1 1)) + %wide.masked.vec = call <vscale x 64 x i8> @llvm.masked.load.nxv64i8(ptr %p, i32 4, <vscale x 64 x i1> %interleaved.mask, <vscale x 64 x i8> poison) + %deinterleaved.vec = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave4.nxv64i8(<vscale x 64 x i8> %wide.masked.vec) + ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %deinterleaved.vec +} + +define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @foo_ld4_nxv16i8_mask_of_ones(ptr %p) { +; CHECK-LABEL: foo_ld4_nxv16i8_mask_of_ones: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: ld4b { z0.b - z3.b }, p0/z, [x0] +; CHECK-NEXT: ret + %wide.masked.vec = call <vscale x 64 x i8> @llvm.masked.load.nxv64i8(ptr %p, i32 4, <vscale x 64 x i1> splat(i1 1), <vscale x 64 x i8> poison) + %deinterleaved.vec = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave4.nxv64i8(<vscale x 64 x i8> %wide.masked.vec) + ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %deinterleaved.vec +} + + +; Negative tests + +define { <vscale x 16 x i8>, <vscale x 16 x i8> } @foo_ld2_nxv16i8_mul_use_of_load(<vscale x 16 x i1> %mask, ptr %p, ptr %p2) { +; CHECK-LABEL: foo_ld2_nxv16i8_mul_use_of_load: +; CHECK: // %bb.0: +; CHECK-NEXT: zip1 p1.b, p0.b, p0.b +; CHECK-NEXT: zip2 p0.b, p0.b, p0.b +; CHECK-NEXT: ld1b { z3.b }, p1/z, [x0] +; CHECK-NEXT: ld1b { z2.b }, p0/z, [x0, #1, mul vl] +; CHECK-NEXT: uzp1 z0.b, z3.b, z2.b +; CHECK-NEXT: uzp2 z1.b, z3.b, z2.b +; CHECK-NEXT: // fake_use: $z3 +; CHECK-NEXT: // fake_use: $z2 +; CHECK-NEXT: ret + %interleaved.mask = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> %mask, <vscale x 16 x i1> %mask) + %wide.masked.vec = call <vscale x 32 x i8> @llvm.masked.load.nxv32i8(ptr %p, i32 4, <vscale x 32 x i1> %interleaved.mask, <vscale x 32 x i8> poison) + %deinterleaved.vec = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> %wide.masked.vec) + call void (...) @llvm.fake.use(<vscale x 32 x i8> %wide.masked.vec) + ret { <vscale x 16 x i8>, <vscale x 16 x i8> } %deinterleaved.vec +} + +; Mask must be an interleave of identical masks. +define { <vscale x 16 x i8>, <vscale x 16 x i8> } @foo_ld2_nxv16i8_bad_mask(<vscale x 16 x i1> %mask, <vscale x 16 x i1> %mask2, ptr %p, ptr %p2) { +; CHECK-LABEL: foo_ld2_nxv16i8_bad_mask: +; CHECK: // %bb.0: +; CHECK-NEXT: zip1 p2.b, p0.b, p1.b +; CHECK-NEXT: zip2 p0.b, p0.b, p1.b +; CHECK-NEXT: ld1b { z2.b }, p2/z, [x0] +; CHECK-NEXT: ld1b { z1.b }, p0/z, [x0, #1, mul vl] +; CHECK-NEXT: uzp1 z0.b, z2.b, z1.b +; CHECK-NEXT: uzp2 z1.b, z2.b, z1.b +; CHECK-NEXT: ret + %interleaved.mask = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> %mask, <vscale x 16 x i1> %mask2) + %wide.masked.vec = call <vscale x 32 x i8> @llvm.masked.load.nxv32i8(ptr %p, i32 4, <vscale x 32 x i1> %interleaved.mask, <vscale x 32 x i8> poison) + %deinterleaved.vec = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> %wide.masked.vec) + ret { <vscale x 16 x i8>, <vscale x 16 x i8> } %deinterleaved.vec +} + +; Number of parts in mask interleave must match deinterleave. +define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @foo_ld4_nxv16i8_bad_mask2(<vscale x 32 x i1> %mask, ptr %p, ptr %p2) { +; CHECK-LABEL: foo_ld4_nxv16i8_bad_mask2: +; CHECK: // %bb.0: +; CHECK-NEXT: zip1 p2.b, p1.b, p1.b +; CHECK-NEXT: zip2 p1.b, p1.b, p1.b +; CHECK-NEXT: zip2 p3.b, p0.b, p0.b +; CHECK-NEXT: ld1b { z3.b }, p2/z, [x0, #2, mul vl] +; CHECK-NEXT: zip1 p0.b, p0.b, p0.b +; CHECK-NEXT: ld1b { z2.b }, p1/z, [x0, #3, mul vl] +; CHECK-NEXT: ld1b { z0.b }, p3/z, [x0, #1, mul vl] +; CHECK-NEXT: ld1b { z1.b }, p0/z, [x0] +; CHECK-NEXT: uzp1 z4.b, z3.b, z2.b +; CHECK-NEXT: uzp2 z3.b, z3.b, z2.b +; CHECK-NEXT: uzp1 z5.b, z1.b, z0.b +; CHECK-NEXT: uzp2 z6.b, z1.b, z0.b +; CHECK-NEXT: uzp1 z0.b, z5.b, z4.b +; CHECK-NEXT: uzp1 z1.b, z6.b, z3.b +; CHECK-NEXT: uzp2 z2.b, z5.b, z4.b +; CHECK-NEXT: uzp2 z3.b, z6.b, z3.b +; CHECK-NEXT: ret + %interleaved.mask = call <vscale x 64 x i1> @llvm.vector.interleave2.nxv64i1(<vscale x 32 x i1> %mask, <vscale x 32 x i1> %mask) + %wide.masked.vec = call <vscale x 64 x i8> @llvm.masked.load.nxv64i8(ptr %p, i32 4, <vscale x 64 x i1> %interleaved.mask, <vscale x 64 x i8> poison) + %deinterleaved.vec = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave4.nxv64i8(<vscale x 64 x i8> %wide.masked.vec) + ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %deinterleaved.vec +} + +; Mask must come from an interleave or a splat. +define { <vscale x 16 x i8>, <vscale x 16 x i8> } @foo_ld2_nxv16i8_bad_mask3(<vscale x 32 x i1> %mask, ptr %p, ptr %p2) { +; CHECK-LABEL: foo_ld2_nxv16i8_bad_mask3: +; CHECK: // %bb.0: +; CHECK-NEXT: ld1b { z1.b }, p1/z, [x0, #1, mul vl] +; CHECK-NEXT: ld1b { z2.b }, p0/z, [x0] +; CHECK-NEXT: uzp1 z0.b, z2.b, z1.b +; CHECK-NEXT: uzp2 z1.b, z2.b, z1.b +; CHECK-NEXT: ret + %wide.masked.vec = call <vscale x 32 x i8> @llvm.masked.load.nxv32i8(ptr %p, i32 4, <vscale x 32 x i1> %mask, <vscale x 32 x i8> poison) + %deinterleaved.vec = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> %wide.masked.vec) + ret { <vscale x 16 x i8>, <vscale x 16 x i8> } %deinterleaved.vec +} + +; Each deinterleaved vector must be exactly 128 bits. +define { <vscale x 8 x i8>, <vscale x 8 x i8> } @foo_ld2_nxv8i8(<vscale x 8 x i1> %mask, ptr %p) { +; CHECK-LABEL: foo_ld2_nxv8i8: +; CHECK: // %bb.0: +; CHECK-NEXT: zip2 p1.h, p0.h, p0.h +; CHECK-NEXT: zip1 p0.h, p0.h, p0.h +; CHECK-NEXT: uzp1 p0.b, p0.b, p1.b +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] +; CHECK-NEXT: uunpkhi z1.h, z0.b +; CHECK-NEXT: uunpklo z2.h, z0.b +; CHECK-NEXT: uzp1 z0.h, z2.h, z1.h +; CHECK-NEXT: uzp2 z1.h, z2.h, z1.h +; CHECK-NEXT: ret + %interleaved.mask = call <vscale x 16 x i1> @llvm.vector.interleave2.nxv16i1(<vscale x 8 x i1> %mask, <vscale x 8 x i1> %mask) + %wide.masked.vec = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8(ptr %p, i32 1, <vscale x 16 x i1> %interleaved.mask, <vscale x 16 x i8> poison) + %deinterleaved.vec = call { <vscale x 8 x i8>, <vscale x 8 x i8> } @llvm.vector.deinterleave2.nxv16i8(<vscale x 16 x i8> %wide.masked.vec) + ret { <vscale x 8 x i8>, <vscale x 8 x i8> } %deinterleaved.vec +} + +; Passthru must be poison or zero. +define { <vscale x 16 x i8>, <vscale x 16 x i8> } @foo_ld2_nxv16i8_bad_passthru(<vscale x 16 x i1> %mask, ptr %p) { +; CHECK-LABEL: foo_ld2_nxv16i8_bad_passthru: +; CHECK: // %bb.0: +; CHECK-NEXT: zip1 p1.b, p0.b, p0.b +; CHECK-NEXT: mov z0.b, #3 // =0x3 +; CHECK-NEXT: zip2 p0.b, p0.b, p0.b +; CHECK-NEXT: ld1b { z2.b }, p1/z, [x0] +; CHECK-NEXT: ld1b { z1.b }, p0/z, [x0, #1, mul vl] +; CHECK-NEXT: sel z2.b, p1, z2.b, z0.b +; CHECK-NEXT: sel z1.b, p0, z1.b, z0.b +; CHECK-NEXT: uzp1 z0.b, z2.b, z1.b +; CHECK-NEXT: uzp2 z1.b, z2.b, z1.b +; CHECK-NEXT: ret + %interleaved.mask = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> %mask, <vscale x 16 x i1> %mask) + %wide.masked.vec = call <vscale x 32 x i8> @llvm.masked.load.nxv32i8(ptr %p, i32 1, <vscale x 32 x i1> %interleaved.mask, <vscale x 32 x i8> splat(i8 3)) + %deinterleaved.vec = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> %wide.masked.vec) + ret { <vscale x 16 x i8>, <vscale x 16 x i8> } %deinterleaved.vec +} + +define { <vscale x 8 x i16>, <vscale x 8 x i16> } @foo_deinterleave2_not_load(<vscale x 8 x i16> %vec1, <vscale x 8 x i16> %vec2) { +; CHECK-LABEL: foo_deinterleave2_not_load: +; CHECK: // %bb.0: +; CHECK-NEXT: uzp1 z2.h, z0.h, z1.h +; CHECK-NEXT: uzp2 z1.h, z0.h, z1.h +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: ret + %bad.vec.init = call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> %vec1, i64 0) + %bad.vec = call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16(<vscale x 16 x i16> %bad.vec.init, <vscale x 8 x i16> %vec2, i64 8) + %deinterleaved.vec = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> %bad.vec) + ret { <vscale x 8 x i16>, <vscale x 8 x i16> } %deinterleaved.vec +} + +define { <vscale x 4 x i16>, <vscale x 4 x i16> } @foo_ld2_nxv8i8_exti16(<vscale x 4 x i1> %mask, ptr %p) { +; CHECK-LABEL: foo_ld2_nxv8i8_exti16: +; CHECK: // %bb.0: +; CHECK-NEXT: zip2 p1.s, p0.s, p0.s +; CHECK-NEXT: zip1 p0.s, p0.s, p0.s +; CHECK-NEXT: uzp1 p0.h, p0.h, p1.h +; CHECK-NEXT: ld1b { z0.h }, p0/z, [x0] +; CHECK-NEXT: uunpkhi z1.s, z0.h +; CHECK-NEXT: uunpklo z2.s, z0.h +; CHECK-NEXT: uzp1 z0.s, z2.s, z1.s +; CHECK-NEXT: uzp2 z1.s, z2.s, z1.s +; CHECK-NEXT: ret + %interleaved.mask = call <vscale x 8 x i1> @llvm.vector.interleave2.nxv8i1(<vscale x 4 x i1> %mask, <vscale x 4 x i1> %mask) + %wide.masked.vec = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8(ptr %p, i32 1, <vscale x 8 x i1> %interleaved.mask, <vscale x 8 x i8> poison) + %wide.masked.vec.ext = zext <vscale x 8 x i8> %wide.masked.vec to <vscale x 8 x i16> + %deinterleaved.vec = call { <vscale x 4 x i16>, <vscale x 4 x i16> } @llvm.vector.deinterleave2.nxv8i16(<vscale x 8 x i16> %wide.masked.vec.ext) + ret { <vscale x 4 x i16>, <vscale x 4 x i16> } %deinterleaved.vec +} diff --git a/llvm/test/CodeGen/AArch64/scalable_masked_interleaved_stores.ll b/llvm/test/CodeGen/AArch64/scalable_masked_interleaved_stores.ll new file mode 100644 index 000000000000..1657dfad9434 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/scalable_masked_interleaved_stores.ll @@ -0,0 +1,288 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s + +define void @foo_st2_nxv16i8(<vscale x 16 x i1> %mask, <vscale x 16 x i8> %val1, <vscale x 16 x i8> %val2, ptr %p) { +; CHECK-LABEL: foo_st2_nxv16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: st2b { z0.b, z1.b }, p0, [x0] +; CHECK-NEXT: ret + %interleaved.mask = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> %mask, <vscale x 16 x i1> %mask) + %interleaved.value = call <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8> %val1, <vscale x 16 x i8> %val2) + call void @llvm.masked.store.nxv32i8.p0(<vscale x 32 x i8> %interleaved.value, ptr %p, i32 1, <vscale x 32 x i1> %interleaved.mask) + ret void +} + +define void @foo_st2_nxv8i16(<vscale x 8 x i1> %mask, <vscale x 8 x i16> %val1, <vscale x 8 x i16> %val2, ptr %p) { +; CHECK-LABEL: foo_st2_nxv8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: st2h { z0.h, z1.h }, p0, [x0] +; CHECK-NEXT: ret + %interleaved.mask = call <vscale x 16 x i1> @llvm.vector.interleave2.nxv16i1(<vscale x 8 x i1> %mask, <vscale x 8 x i1> %mask) + %interleaved.value = call <vscale x 16 x i16> @llvm.vector.interleave2.nxv16i16(<vscale x 8 x i16> %val1, <vscale x 8 x i16> %val2) + call void @llvm.masked.store.nxv16i16.p0(<vscale x 16 x i16> %interleaved.value, ptr %p, i32 1, <vscale x 16 x i1> %interleaved.mask) + ret void +} + +define void @foo_st2_nxv4i32(<vscale x 4 x i1> %mask, <vscale x 4 x i32> %val1, <vscale x 4 x i32> %val2, ptr %p) { +; CHECK-LABEL: foo_st2_nxv4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: st2w { z0.s, z1.s }, p0, [x0] +; CHECK-NEXT: ret + %interleaved.mask = call <vscale x 8 x i1> @llvm.vector.interleave2.nxv8i1(<vscale x 4 x i1> %mask, <vscale x 4 x i1> %mask) + %interleaved.value = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> %val1, <vscale x 4 x i32> %val2) + call void @llvm.masked.store.nxv8i32.p0(<vscale x 8 x i32> %interleaved.value, ptr %p, i32 1, <vscale x 8 x i1> %interleaved.mask) + ret void +} + +define void @foo_st2_nxv2i64(<vscale x 2 x i1> %mask, <vscale x 2 x i64> %val1, <vscale x 2 x i64> %val2, ptr %p) { +; CHECK-LABEL: foo_st2_nxv2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: st2d { z0.d, z1.d }, p0, [x0] +; CHECK-NEXT: ret + %interleaved.mask = call <vscale x 4 x i1> @llvm.vector.interleave2.nxv4i1(<vscale x 2 x i1> %mask, <vscale x 2 x i1> %mask) + %interleaved.value = call <vscale x 4 x i64> @llvm.vector.interleave2.nxv4i64(<vscale x 2 x i64> %val1, <vscale x 2 x i64> %val2) + call void @llvm.masked.store.nxv4i64.p0(<vscale x 4 x i64> %interleaved.value, ptr %p, i32 1, <vscale x 4 x i1> %interleaved.mask) + ret void +} + +define void @foo_st4_nxv16i8(<vscale x 16 x i1> %mask, <vscale x 16 x i8> %val1, <vscale x 16 x i8> %val2, <vscale x 16 x i8> %val3, <vscale x 16 x i8> %val4, ptr %p) { +; CHECK-LABEL: foo_st4_nxv16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: st4b { z0.b - z3.b }, p0, [x0] +; CHECK-NEXT: ret + %interleaved.mask = call <vscale x 64 x i1> @llvm.vector.interleave4.nxv64i1(<vscale x 16 x i1> %mask, <vscale x 16 x i1> %mask, <vscale x 16 x i1> %mask, <vscale x 16 x i1> %mask) + %interleaved.value = call <vscale x 64 x i8> @llvm.vector.interleave4.nxv64i8(<vscale x 16 x i8> %val1, <vscale x 16 x i8> %val2, <vscale x 16 x i8> %val3, <vscale x 16 x i8> %val4) + call void @llvm.masked.store.nxv64i8.p0(<vscale x 64 x i8> %interleaved.value, ptr %p, i32 1, <vscale x 64 x i1> %interleaved.mask) + ret void +} + +define void @foo_st4_nxv8i16(<vscale x 8 x i1> %mask, <vscale x 8 x i16> %val1, <vscale x 8 x i16> %val2, <vscale x 8 x i16> %val3, <vscale x 8 x i16> %val4, ptr %p) { +; CHECK-LABEL: foo_st4_nxv8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: st4h { z0.h - z3.h }, p0, [x0] +; CHECK-NEXT: ret + %interleaved.mask = call <vscale x 32 x i1> @llvm.vector.interleave4.nxv32i1(<vscale x 8 x i1> %mask, <vscale x 8 x i1> %mask, <vscale x 8 x i1> %mask, <vscale x 8 x i1> %mask) + %interleaved.value = call <vscale x 32 x i16> @llvm.vector.interleave4.nxv32i16(<vscale x 8 x i16> %val1, <vscale x 8 x i16> %val2, <vscale x 8 x i16> %val3, <vscale x 8 x i16> %val4) + call void @llvm.masked.store.nxv32i16.p0(<vscale x 32 x i16> %interleaved.value, ptr %p, i32 1, <vscale x 32 x i1> %interleaved.mask) + ret void +} + +define void @foo_st4_nxv4i32(<vscale x 4 x i1> %mask, <vscale x 4 x i32> %val1, <vscale x 4 x i32> %val2, <vscale x 4 x i32> %val3, <vscale x 4 x i32> %val4, ptr %p) { +; CHECK-LABEL: foo_st4_nxv4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: st4w { z0.s - z3.s }, p0, [x0] +; CHECK-NEXT: ret + %interleaved.mask = call <vscale x 16 x i1> @llvm.vector.interleave4.nxv16i1(<vscale x 4 x i1> %mask, <vscale x 4 x i1> %mask, <vscale x 4 x i1> %mask, <vscale x 4 x i1> %mask) + %interleaved.value = call <vscale x 16 x i32> @llvm.vector.interleave4.nxv16i32(<vscale x 4 x i32> %val1, <vscale x 4 x i32> %val2, <vscale x 4 x i32> %val3, <vscale x 4 x i32> %val4) + call void @llvm.masked.store.nxv16i32.p0(<vscale x 16 x i32> %interleaved.value, ptr %p, i32 1, <vscale x 16 x i1> %interleaved.mask) + ret void +} + +define void @foo_st4_nxv2i64(<vscale x 2 x i1> %mask, <vscale x 2 x i64> %val1, <vscale x 2 x i64> %val2, <vscale x 2 x i64> %val3, <vscale x 2 x i64> %val4, ptr %p) { +; CHECK-LABEL: foo_st4_nxv2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: st4d { z0.d - z3.d }, p0, [x0] +; CHECK-NEXT: ret + %interleaved.mask = call <vscale x 8 x i1> @llvm.vector.interleave4.nxv4i1(<vscale x 2 x i1> %mask, <vscale x 2 x i1> %mask, <vscale x 2 x i1> %mask, <vscale x 2 x i1> %mask) + %interleaved.value = call <vscale x 8 x i64> @llvm.vector.interleave4.nxv8i64(<vscale x 2 x i64> %val1, <vscale x 2 x i64> %val2, <vscale x 2 x i64> %val3, <vscale x 2 x i64> %val4) + call void @llvm.masked.store.nxv8i64.p0(<vscale x 8 x i64> %interleaved.value, ptr %p, i32 1, <vscale x 8 x i1> %interleaved.mask) + ret void +} + +define void @foo_st2_nxv16i8_mul_use_mask(<vscale x 16 x i1> %mask, <vscale x 16 x i8> %val1, <vscale x 16 x i8> %val2, ptr %p) { +; CHECK-LABEL: foo_st2_nxv16i8_mul_use_mask: +; CHECK: // %bb.0: +; CHECK-NEXT: zip1 p1.b, p0.b, p0.b +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: zip2 p2.b, p0.b, p0.b +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: st2b { z0.b, z1.b }, p0, [x0] +; CHECK-NEXT: // fake_use: $p1 +; CHECK-NEXT: // fake_use: $p2 +; CHECK-NEXT: ret + %interleaved.mask = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> %mask, <vscale x 16 x i1> %mask) + %interleaved.value = call <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8> %val1, <vscale x 16 x i8> %val2) + call void @llvm.masked.store.nxv32i8.p0(<vscale x 32 x i8> %interleaved.value, ptr %p, i32 1, <vscale x 32 x i1> %interleaved.mask) + call void (...) @llvm.fake.use(<vscale x 32 x i1> %interleaved.mask) + ret void +} + +define void @foo_st2_nxv16i8_mask_of_interleaved_ones(<vscale x 16 x i8> %val1, <vscale x 16 x i8> %val2, ptr %p) { +; CHECK-LABEL: foo_st2_nxv16i8_mask_of_interleaved_ones: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: st2b { z0.b, z1.b }, p0, [x0] +; CHECK-NEXT: ret + %interleaved.mask = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> splat(i1 1), <vscale x 16 x i1> splat(i1 1)) + %interleaved.value = call <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8> %val1, <vscale x 16 x i8> %val2) + call void @llvm.masked.store.nxv32i8.p0(<vscale x 32 x i8> %interleaved.value, ptr %p, i32 1, <vscale x 32 x i1> %interleaved.mask) + ret void +} + +define void @foo_st2_nxv16i8_all_false_mask(<vscale x 16 x i8> %val1, <vscale x 16 x i8> %val2, ptr %p) { +; CHECK-LABEL: foo_st2_nxv16i8_all_false_mask: +; CHECK: // %bb.0: +; CHECK-NEXT: ret + %interleaved.value = call <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8> %val1, <vscale x 16 x i8> %val2) + call void @llvm.masked.store.nxv32i8.p0(<vscale x 32 x i8> %interleaved.value, ptr %p, i32 1, <vscale x 32 x i1> splat(i1 0)) + ret void +} + +; DAGCombiner optimises the masked store to a normal store before we even +; reach performMSTORECombine so we never get a chance to convert this to st2b. +define void @foo_st2_nxv16i8_all_true_mask(<vscale x 16 x i8> %val1, <vscale x 16 x i8> %val2, ptr %p) { +; CHECK-LABEL: foo_st2_nxv16i8_all_true_mask: +; CHECK: // %bb.0: +; CHECK-NEXT: zip2 z2.b, z0.b, z1.b +; CHECK-NEXT: zip1 z0.b, z0.b, z1.b +; CHECK-NEXT: str z2, [x0, #1, mul vl] +; CHECK-NEXT: str z0, [x0] +; CHECK-NEXT: ret + %interleaved.value = call <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8> %val1, <vscale x 16 x i8> %val2) + call void @llvm.masked.store.nxv32i8.p0(<vscale x 32 x i8> %interleaved.value, ptr %p, i32 1, <vscale x 32 x i1> splat(i1 1)) + ret void +} + + +; Negative tests + +define void @foo_st2_nxv16i8_mul_use_value(<vscale x 16 x i1> %mask, <vscale x 16 x i8> %val1, <vscale x 16 x i8> %val2, ptr %p) { +; CHECK-LABEL: foo_st2_nxv16i8_mul_use_value: +; CHECK: // %bb.0: +; CHECK-NEXT: zip2 z2.b, z0.b, z1.b +; CHECK-NEXT: zip1 z0.b, z0.b, z1.b +; CHECK-NEXT: zip2 p1.b, p0.b, p0.b +; CHECK-NEXT: zip1 p0.b, p0.b, p0.b +; CHECK-NEXT: st1b { z2.b }, p1, [x0, #1, mul vl] +; CHECK-NEXT: st1b { z0.b }, p0, [x0] +; CHECK-NEXT: // fake_use: $z0 +; CHECK-NEXT: // fake_use: $z2 +; CHECK-NEXT: ret + %interleaved.mask = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> %mask, <vscale x 16 x i1> %mask) + %interleaved.value = call <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8> %val1, <vscale x 16 x i8> %val2) + call void @llvm.masked.store.nxv32i8.p0(<vscale x 32 x i8> %interleaved.value, ptr %p, i32 1, <vscale x 32 x i1> %interleaved.mask) + call void (...) @llvm.fake.use(<vscale x 32 x i8> %interleaved.value) + ret void +} + +; Mask must be an interleave of identical masks. +define void @foo_st2_nxv16i8_bad_mask(<vscale x 16 x i1> %mask, <vscale x 16 x i1> %mask2, <vscale x 16 x i8> %val1, <vscale x 16 x i8> %val2, ptr %p) { +; CHECK-LABEL: foo_st2_nxv16i8_bad_mask: +; CHECK: // %bb.0: +; CHECK-NEXT: zip2 z2.b, z0.b, z1.b +; CHECK-NEXT: zip1 z0.b, z0.b, z1.b +; CHECK-NEXT: zip2 p2.b, p0.b, p1.b +; CHECK-NEXT: zip1 p0.b, p0.b, p1.b +; CHECK-NEXT: st1b { z2.b }, p2, [x0, #1, mul vl] +; CHECK-NEXT: st1b { z0.b }, p0, [x0] +; CHECK-NEXT: ret + %interleaved.mask = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> %mask, <vscale x 16 x i1> %mask2) + %interleaved.value = call <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8> %val1, <vscale x 16 x i8> %val2) + call void @llvm.masked.store.nxv32i8.p0(<vscale x 32 x i8> %interleaved.value, ptr %p, i32 1, <vscale x 32 x i1> %interleaved.mask) + ret void +} + +; Number of parts in mask interleave must match interleave. +define void @foo_st4_nxv4i32_bad_mask2(<vscale x 8 x i1> %mask, <vscale x 4 x i32> %val1, <vscale x 4 x i32> %val2, <vscale x 4 x i32> %val3, <vscale x 4 x i32> %val4, ptr %p) { +; CHECK-LABEL: foo_st4_nxv4i32_bad_mask2: +; CHECK: // %bb.0: +; CHECK-NEXT: zip2 z4.s, z1.s, z3.s +; CHECK-NEXT: zip2 z5.s, z0.s, z2.s +; CHECK-NEXT: zip2 p1.h, p0.h, p0.h +; CHECK-NEXT: zip1 z1.s, z1.s, z3.s +; CHECK-NEXT: zip1 z0.s, z0.s, z2.s +; CHECK-NEXT: zip1 p0.h, p0.h, p0.h +; CHECK-NEXT: punpkhi p2.h, p1.b +; CHECK-NEXT: zip2 z2.s, z5.s, z4.s +; CHECK-NEXT: zip1 z3.s, z5.s, z4.s +; CHECK-NEXT: punpklo p1.h, p1.b +; CHECK-NEXT: zip2 z4.s, z0.s, z1.s +; CHECK-NEXT: zip1 z0.s, z0.s, z1.s +; CHECK-NEXT: st1w { z2.s }, p2, [x0, #3, mul vl] +; CHECK-NEXT: punpkhi p2.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: st1w { z3.s }, p1, [x0, #2, mul vl] +; CHECK-NEXT: st1w { z4.s }, p2, [x0, #1, mul vl] +; CHECK-NEXT: st1w { z0.s }, p0, [x0] +; CHECK-NEXT: ret + %interleaved.mask = call <vscale x 16 x i1> @llvm.vector.interleave2.nxv16i1(<vscale x 8 x i1> %mask, <vscale x 8 x i1> %mask) + %interleaved.value = call <vscale x 16 x i32> @llvm.vector.interleave4.nxv16i32(<vscale x 4 x i32> %val1, <vscale x 4 x i32> %val2, <vscale x 4 x i32> %val3, <vscale x 4 x i32> %val4) + call void @llvm.masked.store.nxv16i32.p0(<vscale x 16 x i32> %interleaved.value, ptr %p, i32 1, <vscale x 16 x i1> %interleaved.mask) + ret void +} + +; Mask must come from an interleave or a splat. +define void @foo_st2_nxv16i8_bad_mask3(<vscale x 32 x i1> %mask, <vscale x 16 x i8> %val1, <vscale x 16 x i8> %val2, ptr %p) { +; CHECK-LABEL: foo_st2_nxv16i8_bad_mask3: +; CHECK: // %bb.0: +; CHECK-NEXT: zip2 z2.b, z0.b, z1.b +; CHECK-NEXT: zip1 z0.b, z0.b, z1.b +; CHECK-NEXT: st1b { z2.b }, p1, [x0, #1, mul vl] +; CHECK-NEXT: st1b { z0.b }, p0, [x0] +; CHECK-NEXT: ret + %interleaved.value = call <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8> %val1, <vscale x 16 x i8> %val2) + call void @llvm.masked.store.nxv32i8.p0(<vscale x 32 x i8> %interleaved.value, ptr %p, i32 1, <vscale x 32 x i1> %mask) + ret void +} + +; Each interleaved vector must be exactly 128 bits. +define void @foo_st2_nxv8i8(<vscale x 8 x i1> %mask, <vscale x 8 x i8> %val1, <vscale x 8 x i8> %val2, ptr %p) { +; CHECK-LABEL: foo_st2_nxv8i8: +; CHECK: // %bb.0: +; CHECK-NEXT: zip2 z2.h, z0.h, z1.h +; CHECK-NEXT: zip1 z0.h, z0.h, z1.h +; CHECK-NEXT: zip2 p1.h, p0.h, p0.h +; CHECK-NEXT: zip1 p0.h, p0.h, p0.h +; CHECK-NEXT: uzp1 z0.b, z0.b, z2.b +; CHECK-NEXT: uzp1 p0.b, p0.b, p1.b +; CHECK-NEXT: st1b { z0.b }, p0, [x0] +; CHECK-NEXT: ret + %interleaved.mask = call <vscale x 16 x i1> @llvm.vector.interleave2.nxv16i1(<vscale x 8 x i1> %mask, <vscale x 8 x i1> %mask) + %interleaved.value = call <vscale x 16 x i8> @llvm.vector.interleave2.nxv16i8(<vscale x 8 x i8> %val1, <vscale x 8 x i8> %val2) + call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> %interleaved.value, ptr %p, i32 1, <vscale x 16 x i1> %interleaved.mask) + ret void +} + +define void @foo_st2_nxv8i8_trunc(<vscale x 4 x i1> %mask, <vscale x 4 x i16> %val1, <vscale x 4 x i16> %val2, ptr %p) { +; CHECK-LABEL: foo_st2_nxv8i8_trunc: +; CHECK: // %bb.0: +; CHECK-NEXT: zip2 z2.s, z0.s, z1.s +; CHECK-NEXT: zip1 z0.s, z0.s, z1.s +; CHECK-NEXT: zip2 p1.s, p0.s, p0.s +; CHECK-NEXT: zip1 p0.s, p0.s, p0.s +; CHECK-NEXT: uzp1 z0.h, z0.h, z2.h +; CHECK-NEXT: uzp1 p0.h, p0.h, p1.h +; CHECK-NEXT: st1b { z0.h }, p0, [x0] +; CHECK-NEXT: ret + %interleaved.mask = call <vscale x 8 x i1> @llvm.vector.interleave2.nxv8i1(<vscale x 4 x i1> %mask, <vscale x 4 x i1> %mask) + %interleaved.value = call <vscale x 8 x i16> @llvm.vector.interleave2.nxv8i16(<vscale x 4 x i16> %val1, <vscale x 4 x i16> %val2) + %trunc.value = trunc <vscale x 8 x i16> %interleaved.value to <vscale x 8 x i8> + call void @llvm.masked.store.nxv8i8.p0(<vscale x 8 x i8> %trunc.value, ptr %p, i32 1, <vscale x 8 x i1> %interleaved.mask) + ret void +} diff --git a/llvm/test/CodeGen/AArch64/sign-return-address-pauth-lr.ll b/llvm/test/CodeGen/AArch64/sign-return-address-pauth-lr.ll index 85aa6846cd80..0091469edde9 100644 --- a/llvm/test/CodeGen/AArch64/sign-return-address-pauth-lr.ll +++ b/llvm/test/CodeGen/AArch64/sign-return-address-pauth-lr.ll @@ -507,8 +507,8 @@ define i32 @leaf_sign_all_a_key_bti(i32 %x) "branch-protection-pauth-lr" "sign-r ; ; PAUTHLR-LABEL: leaf_sign_all_a_key_bti: ; PAUTHLR: // %bb.0: -; PAUTHLR-NEXT: bti c ; PAUTHLR-NEXT: .cfi_negate_ra_state_with_pc +; PAUTHLR-NEXT: bti c ; PAUTHLR-NEXT: .Ltmp10: ; PAUTHLR-NEXT: paciasppc ; PAUTHLR-NEXT: adrp x16, .Ltmp10 @@ -521,8 +521,8 @@ define i32 @leaf_sign_all_a_key_bti(i32 %x) "branch-protection-pauth-lr" "sign-r define i32 @leaf_sign_all_b_key_bti(i32 %x) "branch-protection-pauth-lr" "sign-return-address"="all" "sign-return-address-key"="b_key" "branch-target-enforcement" { ; COMPAT-LABEL: leaf_sign_all_b_key_bti: ; COMPAT: // %bb.0: +; COMPAT-NEXT: .cfi_b_key_frame ; COMPAT-NEXT: hint #34 -; COMPAT-NEXT: .cfi_b_key_frame ; COMPAT-NEXT: hint #39 ; COMPAT-NEXT: .cfi_negate_ra_state_with_pc ; COMPAT-NEXT: .Ltmp11: @@ -535,8 +535,8 @@ define i32 @leaf_sign_all_b_key_bti(i32 %x) "branch-protection-pauth-lr" "sign-r ; ; V83A-LABEL: leaf_sign_all_b_key_bti: ; V83A: // %bb.0: -; V83A-NEXT: hint #34 ; V83A-NEXT: .cfi_b_key_frame +; V83A-NEXT: hint #34 ; V83A-NEXT: hint #39 ; V83A-NEXT: .cfi_negate_ra_state_with_pc ; V83A-NEXT: .Ltmp11: @@ -548,9 +548,9 @@ define i32 @leaf_sign_all_b_key_bti(i32 %x) "branch-protection-pauth-lr" "sign-r ; ; PAUTHLR-LABEL: leaf_sign_all_b_key_bti: ; PAUTHLR: // %bb.0: -; PAUTHLR-NEXT: bti c ; PAUTHLR-NEXT: .cfi_b_key_frame ; PAUTHLR-NEXT: .cfi_negate_ra_state_with_pc +; PAUTHLR-NEXT: bti c ; PAUTHLR-NEXT: .Ltmp11: ; PAUTHLR-NEXT: pacibsppc ; PAUTHLR-NEXT: adrp x16, .Ltmp11 @@ -563,9 +563,9 @@ define i32 @leaf_sign_all_b_key_bti(i32 %x) "branch-protection-pauth-lr" "sign-r define i32 @leaf_sign_all_v83_b_key_bti(i32 %x) "branch-protection-pauth-lr" "sign-return-address"="all" "target-features"="+v8.3a" "sign-return-address-key"="b_key" "branch-target-enforcement" { ; CHECK-LABEL: leaf_sign_all_v83_b_key_bti: ; CHECK: // %bb.0: -; CHECK-NEXT: hint #34 -; CHECK-NEXT: .cfi_b_key_frame -; CHECK-NEXT: hint #39 +; CHECK-NEXT: .cfi_b_key_frame +; CHECK-NEXT: hint #34 +; CHECK-NEXT: hint #39 ; CHECK-NEXT: .cfi_negate_ra_state_with_pc ; CHECK-NEXT: .Ltmp12: ; CHECK-NEXT: pacibsp @@ -576,9 +576,9 @@ define i32 @leaf_sign_all_v83_b_key_bti(i32 %x) "branch-protection-pauth-lr" "si ; ; PAUTHLR-LABEL: leaf_sign_all_v83_b_key_bti: ; PAUTHLR: // %bb.0: -; PAUTHLR-NEXT: bti c ; PAUTHLR-NEXT: .cfi_b_key_frame ; PAUTHLR-NEXT: .cfi_negate_ra_state_with_pc +; PAUTHLR-NEXT: bti c ; PAUTHLR-NEXT: .Ltmp12: ; PAUTHLR-NEXT: pacibsppc ; PAUTHLR-NEXT: adrp x16, .Ltmp12 diff --git a/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll b/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll index 25a7b87d37d9..a0a14f2ffae3 100644 --- a/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll +++ b/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll @@ -1,18 +1,19 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mattr=+sme2 < %s | FileCheck %s -; RUN: llc -mattr=+sme2 < %s -aarch64-new-sme-abi | FileCheck %s +; RUN: llc -mattr=+sme2 < %s | FileCheck %s --check-prefixes=CHECK-COMMON,CHECK +; RUN: llc -mattr=+sme2 < %s -aarch64-new-sme-abi | FileCheck %s --check-prefixes=CHECK-COMMON,CHECK-NEWLOWERING target triple = "aarch64" declare i64 @private_za_decl(i64) +declare void @private_za() declare i64 @agnostic_decl(i64) "aarch64_za_state_agnostic" ; No calls. Test that no buffer is allocated. define i64 @agnostic_caller_no_callees(ptr %ptr) nounwind "aarch64_za_state_agnostic" { -; CHECK-LABEL: agnostic_caller_no_callees: -; CHECK: // %bb.0: -; CHECK-NEXT: ldr x0, [x0] -; CHECK-NEXT: ret +; CHECK-COMMON-LABEL: agnostic_caller_no_callees: +; CHECK-COMMON: // %bb.0: +; CHECK-COMMON-NEXT: ldr x0, [x0] +; CHECK-COMMON-NEXT: ret %v = load i64, ptr %ptr ret i64 %v } @@ -51,6 +52,29 @@ define i64 @agnostic_caller_private_za_callee(i64 %v) nounwind "aarch64_za_state ; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload ; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload ; CHECK-NEXT: ret +; +; CHECK-NEWLOWERING-LABEL: agnostic_caller_private_za_callee: +; CHECK-NEWLOWERING: // %bb.0: +; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-NEWLOWERING-NEXT: str x19, [sp, #16] // 8-byte Folded Spill +; CHECK-NEWLOWERING-NEXT: mov x29, sp +; CHECK-NEWLOWERING-NEXT: mov x8, x0 +; CHECK-NEWLOWERING-NEXT: bl __arm_sme_state_size +; CHECK-NEWLOWERING-NEXT: sub sp, sp, x0 +; CHECK-NEWLOWERING-NEXT: mov x19, sp +; CHECK-NEWLOWERING-NEXT: mov x0, x19 +; CHECK-NEWLOWERING-NEXT: bl __arm_sme_save +; CHECK-NEWLOWERING-NEXT: mov x0, x8 +; CHECK-NEWLOWERING-NEXT: bl private_za_decl +; CHECK-NEWLOWERING-NEXT: bl private_za_decl +; CHECK-NEWLOWERING-NEXT: mov x8, x0 +; CHECK-NEWLOWERING-NEXT: mov x0, x19 +; CHECK-NEWLOWERING-NEXT: bl __arm_sme_restore +; CHECK-NEWLOWERING-NEXT: mov x0, x8 +; CHECK-NEWLOWERING-NEXT: mov sp, x29 +; CHECK-NEWLOWERING-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload +; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-NEWLOWERING-NEXT: ret %res = call i64 @private_za_decl(i64 %v) %res2 = call i64 @private_za_decl(i64 %res) ret i64 %res2 @@ -60,12 +84,12 @@ define i64 @agnostic_caller_private_za_callee(i64 %v) nounwind "aarch64_za_state ; ; Should not result in save/restore code. define i64 @agnostic_caller_agnostic_callee(i64 %v) nounwind "aarch64_za_state_agnostic" { -; CHECK-LABEL: agnostic_caller_agnostic_callee: -; CHECK: // %bb.0: -; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: bl agnostic_decl -; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload -; CHECK-NEXT: ret +; CHECK-COMMON-LABEL: agnostic_caller_agnostic_callee: +; CHECK-COMMON: // %bb.0: +; CHECK-COMMON-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-COMMON-NEXT: bl agnostic_decl +; CHECK-COMMON-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-COMMON-NEXT: ret %res = call i64 @agnostic_decl(i64 %v) ret i64 %res } @@ -74,12 +98,12 @@ define i64 @agnostic_caller_agnostic_callee(i64 %v) nounwind "aarch64_za_state_a ; ; Should not result in lazy-save or save of ZT0 define i64 @shared_caller_agnostic_callee(i64 %v) nounwind "aarch64_inout_za" "aarch64_inout_zt0" { -; CHECK-LABEL: shared_caller_agnostic_callee: -; CHECK: // %bb.0: -; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: bl agnostic_decl -; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload -; CHECK-NEXT: ret +; CHECK-COMMON-LABEL: shared_caller_agnostic_callee: +; CHECK-COMMON: // %bb.0: +; CHECK-COMMON-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-COMMON-NEXT: bl agnostic_decl +; CHECK-COMMON-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-COMMON-NEXT: ret %res = call i64 @agnostic_decl(i64 %v) ret i64 %res } @@ -126,6 +150,41 @@ define i64 @streaming_agnostic_caller_nonstreaming_private_za_callee(i64 %v) nou ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload ; CHECK-NEXT: ret +; +; CHECK-NEWLOWERING-LABEL: streaming_agnostic_caller_nonstreaming_private_za_callee: +; CHECK-NEWLOWERING: // %bb.0: +; CHECK-NEWLOWERING-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEWLOWERING-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEWLOWERING-NEXT: mov x8, x0 +; CHECK-NEWLOWERING-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEWLOWERING-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill +; CHECK-NEWLOWERING-NEXT: add x29, sp, #64 +; CHECK-NEWLOWERING-NEXT: stp x20, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEWLOWERING-NEXT: bl __arm_sme_state_size +; CHECK-NEWLOWERING-NEXT: sub sp, sp, x0 +; CHECK-NEWLOWERING-NEXT: mov x20, sp +; CHECK-NEWLOWERING-NEXT: mov x0, x20 +; CHECK-NEWLOWERING-NEXT: bl __arm_sme_save +; CHECK-NEWLOWERING-NEXT: smstop sm +; CHECK-NEWLOWERING-NEXT: mov x0, x8 +; CHECK-NEWLOWERING-NEXT: bl private_za_decl +; CHECK-NEWLOWERING-NEXT: smstart sm +; CHECK-NEWLOWERING-NEXT: smstop sm +; CHECK-NEWLOWERING-NEXT: bl private_za_decl +; CHECK-NEWLOWERING-NEXT: smstart sm +; CHECK-NEWLOWERING-NEXT: mov x8, x0 +; CHECK-NEWLOWERING-NEXT: mov x0, x20 +; CHECK-NEWLOWERING-NEXT: bl __arm_sme_restore +; CHECK-NEWLOWERING-NEXT: mov x0, x8 +; CHECK-NEWLOWERING-NEXT: sub sp, x29, #64 +; CHECK-NEWLOWERING-NEXT: ldp x20, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload +; CHECK-NEWLOWERING-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEWLOWERING-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEWLOWERING-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEWLOWERING-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; CHECK-NEWLOWERING-NEXT: ret %res = call i64 @private_za_decl(i64 %v) %res2 = call i64 @private_za_decl(i64 %res) ret i64 %res2 @@ -143,40 +202,39 @@ define i64 @streaming_compatible_agnostic_caller_nonstreaming_private_za_callee( ; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill ; CHECK-NEXT: add x29, sp, #64 ; CHECK-NEXT: stp x20, x19, [sp, #80] // 16-byte Folded Spill -; CHECK-NEXT: bl __arm_sme_state -; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: mrs x20, SVCR ; CHECK-NEXT: bl __arm_sme_state_size ; CHECK-NEXT: sub sp, sp, x0 -; CHECK-NEXT: mov x20, sp -; CHECK-NEXT: mov x0, x20 +; CHECK-NEXT: mov x19, sp +; CHECK-NEXT: mov x0, x19 ; CHECK-NEXT: bl __arm_sme_save -; CHECK-NEXT: tbz w19, #0, .LBB5_2 +; CHECK-NEXT: tbz w20, #0, .LBB5_2 ; CHECK-NEXT: // %bb.1: ; CHECK-NEXT: smstop sm ; CHECK-NEXT: .LBB5_2: ; CHECK-NEXT: mov x0, x8 ; CHECK-NEXT: bl private_za_decl ; CHECK-NEXT: mov x1, x0 -; CHECK-NEXT: tbz w19, #0, .LBB5_4 +; CHECK-NEXT: tbz w20, #0, .LBB5_4 ; CHECK-NEXT: // %bb.3: ; CHECK-NEXT: smstart sm ; CHECK-NEXT: .LBB5_4: -; CHECK-NEXT: mov x0, x20 +; CHECK-NEXT: mov x0, x19 ; CHECK-NEXT: bl __arm_sme_restore -; CHECK-NEXT: mov x0, x20 +; CHECK-NEXT: mov x0, x19 ; CHECK-NEXT: bl __arm_sme_save -; CHECK-NEXT: tbz w19, #0, .LBB5_6 +; CHECK-NEXT: tbz w20, #0, .LBB5_6 ; CHECK-NEXT: // %bb.5: ; CHECK-NEXT: smstop sm ; CHECK-NEXT: .LBB5_6: ; CHECK-NEXT: mov x0, x1 ; CHECK-NEXT: bl private_za_decl ; CHECK-NEXT: mov x1, x0 -; CHECK-NEXT: tbz w19, #0, .LBB5_8 +; CHECK-NEXT: tbz w20, #0, .LBB5_8 ; CHECK-NEXT: // %bb.7: ; CHECK-NEXT: smstart sm ; CHECK-NEXT: .LBB5_8: -; CHECK-NEXT: mov x0, x20 +; CHECK-NEXT: mov x0, x19 ; CHECK-NEXT: bl __arm_sme_restore ; CHECK-NEXT: mov x0, x1 ; CHECK-NEXT: sub sp, x29, #64 @@ -187,6 +245,54 @@ define i64 @streaming_compatible_agnostic_caller_nonstreaming_private_za_callee( ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload ; CHECK-NEXT: ret +; +; CHECK-NEWLOWERING-LABEL: streaming_compatible_agnostic_caller_nonstreaming_private_za_callee: +; CHECK-NEWLOWERING: // %bb.0: +; CHECK-NEWLOWERING-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEWLOWERING-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEWLOWERING-NEXT: mov x8, x0 +; CHECK-NEWLOWERING-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEWLOWERING-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill +; CHECK-NEWLOWERING-NEXT: add x29, sp, #64 +; CHECK-NEWLOWERING-NEXT: stp x20, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEWLOWERING-NEXT: bl __arm_sme_state_size +; CHECK-NEWLOWERING-NEXT: sub sp, sp, x0 +; CHECK-NEWLOWERING-NEXT: mov x19, sp +; CHECK-NEWLOWERING-NEXT: mrs x20, SVCR +; CHECK-NEWLOWERING-NEXT: mov x0, x19 +; CHECK-NEWLOWERING-NEXT: bl __arm_sme_save +; CHECK-NEWLOWERING-NEXT: tbz w20, #0, .LBB5_2 +; CHECK-NEWLOWERING-NEXT: // %bb.1: +; CHECK-NEWLOWERING-NEXT: smstop sm +; CHECK-NEWLOWERING-NEXT: .LBB5_2: +; CHECK-NEWLOWERING-NEXT: mov x0, x8 +; CHECK-NEWLOWERING-NEXT: bl private_za_decl +; CHECK-NEWLOWERING-NEXT: tbz w20, #0, .LBB5_4 +; CHECK-NEWLOWERING-NEXT: // %bb.3: +; CHECK-NEWLOWERING-NEXT: smstart sm +; CHECK-NEWLOWERING-NEXT: .LBB5_4: +; CHECK-NEWLOWERING-NEXT: tbz w20, #0, .LBB5_6 +; CHECK-NEWLOWERING-NEXT: // %bb.5: +; CHECK-NEWLOWERING-NEXT: smstop sm +; CHECK-NEWLOWERING-NEXT: .LBB5_6: +; CHECK-NEWLOWERING-NEXT: bl private_za_decl +; CHECK-NEWLOWERING-NEXT: tbz w20, #0, .LBB5_8 +; CHECK-NEWLOWERING-NEXT: // %bb.7: +; CHECK-NEWLOWERING-NEXT: smstart sm +; CHECK-NEWLOWERING-NEXT: .LBB5_8: +; CHECK-NEWLOWERING-NEXT: mov x8, x0 +; CHECK-NEWLOWERING-NEXT: mov x0, x19 +; CHECK-NEWLOWERING-NEXT: bl __arm_sme_restore +; CHECK-NEWLOWERING-NEXT: mov x0, x8 +; CHECK-NEWLOWERING-NEXT: sub sp, x29, #64 +; CHECK-NEWLOWERING-NEXT: ldp x20, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload +; CHECK-NEWLOWERING-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEWLOWERING-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEWLOWERING-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEWLOWERING-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; CHECK-NEWLOWERING-NEXT: ret %res = call i64 @private_za_decl(i64 %v) %res2 = call i64 @private_za_decl(i64 %res) ret i64 %res2 @@ -223,9 +329,99 @@ define i64 @test_many_callee_arguments( ; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload ; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload ; CHECK-NEXT: ret +; +; CHECK-NEWLOWERING-LABEL: test_many_callee_arguments: +; CHECK-NEWLOWERING: // %bb.0: +; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-NEWLOWERING-NEXT: str x19, [sp, #16] // 8-byte Folded Spill +; CHECK-NEWLOWERING-NEXT: mov x29, sp +; CHECK-NEWLOWERING-NEXT: mov x8, x0 +; CHECK-NEWLOWERING-NEXT: bl __arm_sme_state_size +; CHECK-NEWLOWERING-NEXT: sub sp, sp, x0 +; CHECK-NEWLOWERING-NEXT: mov x19, sp +; CHECK-NEWLOWERING-NEXT: ldp x9, x10, [x29, #32] +; CHECK-NEWLOWERING-NEXT: mov x0, x19 +; CHECK-NEWLOWERING-NEXT: bl __arm_sme_save +; CHECK-NEWLOWERING-NEXT: stp x9, x10, [sp, #-16]! +; CHECK-NEWLOWERING-NEXT: mov x0, x8 +; CHECK-NEWLOWERING-NEXT: bl many_args_private_za_callee +; CHECK-NEWLOWERING-NEXT: add sp, sp, #16 +; CHECK-NEWLOWERING-NEXT: mov x8, x0 +; CHECK-NEWLOWERING-NEXT: mov x0, x19 +; CHECK-NEWLOWERING-NEXT: bl __arm_sme_restore +; CHECK-NEWLOWERING-NEXT: mov x0, x8 +; CHECK-NEWLOWERING-NEXT: mov sp, x29 +; CHECK-NEWLOWERING-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload +; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-NEWLOWERING-NEXT: ret i64 %0, i64 %1, i64 %2, i64 %3, i64 %4, i64 %5, i64 %6, i64 %7, i64 %8, i64 %9 ) nounwind "aarch64_za_state_agnostic" { %ret = call i64 @many_args_private_za_callee( i64 %0, i64 %1, i64 %2, i64 %3, i64 %4, i64 %5, i64 %6, i64 %7, i64 %8, i64 %9) ret i64 %ret } + +; FIXME: The new lowering should avoid saves/restores in the probing loop. +define void @agnostic_za_buffer_alloc_with_stack_probes() nounwind "aarch64_za_state_agnostic" "probe-stack"="inline-asm" "stack-probe-size"="65536"{ +; CHECK-LABEL: agnostic_za_buffer_alloc_with_stack_probes: +; CHECK: // %bb.0: +; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-NEXT: str x19, [sp, #16] // 8-byte Folded Spill +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: bl __arm_sme_state_size +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: sub x19, x8, x0 +; CHECK-NEXT: .LBB7_1: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536 +; CHECK-NEXT: cmp sp, x19 +; CHECK-NEXT: b.le .LBB7_3 +; CHECK-NEXT: // %bb.2: // in Loop: Header=BB7_1 Depth=1 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: b .LBB7_1 +; CHECK-NEXT: .LBB7_3: +; CHECK-NEXT: mov sp, x19 +; CHECK-NEXT: ldr xzr, [sp] +; CHECK-NEXT: mov x0, x19 +; CHECK-NEXT: bl __arm_sme_save +; CHECK-NEXT: bl private_za +; CHECK-NEXT: mov x0, x19 +; CHECK-NEXT: bl __arm_sme_restore +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-NEXT: ret +; +; CHECK-NEWLOWERING-LABEL: agnostic_za_buffer_alloc_with_stack_probes: +; CHECK-NEWLOWERING: // %bb.0: +; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-NEWLOWERING-NEXT: str x19, [sp, #16] // 8-byte Folded Spill +; CHECK-NEWLOWERING-NEXT: mov x29, sp +; CHECK-NEWLOWERING-NEXT: bl __arm_sme_state_size +; CHECK-NEWLOWERING-NEXT: mov x8, sp +; CHECK-NEWLOWERING-NEXT: sub x19, x8, x0 +; CHECK-NEWLOWERING-NEXT: .LBB7_1: // =>This Inner Loop Header: Depth=1 +; CHECK-NEWLOWERING-NEXT: sub sp, sp, #16, lsl #12 // =65536 +; CHECK-NEWLOWERING-NEXT: cmp sp, x19 +; CHECK-NEWLOWERING-NEXT: mov x0, x19 +; CHECK-NEWLOWERING-NEXT: mrs x8, NZCV +; CHECK-NEWLOWERING-NEXT: bl __arm_sme_save +; CHECK-NEWLOWERING-NEXT: msr NZCV, x8 +; CHECK-NEWLOWERING-NEXT: b.le .LBB7_3 +; CHECK-NEWLOWERING-NEXT: // %bb.2: // in Loop: Header=BB7_1 Depth=1 +; CHECK-NEWLOWERING-NEXT: mov x0, x19 +; CHECK-NEWLOWERING-NEXT: str xzr, [sp] +; CHECK-NEWLOWERING-NEXT: bl __arm_sme_restore +; CHECK-NEWLOWERING-NEXT: b .LBB7_1 +; CHECK-NEWLOWERING-NEXT: .LBB7_3: +; CHECK-NEWLOWERING-NEXT: mov sp, x19 +; CHECK-NEWLOWERING-NEXT: ldr xzr, [sp] +; CHECK-NEWLOWERING-NEXT: bl private_za +; CHECK-NEWLOWERING-NEXT: mov x0, x19 +; CHECK-NEWLOWERING-NEXT: bl __arm_sme_restore +; CHECK-NEWLOWERING-NEXT: mov sp, x29 +; CHECK-NEWLOWERING-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload +; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-NEWLOWERING-NEXT: ret + call void @private_za() + ret void +} diff --git a/llvm/test/CodeGen/AArch64/sme-callee-save-restore-pairs.ll b/llvm/test/CodeGen/AArch64/sme-callee-save-restore-pairs.ll index 8d6432ced8e1..cf42db7aa65b 100644 --- a/llvm/test/CodeGen/AArch64/sme-callee-save-restore-pairs.ll +++ b/llvm/test/CodeGen/AArch64/sme-callee-save-restore-pairs.ll @@ -42,8 +42,7 @@ define void @fbyte(<vscale x 16 x i8> %v) #0{ ; NOPAIR-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill ; NOPAIR-NEXT: addvl sp, sp, #-1 ; NOPAIR-NEXT: str z0, [sp] // 16-byte Folded Spill -; NOPAIR-NEXT: bl __arm_sme_state -; NOPAIR-NEXT: mov x19, x0 +; NOPAIR-NEXT: mrs x19, SVCR ; NOPAIR-NEXT: tbz w19, #0, .LBB0_2 ; NOPAIR-NEXT: // %bb.1: ; NOPAIR-NEXT: smstop sm @@ -123,8 +122,7 @@ define void @fbyte(<vscale x 16 x i8> %v) #0{ ; PAIR-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill ; PAIR-NEXT: addvl sp, sp, #-1 ; PAIR-NEXT: str z0, [sp] // 16-byte Folded Spill -; PAIR-NEXT: bl __arm_sme_state -; PAIR-NEXT: mov x19, x0 +; PAIR-NEXT: mrs x19, SVCR ; PAIR-NEXT: tbz w19, #0, .LBB0_2 ; PAIR-NEXT: // %bb.1: ; PAIR-NEXT: smstop sm diff --git a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll index e1bfdddaba92..05d636158b92 100644 --- a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll +++ b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll @@ -250,10 +250,7 @@ define double @za_shared_caller_to_za_none_callee(double %x) nounwind noinline ; CHECK-COMMON-NEXT: mov x9, sp ; CHECK-COMMON-NEXT: msub x9, x8, x8, x9 ; CHECK-COMMON-NEXT: mov sp, x9 -; CHECK-COMMON-NEXT: stur x9, [x29, #-16] -; CHECK-COMMON-NEXT: sturh wzr, [x29, #-6] -; CHECK-COMMON-NEXT: stur wzr, [x29, #-4] -; CHECK-COMMON-NEXT: sturh w8, [x29, #-8] +; CHECK-COMMON-NEXT: stp x9, x8, [x29, #-16] ; CHECK-COMMON-NEXT: sub x8, x29, #16 ; CHECK-COMMON-NEXT: msr TPIDR2_EL0, x8 ; CHECK-COMMON-NEXT: bl normal_callee @@ -292,12 +289,9 @@ define fp128 @f128_call_za(fp128 %a, fp128 %b) "aarch64_inout_za" nounwind { ; CHECK-COMMON-NEXT: mov x9, sp ; CHECK-COMMON-NEXT: msub x9, x8, x8, x9 ; CHECK-COMMON-NEXT: mov sp, x9 -; CHECK-COMMON-NEXT: stur x9, [x29, #-16] -; CHECK-COMMON-NEXT: sub x9, x29, #16 -; CHECK-COMMON-NEXT: sturh wzr, [x29, #-6] -; CHECK-COMMON-NEXT: stur wzr, [x29, #-4] -; CHECK-COMMON-NEXT: sturh w8, [x29, #-8] -; CHECK-COMMON-NEXT: msr TPIDR2_EL0, x9 +; CHECK-COMMON-NEXT: sub x10, x29, #16 +; CHECK-COMMON-NEXT: stp x9, x8, [x29, #-16] +; CHECK-COMMON-NEXT: msr TPIDR2_EL0, x10 ; CHECK-COMMON-NEXT: bl __addtf3 ; CHECK-COMMON-NEXT: smstart za ; CHECK-COMMON-NEXT: mrs x8, TPIDR2_EL0 @@ -356,12 +350,9 @@ define double @frem_call_za(double %a, double %b) "aarch64_inout_za" nounwind { ; CHECK-COMMON-NEXT: mov x9, sp ; CHECK-COMMON-NEXT: msub x9, x8, x8, x9 ; CHECK-COMMON-NEXT: mov sp, x9 -; CHECK-COMMON-NEXT: stur x9, [x29, #-16] -; CHECK-COMMON-NEXT: sub x9, x29, #16 -; CHECK-COMMON-NEXT: sturh wzr, [x29, #-6] -; CHECK-COMMON-NEXT: stur wzr, [x29, #-4] -; CHECK-COMMON-NEXT: sturh w8, [x29, #-8] -; CHECK-COMMON-NEXT: msr TPIDR2_EL0, x9 +; CHECK-COMMON-NEXT: sub x10, x29, #16 +; CHECK-COMMON-NEXT: stp x9, x8, [x29, #-16] +; CHECK-COMMON-NEXT: msr TPIDR2_EL0, x10 ; CHECK-COMMON-NEXT: bl fmod ; CHECK-COMMON-NEXT: smstart za ; CHECK-COMMON-NEXT: mrs x8, TPIDR2_EL0 @@ -418,8 +409,7 @@ define float @frem_call_sm_compat(float %a, float %b) "aarch64_pstate_sm_compati ; CHECK-COMMON-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill ; CHECK-COMMON-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-COMMON-NEXT: stp s0, s1, [sp, #8] // 8-byte Folded Spill -; CHECK-COMMON-NEXT: bl __arm_sme_state -; CHECK-COMMON-NEXT: mov x19, x0 +; CHECK-COMMON-NEXT: mrs x19, SVCR ; CHECK-COMMON-NEXT: tbz w19, #0, .LBB12_2 ; CHECK-COMMON-NEXT: // %bb.1: ; CHECK-COMMON-NEXT: smstop sm diff --git a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll index c57cb8e0873d..a7d51968c515 100644 --- a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll +++ b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll @@ -10,62 +10,32 @@ declare float @llvm.cos.f32(float) ; Test lazy-save mechanism for a single callee. define void @test_lazy_save_1_callee() nounwind "aarch64_inout_za" { -; CHECK-LABEL: test_lazy_save_1_callee: -; CHECK: // %bb.0: -; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill -; CHECK-NEXT: str x19, [sp, #16] // 8-byte Folded Spill -; CHECK-NEXT: mov x29, sp -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: rdsvl x8, #1 -; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: msub x9, x8, x8, x9 -; CHECK-NEXT: mov sp, x9 -; CHECK-NEXT: stur x9, [x29, #-16] -; CHECK-NEXT: sub x9, x29, #16 -; CHECK-NEXT: sturh wzr, [x29, #-6] -; CHECK-NEXT: stur wzr, [x29, #-4] -; CHECK-NEXT: sturh w8, [x29, #-8] -; CHECK-NEXT: msr TPIDR2_EL0, x9 -; CHECK-NEXT: bl private_za_callee -; CHECK-NEXT: smstart za -; CHECK-NEXT: mrs x8, TPIDR2_EL0 -; CHECK-NEXT: sub x0, x29, #16 -; CHECK-NEXT: cbnz x8, .LBB0_2 -; CHECK-NEXT: // %bb.1: -; CHECK-NEXT: bl __arm_tpidr2_restore -; CHECK-NEXT: .LBB0_2: -; CHECK-NEXT: msr TPIDR2_EL0, xzr -; CHECK-NEXT: mov sp, x29 -; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload -; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload -; CHECK-NEXT: ret -; -; CHECK-NEWLOWERING-LABEL: test_lazy_save_1_callee: -; CHECK-NEWLOWERING: // %bb.0: -; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill -; CHECK-NEWLOWERING-NEXT: str x19, [sp, #16] // 8-byte Folded Spill -; CHECK-NEWLOWERING-NEXT: mov x29, sp -; CHECK-NEWLOWERING-NEXT: sub sp, sp, #16 -; CHECK-NEWLOWERING-NEXT: rdsvl x8, #1 -; CHECK-NEWLOWERING-NEXT: mov x9, sp -; CHECK-NEWLOWERING-NEXT: msub x9, x8, x8, x9 -; CHECK-NEWLOWERING-NEXT: mov sp, x9 -; CHECK-NEWLOWERING-NEXT: sub x10, x29, #16 -; CHECK-NEWLOWERING-NEXT: stp x9, x8, [x29, #-16] -; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x10 -; CHECK-NEWLOWERING-NEXT: bl private_za_callee -; CHECK-NEWLOWERING-NEXT: smstart za -; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0 -; CHECK-NEWLOWERING-NEXT: sub x0, x29, #16 -; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB0_2 -; CHECK-NEWLOWERING-NEXT: // %bb.1: -; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_restore -; CHECK-NEWLOWERING-NEXT: .LBB0_2: -; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr -; CHECK-NEWLOWERING-NEXT: mov sp, x29 -; CHECK-NEWLOWERING-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload -; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload -; CHECK-NEWLOWERING-NEXT: ret +; CHECK-COMMON-LABEL: test_lazy_save_1_callee: +; CHECK-COMMON: // %bb.0: +; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-COMMON-NEXT: str x19, [sp, #16] // 8-byte Folded Spill +; CHECK-COMMON-NEXT: mov x29, sp +; CHECK-COMMON-NEXT: sub sp, sp, #16 +; CHECK-COMMON-NEXT: rdsvl x8, #1 +; CHECK-COMMON-NEXT: mov x9, sp +; CHECK-COMMON-NEXT: msub x9, x8, x8, x9 +; CHECK-COMMON-NEXT: mov sp, x9 +; CHECK-COMMON-NEXT: sub x10, x29, #16 +; CHECK-COMMON-NEXT: stp x9, x8, [x29, #-16] +; CHECK-COMMON-NEXT: msr TPIDR2_EL0, x10 +; CHECK-COMMON-NEXT: bl private_za_callee +; CHECK-COMMON-NEXT: smstart za +; CHECK-COMMON-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-COMMON-NEXT: sub x0, x29, #16 +; CHECK-COMMON-NEXT: cbnz x8, .LBB0_2 +; CHECK-COMMON-NEXT: // %bb.1: +; CHECK-COMMON-NEXT: bl __arm_tpidr2_restore +; CHECK-COMMON-NEXT: .LBB0_2: +; CHECK-COMMON-NEXT: msr TPIDR2_EL0, xzr +; CHECK-COMMON-NEXT: mov sp, x29 +; CHECK-COMMON-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload +; CHECK-COMMON-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-COMMON-NEXT: ret call void @private_za_callee() ret void } @@ -74,21 +44,17 @@ define void @test_lazy_save_1_callee() nounwind "aarch64_inout_za" { define void @test_lazy_save_2_callees() nounwind "aarch64_inout_za" { ; CHECK-LABEL: test_lazy_save_2_callees: ; CHECK: // %bb.0: -; CHECK-NEXT: stp x29, x30, [sp, #-48]! // 16-byte Folded Spill -; CHECK-NEXT: str x21, [sp, #16] // 8-byte Folded Spill +; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: mov x29, sp -; CHECK-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: rdsvl x20, #1 -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: msub x8, x20, x20, x8 -; CHECK-NEXT: mov sp, x8 -; CHECK-NEXT: sub x21, x29, #16 -; CHECK-NEXT: stur x8, [x29, #-16] -; CHECK-NEXT: sturh wzr, [x29, #-6] -; CHECK-NEXT: stur wzr, [x29, #-4] -; CHECK-NEXT: sturh w20, [x29, #-8] -; CHECK-NEXT: msr TPIDR2_EL0, x21 +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: msub x9, x8, x8, x9 +; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: sub x20, x29, #16 +; CHECK-NEXT: stp x9, x8, [x29, #-16] +; CHECK-NEXT: msr TPIDR2_EL0, x20 ; CHECK-NEXT: bl private_za_callee ; CHECK-NEXT: smstart za ; CHECK-NEXT: mrs x8, TPIDR2_EL0 @@ -98,8 +64,7 @@ define void @test_lazy_save_2_callees() nounwind "aarch64_inout_za" { ; CHECK-NEXT: bl __arm_tpidr2_restore ; CHECK-NEXT: .LBB1_2: ; CHECK-NEXT: msr TPIDR2_EL0, xzr -; CHECK-NEXT: sturh w20, [x29, #-8] -; CHECK-NEXT: msr TPIDR2_EL0, x21 +; CHECK-NEXT: msr TPIDR2_EL0, x20 ; CHECK-NEXT: bl private_za_callee ; CHECK-NEXT: smstart za ; CHECK-NEXT: mrs x8, TPIDR2_EL0 @@ -110,9 +75,8 @@ define void @test_lazy_save_2_callees() nounwind "aarch64_inout_za" { ; CHECK-NEXT: .LBB1_4: ; CHECK-NEXT: msr TPIDR2_EL0, xzr ; CHECK-NEXT: mov sp, x29 -; CHECK-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: ldr x21, [sp, #16] // 8-byte Folded Reload -; CHECK-NEXT: ldp x29, x30, [sp], #48 // 16-byte Folded Reload +; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload ; CHECK-NEXT: ret ; ; CHECK-NEWLOWERING-LABEL: test_lazy_save_2_callees: @@ -149,62 +113,32 @@ define void @test_lazy_save_2_callees() nounwind "aarch64_inout_za" { ; Test a call of an intrinsic that gets expanded to a library call. define float @test_lazy_save_expanded_intrinsic(float %a) nounwind "aarch64_inout_za" { -; CHECK-LABEL: test_lazy_save_expanded_intrinsic: -; CHECK: // %bb.0: -; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill -; CHECK-NEXT: str x19, [sp, #16] // 8-byte Folded Spill -; CHECK-NEXT: mov x29, sp -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: rdsvl x8, #1 -; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: msub x9, x8, x8, x9 -; CHECK-NEXT: mov sp, x9 -; CHECK-NEXT: stur x9, [x29, #-16] -; CHECK-NEXT: sub x9, x29, #16 -; CHECK-NEXT: sturh wzr, [x29, #-6] -; CHECK-NEXT: stur wzr, [x29, #-4] -; CHECK-NEXT: sturh w8, [x29, #-8] -; CHECK-NEXT: msr TPIDR2_EL0, x9 -; CHECK-NEXT: bl cosf -; CHECK-NEXT: smstart za -; CHECK-NEXT: mrs x8, TPIDR2_EL0 -; CHECK-NEXT: sub x0, x29, #16 -; CHECK-NEXT: cbnz x8, .LBB2_2 -; CHECK-NEXT: // %bb.1: -; CHECK-NEXT: bl __arm_tpidr2_restore -; CHECK-NEXT: .LBB2_2: -; CHECK-NEXT: msr TPIDR2_EL0, xzr -; CHECK-NEXT: mov sp, x29 -; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload -; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload -; CHECK-NEXT: ret -; -; CHECK-NEWLOWERING-LABEL: test_lazy_save_expanded_intrinsic: -; CHECK-NEWLOWERING: // %bb.0: -; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill -; CHECK-NEWLOWERING-NEXT: str x19, [sp, #16] // 8-byte Folded Spill -; CHECK-NEWLOWERING-NEXT: mov x29, sp -; CHECK-NEWLOWERING-NEXT: sub sp, sp, #16 -; CHECK-NEWLOWERING-NEXT: rdsvl x8, #1 -; CHECK-NEWLOWERING-NEXT: mov x9, sp -; CHECK-NEWLOWERING-NEXT: msub x9, x8, x8, x9 -; CHECK-NEWLOWERING-NEXT: mov sp, x9 -; CHECK-NEWLOWERING-NEXT: sub x10, x29, #16 -; CHECK-NEWLOWERING-NEXT: stp x9, x8, [x29, #-16] -; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x10 -; CHECK-NEWLOWERING-NEXT: bl cosf -; CHECK-NEWLOWERING-NEXT: smstart za -; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0 -; CHECK-NEWLOWERING-NEXT: sub x0, x29, #16 -; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB2_2 -; CHECK-NEWLOWERING-NEXT: // %bb.1: -; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_restore -; CHECK-NEWLOWERING-NEXT: .LBB2_2: -; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr -; CHECK-NEWLOWERING-NEXT: mov sp, x29 -; CHECK-NEWLOWERING-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload -; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload -; CHECK-NEWLOWERING-NEXT: ret +; CHECK-COMMON-LABEL: test_lazy_save_expanded_intrinsic: +; CHECK-COMMON: // %bb.0: +; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-COMMON-NEXT: str x19, [sp, #16] // 8-byte Folded Spill +; CHECK-COMMON-NEXT: mov x29, sp +; CHECK-COMMON-NEXT: sub sp, sp, #16 +; CHECK-COMMON-NEXT: rdsvl x8, #1 +; CHECK-COMMON-NEXT: mov x9, sp +; CHECK-COMMON-NEXT: msub x9, x8, x8, x9 +; CHECK-COMMON-NEXT: mov sp, x9 +; CHECK-COMMON-NEXT: sub x10, x29, #16 +; CHECK-COMMON-NEXT: stp x9, x8, [x29, #-16] +; CHECK-COMMON-NEXT: msr TPIDR2_EL0, x10 +; CHECK-COMMON-NEXT: bl cosf +; CHECK-COMMON-NEXT: smstart za +; CHECK-COMMON-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-COMMON-NEXT: sub x0, x29, #16 +; CHECK-COMMON-NEXT: cbnz x8, .LBB2_2 +; CHECK-COMMON-NEXT: // %bb.1: +; CHECK-COMMON-NEXT: bl __arm_tpidr2_restore +; CHECK-COMMON-NEXT: .LBB2_2: +; CHECK-COMMON-NEXT: msr TPIDR2_EL0, xzr +; CHECK-COMMON-NEXT: mov sp, x29 +; CHECK-COMMON-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload +; CHECK-COMMON-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-COMMON-NEXT: ret %res = call float @llvm.cos.f32(float %a) ret float %res } @@ -221,18 +155,14 @@ define void @test_lazy_save_and_conditional_smstart() nounwind "aarch64_inout_za ; CHECK-NEXT: add x29, sp, #64 ; CHECK-NEXT: stp x20, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: bl __arm_sme_state ; CHECK-NEXT: rdsvl x8, #1 ; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: mov x20, x0 +; CHECK-NEXT: mrs x20, SVCR ; CHECK-NEXT: msub x9, x8, x8, x9 ; CHECK-NEXT: mov sp, x9 -; CHECK-NEXT: stur x9, [x29, #-80] -; CHECK-NEXT: sub x9, x29, #80 -; CHECK-NEXT: sturh wzr, [x29, #-70] -; CHECK-NEXT: stur wzr, [x29, #-68] -; CHECK-NEXT: sturh w8, [x29, #-72] -; CHECK-NEXT: msr TPIDR2_EL0, x9 +; CHECK-NEXT: sub x10, x29, #80 +; CHECK-NEXT: stp x9, x8, [x29, #-80] +; CHECK-NEXT: msr TPIDR2_EL0, x10 ; CHECK-NEXT: tbz w20, #0, .LBB3_2 ; CHECK-NEXT: // %bb.1: ; CHECK-NEXT: smstop sm @@ -274,8 +204,7 @@ define void @test_lazy_save_and_conditional_smstart() nounwind "aarch64_inout_za ; CHECK-NEWLOWERING-NEXT: msub x9, x8, x8, x9 ; CHECK-NEWLOWERING-NEXT: mov sp, x9 ; CHECK-NEWLOWERING-NEXT: stp x9, x8, [x29, #-80] -; CHECK-NEWLOWERING-NEXT: bl __arm_sme_state -; CHECK-NEWLOWERING-NEXT: mov x20, x0 +; CHECK-NEWLOWERING-NEXT: mrs x20, SVCR ; CHECK-NEWLOWERING-NEXT: sub x8, x29, #80 ; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x8 ; CHECK-NEWLOWERING-NEXT: tbz w20, #0, .LBB3_2 @@ -313,24 +242,20 @@ define void @test_lazy_save_and_conditional_smstart() nounwind "aarch64_inout_za define void @test_lazy_save_mixed_shared_and_private_callees() "aarch64_new_za" ; CHECK-LABEL: test_lazy_save_mixed_shared_and_private_callees: ; CHECK: // %bb.0: // %prelude -; CHECK-NEXT: stp x29, x30, [sp, #-48]! // 16-byte Folded Spill -; CHECK-NEXT: str x21, [sp, #16] // 8-byte Folded Spill +; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: mov x29, sp -; CHECK-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa w29, 48 +; CHECK-NEXT: .cfi_def_cfa w29, 32 ; CHECK-NEXT: .cfi_offset w19, -8 ; CHECK-NEXT: .cfi_offset w20, -16 -; CHECK-NEXT: .cfi_offset w21, -32 -; CHECK-NEXT: .cfi_offset w30, -40 -; CHECK-NEXT: .cfi_offset w29, -48 +; CHECK-NEXT: .cfi_offset w30, -24 +; CHECK-NEXT: .cfi_offset w29, -32 ; CHECK-NEXT: rdsvl x8, #1 ; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: msub x8, x8, x8, x9 -; CHECK-NEXT: mov sp, x8 -; CHECK-NEXT: stur x8, [x29, #-16] -; CHECK-NEXT: sturh wzr, [x29, #-6] -; CHECK-NEXT: stur wzr, [x29, #-4] +; CHECK-NEXT: msub x9, x8, x8, x9 +; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: stp x9, x8, [x29, #-16] ; CHECK-NEXT: mrs x8, TPIDR2_EL0 ; CHECK-NEXT: cbz x8, .LBB4_2 ; CHECK-NEXT: // %bb.1: // %save.za @@ -338,11 +263,9 @@ define void @test_lazy_save_mixed_shared_and_private_callees() "aarch64_new_za" ; CHECK-NEXT: msr TPIDR2_EL0, xzr ; CHECK-NEXT: .LBB4_2: ; CHECK-NEXT: smstart za -; CHECK-NEXT: rdsvl x20, #1 -; CHECK-NEXT: sub x21, x29, #16 +; CHECK-NEXT: sub x20, x29, #16 ; CHECK-NEXT: zero {za} -; CHECK-NEXT: sturh w20, [x29, #-8] -; CHECK-NEXT: msr TPIDR2_EL0, x21 +; CHECK-NEXT: msr TPIDR2_EL0, x20 ; CHECK-NEXT: bl private_za_callee ; CHECK-NEXT: smstart za ; CHECK-NEXT: mrs x8, TPIDR2_EL0 @@ -354,8 +277,7 @@ define void @test_lazy_save_mixed_shared_and_private_callees() "aarch64_new_za" ; CHECK-NEXT: msr TPIDR2_EL0, xzr ; CHECK-NEXT: bl shared_za_callee ; CHECK-NEXT: bl preserves_za_callee -; CHECK-NEXT: sturh w20, [x29, #-8] -; CHECK-NEXT: msr TPIDR2_EL0, x21 +; CHECK-NEXT: msr TPIDR2_EL0, x20 ; CHECK-NEXT: bl private_za_callee ; CHECK-NEXT: smstart za ; CHECK-NEXT: mrs x8, TPIDR2_EL0 @@ -367,9 +289,8 @@ define void @test_lazy_save_mixed_shared_and_private_callees() "aarch64_new_za" ; CHECK-NEXT: msr TPIDR2_EL0, xzr ; CHECK-NEXT: smstop za ; CHECK-NEXT: mov sp, x29 -; CHECK-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: ldr x21, [sp, #16] // 8-byte Folded Reload -; CHECK-NEXT: ldp x29, x30, [sp], #48 // 16-byte Folded Reload +; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload ; CHECK-NEXT: ret ; ; CHECK-NEWLOWERING-LABEL: test_lazy_save_mixed_shared_and_private_callees: @@ -428,28 +349,23 @@ define void @test_lazy_save_mixed_shared_and_private_callees() "aarch64_new_za" define void @test_many_back2back_private_za_calls() "aarch64_inout_za" { ; CHECK-LABEL: test_many_back2back_private_za_calls: ; CHECK: // %bb.0: -; CHECK-NEXT: stp x29, x30, [sp, #-48]! // 16-byte Folded Spill -; CHECK-NEXT: str x21, [sp, #16] // 8-byte Folded Spill +; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: mov x29, sp -; CHECK-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa w29, 48 +; CHECK-NEXT: .cfi_def_cfa w29, 32 ; CHECK-NEXT: .cfi_offset w19, -8 ; CHECK-NEXT: .cfi_offset w20, -16 -; CHECK-NEXT: .cfi_offset w21, -32 -; CHECK-NEXT: .cfi_offset w30, -40 -; CHECK-NEXT: .cfi_offset w29, -48 -; CHECK-NEXT: rdsvl x20, #1 -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: msub x8, x20, x20, x8 -; CHECK-NEXT: mov sp, x8 -; CHECK-NEXT: stur x8, [x29, #-16] -; CHECK-NEXT: sturh wzr, [x29, #-6] -; CHECK-NEXT: stur wzr, [x29, #-4] +; CHECK-NEXT: .cfi_offset w30, -24 +; CHECK-NEXT: .cfi_offset w29, -32 +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: msub x9, x8, x8, x9 +; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: stp x9, x8, [x29, #-16] ; CHECK-NEXT: bl shared_za_callee -; CHECK-NEXT: sub x21, x29, #16 -; CHECK-NEXT: sturh w20, [x29, #-8] -; CHECK-NEXT: msr TPIDR2_EL0, x21 +; CHECK-NEXT: sub x20, x29, #16 +; CHECK-NEXT: msr TPIDR2_EL0, x20 ; CHECK-NEXT: bl private_za_callee ; CHECK-NEXT: smstart za ; CHECK-NEXT: mrs x8, TPIDR2_EL0 @@ -459,8 +375,7 @@ define void @test_many_back2back_private_za_calls() "aarch64_inout_za" { ; CHECK-NEXT: bl __arm_tpidr2_restore ; CHECK-NEXT: .LBB5_2: ; CHECK-NEXT: msr TPIDR2_EL0, xzr -; CHECK-NEXT: sturh w20, [x29, #-8] -; CHECK-NEXT: msr TPIDR2_EL0, x21 +; CHECK-NEXT: msr TPIDR2_EL0, x20 ; CHECK-NEXT: bl private_za_callee ; CHECK-NEXT: smstart za ; CHECK-NEXT: mrs x8, TPIDR2_EL0 @@ -470,8 +385,7 @@ define void @test_many_back2back_private_za_calls() "aarch64_inout_za" { ; CHECK-NEXT: bl __arm_tpidr2_restore ; CHECK-NEXT: .LBB5_4: ; CHECK-NEXT: msr TPIDR2_EL0, xzr -; CHECK-NEXT: sturh w20, [x29, #-8] -; CHECK-NEXT: msr TPIDR2_EL0, x21 +; CHECK-NEXT: msr TPIDR2_EL0, x20 ; CHECK-NEXT: bl private_za_callee ; CHECK-NEXT: smstart za ; CHECK-NEXT: mrs x8, TPIDR2_EL0 @@ -481,8 +395,7 @@ define void @test_many_back2back_private_za_calls() "aarch64_inout_za" { ; CHECK-NEXT: bl __arm_tpidr2_restore ; CHECK-NEXT: .LBB5_6: ; CHECK-NEXT: msr TPIDR2_EL0, xzr -; CHECK-NEXT: sturh w20, [x29, #-8] -; CHECK-NEXT: msr TPIDR2_EL0, x21 +; CHECK-NEXT: msr TPIDR2_EL0, x20 ; CHECK-NEXT: bl private_za_callee ; CHECK-NEXT: smstart za ; CHECK-NEXT: mrs x8, TPIDR2_EL0 @@ -492,8 +405,7 @@ define void @test_many_back2back_private_za_calls() "aarch64_inout_za" { ; CHECK-NEXT: bl __arm_tpidr2_restore ; CHECK-NEXT: .LBB5_8: ; CHECK-NEXT: msr TPIDR2_EL0, xzr -; CHECK-NEXT: sturh w20, [x29, #-8] -; CHECK-NEXT: msr TPIDR2_EL0, x21 +; CHECK-NEXT: msr TPIDR2_EL0, x20 ; CHECK-NEXT: bl private_za_callee ; CHECK-NEXT: smstart za ; CHECK-NEXT: mrs x8, TPIDR2_EL0 @@ -503,8 +415,7 @@ define void @test_many_back2back_private_za_calls() "aarch64_inout_za" { ; CHECK-NEXT: bl __arm_tpidr2_restore ; CHECK-NEXT: .LBB5_10: ; CHECK-NEXT: msr TPIDR2_EL0, xzr -; CHECK-NEXT: sturh w20, [x29, #-8] -; CHECK-NEXT: msr TPIDR2_EL0, x21 +; CHECK-NEXT: msr TPIDR2_EL0, x20 ; CHECK-NEXT: bl private_za_callee ; CHECK-NEXT: smstart za ; CHECK-NEXT: mrs x8, TPIDR2_EL0 @@ -516,9 +427,8 @@ define void @test_many_back2back_private_za_calls() "aarch64_inout_za" { ; CHECK-NEXT: msr TPIDR2_EL0, xzr ; CHECK-NEXT: bl shared_za_callee ; CHECK-NEXT: mov sp, x29 -; CHECK-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: ldr x21, [sp, #16] // 8-byte Folded Reload -; CHECK-NEXT: ldp x29, x30, [sp], #48 // 16-byte Folded Reload +; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload ; CHECK-NEXT: ret ; ; CHECK-NEWLOWERING-LABEL: test_many_back2back_private_za_calls: @@ -570,66 +480,34 @@ define void @test_many_back2back_private_za_calls() "aarch64_inout_za" { } define void @test_shared_private_shared() nounwind "aarch64_inout_za" { -; CHECK-LABEL: test_shared_private_shared: -; CHECK: // %bb.0: -; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill -; CHECK-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill -; CHECK-NEXT: mov x29, sp -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: rdsvl x20, #1 -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: msub x8, x20, x20, x8 -; CHECK-NEXT: mov sp, x8 -; CHECK-NEXT: stur x8, [x29, #-16] -; CHECK-NEXT: sturh wzr, [x29, #-6] -; CHECK-NEXT: stur wzr, [x29, #-4] -; CHECK-NEXT: bl shared_za_callee -; CHECK-NEXT: sub x8, x29, #16 -; CHECK-NEXT: sturh w20, [x29, #-8] -; CHECK-NEXT: msr TPIDR2_EL0, x8 -; CHECK-NEXT: bl private_za_callee -; CHECK-NEXT: smstart za -; CHECK-NEXT: mrs x8, TPIDR2_EL0 -; CHECK-NEXT: sub x0, x29, #16 -; CHECK-NEXT: cbnz x8, .LBB6_2 -; CHECK-NEXT: // %bb.1: -; CHECK-NEXT: bl __arm_tpidr2_restore -; CHECK-NEXT: .LBB6_2: -; CHECK-NEXT: msr TPIDR2_EL0, xzr -; CHECK-NEXT: bl shared_za_callee -; CHECK-NEXT: mov sp, x29 -; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload -; CHECK-NEXT: ret -; -; CHECK-NEWLOWERING-LABEL: test_shared_private_shared: -; CHECK-NEWLOWERING: // %bb.0: -; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill -; CHECK-NEWLOWERING-NEXT: str x19, [sp, #16] // 8-byte Folded Spill -; CHECK-NEWLOWERING-NEXT: mov x29, sp -; CHECK-NEWLOWERING-NEXT: sub sp, sp, #16 -; CHECK-NEWLOWERING-NEXT: rdsvl x8, #1 -; CHECK-NEWLOWERING-NEXT: mov x9, sp -; CHECK-NEWLOWERING-NEXT: msub x9, x8, x8, x9 -; CHECK-NEWLOWERING-NEXT: mov sp, x9 -; CHECK-NEWLOWERING-NEXT: stp x9, x8, [x29, #-16] -; CHECK-NEWLOWERING-NEXT: bl shared_za_callee -; CHECK-NEWLOWERING-NEXT: sub x8, x29, #16 -; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x8 -; CHECK-NEWLOWERING-NEXT: bl private_za_callee -; CHECK-NEWLOWERING-NEXT: smstart za -; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0 -; CHECK-NEWLOWERING-NEXT: sub x0, x29, #16 -; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB6_2 -; CHECK-NEWLOWERING-NEXT: // %bb.1: -; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_restore -; CHECK-NEWLOWERING-NEXT: .LBB6_2: -; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr -; CHECK-NEWLOWERING-NEXT: bl shared_za_callee -; CHECK-NEWLOWERING-NEXT: mov sp, x29 -; CHECK-NEWLOWERING-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload -; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload -; CHECK-NEWLOWERING-NEXT: ret +; CHECK-COMMON-LABEL: test_shared_private_shared: +; CHECK-COMMON: // %bb.0: +; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-COMMON-NEXT: str x19, [sp, #16] // 8-byte Folded Spill +; CHECK-COMMON-NEXT: mov x29, sp +; CHECK-COMMON-NEXT: sub sp, sp, #16 +; CHECK-COMMON-NEXT: rdsvl x8, #1 +; CHECK-COMMON-NEXT: mov x9, sp +; CHECK-COMMON-NEXT: msub x9, x8, x8, x9 +; CHECK-COMMON-NEXT: mov sp, x9 +; CHECK-COMMON-NEXT: stp x9, x8, [x29, #-16] +; CHECK-COMMON-NEXT: bl shared_za_callee +; CHECK-COMMON-NEXT: sub x8, x29, #16 +; CHECK-COMMON-NEXT: msr TPIDR2_EL0, x8 +; CHECK-COMMON-NEXT: bl private_za_callee +; CHECK-COMMON-NEXT: smstart za +; CHECK-COMMON-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-COMMON-NEXT: sub x0, x29, #16 +; CHECK-COMMON-NEXT: cbnz x8, .LBB6_2 +; CHECK-COMMON-NEXT: // %bb.1: +; CHECK-COMMON-NEXT: bl __arm_tpidr2_restore +; CHECK-COMMON-NEXT: .LBB6_2: +; CHECK-COMMON-NEXT: msr TPIDR2_EL0, xzr +; CHECK-COMMON-NEXT: bl shared_za_callee +; CHECK-COMMON-NEXT: mov sp, x29 +; CHECK-COMMON-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload +; CHECK-COMMON-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-COMMON-NEXT: ret call void @shared_za_callee() call void @private_za_callee() call void @shared_za_callee() @@ -651,70 +529,36 @@ declare i64 @shared_za_callee_i64(i64) "aarch64_inout_za" declare i64 @private_za_callee_i64(i64) define i64 @test_shared_private_shared_i64(i64 %x) nounwind "aarch64_inout_za" { -; CHECK-LABEL: test_shared_private_shared_i64: -; CHECK: // %bb.0: -; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill -; CHECK-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill -; CHECK-NEXT: mov x29, sp -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: rdsvl x20, #1 -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: msub x8, x20, x20, x8 -; CHECK-NEXT: mov sp, x8 -; CHECK-NEXT: stur x8, [x29, #-16] -; CHECK-NEXT: sturh wzr, [x29, #-6] -; CHECK-NEXT: stur wzr, [x29, #-4] -; CHECK-NEXT: bl shared_za_callee_i64 -; CHECK-NEXT: sub x8, x29, #16 -; CHECK-NEXT: sturh w20, [x29, #-8] -; CHECK-NEXT: msr TPIDR2_EL0, x8 -; CHECK-NEXT: bl private_za_callee_i64 -; CHECK-NEXT: mov x1, x0 -; CHECK-NEXT: smstart za -; CHECK-NEXT: mrs x8, TPIDR2_EL0 -; CHECK-NEXT: sub x0, x29, #16 -; CHECK-NEXT: cbnz x8, .LBB8_2 -; CHECK-NEXT: // %bb.1: -; CHECK-NEXT: bl __arm_tpidr2_restore -; CHECK-NEXT: .LBB8_2: -; CHECK-NEXT: mov x0, x1 -; CHECK-NEXT: msr TPIDR2_EL0, xzr -; CHECK-NEXT: bl shared_za_callee_i64 -; CHECK-NEXT: mov sp, x29 -; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload -; CHECK-NEXT: ret -; -; CHECK-NEWLOWERING-LABEL: test_shared_private_shared_i64: -; CHECK-NEWLOWERING: // %bb.0: -; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill -; CHECK-NEWLOWERING-NEXT: str x19, [sp, #16] // 8-byte Folded Spill -; CHECK-NEWLOWERING-NEXT: mov x29, sp -; CHECK-NEWLOWERING-NEXT: sub sp, sp, #16 -; CHECK-NEWLOWERING-NEXT: rdsvl x8, #1 -; CHECK-NEWLOWERING-NEXT: mov x9, sp -; CHECK-NEWLOWERING-NEXT: msub x9, x8, x8, x9 -; CHECK-NEWLOWERING-NEXT: mov sp, x9 -; CHECK-NEWLOWERING-NEXT: stp x9, x8, [x29, #-16] -; CHECK-NEWLOWERING-NEXT: bl shared_za_callee_i64 -; CHECK-NEWLOWERING-NEXT: sub x8, x29, #16 -; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x8 -; CHECK-NEWLOWERING-NEXT: bl private_za_callee_i64 -; CHECK-NEWLOWERING-NEXT: mov x1, x0 -; CHECK-NEWLOWERING-NEXT: smstart za -; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0 -; CHECK-NEWLOWERING-NEXT: sub x0, x29, #16 -; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB8_2 -; CHECK-NEWLOWERING-NEXT: // %bb.1: -; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_restore -; CHECK-NEWLOWERING-NEXT: .LBB8_2: -; CHECK-NEWLOWERING-NEXT: mov x0, x1 -; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr -; CHECK-NEWLOWERING-NEXT: bl shared_za_callee_i64 -; CHECK-NEWLOWERING-NEXT: mov sp, x29 -; CHECK-NEWLOWERING-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload -; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload -; CHECK-NEWLOWERING-NEXT: ret +; CHECK-COMMON-LABEL: test_shared_private_shared_i64: +; CHECK-COMMON: // %bb.0: +; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-COMMON-NEXT: str x19, [sp, #16] // 8-byte Folded Spill +; CHECK-COMMON-NEXT: mov x29, sp +; CHECK-COMMON-NEXT: sub sp, sp, #16 +; CHECK-COMMON-NEXT: rdsvl x8, #1 +; CHECK-COMMON-NEXT: mov x9, sp +; CHECK-COMMON-NEXT: msub x9, x8, x8, x9 +; CHECK-COMMON-NEXT: mov sp, x9 +; CHECK-COMMON-NEXT: stp x9, x8, [x29, #-16] +; CHECK-COMMON-NEXT: bl shared_za_callee_i64 +; CHECK-COMMON-NEXT: sub x8, x29, #16 +; CHECK-COMMON-NEXT: msr TPIDR2_EL0, x8 +; CHECK-COMMON-NEXT: bl private_za_callee_i64 +; CHECK-COMMON-NEXT: mov x1, x0 +; CHECK-COMMON-NEXT: smstart za +; CHECK-COMMON-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-COMMON-NEXT: sub x0, x29, #16 +; CHECK-COMMON-NEXT: cbnz x8, .LBB8_2 +; CHECK-COMMON-NEXT: // %bb.1: +; CHECK-COMMON-NEXT: bl __arm_tpidr2_restore +; CHECK-COMMON-NEXT: .LBB8_2: +; CHECK-COMMON-NEXT: mov x0, x1 +; CHECK-COMMON-NEXT: msr TPIDR2_EL0, xzr +; CHECK-COMMON-NEXT: bl shared_za_callee_i64 +; CHECK-COMMON-NEXT: mov sp, x29 +; CHECK-COMMON-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload +; CHECK-COMMON-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-COMMON-NEXT: ret %a = call i64 @shared_za_callee_i64(i64 %x) %b = call i64 @private_za_callee_i64(i64 %a) %c = call i64 @shared_za_callee_i64(i64 %b) @@ -739,12 +583,9 @@ define i64 @test_many_callee_arguments( ; CHECK-NEXT: msub x8, x9, x9, x8 ; CHECK-NEXT: mov sp, x8 ; CHECK-NEXT: ldp x10, x11, [x29, #32] -; CHECK-NEXT: stur x8, [x29, #-16] -; CHECK-NEXT: sub x8, x29, #16 -; CHECK-NEXT: sturh wzr, [x29, #-6] -; CHECK-NEXT: stur wzr, [x29, #-4] -; CHECK-NEXT: sturh w9, [x29, #-8] -; CHECK-NEXT: msr TPIDR2_EL0, x8 +; CHECK-NEXT: sub x12, x29, #16 +; CHECK-NEXT: stp x8, x9, [x29, #-16] +; CHECK-NEXT: msr TPIDR2_EL0, x12 ; CHECK-NEXT: stp x10, x11, [sp, #-16]! ; CHECK-NEXT: bl many_args_private_za_callee ; CHECK-NEXT: add sp, sp, #16 diff --git a/llvm/test/CodeGen/AArch64/sme-lazy-save-windows.ll b/llvm/test/CodeGen/AArch64/sme-lazy-save-windows.ll new file mode 100644 index 000000000000..1c341e8daf49 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme-lazy-save-windows.ll @@ -0,0 +1,39 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=aarch64-windows-msvc -aarch64-streaming-hazard-size=0 -mattr=+sve,+sme < %s | FileCheck %s +; RUN: llc -mtriple=aarch64-windows-msvc -aarch64-streaming-hazard-size=0 -mattr=+sve,+sme -aarch64-new-sme-abi < %s | FileCheck %s + +declare void @private_za_callee() +declare void @shared_za_callee() "aarch64_inout_za" + +define void @test_lazy_save() nounwind "aarch64_inout_za" { +; CHECK-LABEL: test_lazy_save: +; CHECK: // %bb.0: +; CHECK-NEXT: stp x30, x29, [sp, #-32]! // 16-byte Folded Spill +; CHECK-NEXT: str x19, [sp, #16] // 8-byte Folded Spill +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: mul x9, x8, x8 +; CHECK-NEXT: lsr x15, x9, #4 +; CHECK-NEXT: bl __chkstk +; CHECK-NEXT: sub x9, sp, x15, lsl #4 +; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: sub x10, x29, #16 +; CHECK-NEXT: stp x9, x8, [x29, #-16] +; CHECK-NEXT: msr TPIDR2_EL0, x10 +; CHECK-NEXT: bl private_za_callee +; CHECK-NEXT: smstart za +; CHECK-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEXT: sub x0, x29, #16 +; CHECK-NEXT: cbnz x8, .LBB0_2 +; CHECK-NEXT: // %bb.1: +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: .LBB0_2: +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload +; CHECK-NEXT: ldp x30, x29, [sp], #32 // 16-byte Folded Reload +; CHECK-NEXT: ret + call void @private_za_callee() + ret void +} diff --git a/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll b/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll index ab7c661d2718..80827c254778 100644 --- a/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll +++ b/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll @@ -63,8 +63,7 @@ define void @test2() nounwind "aarch64_pstate_sm_compatible" { ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: bl __arm_sme_state -; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: mrs x19, SVCR ; CHECK-NEXT: tbz w19, #0, .LBB2_2 ; CHECK-NEXT: // %bb.1: ; CHECK-NEXT: smstop sm @@ -95,8 +94,7 @@ define void @test3() nounwind "aarch64_pstate_sm_compatible" { ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: bl __arm_sme_state -; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: mrs x19, SVCR ; CHECK-NEXT: tbnz w19, #0, .LBB3_2 ; CHECK-NEXT: // %bb.1: ; CHECK-NEXT: smstart sm diff --git a/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll b/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll index b4ff8d085ff4..c8915aac5608 100644 --- a/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll +++ b/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll @@ -15,12 +15,9 @@ define void @disable_tailcallopt() "aarch64_inout_za" nounwind { ; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: msub x9, x8, x8, x9 ; CHECK-NEXT: mov sp, x9 -; CHECK-NEXT: stur x9, [x29, #-16] -; CHECK-NEXT: sub x9, x29, #16 -; CHECK-NEXT: sturh wzr, [x29, #-6] -; CHECK-NEXT: stur wzr, [x29, #-4] -; CHECK-NEXT: sturh w8, [x29, #-8] -; CHECK-NEXT: msr TPIDR2_EL0, x9 +; CHECK-NEXT: sub x10, x29, #16 +; CHECK-NEXT: stp x9, x8, [x29, #-16] +; CHECK-NEXT: msr TPIDR2_EL0, x10 ; CHECK-NEXT: bl private_za_callee ; CHECK-NEXT: smstart za ; CHECK-NEXT: mrs x8, TPIDR2_EL0 @@ -73,12 +70,9 @@ define fp128 @f128_call_za(fp128 %a, fp128 %b) "aarch64_inout_za" nounwind { ; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: msub x9, x8, x8, x9 ; CHECK-NEXT: mov sp, x9 -; CHECK-NEXT: stur x9, [x29, #-16] -; CHECK-NEXT: sub x9, x29, #16 -; CHECK-NEXT: sturh wzr, [x29, #-6] -; CHECK-NEXT: stur wzr, [x29, #-4] -; CHECK-NEXT: sturh w8, [x29, #-8] -; CHECK-NEXT: msr TPIDR2_EL0, x9 +; CHECK-NEXT: sub x10, x29, #16 +; CHECK-NEXT: stp x9, x8, [x29, #-16] +; CHECK-NEXT: msr TPIDR2_EL0, x10 ; CHECK-NEXT: bl __addtf3 ; CHECK-NEXT: smstart za ; CHECK-NEXT: mrs x8, TPIDR2_EL0 diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-body-streaming-compatible-interface.ll b/llvm/test/CodeGen/AArch64/sme-streaming-body-streaming-compatible-interface.ll index 39ea180e7ed8..1f0581a142c4 100644 --- a/llvm/test/CodeGen/AArch64/sme-streaming-body-streaming-compatible-interface.ll +++ b/llvm/test/CodeGen/AArch64/sme-streaming-body-streaming-compatible-interface.ll @@ -8,26 +8,24 @@ declare void @streaming_compatible_callee() "aarch64_pstate_sm_compatible"; define float @sm_body_sm_compatible_simple() "aarch64_pstate_sm_compatible" "aarch64_pstate_sm_body" nounwind { ; CHECK-LABEL: sm_body_sm_compatible_simple: ; CHECK: // %bb.0: -; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill +; CHECK-NEXT: stp d15, d14, [sp, #-64]! // 16-byte Folded Spill ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: str x30, [sp, #64] // 8-byte Folded Spill -; CHECK-NEXT: bl __arm_sme_state -; CHECK-NEXT: tbnz w0, #0, .LBB0_2 +; CHECK-NEXT: mrs x8, SVCR +; CHECK-NEXT: tbnz w8, #0, .LBB0_2 ; CHECK-NEXT: // %bb.1: ; CHECK-NEXT: smstart sm ; CHECK-NEXT: .LBB0_2: -; CHECK-NEXT: tbnz w0, #0, .LBB0_4 +; CHECK-NEXT: tbnz w8, #0, .LBB0_4 ; CHECK-NEXT: // %bb.3: ; CHECK-NEXT: smstop sm ; CHECK-NEXT: .LBB0_4: ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: fmov s0, wzr ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #64 // 16-byte Folded Reload ; CHECK-NEXT: ret ret float zeroinitializer } @@ -40,8 +38,7 @@ define void @sm_body_caller_sm_compatible_caller_normal_callee() "aarch64_pstate ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: bl __arm_sme_state -; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: mrs x19, SVCR ; CHECK-NEXT: tbnz w19, #0, .LBB1_2 ; CHECK-NEXT: // %bb.1: ; CHECK-NEXT: smstart sm @@ -69,17 +66,15 @@ define void @streaming_body_and_streaming_compatible_interface_multi_basic_block ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill -; CHECK-NEXT: mov w8, w0 ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: bl __arm_sme_state -; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: mrs x19, SVCR ; CHECK-NEXT: tbnz w19, #0, .LBB2_2 ; CHECK-NEXT: // %bb.1: // %entry ; CHECK-NEXT: smstart sm ; CHECK-NEXT: .LBB2_2: // %entry -; CHECK-NEXT: cbz w8, .LBB2_6 +; CHECK-NEXT: cbz w0, .LBB2_6 ; CHECK-NEXT: // %bb.3: // %if.else ; CHECK-NEXT: bl streaming_compatible_callee ; CHECK-NEXT: tbnz w19, #0, .LBB2_5 diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll b/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll index ff4f36363edc..9088986ee9b7 100644 --- a/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll +++ b/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll @@ -41,8 +41,7 @@ define void @streaming_compatible_caller_normal_callee() "aarch64_pstate_sm_comp ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: bl __arm_sme_state -; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: mrs x19, SVCR ; CHECK-NEXT: tbz w19, #0, .LBB1_2 ; CHECK-NEXT: // %bb.1: ; CHECK-NEXT: smstop sm @@ -77,8 +76,7 @@ define void @streaming_compatible_caller_streaming_callee() "aarch64_pstate_sm_c ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: bl __arm_sme_state -; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: mrs x19, SVCR ; CHECK-NEXT: tbnz w19, #0, .LBB2_2 ; CHECK-NEXT: // %bb.1: ; CHECK-NEXT: smstart sm @@ -134,10 +132,7 @@ define <2 x double> @streaming_compatible_with_neon_vectors(<2 x double> %arg) " ; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill -; CHECK-NEXT: bl __arm_sme_state -; CHECK-NEXT: add x8, sp, #16 -; CHECK-NEXT: mov x19, x0 -; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload +; CHECK-NEXT: mrs x19, SVCR ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-NEXT: tbz w19, #0, .LBB4_2 @@ -209,8 +204,7 @@ define <vscale x 2 x double> @streaming_compatible_with_scalable_vectors(<vscale ; CHECK-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-2 ; CHECK-NEXT: str z0, [sp, #1, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: bl __arm_sme_state -; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: mrs x19, SVCR ; CHECK-NEXT: tbz w19, #0, .LBB5_2 ; CHECK-NEXT: // %bb.1: ; CHECK-NEXT: smstop sm @@ -301,8 +295,7 @@ define <vscale x 2 x i1> @streaming_compatible_with_predicate_vectors(<vscale x ; CHECK-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 ; CHECK-NEXT: str p0, [sp, #7, mul vl] // 2-byte Folded Spill -; CHECK-NEXT: bl __arm_sme_state -; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: mrs x19, SVCR ; CHECK-NEXT: tbz w19, #0, .LBB6_2 ; CHECK-NEXT: // %bb.1: ; CHECK-NEXT: smstop sm @@ -365,8 +358,7 @@ define i32 @conditional_smstart_unreachable_block() "aarch64_pstate_sm_compatibl ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: bl __arm_sme_state -; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: mrs x19, SVCR ; CHECK-NEXT: tbnz w19, #0, .LBB7_2 ; CHECK-NEXT: // %bb.1: ; CHECK-NEXT: smstart sm @@ -381,18 +373,16 @@ define void @conditional_smstart_no_successor_block(i1 %p) "aarch64_pstate_sm_co ; CHECK: // %bb.0: ; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill ; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill -; CHECK-NEXT: mov w8, w0 ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: bl __arm_sme_state -; CHECK-NEXT: tbz w8, #0, .LBB8_5 +; CHECK-NEXT: mrs x19, SVCR +; CHECK-NEXT: tbz w0, #0, .LBB8_5 ; CHECK-NEXT: // %bb.1: // %if.then -; CHECK-NEXT: tbnz w0, #0, .LBB8_3 +; CHECK-NEXT: tbnz w19, #0, .LBB8_3 ; CHECK-NEXT: // %bb.2: // %if.then ; CHECK-NEXT: smstart sm ; CHECK-NEXT: .LBB8_3: // %if.then -; CHECK-NEXT: mov x19, x0 ; CHECK-NEXT: bl streaming_callee ; CHECK-NEXT: tbnz w19, #0, .LBB8_5 ; CHECK-NEXT: // %bb.4: // %if.then @@ -422,8 +412,7 @@ define void @disable_tailcallopt() "aarch64_pstate_sm_compatible" nounwind { ; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: bl __arm_sme_state -; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: mrs x19, SVCR ; CHECK-NEXT: tbz w19, #0, .LBB9_2 ; CHECK-NEXT: // %bb.1: ; CHECK-NEXT: smstop sm @@ -469,19 +458,14 @@ define void @call_to_non_streaming_pass_args(ptr nocapture noundef readnone %ptr ; CHECK-NEXT: .cfi_offset b14, -88 ; CHECK-NEXT: .cfi_offset b15, -96 ; CHECK-NEXT: stp d2, d3, [sp, #16] // 16-byte Folded Spill -; CHECK-NEXT: mov x8, x1 -; CHECK-NEXT: mov x9, x0 ; CHECK-NEXT: stp s0, s1, [sp, #8] // 8-byte Folded Spill -; CHECK-NEXT: bl __arm_sme_state -; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: mrs x19, SVCR ; CHECK-NEXT: tbz w19, #0, .LBB10_2 ; CHECK-NEXT: // %bb.1: // %entry ; CHECK-NEXT: smstop sm ; CHECK-NEXT: .LBB10_2: // %entry ; CHECK-NEXT: ldp s0, s1, [sp, #8] // 8-byte Folded Reload -; CHECK-NEXT: mov x0, x9 ; CHECK-NEXT: ldp d2, d3, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: mov x1, x8 ; CHECK-NEXT: bl bar ; CHECK-NEXT: tbz w19, #0, .LBB10_4 ; CHECK-NEXT: // %bb.3: // %entry diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-mode-changes-unwindinfo.ll b/llvm/test/CodeGen/AArch64/sme-streaming-mode-changes-unwindinfo.ll index 991776f11ae4..7be5e6fe2986 100644 --- a/llvm/test/CodeGen/AArch64/sme-streaming-mode-changes-unwindinfo.ll +++ b/llvm/test/CodeGen/AArch64/sme-streaming-mode-changes-unwindinfo.ll @@ -283,8 +283,7 @@ define aarch64_sve_vector_pcs void @streaming_compatible_caller_conditional_mode ; CHECK: .cfi_escape 0x10, 0x4d, 0x0c, 0x12, 0x11, 0x60, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0x50, 0x22 // $d13 @ cfa - 48 * IncomingVG - 48 ; CHECK: .cfi_escape 0x10, 0x4e, 0x0c, 0x12, 0x11, 0x60, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0x50, 0x22 // $d14 @ cfa - 56 * IncomingVG - 48 ; CHECK: .cfi_escape 0x10, 0x4f, 0x0c, 0x12, 0x11, 0x60, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0x50, 0x22 // $d15 @ cfa - 64 * IncomingVG - 48 -; CHECK: bl __arm_sme_state -; CHECK: mov x19, x0 +; CHECK: mrs x19, SVCR ; CHECK: tbnz w19, #0, .LBB5_2 ; CHECK: smstart sm ; CHECK: .LBB5_2: diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-mode-landingpads.ll b/llvm/test/CodeGen/AArch64/sme-streaming-mode-landingpads.ll new file mode 100644 index 000000000000..b583479b21e4 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme-streaming-mode-landingpads.ll @@ -0,0 +1,198 @@ +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=aarch64 -aarch64-streaming-hazard-size=0 -mattr=+sme,+sve -stop-before=finalize-isel -verify-machineinstrs < %s | FileCheck %s + +target triple = "aarch64-unknown-linux-gnu" + +declare void @"StreamingCleanup::~StreamingCleanup"(ptr %this) nounwind "aarch64_pstate_sm_enabled" +declare void @"StreamingCompatCleanup::~StreamingCompatCleanup"(ptr %this) nounwind "aarch64_pstate_sm_compatible" + +declare void @may_throw() "aarch64_pstate_sm_compatible" + +; This test models the kind of IR clang would emit for the following C++: +; +; struct StreamingCleanup { +; ~StreamingCleanup() __arm_streaming +; }; +; +; void may_throw() __arm_streaming_compatible; +; +; void streaming_with_cleanup() __arm_streaming { +; StreamingCleanup cleanup; +; may_throw(); +; } +; +; This is a streaming function and all callees of this function are streaming[-compatible] +; functions (including the StreamingCleanup destructor). This means call lowering will not +; insert any streaming mode switches. However, if "may_throw" throws an exception, the +; unwinder can re-enter this function (in %unwind_cleanup) to run the "StreamingCleanup" +; destructor. The unwinder will always re-enter functions with streaming-mode disabled, so +; we must ensure streaming-mode is enabled on entry to exception handlers. +define void @streaming_with_cleanup() "aarch64_pstate_sm_enabled" personality ptr @__gxx_personality_v0 { + ; CHECK-LABEL: name: streaming_with_cleanup + ; CHECK: bb.0 (%ir-block.0): + ; CHECK-NEXT: successors: %bb.1(0x7ffff800), %bb.2(0x00000800) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: EH_LABEL <mcsymbol > + ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: BL @may_throw, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: EH_LABEL <mcsymbol > + ; CHECK-NEXT: B %bb.1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1.normal_return: + ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: [[ADDXri:%[0-9]+]]:gpr64sp = ADDXri %stack.0.cleanup, 0, 0 + ; CHECK-NEXT: $x0 = COPY [[ADDXri]] + ; CHECK-NEXT: BL @"StreamingCleanup::~StreamingCleanup", csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $x0, implicit-def $sp + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: RET_ReallyLR + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2.unwind_cleanup (landing-pad): + ; CHECK-NEXT: liveins: $x0, $x1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: EH_LABEL <mcsymbol > + ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64all = COPY killed $x1 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64all = COPY killed $x0 + ; CHECK-NEXT: MSRpstatesvcrImm1 1, 1, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit $vg, implicit-def $vg, implicit-def $fpmr + ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: [[ADDXri1:%[0-9]+]]:gpr64sp = ADDXri %stack.0.cleanup, 0, 0 + ; CHECK-NEXT: $x0 = COPY [[ADDXri1]] + ; CHECK-NEXT: BL @"StreamingCleanup::~StreamingCleanup", csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $x0, implicit-def $sp + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: MSRpstatesvcrImm1 1, 0, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit-def $sp, implicit $vg, implicit-def $vg, implicit-def $fpmr + ; CHECK-NEXT: $x0 = COPY [[COPY1]] + ; CHECK-NEXT: BL @_Unwind_Resume, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $x0, implicit-def $sp + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: MSRpstatesvcrImm1 1, 1, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit $vg, implicit-def $vg, implicit-def $fpmr + %cleanup = alloca i8, align 1 + invoke void @may_throw() + to label %normal_return unwind label %unwind_cleanup + +normal_return: + call void @"StreamingCleanup::~StreamingCleanup"(ptr %cleanup) + ret void + +unwind_cleanup: + %eh_info = landingpad { ptr, i32 } + cleanup + call void @"StreamingCleanup::~StreamingCleanup"(ptr %cleanup) + resume { ptr, i32 } %eh_info +} + +; This test is the same as "streaming_with_cleanup", but now the function and destructor +; are streaming-compatible functions. In this case, when we enter the exception handler, +; we must switch to streaming-mode "streaming_compatible_with_cleanup" was entered with +; during normal execution (i.e., EntryPStateSM). +define void @streaming_compatible_with_cleanup() "aarch64_pstate_sm_compatible" personality ptr @__gxx_personality_v0 { + ; CHECK-LABEL: name: streaming_compatible_with_cleanup + ; CHECK: bb.0 (%ir-block.0): + ; CHECK-NEXT: successors: %bb.1(0x7ffff800), %bb.2(0x00000800) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[EntryPStateSM:%[0-9]+]]:gpr64 = EntryPStateSM + ; CHECK-NEXT: EH_LABEL <mcsymbol > + ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: BL @may_throw, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: EH_LABEL <mcsymbol > + ; CHECK-NEXT: B %bb.1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1.normal_return: + ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: [[ADDXri:%[0-9]+]]:gpr64sp = ADDXri %stack.0.cleanup, 0, 0 + ; CHECK-NEXT: $x0 = COPY [[ADDXri]] + ; CHECK-NEXT: BL @"StreamingCompatCleanup::~StreamingCompatCleanup", csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $x0, implicit-def $sp + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: RET_ReallyLR + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2.unwind_cleanup (landing-pad): + ; CHECK-NEXT: liveins: $x0, $x1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: EH_LABEL <mcsymbol > + ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64all = COPY killed $x1 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64all = COPY killed $x0 + ; CHECK-NEXT: MSRpstatePseudo 1, 1, 1, [[EntryPStateSM]], csr_aarch64_smstartstop, implicit-def dead $vg, implicit $vg, implicit $vg, implicit-def $vg, implicit-def $fpmr + ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: [[ADDXri1:%[0-9]+]]:gpr64sp = ADDXri %stack.0.cleanup, 0, 0 + ; CHECK-NEXT: $x0 = COPY [[ADDXri1]] + ; CHECK-NEXT: BL @"StreamingCompatCleanup::~StreamingCompatCleanup", csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $x0, implicit-def $sp + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: MSRpstatePseudo 1, 0, 1, [[EntryPStateSM]], csr_aarch64_smstartstop, implicit-def $vg, implicit $vg, implicit-def $sp, implicit $vg, implicit-def $vg, implicit-def $fpmr + ; CHECK-NEXT: $x0 = COPY [[COPY1]] + ; CHECK-NEXT: BL @_Unwind_Resume, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $x0, implicit-def $sp, implicit-def $vg + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: MSRpstatePseudo 1, 1, 1, [[EntryPStateSM]], csr_aarch64_smstartstop, implicit-def dead $vg, implicit $vg, implicit $vg, implicit-def $vg, implicit-def $fpmr + %cleanup = alloca i8, align 1 + invoke void @may_throw() + to label %normal_return unwind label %unwind_cleanup + +normal_return: + call void @"StreamingCompatCleanup::~StreamingCompatCleanup"(ptr %cleanup) + ret void + +unwind_cleanup: + %eh_info = landingpad { ptr, i32 } + cleanup + call void @"StreamingCompatCleanup::~StreamingCompatCleanup"(ptr %cleanup) + resume { ptr, i32 } %eh_info +} + +; This is the same as "streaming_with_cleanup" but for a locally streaming function. +; The lowering of "unwind_cleanup" is expected to match "streaming_with_cleanup". +define void @locally_streaming_with_cleanup() "aarch64_pstate_sm_body" personality ptr @__gxx_personality_v0 { + ; CHECK-LABEL: name: locally_streaming_with_cleanup + ; CHECK: bb.0 (%ir-block.0): + ; CHECK-NEXT: successors: %bb.1(0x7ffff800), %bb.2(0x00000800) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: MSRpstatesvcrImm1 1, 1, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit $vg, implicit-def $vg, implicit-def $fpmr + ; CHECK-NEXT: EH_LABEL <mcsymbol > + ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: BL @may_throw, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: EH_LABEL <mcsymbol > + ; CHECK-NEXT: B %bb.1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1.normal_return: + ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: [[ADDXri:%[0-9]+]]:gpr64sp = ADDXri %stack.0.cleanup, 0, 0 + ; CHECK-NEXT: $x0 = COPY [[ADDXri]] + ; CHECK-NEXT: BL @"StreamingCleanup::~StreamingCleanup", csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $x0, implicit-def $sp + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: MSRpstatesvcrImm1 1, 0, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit $vg, implicit-def $vg, implicit-def $fpmr + ; CHECK-NEXT: RET_ReallyLR + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2.unwind_cleanup (landing-pad): + ; CHECK-NEXT: liveins: $x0, $x1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: EH_LABEL <mcsymbol > + ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64all = COPY killed $x1 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64all = COPY killed $x0 + ; CHECK-NEXT: MSRpstatesvcrImm1 1, 1, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit $vg, implicit-def $vg, implicit-def $fpmr + ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: [[ADDXri1:%[0-9]+]]:gpr64sp = ADDXri %stack.0.cleanup, 0, 0 + ; CHECK-NEXT: $x0 = COPY [[ADDXri1]] + ; CHECK-NEXT: BL @"StreamingCleanup::~StreamingCleanup", csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $x0, implicit-def $sp + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: MSRpstatesvcrImm1 1, 0, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit-def $sp, implicit $vg, implicit-def $vg, implicit-def $fpmr + ; CHECK-NEXT: $x0 = COPY [[COPY1]] + ; CHECK-NEXT: BL @_Unwind_Resume, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $x0, implicit-def $sp + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: MSRpstatesvcrImm1 1, 1, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit $vg, implicit-def $vg, implicit-def $fpmr + %cleanup = alloca i8, align 1 + invoke void @may_throw() + to label %normal_return unwind label %unwind_cleanup + +normal_return: + call void @"StreamingCleanup::~StreamingCleanup"(ptr %cleanup) + ret void + +unwind_cleanup: + %eh_info = landingpad { ptr, i32 } + cleanup + call void @"StreamingCleanup::~StreamingCleanup"(ptr %cleanup) + resume { ptr, i32 } %eh_info +} + +declare i32 @__gxx_personality_v0(...) diff --git a/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll b/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll index dec8eb0d8a93..c72077bd311b 100644 --- a/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll +++ b/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll @@ -786,8 +786,7 @@ define void @streaming_compatible_to_streaming() #4 { ; CHECK-NEXT: .cfi_offset b13, -80 ; CHECK-NEXT: .cfi_offset b14, -88 ; CHECK-NEXT: .cfi_offset b15, -96 -; CHECK-NEXT: bl __arm_sme_state -; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: mrs x19, SVCR ; CHECK-NEXT: tbnz w19, #0, .LBB6_2 ; CHECK-NEXT: // %bb.1: ; CHECK-NEXT: smstart sm @@ -842,8 +841,7 @@ define void @streaming_compatible_to_streaming() #4 { ; FP-CHECK-NEXT: .cfi_offset b13, -80 ; FP-CHECK-NEXT: .cfi_offset b14, -88 ; FP-CHECK-NEXT: .cfi_offset b15, -96 -; FP-CHECK-NEXT: bl __arm_sme_state -; FP-CHECK-NEXT: mov x19, x0 +; FP-CHECK-NEXT: mrs x19, SVCR ; FP-CHECK-NEXT: tbnz w19, #0, .LBB6_2 ; FP-CHECK-NEXT: // %bb.1: ; FP-CHECK-NEXT: smstart sm @@ -905,8 +903,7 @@ define void @streaming_compatible_to_non_streaming() #4 { ; CHECK-NEXT: .cfi_offset b13, -80 ; CHECK-NEXT: .cfi_offset b14, -88 ; CHECK-NEXT: .cfi_offset b15, -96 -; CHECK-NEXT: bl __arm_sme_state -; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: mrs x19, SVCR ; CHECK-NEXT: tbz w19, #0, .LBB7_2 ; CHECK-NEXT: // %bb.1: ; CHECK-NEXT: smstop sm @@ -961,8 +958,7 @@ define void @streaming_compatible_to_non_streaming() #4 { ; FP-CHECK-NEXT: .cfi_offset b13, -80 ; FP-CHECK-NEXT: .cfi_offset b14, -88 ; FP-CHECK-NEXT: .cfi_offset b15, -96 -; FP-CHECK-NEXT: bl __arm_sme_state -; FP-CHECK-NEXT: mov x19, x0 +; FP-CHECK-NEXT: mrs x19, SVCR ; FP-CHECK-NEXT: tbz w19, #0, .LBB7_2 ; FP-CHECK-NEXT: // %bb.1: ; FP-CHECK-NEXT: smstop sm @@ -1033,14 +1029,11 @@ define void @streaming_compatible_no_sve(i32 noundef %x) #4 { ; NO-SVE-CHECK-NEXT: .cfi_offset b13, -80 ; NO-SVE-CHECK-NEXT: .cfi_offset b14, -88 ; NO-SVE-CHECK-NEXT: .cfi_offset b15, -96 -; NO-SVE-CHECK-NEXT: mov w8, w0 -; NO-SVE-CHECK-NEXT: bl __arm_sme_state -; NO-SVE-CHECK-NEXT: mov x19, x0 +; NO-SVE-CHECK-NEXT: mrs x19, SVCR ; NO-SVE-CHECK-NEXT: tbnz w19, #0, .LBB8_2 ; NO-SVE-CHECK-NEXT: // %bb.1: ; NO-SVE-CHECK-NEXT: smstart sm ; NO-SVE-CHECK-NEXT: .LBB8_2: -; NO-SVE-CHECK-NEXT: mov w0, w8 ; NO-SVE-CHECK-NEXT: bl streaming_callee_with_arg ; NO-SVE-CHECK-NEXT: tbnz w19, #0, .LBB8_4 ; NO-SVE-CHECK-NEXT: // %bb.3: diff --git a/llvm/test/CodeGen/AArch64/sme-za-control-flow.ll b/llvm/test/CodeGen/AArch64/sme-za-control-flow.ll index d3d7e953bedf..18ea07e38fe8 100644 --- a/llvm/test/CodeGen/AArch64/sme-za-control-flow.ll +++ b/llvm/test/CodeGen/AArch64/sme-za-control-flow.ll @@ -8,24 +8,20 @@ declare void @shared_za_call() "aarch64_inout_za" define void @private_za_loop(i32 %n) "aarch64_inout_za" nounwind { ; CHECK-LABEL: private_za_loop: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: stp x29, x30, [sp, #-48]! // 16-byte Folded Spill -; CHECK-NEXT: str x21, [sp, #16] // 8-byte Folded Spill +; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: mov x29, sp -; CHECK-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: rdsvl x8, #1 ; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: msub x8, x8, x8, x9 -; CHECK-NEXT: mov sp, x8 +; CHECK-NEXT: msub x9, x8, x8, x9 +; CHECK-NEXT: mov sp, x9 ; CHECK-NEXT: cmp w0, #1 -; CHECK-NEXT: stur x8, [x29, #-16] -; CHECK-NEXT: sturh wzr, [x29, #-6] -; CHECK-NEXT: stur wzr, [x29, #-4] +; CHECK-NEXT: stp x9, x8, [x29, #-16] ; CHECK-NEXT: b.lt .LBB0_5 ; CHECK-NEXT: // %bb.1: // %loop.preheader ; CHECK-NEXT: mov w19, w0 -; CHECK-NEXT: rdsvl x20, #1 -; CHECK-NEXT: sub x21, x29, #16 +; CHECK-NEXT: sub x20, x29, #16 ; CHECK-NEXT: b .LBB0_3 ; CHECK-NEXT: .LBB0_2: // %loop ; CHECK-NEXT: // in Loop: Header=BB0_3 Depth=1 @@ -34,8 +30,7 @@ define void @private_za_loop(i32 %n) "aarch64_inout_za" nounwind { ; CHECK-NEXT: b.eq .LBB0_5 ; CHECK-NEXT: .LBB0_3: // %loop ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: sturh w20, [x29, #-8] -; CHECK-NEXT: msr TPIDR2_EL0, x21 +; CHECK-NEXT: msr TPIDR2_EL0, x20 ; CHECK-NEXT: bl private_za_call ; CHECK-NEXT: smstart za ; CHECK-NEXT: mrs x8, TPIDR2_EL0 @@ -47,9 +42,8 @@ define void @private_za_loop(i32 %n) "aarch64_inout_za" nounwind { ; CHECK-NEXT: b .LBB0_2 ; CHECK-NEXT: .LBB0_5: // %exit ; CHECK-NEXT: mov sp, x29 -; CHECK-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: ldr x21, [sp, #16] // 8-byte Folded Reload -; CHECK-NEXT: ldp x29, x30, [sp], #48 // 16-byte Folded Reload +; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload ; CHECK-NEXT: ret ; ; CHECK-NEWLOWERING-LABEL: private_za_loop: @@ -106,25 +100,21 @@ exit: define void @private_za_loop_active_entry_and_exit(i32 %n) "aarch64_inout_za" nounwind { ; CHECK-LABEL: private_za_loop_active_entry_and_exit: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: stp x29, x30, [sp, #-48]! // 16-byte Folded Spill -; CHECK-NEXT: str x21, [sp, #16] // 8-byte Folded Spill +; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: mov x29, sp -; CHECK-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: rdsvl x8, #1 ; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: mov w19, w0 -; CHECK-NEXT: msub x8, x8, x8, x9 -; CHECK-NEXT: mov sp, x8 -; CHECK-NEXT: stur x8, [x29, #-16] -; CHECK-NEXT: sturh wzr, [x29, #-6] -; CHECK-NEXT: stur wzr, [x29, #-4] +; CHECK-NEXT: msub x9, x8, x8, x9 +; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: stp x9, x8, [x29, #-16] ; CHECK-NEXT: bl shared_za_call ; CHECK-NEXT: cmp w19, #1 ; CHECK-NEXT: b.lt .LBB1_5 ; CHECK-NEXT: // %bb.1: // %loop.preheader -; CHECK-NEXT: rdsvl x20, #1 -; CHECK-NEXT: sub x21, x29, #16 +; CHECK-NEXT: sub x20, x29, #16 ; CHECK-NEXT: b .LBB1_3 ; CHECK-NEXT: .LBB1_2: // %loop ; CHECK-NEXT: // in Loop: Header=BB1_3 Depth=1 @@ -133,8 +123,7 @@ define void @private_za_loop_active_entry_and_exit(i32 %n) "aarch64_inout_za" no ; CHECK-NEXT: b.eq .LBB1_5 ; CHECK-NEXT: .LBB1_3: // %loop ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: sturh w20, [x29, #-8] -; CHECK-NEXT: msr TPIDR2_EL0, x21 +; CHECK-NEXT: msr TPIDR2_EL0, x20 ; CHECK-NEXT: bl private_za_call ; CHECK-NEXT: smstart za ; CHECK-NEXT: mrs x8, TPIDR2_EL0 @@ -146,9 +135,8 @@ define void @private_za_loop_active_entry_and_exit(i32 %n) "aarch64_inout_za" no ; CHECK-NEXT: b .LBB1_2 ; CHECK-NEXT: .LBB1_5: // %exit ; CHECK-NEXT: mov sp, x29 -; CHECK-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: ldr x21, [sp, #16] // 8-byte Folded Reload -; CHECK-NEXT: ldp x29, x30, [sp], #48 // 16-byte Folded Reload +; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload ; CHECK-NEXT: b shared_za_call ; ; CHECK-NEWLOWERING-LABEL: private_za_loop_active_entry_and_exit: @@ -251,17 +239,13 @@ define void @cond_private_za_call(i1 %cond) "aarch64_inout_za" nounwind { ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: rdsvl x8, #1 ; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: msub x8, x8, x8, x9 -; CHECK-NEXT: mov sp, x8 -; CHECK-NEXT: stur x8, [x29, #-16] -; CHECK-NEXT: sturh wzr, [x29, #-6] -; CHECK-NEXT: stur wzr, [x29, #-4] +; CHECK-NEXT: msub x9, x8, x8, x9 +; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: stp x9, x8, [x29, #-16] ; CHECK-NEXT: tbz w0, #0, .LBB3_4 ; CHECK-NEXT: // %bb.1: // %private_za_call -; CHECK-NEXT: rdsvl x8, #1 -; CHECK-NEXT: sub x9, x29, #16 -; CHECK-NEXT: sturh w8, [x29, #-8] -; CHECK-NEXT: msr TPIDR2_EL0, x9 +; CHECK-NEXT: sub x8, x29, #16 +; CHECK-NEXT: msr TPIDR2_EL0, x8 ; CHECK-NEXT: bl private_za_call ; CHECK-NEXT: smstart za ; CHECK-NEXT: mrs x8, TPIDR2_EL0 @@ -317,20 +301,17 @@ exit: define void @mixed_shared_private_za_loop(ptr %cond) "aarch64_inout_za" nounwind { ; CHECK-LABEL: mixed_shared_private_za_loop: ; CHECK: // %bb.0: -; CHECK-NEXT: stp x29, x30, [sp, #-48]! // 16-byte Folded Spill -; CHECK-NEXT: str x21, [sp, #16] // 8-byte Folded Spill +; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: mov x29, sp -; CHECK-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: rdsvl x20, #1 -; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: mov x19, x0 -; CHECK-NEXT: msub x8, x20, x20, x8 -; CHECK-NEXT: mov sp, x8 -; CHECK-NEXT: sub x21, x29, #16 -; CHECK-NEXT: stur x8, [x29, #-16] -; CHECK-NEXT: sturh wzr, [x29, #-6] -; CHECK-NEXT: stur wzr, [x29, #-4] +; CHECK-NEXT: msub x9, x8, x8, x9 +; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: sub x20, x29, #16 +; CHECK-NEXT: stp x9, x8, [x29, #-16] ; CHECK-NEXT: b .LBB4_2 ; CHECK-NEXT: .LBB4_1: // %loop ; CHECK-NEXT: // in Loop: Header=BB4_2 Depth=1 @@ -340,8 +321,7 @@ define void @mixed_shared_private_za_loop(ptr %cond) "aarch64_inout_za" nounwind ; CHECK-NEXT: .LBB4_2: // %loop ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: bl shared_za_call -; CHECK-NEXT: sturh w20, [x29, #-8] -; CHECK-NEXT: msr TPIDR2_EL0, x21 +; CHECK-NEXT: msr TPIDR2_EL0, x20 ; CHECK-NEXT: bl private_za_call ; CHECK-NEXT: smstart za ; CHECK-NEXT: mrs x8, TPIDR2_EL0 @@ -354,9 +334,8 @@ define void @mixed_shared_private_za_loop(ptr %cond) "aarch64_inout_za" nounwind ; CHECK-NEXT: .LBB4_4: // %exit ; CHECK-NEXT: bl shared_za_call ; CHECK-NEXT: mov sp, x29 -; CHECK-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: ldr x21, [sp, #16] // 8-byte Folded Reload -; CHECK-NEXT: ldp x29, x30, [sp], #48 // 16-byte Folded Reload +; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload ; CHECK-NEXT: ret ; ; CHECK-NEWLOWERING-LABEL: mixed_shared_private_za_loop: @@ -425,18 +404,14 @@ define void @cond_clobber_followed_by_clobber(i1 %cond) "aarch64_inout_za" nounw ; CHECK-NEXT: rdsvl x8, #1 ; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: mov w19, w0 -; CHECK-NEXT: msub x8, x8, x8, x9 -; CHECK-NEXT: mov sp, x8 -; CHECK-NEXT: stur x8, [x29, #-16] -; CHECK-NEXT: sturh wzr, [x29, #-6] -; CHECK-NEXT: stur wzr, [x29, #-4] +; CHECK-NEXT: msub x9, x8, x8, x9 +; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: stp x9, x8, [x29, #-16] ; CHECK-NEXT: bl shared_za_call ; CHECK-NEXT: tbz w19, #0, .LBB5_4 ; CHECK-NEXT: // %bb.1: // %cond_clobber -; CHECK-NEXT: rdsvl x8, #1 -; CHECK-NEXT: sub x9, x29, #16 -; CHECK-NEXT: sturh w8, [x29, #-8] -; CHECK-NEXT: msr TPIDR2_EL0, x9 +; CHECK-NEXT: sub x8, x29, #16 +; CHECK-NEXT: msr TPIDR2_EL0, x8 ; CHECK-NEXT: bl private_za_call ; CHECK-NEXT: smstart za ; CHECK-NEXT: mrs x8, TPIDR2_EL0 @@ -447,10 +422,8 @@ define void @cond_clobber_followed_by_clobber(i1 %cond) "aarch64_inout_za" nounw ; CHECK-NEXT: .LBB5_3: // %cond_clobber ; CHECK-NEXT: msr TPIDR2_EL0, xzr ; CHECK-NEXT: .LBB5_4: // %exit -; CHECK-NEXT: rdsvl x8, #1 -; CHECK-NEXT: sub x9, x29, #16 -; CHECK-NEXT: sturh w8, [x29, #-8] -; CHECK-NEXT: msr TPIDR2_EL0, x9 +; CHECK-NEXT: sub x8, x29, #16 +; CHECK-NEXT: msr TPIDR2_EL0, x8 ; CHECK-NEXT: bl private_za_call ; CHECK-NEXT: smstart za ; CHECK-NEXT: mrs x8, TPIDR2_EL0 @@ -511,67 +484,34 @@ exit: } define void @conditionally_use_za(i1 %cond) "aarch64_inout_za" nounwind { -; CHECK-LABEL: conditionally_use_za: -; CHECK: // %bb.0: -; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill -; CHECK-NEXT: mov x29, sp -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: rdsvl x8, #1 -; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: msub x8, x8, x8, x9 -; CHECK-NEXT: mov sp, x8 -; CHECK-NEXT: stur x8, [x29, #-16] -; CHECK-NEXT: sturh wzr, [x29, #-6] -; CHECK-NEXT: stur wzr, [x29, #-4] -; CHECK-NEXT: tbz w0, #0, .LBB6_4 -; CHECK-NEXT: // %bb.1: // %use_za -; CHECK-NEXT: bl shared_za_call -; CHECK-NEXT: rdsvl x8, #1 -; CHECK-NEXT: sub x9, x29, #16 -; CHECK-NEXT: sturh w8, [x29, #-8] -; CHECK-NEXT: msr TPIDR2_EL0, x9 -; CHECK-NEXT: bl private_za_call -; CHECK-NEXT: smstart za -; CHECK-NEXT: mrs x8, TPIDR2_EL0 -; CHECK-NEXT: sub x0, x29, #16 -; CHECK-NEXT: cbnz x8, .LBB6_3 -; CHECK-NEXT: // %bb.2: // %use_za -; CHECK-NEXT: bl __arm_tpidr2_restore -; CHECK-NEXT: .LBB6_3: // %use_za -; CHECK-NEXT: msr TPIDR2_EL0, xzr -; CHECK-NEXT: .LBB6_4: // %exit -; CHECK-NEXT: mov sp, x29 -; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload -; CHECK-NEXT: ret -; -; CHECK-NEWLOWERING-LABEL: conditionally_use_za: -; CHECK-NEWLOWERING: // %bb.0: -; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill -; CHECK-NEWLOWERING-NEXT: mov x29, sp -; CHECK-NEWLOWERING-NEXT: sub sp, sp, #16 -; CHECK-NEWLOWERING-NEXT: rdsvl x8, #1 -; CHECK-NEWLOWERING-NEXT: mov x9, sp -; CHECK-NEWLOWERING-NEXT: msub x9, x8, x8, x9 -; CHECK-NEWLOWERING-NEXT: mov sp, x9 -; CHECK-NEWLOWERING-NEXT: stp x9, x8, [x29, #-16] -; CHECK-NEWLOWERING-NEXT: tbz w0, #0, .LBB6_4 -; CHECK-NEWLOWERING-NEXT: // %bb.1: // %use_za -; CHECK-NEWLOWERING-NEXT: bl shared_za_call -; CHECK-NEWLOWERING-NEXT: sub x8, x29, #16 -; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x8 -; CHECK-NEWLOWERING-NEXT: bl private_za_call -; CHECK-NEWLOWERING-NEXT: smstart za -; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0 -; CHECK-NEWLOWERING-NEXT: sub x0, x29, #16 -; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB6_3 -; CHECK-NEWLOWERING-NEXT: // %bb.2: // %use_za -; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_restore -; CHECK-NEWLOWERING-NEXT: .LBB6_3: // %use_za -; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr -; CHECK-NEWLOWERING-NEXT: .LBB6_4: // %exit -; CHECK-NEWLOWERING-NEXT: mov sp, x29 -; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload -; CHECK-NEWLOWERING-NEXT: ret +; CHECK-COMMON-LABEL: conditionally_use_za: +; CHECK-COMMON: // %bb.0: +; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-COMMON-NEXT: mov x29, sp +; CHECK-COMMON-NEXT: sub sp, sp, #16 +; CHECK-COMMON-NEXT: rdsvl x8, #1 +; CHECK-COMMON-NEXT: mov x9, sp +; CHECK-COMMON-NEXT: msub x9, x8, x8, x9 +; CHECK-COMMON-NEXT: mov sp, x9 +; CHECK-COMMON-NEXT: stp x9, x8, [x29, #-16] +; CHECK-COMMON-NEXT: tbz w0, #0, .LBB6_4 +; CHECK-COMMON-NEXT: // %bb.1: // %use_za +; CHECK-COMMON-NEXT: bl shared_za_call +; CHECK-COMMON-NEXT: sub x8, x29, #16 +; CHECK-COMMON-NEXT: msr TPIDR2_EL0, x8 +; CHECK-COMMON-NEXT: bl private_za_call +; CHECK-COMMON-NEXT: smstart za +; CHECK-COMMON-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-COMMON-NEXT: sub x0, x29, #16 +; CHECK-COMMON-NEXT: cbnz x8, .LBB6_3 +; CHECK-COMMON-NEXT: // %bb.2: // %use_za +; CHECK-COMMON-NEXT: bl __arm_tpidr2_restore +; CHECK-COMMON-NEXT: .LBB6_3: // %use_za +; CHECK-COMMON-NEXT: msr TPIDR2_EL0, xzr +; CHECK-COMMON-NEXT: .LBB6_4: // %exit +; CHECK-COMMON-NEXT: mov sp, x29 +; CHECK-COMMON-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-COMMON-NEXT: ret br i1 %cond, label %use_za, label %exit use_za: @@ -585,73 +525,37 @@ exit: define void @diamond_mixed_za_merge_shared(i1 %cond) "aarch64_inout_za" nounwind { -; CHECK-LABEL: diamond_mixed_za_merge_shared: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill -; CHECK-NEXT: mov x29, sp -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: rdsvl x8, #1 -; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: msub x8, x8, x8, x9 -; CHECK-NEXT: mov sp, x8 -; CHECK-NEXT: stur x8, [x29, #-16] -; CHECK-NEXT: sturh wzr, [x29, #-6] -; CHECK-NEXT: stur wzr, [x29, #-4] -; CHECK-NEXT: tbz w0, #0, .LBB7_2 -; CHECK-NEXT: // %bb.1: // %then -; CHECK-NEXT: bl shared_za_call -; CHECK-NEXT: b .LBB7_5 -; CHECK-NEXT: .LBB7_2: // %else -; CHECK-NEXT: rdsvl x8, #1 -; CHECK-NEXT: sub x9, x29, #16 -; CHECK-NEXT: sturh w8, [x29, #-8] -; CHECK-NEXT: msr TPIDR2_EL0, x9 -; CHECK-NEXT: bl private_za_call -; CHECK-NEXT: smstart za -; CHECK-NEXT: mrs x8, TPIDR2_EL0 -; CHECK-NEXT: sub x0, x29, #16 -; CHECK-NEXT: cbnz x8, .LBB7_4 -; CHECK-NEXT: // %bb.3: // %else -; CHECK-NEXT: bl __arm_tpidr2_restore -; CHECK-NEXT: .LBB7_4: // %else -; CHECK-NEXT: msr TPIDR2_EL0, xzr -; CHECK-NEXT: .LBB7_5: // %merge_shared -; CHECK-NEXT: bl shared_za_call -; CHECK-NEXT: mov sp, x29 -; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload -; CHECK-NEXT: ret -; -; CHECK-NEWLOWERING-LABEL: diamond_mixed_za_merge_shared: -; CHECK-NEWLOWERING: // %bb.0: // %entry -; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill -; CHECK-NEWLOWERING-NEXT: mov x29, sp -; CHECK-NEWLOWERING-NEXT: sub sp, sp, #16 -; CHECK-NEWLOWERING-NEXT: rdsvl x8, #1 -; CHECK-NEWLOWERING-NEXT: mov x9, sp -; CHECK-NEWLOWERING-NEXT: msub x9, x8, x8, x9 -; CHECK-NEWLOWERING-NEXT: mov sp, x9 -; CHECK-NEWLOWERING-NEXT: stp x9, x8, [x29, #-16] -; CHECK-NEWLOWERING-NEXT: tbz w0, #0, .LBB7_2 -; CHECK-NEWLOWERING-NEXT: // %bb.1: // %then -; CHECK-NEWLOWERING-NEXT: bl shared_za_call -; CHECK-NEWLOWERING-NEXT: b .LBB7_5 -; CHECK-NEWLOWERING-NEXT: .LBB7_2: // %else -; CHECK-NEWLOWERING-NEXT: sub x8, x29, #16 -; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x8 -; CHECK-NEWLOWERING-NEXT: bl private_za_call -; CHECK-NEWLOWERING-NEXT: smstart za -; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0 -; CHECK-NEWLOWERING-NEXT: sub x0, x29, #16 -; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB7_4 -; CHECK-NEWLOWERING-NEXT: // %bb.3: // %else -; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_restore -; CHECK-NEWLOWERING-NEXT: .LBB7_4: // %else -; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr -; CHECK-NEWLOWERING-NEXT: .LBB7_5: // %merge_shared -; CHECK-NEWLOWERING-NEXT: bl shared_za_call -; CHECK-NEWLOWERING-NEXT: mov sp, x29 -; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload -; CHECK-NEWLOWERING-NEXT: ret +; CHECK-COMMON-LABEL: diamond_mixed_za_merge_shared: +; CHECK-COMMON: // %bb.0: // %entry +; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-COMMON-NEXT: mov x29, sp +; CHECK-COMMON-NEXT: sub sp, sp, #16 +; CHECK-COMMON-NEXT: rdsvl x8, #1 +; CHECK-COMMON-NEXT: mov x9, sp +; CHECK-COMMON-NEXT: msub x9, x8, x8, x9 +; CHECK-COMMON-NEXT: mov sp, x9 +; CHECK-COMMON-NEXT: stp x9, x8, [x29, #-16] +; CHECK-COMMON-NEXT: tbz w0, #0, .LBB7_2 +; CHECK-COMMON-NEXT: // %bb.1: // %then +; CHECK-COMMON-NEXT: bl shared_za_call +; CHECK-COMMON-NEXT: b .LBB7_5 +; CHECK-COMMON-NEXT: .LBB7_2: // %else +; CHECK-COMMON-NEXT: sub x8, x29, #16 +; CHECK-COMMON-NEXT: msr TPIDR2_EL0, x8 +; CHECK-COMMON-NEXT: bl private_za_call +; CHECK-COMMON-NEXT: smstart za +; CHECK-COMMON-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-COMMON-NEXT: sub x0, x29, #16 +; CHECK-COMMON-NEXT: cbnz x8, .LBB7_4 +; CHECK-COMMON-NEXT: // %bb.3: // %else +; CHECK-COMMON-NEXT: bl __arm_tpidr2_restore +; CHECK-COMMON-NEXT: .LBB7_4: // %else +; CHECK-COMMON-NEXT: msr TPIDR2_EL0, xzr +; CHECK-COMMON-NEXT: .LBB7_5: // %merge_shared +; CHECK-COMMON-NEXT: bl shared_za_call +; CHECK-COMMON-NEXT: mov sp, x29 +; CHECK-COMMON-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-COMMON-NEXT: ret entry: br i1 %cond, label %then, label %else @@ -677,20 +581,16 @@ define void @diamond_mixed_za_merge_private(i1 %cond) "aarch64_inout_za" nounwin ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: rdsvl x8, #1 ; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: msub x8, x8, x8, x9 -; CHECK-NEXT: mov sp, x8 -; CHECK-NEXT: stur x8, [x29, #-16] -; CHECK-NEXT: sturh wzr, [x29, #-6] -; CHECK-NEXT: stur wzr, [x29, #-4] +; CHECK-NEXT: msub x9, x8, x8, x9 +; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: stp x9, x8, [x29, #-16] ; CHECK-NEXT: tbz w0, #0, .LBB8_2 ; CHECK-NEXT: // %bb.1: // %then ; CHECK-NEXT: bl shared_za_call ; CHECK-NEXT: b .LBB8_5 ; CHECK-NEXT: .LBB8_2: // %else -; CHECK-NEXT: rdsvl x8, #1 -; CHECK-NEXT: sub x9, x29, #16 -; CHECK-NEXT: sturh w8, [x29, #-8] -; CHECK-NEXT: msr TPIDR2_EL0, x9 +; CHECK-NEXT: sub x8, x29, #16 +; CHECK-NEXT: msr TPIDR2_EL0, x8 ; CHECK-NEXT: bl private_za_call ; CHECK-NEXT: smstart za ; CHECK-NEXT: mrs x8, TPIDR2_EL0 @@ -701,10 +601,8 @@ define void @diamond_mixed_za_merge_private(i1 %cond) "aarch64_inout_za" nounwin ; CHECK-NEXT: .LBB8_4: // %else ; CHECK-NEXT: msr TPIDR2_EL0, xzr ; CHECK-NEXT: .LBB8_5: // %merge_private_za -; CHECK-NEXT: rdsvl x8, #1 -; CHECK-NEXT: sub x9, x29, #16 -; CHECK-NEXT: sturh w8, [x29, #-8] -; CHECK-NEXT: msr TPIDR2_EL0, x9 +; CHECK-NEXT: sub x8, x29, #16 +; CHECK-NEXT: msr TPIDR2_EL0, x8 ; CHECK-NEXT: bl private_za_call ; CHECK-NEXT: smstart za ; CHECK-NEXT: mrs x8, TPIDR2_EL0 @@ -777,20 +675,16 @@ define void @critical_edge_mixed_za(i1 %c1, i1 %c2) "aarch64_inout_za" nounwind ; CHECK-NEXT: rdsvl x8, #1 ; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: mov w19, w1 -; CHECK-NEXT: msub x8, x8, x8, x9 -; CHECK-NEXT: mov sp, x8 -; CHECK-NEXT: stur x8, [x29, #-16] -; CHECK-NEXT: sturh wzr, [x29, #-6] -; CHECK-NEXT: stur wzr, [x29, #-4] +; CHECK-NEXT: msub x9, x8, x8, x9 +; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: stp x9, x8, [x29, #-16] ; CHECK-NEXT: tbz w0, #0, .LBB9_5 ; CHECK-NEXT: // %bb.1: // %shared_path ; CHECK-NEXT: bl shared_za_call ; CHECK-NEXT: tbz w19, #0, .LBB9_8 ; CHECK-NEXT: .LBB9_2: // %exit_private -; CHECK-NEXT: rdsvl x8, #1 -; CHECK-NEXT: sub x9, x29, #16 -; CHECK-NEXT: sturh w8, [x29, #-8] -; CHECK-NEXT: msr TPIDR2_EL0, x9 +; CHECK-NEXT: sub x8, x29, #16 +; CHECK-NEXT: msr TPIDR2_EL0, x8 ; CHECK-NEXT: bl private_za_call ; CHECK-NEXT: smstart za ; CHECK-NEXT: mrs x8, TPIDR2_EL0 @@ -802,10 +696,8 @@ define void @critical_edge_mixed_za(i1 %c1, i1 %c2) "aarch64_inout_za" nounwind ; CHECK-NEXT: msr TPIDR2_EL0, xzr ; CHECK-NEXT: b .LBB9_9 ; CHECK-NEXT: .LBB9_5: // %private_path -; CHECK-NEXT: rdsvl x8, #1 -; CHECK-NEXT: sub x9, x29, #16 -; CHECK-NEXT: sturh w8, [x29, #-8] -; CHECK-NEXT: msr TPIDR2_EL0, x9 +; CHECK-NEXT: sub x8, x29, #16 +; CHECK-NEXT: msr TPIDR2_EL0, x8 ; CHECK-NEXT: bl private_za_call ; CHECK-NEXT: smstart za ; CHECK-NEXT: mrs x8, TPIDR2_EL0 @@ -897,117 +789,58 @@ exit_shared: } define void @nested_cond_in_loop(i32 %n, i1 %cond) "aarch64_inout_za" nounwind { -; CHECK-LABEL: nested_cond_in_loop: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: stp x29, x30, [sp, #-64]! // 16-byte Folded Spill -; CHECK-NEXT: str x23, [sp, #16] // 8-byte Folded Spill -; CHECK-NEXT: mov x29, sp -; CHECK-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill -; CHECK-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: rdsvl x8, #1 -; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: msub x8, x8, x8, x9 -; CHECK-NEXT: mov sp, x8 -; CHECK-NEXT: cmp w0, #1 -; CHECK-NEXT: stur x8, [x29, #-16] -; CHECK-NEXT: sturh wzr, [x29, #-6] -; CHECK-NEXT: stur wzr, [x29, #-4] -; CHECK-NEXT: b.lt .LBB10_8 -; CHECK-NEXT: // %bb.1: // %loop.preheader -; CHECK-NEXT: mov w19, w1 -; CHECK-NEXT: mov w20, w0 -; CHECK-NEXT: mov w21, wzr -; CHECK-NEXT: rdsvl x22, #1 -; CHECK-NEXT: sub x23, x29, #16 -; CHECK-NEXT: b .LBB10_4 -; CHECK-NEXT: .LBB10_2: // %use_shared -; CHECK-NEXT: // in Loop: Header=BB10_4 Depth=1 -; CHECK-NEXT: bl shared_za_call -; CHECK-NEXT: .LBB10_3: // %latch -; CHECK-NEXT: // in Loop: Header=BB10_4 Depth=1 -; CHECK-NEXT: add w21, w21, #1 -; CHECK-NEXT: cmp w21, w20 -; CHECK-NEXT: b.ge .LBB10_8 -; CHECK-NEXT: .LBB10_4: // %loop -; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: tbnz w19, #0, .LBB10_2 -; CHECK-NEXT: // %bb.5: // %use_private -; CHECK-NEXT: // in Loop: Header=BB10_4 Depth=1 -; CHECK-NEXT: sturh w22, [x29, #-8] -; CHECK-NEXT: msr TPIDR2_EL0, x23 -; CHECK-NEXT: bl private_za_call -; CHECK-NEXT: smstart za -; CHECK-NEXT: mrs x8, TPIDR2_EL0 -; CHECK-NEXT: sub x0, x29, #16 -; CHECK-NEXT: cbnz x8, .LBB10_7 -; CHECK-NEXT: // %bb.6: // %use_private -; CHECK-NEXT: // in Loop: Header=BB10_4 Depth=1 -; CHECK-NEXT: bl __arm_tpidr2_restore -; CHECK-NEXT: .LBB10_7: // %use_private -; CHECK-NEXT: // in Loop: Header=BB10_4 Depth=1 -; CHECK-NEXT: msr TPIDR2_EL0, xzr -; CHECK-NEXT: b .LBB10_3 -; CHECK-NEXT: .LBB10_8: // %exit -; CHECK-NEXT: mov sp, x29 -; CHECK-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: ldr x23, [sp, #16] // 8-byte Folded Reload -; CHECK-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: ldp x29, x30, [sp], #64 // 16-byte Folded Reload -; CHECK-NEXT: ret -; -; CHECK-NEWLOWERING-LABEL: nested_cond_in_loop: -; CHECK-NEWLOWERING: // %bb.0: // %entry -; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #-48]! // 16-byte Folded Spill -; CHECK-NEWLOWERING-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill -; CHECK-NEWLOWERING-NEXT: mov x29, sp -; CHECK-NEWLOWERING-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill -; CHECK-NEWLOWERING-NEXT: sub sp, sp, #16 -; CHECK-NEWLOWERING-NEXT: rdsvl x8, #1 -; CHECK-NEWLOWERING-NEXT: mov x9, sp -; CHECK-NEWLOWERING-NEXT: msub x9, x8, x8, x9 -; CHECK-NEWLOWERING-NEXT: mov sp, x9 -; CHECK-NEWLOWERING-NEXT: cmp w0, #1 -; CHECK-NEWLOWERING-NEXT: stp x9, x8, [x29, #-16] -; CHECK-NEWLOWERING-NEXT: b.lt .LBB10_8 -; CHECK-NEWLOWERING-NEXT: // %bb.1: // %loop.preheader -; CHECK-NEWLOWERING-NEXT: mov w19, w1 -; CHECK-NEWLOWERING-NEXT: mov w20, w0 -; CHECK-NEWLOWERING-NEXT: mov w21, wzr -; CHECK-NEWLOWERING-NEXT: sub x22, x29, #16 -; CHECK-NEWLOWERING-NEXT: b .LBB10_4 -; CHECK-NEWLOWERING-NEXT: .LBB10_2: // %use_shared -; CHECK-NEWLOWERING-NEXT: // in Loop: Header=BB10_4 Depth=1 -; CHECK-NEWLOWERING-NEXT: bl shared_za_call -; CHECK-NEWLOWERING-NEXT: .LBB10_3: // %latch -; CHECK-NEWLOWERING-NEXT: // in Loop: Header=BB10_4 Depth=1 -; CHECK-NEWLOWERING-NEXT: add w21, w21, #1 -; CHECK-NEWLOWERING-NEXT: cmp w21, w20 -; CHECK-NEWLOWERING-NEXT: b.ge .LBB10_8 -; CHECK-NEWLOWERING-NEXT: .LBB10_4: // %loop -; CHECK-NEWLOWERING-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEWLOWERING-NEXT: tbnz w19, #0, .LBB10_2 -; CHECK-NEWLOWERING-NEXT: // %bb.5: // %use_private -; CHECK-NEWLOWERING-NEXT: // in Loop: Header=BB10_4 Depth=1 -; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x22 -; CHECK-NEWLOWERING-NEXT: bl private_za_call -; CHECK-NEWLOWERING-NEXT: smstart za -; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0 -; CHECK-NEWLOWERING-NEXT: sub x0, x29, #16 -; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB10_7 -; CHECK-NEWLOWERING-NEXT: // %bb.6: // %use_private -; CHECK-NEWLOWERING-NEXT: // in Loop: Header=BB10_4 Depth=1 -; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_restore -; CHECK-NEWLOWERING-NEXT: .LBB10_7: // %use_private -; CHECK-NEWLOWERING-NEXT: // in Loop: Header=BB10_4 Depth=1 -; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr -; CHECK-NEWLOWERING-NEXT: b .LBB10_3 -; CHECK-NEWLOWERING-NEXT: .LBB10_8: // %exit -; CHECK-NEWLOWERING-NEXT: mov sp, x29 -; CHECK-NEWLOWERING-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload -; CHECK-NEWLOWERING-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload -; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp], #48 // 16-byte Folded Reload -; CHECK-NEWLOWERING-NEXT: ret +; CHECK-COMMON-LABEL: nested_cond_in_loop: +; CHECK-COMMON: // %bb.0: // %entry +; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-48]! // 16-byte Folded Spill +; CHECK-COMMON-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill +; CHECK-COMMON-NEXT: mov x29, sp +; CHECK-COMMON-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill +; CHECK-COMMON-NEXT: sub sp, sp, #16 +; CHECK-COMMON-NEXT: rdsvl x8, #1 +; CHECK-COMMON-NEXT: mov x9, sp +; CHECK-COMMON-NEXT: msub x9, x8, x8, x9 +; CHECK-COMMON-NEXT: mov sp, x9 +; CHECK-COMMON-NEXT: cmp w0, #1 +; CHECK-COMMON-NEXT: stp x9, x8, [x29, #-16] +; CHECK-COMMON-NEXT: b.lt .LBB10_8 +; CHECK-COMMON-NEXT: // %bb.1: // %loop.preheader +; CHECK-COMMON-NEXT: mov w19, w1 +; CHECK-COMMON-NEXT: mov w20, w0 +; CHECK-COMMON-NEXT: mov w21, wzr +; CHECK-COMMON-NEXT: sub x22, x29, #16 +; CHECK-COMMON-NEXT: b .LBB10_4 +; CHECK-COMMON-NEXT: .LBB10_2: // %use_shared +; CHECK-COMMON-NEXT: // in Loop: Header=BB10_4 Depth=1 +; CHECK-COMMON-NEXT: bl shared_za_call +; CHECK-COMMON-NEXT: .LBB10_3: // %latch +; CHECK-COMMON-NEXT: // in Loop: Header=BB10_4 Depth=1 +; CHECK-COMMON-NEXT: add w21, w21, #1 +; CHECK-COMMON-NEXT: cmp w21, w20 +; CHECK-COMMON-NEXT: b.ge .LBB10_8 +; CHECK-COMMON-NEXT: .LBB10_4: // %loop +; CHECK-COMMON-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-COMMON-NEXT: tbnz w19, #0, .LBB10_2 +; CHECK-COMMON-NEXT: // %bb.5: // %use_private +; CHECK-COMMON-NEXT: // in Loop: Header=BB10_4 Depth=1 +; CHECK-COMMON-NEXT: msr TPIDR2_EL0, x22 +; CHECK-COMMON-NEXT: bl private_za_call +; CHECK-COMMON-NEXT: smstart za +; CHECK-COMMON-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-COMMON-NEXT: sub x0, x29, #16 +; CHECK-COMMON-NEXT: cbnz x8, .LBB10_7 +; CHECK-COMMON-NEXT: // %bb.6: // %use_private +; CHECK-COMMON-NEXT: // in Loop: Header=BB10_4 Depth=1 +; CHECK-COMMON-NEXT: bl __arm_tpidr2_restore +; CHECK-COMMON-NEXT: .LBB10_7: // %use_private +; CHECK-COMMON-NEXT: // in Loop: Header=BB10_4 Depth=1 +; CHECK-COMMON-NEXT: msr TPIDR2_EL0, xzr +; CHECK-COMMON-NEXT: b .LBB10_3 +; CHECK-COMMON-NEXT: .LBB10_8: // %exit +; CHECK-COMMON-NEXT: mov sp, x29 +; CHECK-COMMON-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload +; CHECK-COMMON-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload +; CHECK-COMMON-NEXT: ldp x29, x30, [sp], #48 // 16-byte Folded Reload +; CHECK-COMMON-NEXT: ret entry: %cmp = icmp sgt i32 %n, 0 br i1 %cmp, label %loop, label %exit @@ -1036,25 +869,21 @@ exit: define void @loop_with_external_entry(i1 %c1, i1 %c2) "aarch64_inout_za" nounwind { ; CHECK-LABEL: loop_with_external_entry: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: stp x29, x30, [sp, #-48]! // 16-byte Folded Spill -; CHECK-NEXT: str x21, [sp, #16] // 8-byte Folded Spill +; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: mov x29, sp -; CHECK-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: rdsvl x8, #1 ; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: mov w19, w1 -; CHECK-NEXT: msub x8, x8, x8, x9 -; CHECK-NEXT: mov sp, x8 -; CHECK-NEXT: stur x8, [x29, #-16] -; CHECK-NEXT: sturh wzr, [x29, #-6] -; CHECK-NEXT: stur wzr, [x29, #-4] +; CHECK-NEXT: msub x9, x8, x8, x9 +; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: stp x9, x8, [x29, #-16] ; CHECK-NEXT: tbz w0, #0, .LBB11_2 ; CHECK-NEXT: // %bb.1: // %init ; CHECK-NEXT: bl shared_za_call ; CHECK-NEXT: .LBB11_2: // %loop.preheader -; CHECK-NEXT: rdsvl x20, #1 -; CHECK-NEXT: sub x21, x29, #16 +; CHECK-NEXT: sub x20, x29, #16 ; CHECK-NEXT: b .LBB11_4 ; CHECK-NEXT: .LBB11_3: // %loop ; CHECK-NEXT: // in Loop: Header=BB11_4 Depth=1 @@ -1062,8 +891,7 @@ define void @loop_with_external_entry(i1 %c1, i1 %c2) "aarch64_inout_za" nounwin ; CHECK-NEXT: tbz w19, #0, .LBB11_6 ; CHECK-NEXT: .LBB11_4: // %loop ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: sturh w20, [x29, #-8] -; CHECK-NEXT: msr TPIDR2_EL0, x21 +; CHECK-NEXT: msr TPIDR2_EL0, x20 ; CHECK-NEXT: bl private_za_call ; CHECK-NEXT: smstart za ; CHECK-NEXT: mrs x8, TPIDR2_EL0 @@ -1075,9 +903,8 @@ define void @loop_with_external_entry(i1 %c1, i1 %c2) "aarch64_inout_za" nounwin ; CHECK-NEXT: b .LBB11_3 ; CHECK-NEXT: .LBB11_6: // %exit ; CHECK-NEXT: mov sp, x29 -; CHECK-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: ldr x21, [sp, #16] // 8-byte Folded Reload -; CHECK-NEXT: ldp x29, x30, [sp], #48 // 16-byte Folded Reload +; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload ; CHECK-NEXT: ret ; ; CHECK-NEWLOWERING-LABEL: loop_with_external_entry: diff --git a/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll b/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll index a9ad6f695cf8..066ee3b04046 100644 --- a/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll +++ b/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll @@ -22,11 +22,9 @@ define float @multi_bb_stpidr2_save_required(i32 %a, float %b, float %c) "aarch6 ; CHECK-NEXT: .cfi_offset w29, -16 ; CHECK-NEXT: rdsvl x8, #1 ; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: msub x8, x8, x8, x9 -; CHECK-NEXT: mov sp, x8 -; CHECK-NEXT: stur x8, [x29, #-16] -; CHECK-NEXT: sturh wzr, [x29, #-6] -; CHECK-NEXT: stur wzr, [x29, #-4] +; CHECK-NEXT: msub x9, x8, x8, x9 +; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: stp x9, x8, [x29, #-16] ; CHECK-NEXT: cbz w0, .LBB1_2 ; CHECK-NEXT: // %bb.1: // %use_b ; CHECK-NEXT: fmov s1, #4.00000000 @@ -34,10 +32,8 @@ define float @multi_bb_stpidr2_save_required(i32 %a, float %b, float %c) "aarch6 ; CHECK-NEXT: b .LBB1_5 ; CHECK-NEXT: .LBB1_2: // %use_c ; CHECK-NEXT: fmov s0, s1 -; CHECK-NEXT: rdsvl x8, #1 -; CHECK-NEXT: sub x9, x29, #16 -; CHECK-NEXT: sturh w8, [x29, #-8] -; CHECK-NEXT: msr TPIDR2_EL0, x9 +; CHECK-NEXT: sub x8, x29, #16 +; CHECK-NEXT: msr TPIDR2_EL0, x8 ; CHECK-NEXT: bl cosf ; CHECK-NEXT: smstart za ; CHECK-NEXT: mrs x8, TPIDR2_EL0 @@ -103,7 +99,6 @@ exit: ret float %ret } -; FIXME: This is missing stack probes with -aarch64-new-sme-abi. define float @multi_bb_stpidr2_save_required_stackprobe(i32 %a, float %b, float %c) "aarch64_inout_za" "probe-stack"="inline-asm" "stack-probe-size"="65536" { ; CHECK-LABEL: multi_bb_stpidr2_save_required_stackprobe: ; CHECK: // %bb.0: @@ -115,20 +110,18 @@ define float @multi_bb_stpidr2_save_required_stackprobe(i32 %a, float %b, float ; CHECK-NEXT: .cfi_offset w29, -16 ; CHECK-NEXT: rdsvl x8, #1 ; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: msub x8, x8, x8, x9 +; CHECK-NEXT: msub x9, x8, x8, x9 ; CHECK-NEXT: .LBB2_1: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536 -; CHECK-NEXT: cmp sp, x8 +; CHECK-NEXT: cmp sp, x9 ; CHECK-NEXT: b.le .LBB2_3 ; CHECK-NEXT: // %bb.2: // in Loop: Header=BB2_1 Depth=1 ; CHECK-NEXT: str xzr, [sp] ; CHECK-NEXT: b .LBB2_1 ; CHECK-NEXT: .LBB2_3: -; CHECK-NEXT: mov sp, x8 +; CHECK-NEXT: mov sp, x9 ; CHECK-NEXT: ldr xzr, [sp] -; CHECK-NEXT: stur x8, [x29, #-16] -; CHECK-NEXT: sturh wzr, [x29, #-6] -; CHECK-NEXT: stur wzr, [x29, #-4] +; CHECK-NEXT: stp x9, x8, [x29, #-16] ; CHECK-NEXT: cbz w0, .LBB2_5 ; CHECK-NEXT: // %bb.4: // %use_b ; CHECK-NEXT: fmov s1, #4.00000000 @@ -136,10 +129,8 @@ define float @multi_bb_stpidr2_save_required_stackprobe(i32 %a, float %b, float ; CHECK-NEXT: b .LBB2_8 ; CHECK-NEXT: .LBB2_5: // %use_c ; CHECK-NEXT: fmov s0, s1 -; CHECK-NEXT: rdsvl x8, #1 -; CHECK-NEXT: sub x9, x29, #16 -; CHECK-NEXT: sturh w8, [x29, #-8] -; CHECK-NEXT: msr TPIDR2_EL0, x9 +; CHECK-NEXT: sub x8, x29, #16 +; CHECK-NEXT: msr TPIDR2_EL0, x8 ; CHECK-NEXT: bl cosf ; CHECK-NEXT: smstart za ; CHECK-NEXT: mrs x8, TPIDR2_EL0 @@ -165,26 +156,35 @@ define float @multi_bb_stpidr2_save_required_stackprobe(i32 %a, float %b, float ; CHECK-NEWLOWERING-NEXT: rdsvl x8, #1 ; CHECK-NEWLOWERING-NEXT: mov x9, sp ; CHECK-NEWLOWERING-NEXT: msub x9, x8, x8, x9 +; CHECK-NEWLOWERING-NEXT: .LBB2_1: // =>This Inner Loop Header: Depth=1 +; CHECK-NEWLOWERING-NEXT: sub sp, sp, #16, lsl #12 // =65536 +; CHECK-NEWLOWERING-NEXT: cmp sp, x9 +; CHECK-NEWLOWERING-NEXT: b.le .LBB2_3 +; CHECK-NEWLOWERING-NEXT: // %bb.2: // in Loop: Header=BB2_1 Depth=1 +; CHECK-NEWLOWERING-NEXT: str xzr, [sp] +; CHECK-NEWLOWERING-NEXT: b .LBB2_1 +; CHECK-NEWLOWERING-NEXT: .LBB2_3: ; CHECK-NEWLOWERING-NEXT: mov sp, x9 +; CHECK-NEWLOWERING-NEXT: ldr xzr, [sp] ; CHECK-NEWLOWERING-NEXT: sub x10, x29, #16 ; CHECK-NEWLOWERING-NEXT: stp x9, x8, [x29, #-16] ; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x10 -; CHECK-NEWLOWERING-NEXT: cbz w0, .LBB2_2 -; CHECK-NEWLOWERING-NEXT: // %bb.1: // %use_b +; CHECK-NEWLOWERING-NEXT: cbz w0, .LBB2_5 +; CHECK-NEWLOWERING-NEXT: // %bb.4: // %use_b ; CHECK-NEWLOWERING-NEXT: fmov s1, #4.00000000 ; CHECK-NEWLOWERING-NEXT: fadd s0, s0, s1 -; CHECK-NEWLOWERING-NEXT: b .LBB2_3 -; CHECK-NEWLOWERING-NEXT: .LBB2_2: // %use_c +; CHECK-NEWLOWERING-NEXT: b .LBB2_6 +; CHECK-NEWLOWERING-NEXT: .LBB2_5: // %use_c ; CHECK-NEWLOWERING-NEXT: fmov s0, s1 ; CHECK-NEWLOWERING-NEXT: bl cosf -; CHECK-NEWLOWERING-NEXT: .LBB2_3: // %exit +; CHECK-NEWLOWERING-NEXT: .LBB2_6: // %exit ; CHECK-NEWLOWERING-NEXT: smstart za ; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0 ; CHECK-NEWLOWERING-NEXT: sub x0, x29, #16 -; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB2_5 -; CHECK-NEWLOWERING-NEXT: // %bb.4: // %exit +; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB2_8 +; CHECK-NEWLOWERING-NEXT: // %bb.7: // %exit ; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_restore -; CHECK-NEWLOWERING-NEXT: .LBB2_5: // %exit +; CHECK-NEWLOWERING-NEXT: .LBB2_8: // %exit ; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr ; CHECK-NEWLOWERING-NEXT: mov sp, x29 ; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload diff --git a/llvm/test/CodeGen/AArch64/sme-zt0-state.ll b/llvm/test/CodeGen/AArch64/sme-zt0-state.ll index 57c1ced8ab12..49eb368662b5 100644 --- a/llvm/test/CodeGen/AArch64/sme-zt0-state.ll +++ b/llvm/test/CodeGen/AArch64/sme-zt0-state.ll @@ -40,13 +40,10 @@ define void @za_zt0_shared_caller_no_state_callee(ptr %callee) "aarch64_inout_za ; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: msub x9, x8, x8, x9 ; CHECK-NEXT: mov sp, x9 -; CHECK-NEXT: stur x9, [x29, #-16] -; CHECK-NEXT: sub x9, x29, #16 +; CHECK-NEXT: sub x10, x29, #16 ; CHECK-NEXT: sub x19, x29, #80 -; CHECK-NEXT: sturh wzr, [x29, #-6] -; CHECK-NEXT: stur wzr, [x29, #-4] -; CHECK-NEXT: sturh w8, [x29, #-8] -; CHECK-NEXT: msr TPIDR2_EL0, x9 +; CHECK-NEXT: stp x9, x8, [x29, #-16] +; CHECK-NEXT: msr TPIDR2_EL0, x10 ; CHECK-NEXT: str zt0, [x19] ; CHECK-NEXT: blr x0 ; CHECK-NEXT: smstart za diff --git a/llvm/test/CodeGen/AArch64/sms-order-physreg-deps.mir b/llvm/test/CodeGen/AArch64/sms-order-physreg-deps.mir index 4d8067e16b96..61e3c73a3ee5 100644 --- a/llvm/test/CodeGen/AArch64/sms-order-physreg-deps.mir +++ b/llvm/test/CodeGen/AArch64/sms-order-physreg-deps.mir @@ -1,4 +1,4 @@ -# RUN: llc --verify-machineinstrs -mtriple=aarch64 -o - %s -mcpu=a64fx -aarch64-enable-pipeliner -pipeliner-max-mii=100 -pipeliner-enable-copytophi=0 -debug-only=pipeliner -run-pass=pipeliner -treat-scalable-fixed-error-as-warning 2>&1 | FileCheck %s +# RUN: llc --verify-machineinstrs -mtriple=aarch64 -o - %s -mcpu=a64fx -aarch64-enable-pipeliner -pipeliner-max-mii=100 -pipeliner-enable-copytophi=0 -debug-only=pipeliner -run-pass=pipeliner 2>&1 | FileCheck %s # REQUIRES: asserts diff --git a/llvm/test/CodeGen/AArch64/spill-reload-remarks.ll b/llvm/test/CodeGen/AArch64/spill-reload-remarks.ll index 6c248048e682..a23854759d68 100644 --- a/llvm/test/CodeGen/AArch64/spill-reload-remarks.ll +++ b/llvm/test/CodeGen/AArch64/spill-reload-remarks.ll @@ -2,7 +2,7 @@ ; We should have both spill and reload for %arg. -; CHECK: remark: <unknown>:0:0: 2 spills 2.000000e+00 total spills cost 3 reloads 3.000000e+00 total reloads cost 1 virtual registers copies 1.000000e+00 total copies cost generated in function +; CHECK: remark: <unknown>:0:0: 2 spills 2.000000e+00 total spills cost 3 reloads 3.000000e+00 total reloads cost generated in function define <vscale x 2 x i1> @streaming_compatible_with_predicate_vectors(<vscale x 2 x i1> %arg) "aarch64_pstate_sm_compatible" nounwind #0 { %res = call <vscale x 2 x i1> @normal_callee_predicate_vec_arg(<vscale x 2 x i1> %arg) %and = and <vscale x 2 x i1> %res, %arg diff --git a/llvm/test/CodeGen/AArch64/stack-hazard.ll b/llvm/test/CodeGen/AArch64/stack-hazard.ll index c878d888b5f0..5f52280935c7 100644 --- a/llvm/test/CodeGen/AArch64/stack-hazard.ll +++ b/llvm/test/CodeGen/AArch64/stack-hazard.ll @@ -2855,12 +2855,9 @@ define i32 @vastate(i32 %x) "aarch64_inout_za" "aarch64_pstate_sm_enabled" "targ ; CHECK0-NEXT: mov w20, w0 ; CHECK0-NEXT: msub x9, x8, x8, x9 ; CHECK0-NEXT: mov sp, x9 -; CHECK0-NEXT: stur x9, [x29, #-80] -; CHECK0-NEXT: sub x9, x29, #80 -; CHECK0-NEXT: sturh wzr, [x29, #-70] -; CHECK0-NEXT: stur wzr, [x29, #-68] -; CHECK0-NEXT: sturh w8, [x29, #-72] -; CHECK0-NEXT: msr TPIDR2_EL0, x9 +; CHECK0-NEXT: sub x10, x29, #80 +; CHECK0-NEXT: stp x9, x8, [x29, #-80] +; CHECK0-NEXT: msr TPIDR2_EL0, x10 ; CHECK0-NEXT: smstop sm ; CHECK0-NEXT: bl other ; CHECK0-NEXT: smstart sm @@ -2930,12 +2927,9 @@ define i32 @vastate(i32 %x) "aarch64_inout_za" "aarch64_pstate_sm_enabled" "targ ; CHECK64-NEXT: msub x9, x8, x8, x9 ; CHECK64-NEXT: mov x19, sp ; CHECK64-NEXT: mov sp, x9 -; CHECK64-NEXT: str x9, [x19] -; CHECK64-NEXT: add x9, x19, #0 -; CHECK64-NEXT: strh wzr, [x19, #10] -; CHECK64-NEXT: str wzr, [x19, #12] -; CHECK64-NEXT: strh w8, [x19, #8] -; CHECK64-NEXT: msr TPIDR2_EL0, x9 +; CHECK64-NEXT: add x10, x19, #0 +; CHECK64-NEXT: stp x9, x8, [x19] +; CHECK64-NEXT: msr TPIDR2_EL0, x10 ; CHECK64-NEXT: smstop sm ; CHECK64-NEXT: bl other ; CHECK64-NEXT: smstart sm @@ -3011,12 +3005,9 @@ define i32 @vastate(i32 %x) "aarch64_inout_za" "aarch64_pstate_sm_enabled" "targ ; CHECK1024-NEXT: msub x9, x8, x8, x9 ; CHECK1024-NEXT: mov x19, sp ; CHECK1024-NEXT: mov sp, x9 -; CHECK1024-NEXT: str x9, [x19] -; CHECK1024-NEXT: add x9, x19, #0 -; CHECK1024-NEXT: strh wzr, [x19, #10] -; CHECK1024-NEXT: str wzr, [x19, #12] -; CHECK1024-NEXT: strh w8, [x19, #8] -; CHECK1024-NEXT: msr TPIDR2_EL0, x9 +; CHECK1024-NEXT: add x10, x19, #0 +; CHECK1024-NEXT: stp x9, x8, [x19] +; CHECK1024-NEXT: msr TPIDR2_EL0, x10 ; CHECK1024-NEXT: smstop sm ; CHECK1024-NEXT: bl other ; CHECK1024-NEXT: smstart sm diff --git a/llvm/test/CodeGen/AArch64/streaming-compatible-memory-ops.ll b/llvm/test/CodeGen/AArch64/streaming-compatible-memory-ops.ll index 7ad95429949a..6021f9fab2cd 100644 --- a/llvm/test/CodeGen/AArch64/streaming-compatible-memory-ops.ll +++ b/llvm/test/CodeGen/AArch64/streaming-compatible-memory-ops.ll @@ -169,15 +169,14 @@ define void @sc_memcpy(i64 noundef %n) "aarch64_pstate_sm_compatible" nounwind { ; CHECK-NO-SME-ROUTINES-LABEL: sc_memcpy: ; CHECK-NO-SME-ROUTINES: // %bb.0: // %entry ; CHECK-NO-SME-ROUTINES-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill -; CHECK-NO-SME-ROUTINES-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NO-SME-ROUTINES-NEXT: mov x2, x0 +; CHECK-NO-SME-ROUTINES-NEXT: adrp x0, :got:dst +; CHECK-NO-SME-ROUTINES-NEXT: adrp x1, :got:src +; CHECK-NO-SME-ROUTINES-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill ; CHECK-NO-SME-ROUTINES-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill ; CHECK-NO-SME-ROUTINES-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill ; CHECK-NO-SME-ROUTINES-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill -; CHECK-NO-SME-ROUTINES-NEXT: bl __arm_sme_state -; CHECK-NO-SME-ROUTINES-NEXT: mov x19, x0 -; CHECK-NO-SME-ROUTINES-NEXT: adrp x0, :got:dst -; CHECK-NO-SME-ROUTINES-NEXT: adrp x1, :got:src +; CHECK-NO-SME-ROUTINES-NEXT: mrs x19, SVCR ; CHECK-NO-SME-ROUTINES-NEXT: ldr x0, [x0, :got_lo12:dst] ; CHECK-NO-SME-ROUTINES-NEXT: ldr x1, [x1, :got_lo12:src] ; CHECK-NO-SME-ROUTINES-NEXT: tbz w19, #0, .LBB3_2 diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-bitselect.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-bitselect.ll index fb494afa11de..258e399018ba 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-bitselect.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-bitselect.ll @@ -13,15 +13,15 @@ define void @fixed_bitselect_v8i32(ptr %pre_cond_ptr, ptr %left_ptr, ptr %right_ ; CHECK-LABEL: fixed_bitselect_v8i32: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s, vl8 -; CHECK-NEXT: mov z1.s, #-1 // =0xffffffffffffffff ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] -; CHECK-NEXT: ld1w { z2.s }, p0/z, [x1] +; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] ; CHECK-NEXT: ld1w { z3.s }, p0/z, [x2] -; CHECK-NEXT: add z1.s, z0.s, z1.s -; CHECK-NEXT: subr z0.s, z0.s, #0 // =0x0 -; CHECK-NEXT: and z0.d, z0.d, z2.d -; CHECK-NEXT: and z1.d, z1.d, z3.d -; CHECK-NEXT: orr z0.d, z1.d, z0.d +; CHECK-NEXT: mov z2.d, z0.d +; CHECK-NEXT: sub z0.s, z0.s, #1 // =0x1 +; CHECK-NEXT: subr z2.s, z2.s, #0 // =0x0 +; CHECK-NEXT: and z0.d, z0.d, z3.d +; CHECK-NEXT: and z1.d, z2.d, z1.d +; CHECK-NEXT: orr z0.d, z0.d, z1.d ; CHECK-NEXT: st1w { z0.s }, p0, [x3] ; CHECK-NEXT: ret %pre_cond = load <8 x i32>, ptr %pre_cond_ptr diff --git a/llvm/test/CodeGen/AArch64/sve-index-const-step-vector.ll b/llvm/test/CodeGen/AArch64/sve-index-const-step-vector.ll index 433ddbd4a261..cf2ae02c14b1 100644 --- a/llvm/test/CodeGen/AArch64/sve-index-const-step-vector.ll +++ b/llvm/test/CodeGen/AArch64/sve-index-const-step-vector.ll @@ -94,8 +94,8 @@ define <4 x i32> @v4i32_neg_immediates() #0 { define <4 x i32> @v4i32_out_range_start() #0 { ; CHECK-LABEL: v4i32_out_range_start: ; CHECK: // %bb.0: -; CHECK-NEXT: index z0.s, #0, #1 -; CHECK-NEXT: add z0.s, z0.s, #16 // =0x10 +; CHECK-NEXT: mov w8, #16 // =0x10 +; CHECK-NEXT: index z0.s, w8, #1 ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret ret <4 x i32> <i32 16, i32 17, i32 18, i32 19> diff --git a/llvm/test/CodeGen/AArch64/sve-int-imm.ll b/llvm/test/CodeGen/AArch64/sve-int-imm.ll index 47f4f0181dfb..985b7b959770 100644 --- a/llvm/test/CodeGen/AArch64/sve-int-imm.ll +++ b/llvm/test/CodeGen/AArch64/sve-int-imm.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -use-constant-int-for-scalable-splat < %s | FileCheck %s @@ -7,415 +8,530 @@ ; ADD define <vscale x 16 x i8> @add_i8_low(<vscale x 16 x i8> %a) { -; CHECK-LABEL: add_i8_low -; CHECK: add z0.b, z0.b, #30 -; CHECK-NEXT: ret +; CHECK-LABEL: add_i8_low: +; CHECK: // %bb.0: +; CHECK-NEXT: add z0.b, z0.b, #30 // =0x1e +; CHECK-NEXT: ret %res = add <vscale x 16 x i8> %a, splat(i8 30) ret <vscale x 16 x i8> %res } define <vscale x 8 x i16> @add_i16_low(<vscale x 8 x i16> %a) { -; CHECK-LABEL: add_i16_low -; CHECK: add z0.h, z0.h, #30 -; CHECK-NEXT: ret +; CHECK-LABEL: add_i16_low: +; CHECK: // %bb.0: +; CHECK-NEXT: add z0.h, z0.h, #30 // =0x1e +; CHECK-NEXT: ret %res = add <vscale x 8 x i16> %a, splat(i16 30) ret <vscale x 8 x i16> %res } define <vscale x 8 x i16> @add_i16_high(<vscale x 8 x i16> %a) { -; CHECK-LABEL: add_i16_high -; CHECK: add z0.h, z0.h, #1024 -; CHECK-NEXT: ret +; CHECK-LABEL: add_i16_high: +; CHECK: // %bb.0: +; CHECK-NEXT: add z0.h, z0.h, #1024 // =0x400 +; CHECK-NEXT: ret %res = add <vscale x 8 x i16> %a, splat(i16 1024) ret <vscale x 8 x i16> %res } define <vscale x 4 x i32> @add_i32_low(<vscale x 4 x i32> %a) { -; CHECK-LABEL: add_i32_low -; CHECK: add z0.s, z0.s, #30 -; CHECK-NEXT: ret +; CHECK-LABEL: add_i32_low: +; CHECK: // %bb.0: +; CHECK-NEXT: add z0.s, z0.s, #30 // =0x1e +; CHECK-NEXT: ret %res = add <vscale x 4 x i32> %a, splat(i32 30) ret <vscale x 4 x i32> %res } define <vscale x 4 x i32> @add_i32_high(<vscale x 4 x i32> %a) { -; CHECK-LABEL: add_i32_high -; CHECK: add z0.s, z0.s, #1024 -; CHECK-NEXT: ret +; CHECK-LABEL: add_i32_high: +; CHECK: // %bb.0: +; CHECK-NEXT: add z0.s, z0.s, #1024 // =0x400 +; CHECK-NEXT: ret %res = add <vscale x 4 x i32> %a, splat(i32 1024) ret <vscale x 4 x i32> %res } define <vscale x 2 x i64> @add_i64_low(<vscale x 2 x i64> %a) { -; CHECK-LABEL: add_i64_low -; CHECK: add z0.d, z0.d, #30 -; CHECK-NEXT: ret +; CHECK-LABEL: add_i64_low: +; CHECK: // %bb.0: +; CHECK-NEXT: add z0.d, z0.d, #30 // =0x1e +; CHECK-NEXT: ret %res = add <vscale x 2 x i64> %a, splat(i64 30) ret <vscale x 2 x i64> %res } define <vscale x 2 x i64> @add_i64_high(<vscale x 2 x i64> %a) { -; CHECK-LABEL: add_i64_high -; CHECK: add z0.d, z0.d, #1024 -; CHECK-NEXT: ret +; CHECK-LABEL: add_i64_high: +; CHECK: // %bb.0: +; CHECK-NEXT: add z0.d, z0.d, #1024 // =0x400 +; CHECK-NEXT: ret %res = add <vscale x 2 x i64> %a, splat(i64 1024) ret <vscale x 2 x i64> %res } define <vscale x 16 x i8> @add_i8_signedness(<vscale x 16 x i8> %a) { -; CHECK-LABEL: add_i8_signedness -; CHECK: add z0.b, z0.b, #255 -; CHECK-NEXT: ret +; CHECK-LABEL: add_i8_signedness: +; CHECK: // %bb.0: +; CHECK-NEXT: add z0.b, z0.b, #255 // =0xff +; CHECK-NEXT: ret %res = add <vscale x 16 x i8> %a, splat(i8 255) ret <vscale x 16 x i8> %res } define <vscale x 8 x i16> @add_i16_signedness(<vscale x 8 x i16> %a) { -; CHECK-LABEL: add_i16_signedness -; CHECK: add z0.h, z0.h, #65280 -; CHECK-NEXT: ret +; CHECK-LABEL: add_i16_signedness: +; CHECK: // %bb.0: +; CHECK-NEXT: add z0.h, z0.h, #65280 // =0xff00 +; CHECK-NEXT: ret %res = add <vscale x 8 x i16> %a, splat(i16 65280) ret <vscale x 8 x i16> %res } ; SUBR define <vscale x 16 x i8> @subr_i8_low(<vscale x 16 x i8> %a) { -; CHECK-LABEL: subr_i8_low -; CHECK: subr z0.b, z0.b, #30 -; CHECK-NEXT: ret +; CHECK-LABEL: subr_i8_low: +; CHECK: // %bb.0: +; CHECK-NEXT: subr z0.b, z0.b, #30 // =0x1e +; CHECK-NEXT: ret %res = sub <vscale x 16 x i8> splat(i8 30), %a ret <vscale x 16 x i8> %res } define <vscale x 8 x i16> @subr_i16_low(<vscale x 8 x i16> %a) { -; CHECK-LABEL: subr_i16_low -; CHECK: subr z0.h, z0.h, #30 -; CHECK-NEXT: ret +; CHECK-LABEL: subr_i16_low: +; CHECK: // %bb.0: +; CHECK-NEXT: subr z0.h, z0.h, #30 // =0x1e +; CHECK-NEXT: ret %res = sub <vscale x 8 x i16> splat(i16 30), %a ret <vscale x 8 x i16> %res } define <vscale x 8 x i16> @subr_i16_high(<vscale x 8 x i16> %a) { -; CHECK-LABEL: subr_i16_high -; CHECK: subr z0.h, z0.h, #1024 -; CHECK-NEXT: ret +; CHECK-LABEL: subr_i16_high: +; CHECK: // %bb.0: +; CHECK-NEXT: subr z0.h, z0.h, #1024 // =0x400 +; CHECK-NEXT: ret %res = sub <vscale x 8 x i16> splat(i16 1024), %a ret <vscale x 8 x i16> %res } define <vscale x 4 x i32> @subr_i32_low(<vscale x 4 x i32> %a) { -; CHECK-LABEL: subr_i32_low -; CHECK: subr z0.s, z0.s, #30 -; CHECK-NEXT: ret +; CHECK-LABEL: subr_i32_low: +; CHECK: // %bb.0: +; CHECK-NEXT: subr z0.s, z0.s, #30 // =0x1e +; CHECK-NEXT: ret %res = sub <vscale x 4 x i32> splat(i32 30), %a ret <vscale x 4 x i32> %res } define <vscale x 4 x i32> @subr_i32_high(<vscale x 4 x i32> %a) { -; CHECK-LABEL: subr_i32_high -; CHECK: subr z0.s, z0.s, #1024 -; CHECK-NEXT: ret +; CHECK-LABEL: subr_i32_high: +; CHECK: // %bb.0: +; CHECK-NEXT: subr z0.s, z0.s, #1024 // =0x400 +; CHECK-NEXT: ret %res = sub <vscale x 4 x i32> splat(i32 1024), %a ret <vscale x 4 x i32> %res } define <vscale x 2 x i64> @subr_i64_low(<vscale x 2 x i64> %a) { -; CHECK-LABEL: subr_i64_low -; CHECK: subr z0.d, z0.d, #30 -; CHECK-NEXT: ret +; CHECK-LABEL: subr_i64_low: +; CHECK: // %bb.0: +; CHECK-NEXT: subr z0.d, z0.d, #30 // =0x1e +; CHECK-NEXT: ret %res = sub <vscale x 2 x i64> splat(i64 30), %a ret <vscale x 2 x i64> %res } define <vscale x 2 x i64> @subr_i64_high(<vscale x 2 x i64> %a) { -; CHECK-LABEL: subr_i64_high -; CHECK: subr z0.d, z0.d, #1024 -; CHECK-NEXT: ret +; CHECK-LABEL: subr_i64_high: +; CHECK: // %bb.0: +; CHECK-NEXT: subr z0.d, z0.d, #1024 // =0x400 +; CHECK-NEXT: ret %res = sub <vscale x 2 x i64> splat(i64 1024), %a ret <vscale x 2 x i64> %res } ; SUB define <vscale x 16 x i8> @sub_i8_low(<vscale x 16 x i8> %a) { -; CHECK-LABEL: sub_i8_low -; CHECK: sub z0.b, z0.b, #30 -; CHECK-NEXT: ret +; CHECK-LABEL: sub_i8_low: +; CHECK: // %bb.0: +; CHECK-NEXT: sub z0.b, z0.b, #30 // =0x1e +; CHECK-NEXT: ret %res = sub <vscale x 16 x i8> %a, splat(i8 30) ret <vscale x 16 x i8> %res } define <vscale x 8 x i16> @sub_i16_low(<vscale x 8 x i16> %a) { -; CHECK-LABEL: sub_i16_low -; CHECK: sub z0.h, z0.h, #30 -; CHECK-NEXT: ret +; CHECK-LABEL: sub_i16_low: +; CHECK: // %bb.0: +; CHECK-NEXT: sub z0.h, z0.h, #30 // =0x1e +; CHECK-NEXT: ret %res = sub <vscale x 8 x i16> %a, splat(i16 30) ret <vscale x 8 x i16> %res } define <vscale x 8 x i16> @sub_i16_high(<vscale x 8 x i16> %a) { -; CHECK-LABEL: sub_i16_high -; CHECK: sub z0.h, z0.h, #1024 -; CHECK-NEXT: ret +; CHECK-LABEL: sub_i16_high: +; CHECK: // %bb.0: +; CHECK-NEXT: sub z0.h, z0.h, #1024 // =0x400 +; CHECK-NEXT: ret %res = sub <vscale x 8 x i16> %a, splat(i16 1024) ret <vscale x 8 x i16> %res } define <vscale x 4 x i32> @sub_i32_low(<vscale x 4 x i32> %a) { -; CHECK-LABEL: sub_i32_low -; CHECK: sub z0.s, z0.s, #30 -; CHECK-NEXT: ret +; CHECK-LABEL: sub_i32_low: +; CHECK: // %bb.0: +; CHECK-NEXT: sub z0.s, z0.s, #30 // =0x1e +; CHECK-NEXT: ret %res = sub <vscale x 4 x i32> %a, splat(i32 30) ret <vscale x 4 x i32> %res } define <vscale x 4 x i32> @sub_i32_high(<vscale x 4 x i32> %a) { -; CHECK-LABEL: sub_i32_high -; CHECK: sub z0.s, z0.s, #1024 -; CHECK-NEXT: ret +; CHECK-LABEL: sub_i32_high: +; CHECK: // %bb.0: +; CHECK-NEXT: sub z0.s, z0.s, #1024 // =0x400 +; CHECK-NEXT: ret %res = sub <vscale x 4 x i32> %a, splat(i32 1024) ret <vscale x 4 x i32> %res } define <vscale x 2 x i64> @sub_i64_low(<vscale x 2 x i64> %a) { -; CHECK-LABEL: sub_i64_low -; CHECK: sub z0.d, z0.d, #30 -; CHECK-NEXT: ret +; CHECK-LABEL: sub_i64_low: +; CHECK: // %bb.0: +; CHECK-NEXT: sub z0.d, z0.d, #30 // =0x1e +; CHECK-NEXT: ret %res = sub <vscale x 2 x i64> %a, splat(i64 30) ret <vscale x 2 x i64> %res } define <vscale x 2 x i64> @sub_i64_high(<vscale x 2 x i64> %a) { -; CHECK-LABEL: sub_i64_high -; CHECK: sub z0.d, z0.d, #1024 -; CHECK-NEXT: ret +; CHECK-LABEL: sub_i64_high: +; CHECK: // %bb.0: +; CHECK-NEXT: sub z0.d, z0.d, #1024 // =0x400 +; CHECK-NEXT: ret %res = sub <vscale x 2 x i64> %a, splat(i64 1024) ret <vscale x 2 x i64> %res } +; SUB via -ve add +define <vscale x 16 x i8> @addnve_i8_low(<vscale x 16 x i8> %a) { +; CHECK-LABEL: addnve_i8_low: +; CHECK: // %bb.0: +; CHECK-NEXT: add z0.b, z0.b, #226 // =0xe2 +; CHECK-NEXT: ret + %res = add <vscale x 16 x i8> %a, splat(i8 -30) + ret <vscale x 16 x i8> %res +} + +define <vscale x 8 x i16> @addnve_i16_low(<vscale x 8 x i16> %a) { +; CHECK-LABEL: addnve_i16_low: +; CHECK: // %bb.0: +; CHECK-NEXT: sub z0.h, z0.h, #30 // =0x1e +; CHECK-NEXT: ret + %res = add <vscale x 8 x i16> %a, splat(i16 -30) + ret <vscale x 8 x i16> %res +} + +define <vscale x 8 x i16> @addnve_i16_high(<vscale x 8 x i16> %a) { +; CHECK-LABEL: addnve_i16_high: +; CHECK: // %bb.0: +; CHECK-NEXT: add z0.h, z0.h, #64512 // =0xfc00 +; CHECK-NEXT: ret + %res = add <vscale x 8 x i16> %a, splat(i16 -1024) + ret <vscale x 8 x i16> %res +} + +define <vscale x 4 x i32> @addnve_i32_low(<vscale x 4 x i32> %a) { +; CHECK-LABEL: addnve_i32_low: +; CHECK: // %bb.0: +; CHECK-NEXT: sub z0.s, z0.s, #30 // =0x1e +; CHECK-NEXT: ret + %res = add <vscale x 4 x i32> %a, splat(i32 -30) + ret <vscale x 4 x i32> %res +} + +define <vscale x 4 x i32> @addnve_i32_high(<vscale x 4 x i32> %a) { +; CHECK-LABEL: addnve_i32_high: +; CHECK: // %bb.0: +; CHECK-NEXT: sub z0.s, z0.s, #1024 // =0x400 +; CHECK-NEXT: ret + %res = add <vscale x 4 x i32> %a, splat(i32 -1024) + ret <vscale x 4 x i32> %res +} + +define <vscale x 2 x i64> @addnve_i64_low(<vscale x 2 x i64> %a) { +; CHECK-LABEL: addnve_i64_low: +; CHECK: // %bb.0: +; CHECK-NEXT: sub z0.d, z0.d, #30 // =0x1e +; CHECK-NEXT: ret + %res = add <vscale x 2 x i64> %a, splat(i64 -30) + ret <vscale x 2 x i64> %res +} + +define <vscale x 2 x i64> @addnve_i64_high(<vscale x 2 x i64> %a) { +; CHECK-LABEL: addnve_i64_high: +; CHECK: // %bb.0: +; CHECK-NEXT: sub z0.d, z0.d, #1024 // =0x400 +; CHECK-NEXT: ret + %res = add <vscale x 2 x i64> %a, splat(i64 -1024) + ret <vscale x 2 x i64> %res +} + ; SQADD define <vscale x 16 x i8> @sqadd_i8_low(<vscale x 16 x i8> %a) { -; CHECK-LABEL: sqadd_i8_low -; CHECK: sqadd z0.b, z0.b, #30 -; CHECK-NEXT: ret +; CHECK-LABEL: sqadd_i8_low: +; CHECK: // %bb.0: +; CHECK-NEXT: sqadd z0.b, z0.b, #30 // =0x1e +; CHECK-NEXT: ret %res = call <vscale x 16 x i8> @llvm.sadd.sat.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> splat(i8 30)) ret <vscale x 16 x i8> %res } define <vscale x 8 x i16> @sqadd_i16_low(<vscale x 8 x i16> %a) { -; CHECK-LABEL: sqadd_i16_low -; CHECK: sqadd z0.h, z0.h, #30 -; CHECK-NEXT: ret +; CHECK-LABEL: sqadd_i16_low: +; CHECK: // %bb.0: +; CHECK-NEXT: sqadd z0.h, z0.h, #30 // =0x1e +; CHECK-NEXT: ret %res = call <vscale x 8 x i16> @llvm.sadd.sat.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> splat(i16 30)) ret <vscale x 8 x i16> %res } define <vscale x 8 x i16> @sqadd_i16_high(<vscale x 8 x i16> %a) { -; CHECK-LABEL: sqadd_i16_high -; CHECK: sqadd z0.h, z0.h, #1024 -; CHECK-NEXT: ret +; CHECK-LABEL: sqadd_i16_high: +; CHECK: // %bb.0: +; CHECK-NEXT: sqadd z0.h, z0.h, #1024 // =0x400 +; CHECK-NEXT: ret %res = call <vscale x 8 x i16> @llvm.sadd.sat.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> splat(i16 1024)) ret <vscale x 8 x i16> %res } define <vscale x 4 x i32> @sqadd_i32_low(<vscale x 4 x i32> %a) { -; CHECK-LABEL: sqadd_i32_low -; CHECK: sqadd z0.s, z0.s, #30 -; CHECK-NEXT: ret +; CHECK-LABEL: sqadd_i32_low: +; CHECK: // %bb.0: +; CHECK-NEXT: sqadd z0.s, z0.s, #30 // =0x1e +; CHECK-NEXT: ret %res = call <vscale x 4 x i32> @llvm.sadd.sat.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> splat(i32 30)) ret <vscale x 4 x i32> %res } define <vscale x 4 x i32> @sqadd_i32_high(<vscale x 4 x i32> %a) { -; CHECK-LABEL: sqadd_i32_high -; CHECK: sqadd z0.s, z0.s, #1024 -; CHECK-NEXT: ret +; CHECK-LABEL: sqadd_i32_high: +; CHECK: // %bb.0: +; CHECK-NEXT: sqadd z0.s, z0.s, #1024 // =0x400 +; CHECK-NEXT: ret %res = call <vscale x 4 x i32> @llvm.sadd.sat.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> splat(i32 1024)) ret <vscale x 4 x i32> %res } define <vscale x 2 x i64> @sqadd_i64_low(<vscale x 2 x i64> %a) { -; CHECK-LABEL: sqadd_i64_low -; CHECK: sqadd z0.d, z0.d, #30 -; CHECK-NEXT: ret +; CHECK-LABEL: sqadd_i64_low: +; CHECK: // %bb.0: +; CHECK-NEXT: sqadd z0.d, z0.d, #30 // =0x1e +; CHECK-NEXT: ret %res = call <vscale x 2 x i64> @llvm.sadd.sat.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> splat(i64 30)) ret <vscale x 2 x i64> %res } define <vscale x 2 x i64> @sqadd_i64_high(<vscale x 2 x i64> %a) { -; CHECK-LABEL: sqadd_i64_high -; CHECK: sqadd z0.d, z0.d, #1024 -; CHECK-NEXT: ret +; CHECK-LABEL: sqadd_i64_high: +; CHECK: // %bb.0: +; CHECK-NEXT: sqadd z0.d, z0.d, #1024 // =0x400 +; CHECK-NEXT: ret %res = call <vscale x 2 x i64> @llvm.sadd.sat.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> splat(i64 1024)) ret <vscale x 2 x i64> %res } ; UQADD define <vscale x 16 x i8> @uqadd_i8_low(<vscale x 16 x i8> %a) { -; CHECK-LABEL: uqadd_i8_low -; CHECK: uqadd z0.b, z0.b, #30 -; CHECK-NEXT: ret +; CHECK-LABEL: uqadd_i8_low: +; CHECK: // %bb.0: +; CHECK-NEXT: uqadd z0.b, z0.b, #30 // =0x1e +; CHECK-NEXT: ret %res = call <vscale x 16 x i8> @llvm.uadd.sat.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> splat(i8 30)) ret <vscale x 16 x i8> %res } define <vscale x 8 x i16> @uqadd_i16_low(<vscale x 8 x i16> %a) { -; CHECK-LABEL: uqadd_i16_low -; CHECK: uqadd z0.h, z0.h, #30 -; CHECK-NEXT: ret +; CHECK-LABEL: uqadd_i16_low: +; CHECK: // %bb.0: +; CHECK-NEXT: uqadd z0.h, z0.h, #30 // =0x1e +; CHECK-NEXT: ret %res = call <vscale x 8 x i16> @llvm.uadd.sat.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> splat(i16 30)) ret <vscale x 8 x i16> %res } define <vscale x 8 x i16> @uqadd_i16_high(<vscale x 8 x i16> %a) { -; CHECK-LABEL: uqadd_i16_high -; CHECK: uqadd z0.h, z0.h, #1024 -; CHECK-NEXT: ret +; CHECK-LABEL: uqadd_i16_high: +; CHECK: // %bb.0: +; CHECK-NEXT: uqadd z0.h, z0.h, #1024 // =0x400 +; CHECK-NEXT: ret %res = call <vscale x 8 x i16> @llvm.uadd.sat.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> splat(i16 1024)) ret <vscale x 8 x i16> %res } define <vscale x 4 x i32> @uqadd_i32_low(<vscale x 4 x i32> %a) { -; CHECK-LABEL: uqadd_i32_low -; CHECK: uqadd z0.s, z0.s, #30 -; CHECK-NEXT: ret +; CHECK-LABEL: uqadd_i32_low: +; CHECK: // %bb.0: +; CHECK-NEXT: uqadd z0.s, z0.s, #30 // =0x1e +; CHECK-NEXT: ret %res = call <vscale x 4 x i32> @llvm.uadd.sat.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> splat(i32 30)) ret <vscale x 4 x i32> %res } define <vscale x 4 x i32> @uqadd_i32_high(<vscale x 4 x i32> %a) { -; CHECK-LABEL: uqadd_i32_high -; CHECK: uqadd z0.s, z0.s, #1024 -; CHECK-NEXT: ret +; CHECK-LABEL: uqadd_i32_high: +; CHECK: // %bb.0: +; CHECK-NEXT: uqadd z0.s, z0.s, #1024 // =0x400 +; CHECK-NEXT: ret %res = call <vscale x 4 x i32> @llvm.uadd.sat.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> splat(i32 1024)) ret <vscale x 4 x i32> %res } define <vscale x 2 x i64> @uqadd_i64_low(<vscale x 2 x i64> %a) { -; CHECK-LABEL: uqadd_i64_low -; CHECK: uqadd z0.d, z0.d, #30 -; CHECK-NEXT: ret +; CHECK-LABEL: uqadd_i64_low: +; CHECK: // %bb.0: +; CHECK-NEXT: uqadd z0.d, z0.d, #30 // =0x1e +; CHECK-NEXT: ret %res = call <vscale x 2 x i64> @llvm.uadd.sat.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> splat(i64 30)) ret <vscale x 2 x i64> %res } define <vscale x 2 x i64> @uqadd_i64_high(<vscale x 2 x i64> %a) { -; CHECK-LABEL: uqadd_i64_high -; CHECK: uqadd z0.d, z0.d, #1024 -; CHECK-NEXT: ret +; CHECK-LABEL: uqadd_i64_high: +; CHECK: // %bb.0: +; CHECK-NEXT: uqadd z0.d, z0.d, #1024 // =0x400 +; CHECK-NEXT: ret %res = call <vscale x 2 x i64> @llvm.uadd.sat.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> splat(i64 1024)) ret <vscale x 2 x i64> %res } ; SQSUB define <vscale x 16 x i8> @sqsub_i8_low(<vscale x 16 x i8> %a) { -; CHECK-LABEL: sqsub_i8_low -; CHECK: sqsub z0.b, z0.b, #30 -; CHECK-NEXT: ret +; CHECK-LABEL: sqsub_i8_low: +; CHECK: // %bb.0: +; CHECK-NEXT: sqsub z0.b, z0.b, #30 // =0x1e +; CHECK-NEXT: ret %res = call <vscale x 16 x i8> @llvm.ssub.sat.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> splat(i8 30)) ret <vscale x 16 x i8> %res } define <vscale x 8 x i16> @sqsub_i16_low(<vscale x 8 x i16> %a) { -; CHECK-LABEL: sqsub_i16_low -; CHECK: sqsub z0.h, z0.h, #30 -; CHECK-NEXT: ret +; CHECK-LABEL: sqsub_i16_low: +; CHECK: // %bb.0: +; CHECK-NEXT: sqsub z0.h, z0.h, #30 // =0x1e +; CHECK-NEXT: ret %res = call <vscale x 8 x i16> @llvm.ssub.sat.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> splat(i16 30)) ret <vscale x 8 x i16> %res } define <vscale x 8 x i16> @sqsub_i16_high(<vscale x 8 x i16> %a) { -; CHECK-LABEL: sqsub_i16_high -; CHECK: sqsub z0.h, z0.h, #1024 -; CHECK-NEXT: ret +; CHECK-LABEL: sqsub_i16_high: +; CHECK: // %bb.0: +; CHECK-NEXT: sqsub z0.h, z0.h, #1024 // =0x400 +; CHECK-NEXT: ret %res = call <vscale x 8 x i16> @llvm.ssub.sat.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> splat(i16 1024)) ret <vscale x 8 x i16> %res } define <vscale x 4 x i32> @sqsub_i32_low(<vscale x 4 x i32> %a) { -; CHECK-LABEL: sqsub_i32_low -; CHECK: sqsub z0.s, z0.s, #30 -; CHECK-NEXT: ret +; CHECK-LABEL: sqsub_i32_low: +; CHECK: // %bb.0: +; CHECK-NEXT: sqsub z0.s, z0.s, #30 // =0x1e +; CHECK-NEXT: ret %res = call <vscale x 4 x i32> @llvm.ssub.sat.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> splat(i32 30)) ret <vscale x 4 x i32> %res } define <vscale x 4 x i32> @sqsub_i32_high(<vscale x 4 x i32> %a) { -; CHECK-LABEL: sqsub_i32_high -; CHECK: sqsub z0.s, z0.s, #1024 -; CHECK-NEXT: ret +; CHECK-LABEL: sqsub_i32_high: +; CHECK: // %bb.0: +; CHECK-NEXT: sqsub z0.s, z0.s, #1024 // =0x400 +; CHECK-NEXT: ret %res = call <vscale x 4 x i32> @llvm.ssub.sat.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> splat(i32 1024)) ret <vscale x 4 x i32> %res } define <vscale x 2 x i64> @sqsub_i64_low(<vscale x 2 x i64> %a) { -; CHECK-LABEL: sqsub_i64_low -; CHECK: sqsub z0.d, z0.d, #30 -; CHECK-NEXT: ret +; CHECK-LABEL: sqsub_i64_low: +; CHECK: // %bb.0: +; CHECK-NEXT: sqsub z0.d, z0.d, #30 // =0x1e +; CHECK-NEXT: ret %res = call <vscale x 2 x i64> @llvm.ssub.sat.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> splat(i64 30)) ret <vscale x 2 x i64> %res } define <vscale x 2 x i64> @sqsub_i64_high(<vscale x 2 x i64> %a) { -; CHECK-LABEL: sqsub_i64_high -; CHECK: sqsub z0.d, z0.d, #1024 -; CHECK-NEXT: ret +; CHECK-LABEL: sqsub_i64_high: +; CHECK: // %bb.0: +; CHECK-NEXT: sqsub z0.d, z0.d, #1024 // =0x400 +; CHECK-NEXT: ret %res = call <vscale x 2 x i64> @llvm.ssub.sat.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> splat(i64 1024)) ret <vscale x 2 x i64> %res } ; UQSUB define <vscale x 16 x i8> @uqsub_i8_low(<vscale x 16 x i8> %a) { -; CHECK-LABEL: uqsub_i8_low -; CHECK: uqsub z0.b, z0.b, #30 -; CHECK-NEXT: ret +; CHECK-LABEL: uqsub_i8_low: +; CHECK: // %bb.0: +; CHECK-NEXT: uqsub z0.b, z0.b, #30 // =0x1e +; CHECK-NEXT: ret %res = call <vscale x 16 x i8> @llvm.usub.sat.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> splat(i8 30)) ret <vscale x 16 x i8> %res } define <vscale x 8 x i16> @uqsub_i16_low(<vscale x 8 x i16> %a) { -; CHECK-LABEL: uqsub_i16_low -; CHECK: uqsub z0.h, z0.h, #30 -; CHECK-NEXT: ret +; CHECK-LABEL: uqsub_i16_low: +; CHECK: // %bb.0: +; CHECK-NEXT: uqsub z0.h, z0.h, #30 // =0x1e +; CHECK-NEXT: ret %res = call <vscale x 8 x i16> @llvm.usub.sat.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> splat(i16 30)) ret <vscale x 8 x i16> %res } define <vscale x 8 x i16> @uqsub_i16_high(<vscale x 8 x i16> %a) { -; CHECK-LABEL: uqsub_i16_high -; CHECK: uqsub z0.h, z0.h, #1024 -; CHECK-NEXT: ret +; CHECK-LABEL: uqsub_i16_high: +; CHECK: // %bb.0: +; CHECK-NEXT: uqsub z0.h, z0.h, #1024 // =0x400 +; CHECK-NEXT: ret %res = call <vscale x 8 x i16> @llvm.usub.sat.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> splat(i16 1024)) ret <vscale x 8 x i16> %res } define <vscale x 4 x i32> @uqsub_i32_low(<vscale x 4 x i32> %a) { -; CHECK-LABEL: uqsub_i32_low -; CHECK: uqsub z0.s, z0.s, #30 -; CHECK-NEXT: ret +; CHECK-LABEL: uqsub_i32_low: +; CHECK: // %bb.0: +; CHECK-NEXT: uqsub z0.s, z0.s, #30 // =0x1e +; CHECK-NEXT: ret %res = call <vscale x 4 x i32> @llvm.usub.sat.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> splat(i32 30)) ret <vscale x 4 x i32> %res } define <vscale x 4 x i32> @uqsub_i32_high(<vscale x 4 x i32> %a) { -; CHECK-LABEL: uqsub_i32_high -; CHECK: uqsub z0.s, z0.s, #1024 -; CHECK-NEXT: ret +; CHECK-LABEL: uqsub_i32_high: +; CHECK: // %bb.0: +; CHECK-NEXT: uqsub z0.s, z0.s, #1024 // =0x400 +; CHECK-NEXT: ret %res = call <vscale x 4 x i32> @llvm.usub.sat.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> splat(i32 1024)) ret <vscale x 4 x i32> %res } define <vscale x 2 x i64> @uqsub_i64_low(<vscale x 2 x i64> %a) { -; CHECK-LABEL: uqsub_i64_low -; CHECK: uqsub z0.d, z0.d, #30 -; CHECK-NEXT: ret +; CHECK-LABEL: uqsub_i64_low: +; CHECK: // %bb.0: +; CHECK-NEXT: uqsub z0.d, z0.d, #30 // =0x1e +; CHECK-NEXT: ret %res = call <vscale x 2 x i64> @llvm.usub.sat.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> splat(i64 30)) ret <vscale x 2 x i64> %res } define <vscale x 2 x i64> @uqsub_i64_high(<vscale x 2 x i64> %a) { -; CHECK-LABEL: uqsub_i64_high -; CHECK: uqsub z0.d, z0.d, #1024 -; CHECK-NEXT: ret +; CHECK-LABEL: uqsub_i64_high: +; CHECK: // %bb.0: +; CHECK-NEXT: uqsub z0.d, z0.d, #1024 // =0x400 +; CHECK-NEXT: ret %res = call <vscale x 2 x i64> @llvm.usub.sat.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> splat(i64 1024)) ret <vscale x 2 x i64> %res } diff --git a/llvm/test/CodeGen/AArch64/sve-saddv_64.ll b/llvm/test/CodeGen/AArch64/sve-saddv_64.ll new file mode 100644 index 000000000000..f30477805262 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-saddv_64.ll @@ -0,0 +1,22 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s + +define noundef i64 @svaddv_SVBool_SVInt64_t(<vscale x 16 x i1> %a, <vscale x 2 x i64> %b) { +; CHECK-LABEL: svaddv_SVBool_SVInt64_t: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: //APP +; CHECK-NEXT: //NO_APP +; CHECK-NEXT: uaddv d0, p0, z0.d +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: ret +entry: + %0 = tail call { <vscale x 16 x i1>, <vscale x 2 x i64> } asm sideeffect "", "=@3Upa,=w,0,1"(<vscale x 16 x i1> %a, <vscale x 2 x i64> %b) + %asmresult = extractvalue { <vscale x 16 x i1>, <vscale x 2 x i64> } %0, 0 + %asmresult1 = extractvalue { <vscale x 16 x i1>, <vscale x 2 x i64> } %0, 1 + %1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> %asmresult) + %2 = tail call i64 @llvm.aarch64.sve.saddv.nxv2i64(<vscale x 2 x i1> %1, <vscale x 2 x i64> %asmresult1) + ret i64 %2 +} + +declare <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1>) +declare i64 @llvm.aarch64.sve.saddv.nxv2i64(<vscale x 2 x i1>, <vscale x 2 x i64>) diff --git a/llvm/test/CodeGen/AArch64/sve-stack-frame-layout.ll b/llvm/test/CodeGen/AArch64/sve-stack-frame-layout.ll index 1f1ca7e3b9ee..2cbb29ebe1a1 100644 --- a/llvm/test/CodeGen/AArch64/sve-stack-frame-layout.ll +++ b/llvm/test/CodeGen/AArch64/sve-stack-frame-layout.ll @@ -556,12 +556,9 @@ define i32 @vastate(i32 %x) "aarch64_inout_za" "aarch64_pstate_sm_enabled" "targ ; CHECK-NEXT: mov w20, w0 ; CHECK-NEXT: msub x9, x8, x8, x9 ; CHECK-NEXT: mov sp, x9 -; CHECK-NEXT: stur x9, [x29, #-80] -; CHECK-NEXT: sub x9, x29, #80 -; CHECK-NEXT: sturh wzr, [x29, #-70] -; CHECK-NEXT: stur wzr, [x29, #-68] -; CHECK-NEXT: sturh w8, [x29, #-72] -; CHECK-NEXT: msr TPIDR2_EL0, x9 +; CHECK-NEXT: sub x10, x29, #80 +; CHECK-NEXT: stp x9, x8, [x29, #-80] +; CHECK-NEXT: msr TPIDR2_EL0, x10 ; CHECK-NEXT: smstop sm ; CHECK-NEXT: bl other ; CHECK-NEXT: smstart sm diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitselect.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitselect.ll index d29e43509dfe..71396da00400 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitselect.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitselect.ll @@ -14,20 +14,21 @@ target triple = "aarch64" define <8 x i32> @fixed_bitselect_v8i32(ptr %pre_cond_ptr, ptr %left_ptr, ptr %right_ptr) { ; CHECK-LABEL: fixed_bitselect_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z0.s, #-1 // =0xffffffffffffffff -; CHECK-NEXT: ldp q2, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ldp q5, q4, [x1] ; CHECK-NEXT: ldp q6, q7, [x2] -; CHECK-NEXT: add z3.s, z1.s, z0.s -; CHECK-NEXT: subr z1.s, z1.s, #0 // =0x0 -; CHECK-NEXT: add z0.s, z2.s, z0.s +; CHECK-NEXT: mov z2.d, z0.d +; CHECK-NEXT: mov z3.d, z1.d +; CHECK-NEXT: sub z1.s, z1.s, #1 // =0x1 +; CHECK-NEXT: sub z0.s, z0.s, #1 // =0x1 ; CHECK-NEXT: subr z2.s, z2.s, #0 // =0x0 -; CHECK-NEXT: and z1.d, z1.d, z4.d -; CHECK-NEXT: and z3.d, z3.d, z7.d -; CHECK-NEXT: and z0.d, z0.d, z6.d -; CHECK-NEXT: and z2.d, z2.d, z5.d -; CHECK-NEXT: orr z1.d, z3.d, z1.d -; CHECK-NEXT: orr z0.d, z0.d, z2.d +; CHECK-NEXT: subr z3.s, z3.s, #0 // =0x0 +; CHECK-NEXT: and z2.d, z2.d, z4.d +; CHECK-NEXT: and z3.d, z3.d, z5.d +; CHECK-NEXT: and z4.d, z0.d, z7.d +; CHECK-NEXT: and z0.d, z1.d, z6.d +; CHECK-NEXT: orr z1.d, z4.d, z2.d +; CHECK-NEXT: orr z0.d, z0.d, z3.d ; CHECK-NEXT: // kill: def $q1 killed $q1 killed $z1 ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-build-vector.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-build-vector.ll index 3a6445dd1d99..d226fc89c338 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-build-vector.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-build-vector.ll @@ -69,9 +69,9 @@ define void @build_vector_0_dec3_v8i32(ptr %a) { ; CHECK-LABEL: build_vector_0_dec3_v8i32: ; CHECK: // %bb.0: ; CHECK-NEXT: index z0.s, #0, #-3 -; CHECK-NEXT: mov z1.s, #-12 // =0xfffffffffffffff4 -; CHECK-NEXT: add z1.s, z0.s, z1.s -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: str q0, [x0] +; CHECK-NEXT: sub z0.s, z0.s, #12 // =0xc +; CHECK-NEXT: str q0, [x0, #16] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: build_vector_0_dec3_v8i32: @@ -91,11 +91,10 @@ define void @build_vector_minus2_dec32_v4i64(ptr %a) { ; CHECK-LABEL: build_vector_minus2_dec32_v4i64: ; CHECK: // %bb.0: ; CHECK-NEXT: mov x8, #-32 // =0xffffffffffffffe0 -; CHECK-NEXT: mov z1.d, #-66 // =0xffffffffffffffbe -; CHECK-NEXT: mov z2.d, #-2 // =0xfffffffffffffffe ; CHECK-NEXT: index z0.d, #0, x8 -; CHECK-NEXT: add z1.d, z0.d, z1.d -; CHECK-NEXT: add z0.d, z0.d, z2.d +; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: sub z0.d, z0.d, #2 // =0x2 +; CHECK-NEXT: sub z1.d, z1.d, #66 // =0x42 ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret ; diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll index 42b947604b86..1fa4b5f62bde 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll @@ -1466,8 +1466,8 @@ define <32 x i8> @masked_load_v32i8(ptr %src, <32 x i1> %mask) { define <2 x half> @masked_load_v2f16(ptr %src, <2 x i1> %mask) { ; CHECK-LABEL: masked_load_v2f16: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: fmov s1, wzr +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: mov z2.s, z0.s[1] ; CHECK-NEXT: ptrue p0.h, vl4 ; CHECK-NEXT: zip1 z0.h, z0.h, z2.h diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll index 9b3da75be47e..8f4a696a28d6 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll @@ -589,8 +589,8 @@ define void @masked_store_v32i8(ptr %dst, <32 x i1> %mask) { define void @masked_store_v2f16(ptr %dst, <2 x i1> %mask) { ; CHECK-LABEL: masked_store_v2f16: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: fmov s1, wzr +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: mov z2.s, z0.s[1] ; CHECK-NEXT: ptrue p0.h, vl4 ; CHECK-NEXT: zip1 z0.h, z0.h, z2.h diff --git a/llvm/test/CodeGen/AArch64/sve-unaligned-load-store-strict-align.ll b/llvm/test/CodeGen/AArch64/sve-unaligned-load-store-strict-align.ll index 981ccdbf589a..78325a1bcded 100644 --- a/llvm/test/CodeGen/AArch64/sve-unaligned-load-store-strict-align.ll +++ b/llvm/test/CodeGen/AArch64/sve-unaligned-load-store-strict-align.ll @@ -5,7 +5,7 @@ ; REQUIRES: asserts ; FIXME: Support TLI.expandUnalignedLoad()/TLI.expandUnalignedStore() for SVE. -; CHECK-FIXME: LLVM ERROR: Invalid size request on a scalable vector. +; CHECK-FIXME: LLVM ERROR: Cannot implicitly convert a scalable size to a fixed-width size in `TypeSize::operator ScalarTy() define void @unaligned_nxv16i1(ptr %ldptr, ptr %stptr) { ; CHECK-LABEL: unaligned_nxv16i1: diff --git a/llvm/test/CodeGen/AArch64/sve2-intrinsics-polynomial-arithmetic-128.ll b/llvm/test/CodeGen/AArch64/sve2-intrinsics-polynomial-arithmetic-128.ll index f695fd444be7..ee77b52362e3 100644 --- a/llvm/test/CodeGen/AArch64/sve2-intrinsics-polynomial-arithmetic-128.ll +++ b/llvm/test/CodeGen/AArch64/sve2-intrinsics-polynomial-arithmetic-128.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2-aes < %s | FileCheck %s +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+ssve-aes -force-streaming < %s | FileCheck %s ; ; PMULLB diff --git a/llvm/test/CodeGen/AArch64/sve2-intrinsics-sm4.ll b/llvm/test/CodeGen/AArch64/sve2-intrinsics-sm4.ll index fb7dcb3485c6..e1050c1b551c 100644 --- a/llvm/test/CodeGen/AArch64/sve2-intrinsics-sm4.ll +++ b/llvm/test/CodeGen/AArch64/sve2-intrinsics-sm4.ll @@ -1,5 +1,4 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2-sm4 < %s | FileCheck %s ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+sve-sm4 < %s | FileCheck %s ; diff --git a/llvm/test/CodeGen/AArch64/switch-cases-to-branch-and.ll b/llvm/test/CodeGen/AArch64/switch-cases-to-branch-and.ll index e99ba4843c45..775ab3fe110e 100644 --- a/llvm/test/CodeGen/AArch64/switch-cases-to-branch-and.ll +++ b/llvm/test/CodeGen/AArch64/switch-cases-to-branch-and.ll @@ -899,3 +899,90 @@ other.dst: pow2.dst: ret void } + +define void @switch_large_enough_for_clustering(i32 %x, ptr %dst) { +; CHECK-LABEL: switch_large_enough_for_clustering: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: cmp w0, #1 +; CHECK-NEXT: b.le LBB12_5 +; CHECK-NEXT: ; %bb.1: ; %entry +; CHECK-NEXT: cmp w0, #7 +; CHECK-NEXT: b.eq LBB12_9 +; CHECK-NEXT: ; %bb.2: ; %entry +; CHECK-NEXT: cmp w0, #4 +; CHECK-NEXT: b.eq LBB12_7 +; CHECK-NEXT: ; %bb.3: ; %entry +; CHECK-NEXT: cmp w0, #2 +; CHECK-NEXT: b.eq LBB12_8 +; CHECK-NEXT: LBB12_4: ; %exit +; CHECK-NEXT: ret +; CHECK-NEXT: LBB12_5: ; %entry +; CHECK-NEXT: cbz w0, LBB12_8 +; CHECK-NEXT: ; %bb.6: ; %entry +; CHECK-NEXT: cmp w0, #1 +; CHECK-NEXT: b.ne LBB12_4 +; CHECK-NEXT: LBB12_7: ; %succ.2 +; CHECK-NEXT: str wzr, [x1] +; CHECK-NEXT: ret +; CHECK-NEXT: LBB12_8: ; %succ.1 +; CHECK-NEXT: strb wzr, [x1] +; CHECK-NEXT: ret +; CHECK-NEXT: LBB12_9: ; %succ.3 +; CHECK-NEXT: strh wzr, [x1] +; CHECK-NEXT: ret +; +; GISEL-LABEL: switch_large_enough_for_clustering: +; GISEL: ; %bb.0: ; %entry +; GISEL-NEXT: cmp w0, #2 +; GISEL-NEXT: b.lt LBB12_5 +; GISEL-NEXT: ; %bb.1: ; %entry +; GISEL-NEXT: cmp w0, #7 +; GISEL-NEXT: b.eq LBB12_9 +; GISEL-NEXT: ; %bb.2: ; %entry +; GISEL-NEXT: cmp w0, #4 +; GISEL-NEXT: b.eq LBB12_7 +; GISEL-NEXT: ; %bb.3: ; %entry +; GISEL-NEXT: cmp w0, #2 +; GISEL-NEXT: b.eq LBB12_8 +; GISEL-NEXT: LBB12_4: ; %exit +; GISEL-NEXT: ret +; GISEL-NEXT: LBB12_5: ; %entry +; GISEL-NEXT: cbz w0, LBB12_8 +; GISEL-NEXT: ; %bb.6: ; %entry +; GISEL-NEXT: cmp w0, #1 +; GISEL-NEXT: b.ne LBB12_4 +; GISEL-NEXT: LBB12_7: ; %succ.2 +; GISEL-NEXT: str wzr, [x1] +; GISEL-NEXT: ret +; GISEL-NEXT: LBB12_8: ; %succ.1 +; GISEL-NEXT: strb wzr, [x1] +; GISEL-NEXT: ret +; GISEL-NEXT: LBB12_9: ; %succ.3 +; GISEL-NEXT: strh wzr, [x1] +; GISEL-NEXT: ret +entry: + switch i32 %x, label %exit [ + i32 0, label %succ.1 + i32 2, label %succ.1 + i32 1, label %succ.2 + i32 4, label %succ.2 + i32 7, label %succ.3 + ] + +succ.1: + store i8 0, ptr %dst + br label %exit + +succ.2: + call void @llvm.memset.p0.i64(ptr %dst, i8 0, i64 4, i1 false) + br label %exit + +succ.3: + call void @llvm.memset.p0.i64(ptr %dst, i8 0, i64 2, i1 false) + br label %exit + +exit: + ret void +} + +declare void @llvm.memset.p0.i64(ptr writeonly captures(none), i8, i64, i1 immarg) diff --git a/llvm/test/CodeGen/AArch64/urem-lkk.ll b/llvm/test/CodeGen/AArch64/urem-lkk.ll index 0dd668555582..40016c7e4ce0 100644 --- a/llvm/test/CodeGen/AArch64/urem-lkk.ll +++ b/llvm/test/CodeGen/AArch64/urem-lkk.ll @@ -20,26 +20,15 @@ define i32 @fold_urem_positive_odd(i32 %x) { } define i32 @fold_urem_positive_even(i32 %x) { -; CHECK-SD-LABEL: fold_urem_positive_even: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: mov w8, #16323 // =0x3fc3 -; CHECK-SD-NEXT: mov w9, #1060 // =0x424 -; CHECK-SD-NEXT: movk w8, #63310, lsl #16 -; CHECK-SD-NEXT: umull x8, w0, w8 -; CHECK-SD-NEXT: lsr x8, x8, #42 -; CHECK-SD-NEXT: msub w0, w8, w9, w0 -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: fold_urem_positive_even: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: mov w8, #16323 // =0x3fc3 -; CHECK-GI-NEXT: mov w9, #1060 // =0x424 -; CHECK-GI-NEXT: movk w8, #63310, lsl #16 -; CHECK-GI-NEXT: umull x8, w0, w8 -; CHECK-GI-NEXT: lsr x8, x8, #32 -; CHECK-GI-NEXT: lsr w8, w8, #10 -; CHECK-GI-NEXT: msub w0, w8, w9, w0 -; CHECK-GI-NEXT: ret +; CHECK-LABEL: fold_urem_positive_even: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #16323 // =0x3fc3 +; CHECK-NEXT: mov w9, #1060 // =0x424 +; CHECK-NEXT: movk w8, #63310, lsl #16 +; CHECK-NEXT: umull x8, w0, w8 +; CHECK-NEXT: lsr x8, x8, #42 +; CHECK-NEXT: msub w0, w8, w9, w0 +; CHECK-NEXT: ret %1 = urem i32 %x, 1060 ret i32 %1 } diff --git a/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll b/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll index d6d323530946..25702ef25510 100644 --- a/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll +++ b/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll @@ -851,15 +851,15 @@ define i6 @no_combine_illegal_num_elements(<6 x i32> %vec) { ; CHECK-GI-NEXT: sub sp, sp, #16 ; CHECK-GI-NEXT: .cfi_def_cfa_offset 16 ; CHECK-GI-NEXT: fmov s1, w0 -; CHECK-GI-NEXT: fmov s0, wzr +; CHECK-GI-NEXT: movi d0, #0000000000000000 ; CHECK-GI-NEXT: fmov s2, w4 ; CHECK-GI-NEXT: mov.s v1[1], w1 ; CHECK-GI-NEXT: mov.s v2[1], w5 ; CHECK-GI-NEXT: mov.s v0[1], wzr ; CHECK-GI-NEXT: mov.s v1[2], w2 ; CHECK-GI-NEXT: cmeq.4s v0, v2, v0 -; CHECK-GI-NEXT: mvn.16b v0, v0 ; CHECK-GI-NEXT: mov.s v1[3], w3 +; CHECK-GI-NEXT: mvn.16b v0, v0 ; CHECK-GI-NEXT: cmtst.4s v1, v1, v1 ; CHECK-GI-NEXT: mov.s w8, v1[1] ; CHECK-GI-NEXT: mov.s w9, v1[2] diff --git a/llvm/test/CodeGen/AArch64/vecreduce-add.ll b/llvm/test/CodeGen/AArch64/vecreduce-add.ll index 74d1165d99b8..ee04e41d5504 100644 --- a/llvm/test/CodeGen/AArch64/vecreduce-add.ll +++ b/llvm/test/CodeGen/AArch64/vecreduce-add.ll @@ -2127,15 +2127,15 @@ define i32 @test_udot_v48i8(ptr %p1, ptr %p2) { ; ; CHECK-GI-DOT-LABEL: test_udot_v48i8: ; CHECK-GI-DOT: // %bb.0: // %entry -; CHECK-GI-DOT-NEXT: fmov s0, wzr +; CHECK-GI-DOT-NEXT: movi d0, #0000000000000000 ; CHECK-GI-DOT-NEXT: movi v1.2d, #0000000000000000 ; CHECK-GI-DOT-NEXT: ldr q7, [x0, #32] ; CHECK-GI-DOT-NEXT: movi v2.2d, #0000000000000000 ; CHECK-GI-DOT-NEXT: movi v3.2d, #0000000000000000 ; CHECK-GI-DOT-NEXT: ldr q17, [x1, #32] ; CHECK-GI-DOT-NEXT: ldp q4, q5, [x0] -; CHECK-GI-DOT-NEXT: mov v0.s[1], wzr ; CHECK-GI-DOT-NEXT: ldp q6, q16, [x1] +; CHECK-GI-DOT-NEXT: mov v0.s[1], wzr ; CHECK-GI-DOT-NEXT: udot v2.4s, v17.16b, v7.16b ; CHECK-GI-DOT-NEXT: udot v1.4s, v6.16b, v4.16b ; CHECK-GI-DOT-NEXT: udot v3.4s, v16.16b, v5.16b @@ -2395,15 +2395,15 @@ define i32 @test_sdot_v48i8(ptr %p1, ptr %p2) { ; ; CHECK-GI-DOT-LABEL: test_sdot_v48i8: ; CHECK-GI-DOT: // %bb.0: // %entry -; CHECK-GI-DOT-NEXT: fmov s0, wzr +; CHECK-GI-DOT-NEXT: movi d0, #0000000000000000 ; CHECK-GI-DOT-NEXT: movi v1.2d, #0000000000000000 ; CHECK-GI-DOT-NEXT: ldr q7, [x0, #32] ; CHECK-GI-DOT-NEXT: movi v2.2d, #0000000000000000 ; CHECK-GI-DOT-NEXT: movi v3.2d, #0000000000000000 ; CHECK-GI-DOT-NEXT: ldr q17, [x1, #32] ; CHECK-GI-DOT-NEXT: ldp q4, q5, [x0] -; CHECK-GI-DOT-NEXT: mov v0.s[1], wzr ; CHECK-GI-DOT-NEXT: ldp q6, q16, [x1] +; CHECK-GI-DOT-NEXT: mov v0.s[1], wzr ; CHECK-GI-DOT-NEXT: sdot v2.4s, v17.16b, v7.16b ; CHECK-GI-DOT-NEXT: sdot v1.4s, v6.16b, v4.16b ; CHECK-GI-DOT-NEXT: sdot v3.4s, v16.16b, v5.16b @@ -4535,96 +4535,89 @@ define i32 @full(ptr %p1, i32 noundef %s1, ptr %p2, i32 noundef %s2) { ; CHECK-GI-NEXT: ldr d1, [x2] ; CHECK-GI-NEXT: add x10, x0, x9 ; CHECK-GI-NEXT: add x11, x2, x8 -; CHECK-GI-NEXT: usubl v0.8h, v0.8b, v1.8b +; CHECK-GI-NEXT: usubl v2.8h, v0.8b, v1.8b ; CHECK-GI-NEXT: ldr d1, [x10] -; CHECK-GI-NEXT: ldr d2, [x11] -; CHECK-GI-NEXT: add x10, x10, x9 -; CHECK-GI-NEXT: add x11, x11, x8 -; CHECK-GI-NEXT: usubl v1.8h, v1.8b, v2.8b -; CHECK-GI-NEXT: ldr d3, [x10] -; CHECK-GI-NEXT: ldr d4, [x11] -; CHECK-GI-NEXT: sshll v5.4s, v0.4h, #0 -; CHECK-GI-NEXT: sshll2 v0.4s, v0.8h, #0 -; CHECK-GI-NEXT: add x10, x10, x9 -; CHECK-GI-NEXT: add x11, x11, x8 -; CHECK-GI-NEXT: ldr d2, [x10] +; CHECK-GI-NEXT: ldr d3, [x11] ; CHECK-GI-NEXT: add x10, x10, x9 -; CHECK-GI-NEXT: sshll v7.4s, v1.4h, #0 -; CHECK-GI-NEXT: sshll2 v1.4s, v1.8h, #0 -; CHECK-GI-NEXT: ldr d6, [x11] ; CHECK-GI-NEXT: add x11, x11, x8 -; CHECK-GI-NEXT: usubl v3.8h, v3.8b, v4.8b -; CHECK-GI-NEXT: abs v5.4s, v5.4s -; CHECK-GI-NEXT: abs v0.4s, v0.4s +; CHECK-GI-NEXT: movi v0.2d, #0000000000000000 +; CHECK-GI-NEXT: usubl v3.8h, v1.8b, v3.8b ; CHECK-GI-NEXT: ldr d4, [x10] -; CHECK-GI-NEXT: ldr d16, [x11] -; CHECK-GI-NEXT: abs v7.4s, v7.4s -; CHECK-GI-NEXT: abs v1.4s, v1.4s +; CHECK-GI-NEXT: ldr d5, [x11] ; CHECK-GI-NEXT: add x10, x10, x9 ; CHECK-GI-NEXT: add x11, x11, x8 -; CHECK-GI-NEXT: usubl v2.8h, v2.8b, v6.8b +; CHECK-GI-NEXT: sshll v1.4s, v2.4h, #0 +; CHECK-GI-NEXT: sshll2 v2.4s, v2.8h, #0 ; CHECK-GI-NEXT: ldr d6, [x10] -; CHECK-GI-NEXT: ldr d17, [x11] +; CHECK-GI-NEXT: ldr d7, [x11] ; CHECK-GI-NEXT: add x10, x10, x9 ; CHECK-GI-NEXT: add x11, x11, x8 -; CHECK-GI-NEXT: usubl v4.8h, v4.8b, v16.8b -; CHECK-GI-NEXT: sshll v16.4s, v3.4h, #0 -; CHECK-GI-NEXT: sshll2 v3.4s, v3.8h, #0 -; CHECK-GI-NEXT: add v0.4s, v5.4s, v0.4s -; CHECK-GI-NEXT: add v1.4s, v7.4s, v1.4s +; CHECK-GI-NEXT: sshll2 v16.4s, v3.8h, #0 +; CHECK-GI-NEXT: usubl v4.8h, v4.8b, v5.8b ; CHECK-GI-NEXT: ldr d5, [x10] -; CHECK-GI-NEXT: ldr d7, [x11] -; CHECK-GI-NEXT: sshll v18.4s, v2.4h, #0 -; CHECK-GI-NEXT: sshll2 v2.4s, v2.8h, #0 -; CHECK-GI-NEXT: usubl v6.8h, v6.8b, v17.8b -; CHECK-GI-NEXT: ldr d17, [x11, x8] -; CHECK-GI-NEXT: sshll v19.4s, v4.4h, #0 -; CHECK-GI-NEXT: usubl v5.8h, v5.8b, v7.8b -; CHECK-GI-NEXT: ldr d7, [x10, x9] -; CHECK-GI-NEXT: sshll2 v4.4s, v4.8h, #0 +; CHECK-GI-NEXT: ldr d17, [x11] +; CHECK-GI-NEXT: add x10, x10, x9 +; CHECK-GI-NEXT: add x11, x11, x8 +; CHECK-GI-NEXT: usubl v6.8h, v6.8b, v7.8b +; CHECK-GI-NEXT: ldr d7, [x10] +; CHECK-GI-NEXT: ldr d19, [x11] +; CHECK-GI-NEXT: abs v2.4s, v2.4s +; CHECK-GI-NEXT: sshll v3.4s, v3.4h, #0 ; CHECK-GI-NEXT: abs v16.4s, v16.4s -; CHECK-GI-NEXT: abs v3.4s, v3.4s +; CHECK-GI-NEXT: add x10, x10, x9 +; CHECK-GI-NEXT: add x11, x11, x8 +; CHECK-GI-NEXT: usubl v5.8h, v5.8b, v17.8b +; CHECK-GI-NEXT: ldr d17, [x10] +; CHECK-GI-NEXT: ldr d20, [x11] +; CHECK-GI-NEXT: usubl v7.8h, v7.8b, v19.8b +; CHECK-GI-NEXT: ldr d19, [x10, x9] +; CHECK-GI-NEXT: ldr d21, [x11, x8] +; CHECK-GI-NEXT: sshll2 v18.4s, v4.8h, #0 +; CHECK-GI-NEXT: saba v2.4s, v1.4s, v0.4s +; CHECK-GI-NEXT: sshll2 v1.4s, v6.8h, #0 +; CHECK-GI-NEXT: usubl v17.8h, v17.8b, v20.8b +; CHECK-GI-NEXT: saba v16.4s, v3.4s, v0.4s +; CHECK-GI-NEXT: sshll2 v3.4s, v5.8h, #0 +; CHECK-GI-NEXT: usubl v19.8h, v19.8b, v21.8b +; CHECK-GI-NEXT: sshll v4.4s, v4.4h, #0 ; CHECK-GI-NEXT: abs v18.4s, v18.4s -; CHECK-GI-NEXT: abs v2.4s, v2.4s -; CHECK-GI-NEXT: usubl v7.8h, v7.8b, v17.8b -; CHECK-GI-NEXT: sshll v17.4s, v6.4h, #0 -; CHECK-GI-NEXT: sshll2 v6.4s, v6.8h, #0 -; CHECK-GI-NEXT: abs v19.4s, v19.4s -; CHECK-GI-NEXT: abs v4.4s, v4.4s -; CHECK-GI-NEXT: add v3.4s, v16.4s, v3.4s -; CHECK-GI-NEXT: sshll v16.4s, v5.4h, #0 -; CHECK-GI-NEXT: sshll2 v5.4s, v5.8h, #0 -; CHECK-GI-NEXT: add v2.4s, v18.4s, v2.4s -; CHECK-GI-NEXT: abs v17.4s, v17.4s +; CHECK-GI-NEXT: sshll2 v20.4s, v7.8h, #0 +; CHECK-GI-NEXT: sshll v6.4s, v6.4h, #0 +; CHECK-GI-NEXT: abs v1.4s, v1.4s +; CHECK-GI-NEXT: sshll2 v21.4s, v17.8h, #0 +; CHECK-GI-NEXT: sshll v5.4s, v5.4h, #0 +; CHECK-GI-NEXT: abs v3.4s, v3.4s +; CHECK-GI-NEXT: sshll2 v22.4s, v19.8h, #0 +; CHECK-GI-NEXT: saba v18.4s, v4.4s, v0.4s +; CHECK-GI-NEXT: sshll v4.4s, v7.4h, #0 +; CHECK-GI-NEXT: abs v7.4s, v20.4s +; CHECK-GI-NEXT: saba v1.4s, v6.4s, v0.4s +; CHECK-GI-NEXT: sshll v6.4s, v17.4h, #0 +; CHECK-GI-NEXT: abs v17.4s, v21.4s +; CHECK-GI-NEXT: saba v3.4s, v5.4s, v0.4s +; CHECK-GI-NEXT: sshll v5.4s, v19.4h, #0 +; CHECK-GI-NEXT: abs v19.4s, v22.4s +; CHECK-GI-NEXT: saba v7.4s, v4.4s, v0.4s +; CHECK-GI-NEXT: saba v17.4s, v6.4s, v0.4s +; CHECK-GI-NEXT: saba v19.4s, v5.4s, v0.4s +; CHECK-GI-NEXT: addv s0, v2.4s +; CHECK-GI-NEXT: addv s2, v16.4s +; CHECK-GI-NEXT: addv s4, v18.4s ; CHECK-GI-NEXT: addv s1, v1.4s -; CHECK-GI-NEXT: abs v6.4s, v6.4s -; CHECK-GI-NEXT: addv s0, v0.4s -; CHECK-GI-NEXT: add v4.4s, v19.4s, v4.4s -; CHECK-GI-NEXT: addv s3, v3.4s -; CHECK-GI-NEXT: sshll v18.4s, v7.4h, #0 -; CHECK-GI-NEXT: sshll2 v7.4s, v7.8h, #0 -; CHECK-GI-NEXT: abs v16.4s, v16.4s -; CHECK-GI-NEXT: abs v5.4s, v5.4s -; CHECK-GI-NEXT: fmov w8, s1 -; CHECK-GI-NEXT: add v6.4s, v17.4s, v6.4s -; CHECK-GI-NEXT: addv s2, v2.4s +; CHECK-GI-NEXT: fmov w8, s2 ; CHECK-GI-NEXT: fmov w9, s0 -; CHECK-GI-NEXT: addv s4, v4.4s -; CHECK-GI-NEXT: fmov w10, s3 -; CHECK-GI-NEXT: abs v18.4s, v18.4s -; CHECK-GI-NEXT: abs v7.4s, v7.4s -; CHECK-GI-NEXT: add v1.4s, v16.4s, v5.4s +; CHECK-GI-NEXT: addv s0, v3.4s +; CHECK-GI-NEXT: fmov w10, s4 +; CHECK-GI-NEXT: addv s2, v7.4s ; CHECK-GI-NEXT: add w8, w8, w9 -; CHECK-GI-NEXT: addv s3, v6.4s -; CHECK-GI-NEXT: fmov w9, s2 +; CHECK-GI-NEXT: fmov w9, s1 +; CHECK-GI-NEXT: addv s1, v17.4s ; CHECK-GI-NEXT: add w8, w10, w8 -; CHECK-GI-NEXT: fmov w10, s4 -; CHECK-GI-NEXT: add v0.4s, v18.4s, v7.4s -; CHECK-GI-NEXT: addv s1, v1.4s ; CHECK-GI-NEXT: add w8, w9, w8 -; CHECK-GI-NEXT: fmov w9, s3 -; CHECK-GI-NEXT: add w8, w10, w8 -; CHECK-GI-NEXT: addv s0, v0.4s +; CHECK-GI-NEXT: fmov w9, s0 +; CHECK-GI-NEXT: addv s0, v19.4s +; CHECK-GI-NEXT: add w8, w9, w8 +; CHECK-GI-NEXT: fmov w9, s2 ; CHECK-GI-NEXT: add w8, w9, w8 ; CHECK-GI-NEXT: fmov w9, s1 ; CHECK-GI-NEXT: add w8, w9, w8 diff --git a/llvm/test/CodeGen/AArch64/vector-compress.ll b/llvm/test/CodeGen/AArch64/vector-compress.ll index a580913d40d9..67a0379d0524 100644 --- a/llvm/test/CodeGen/AArch64/vector-compress.ll +++ b/llvm/test/CodeGen/AArch64/vector-compress.ll @@ -12,16 +12,15 @@ define <4 x i32> @test_compress_v4i32(<4 x i32> %vec, <4 x i1> %mask) { ; CHECK-NEXT: shl.4s v1, v1, #31 ; CHECK-NEXT: cmlt.4s v1, v1, #0 ; CHECK-NEXT: mov.s w9, v1[1] -; CHECK-NEXT: mov.s w10, v1[2] ; CHECK-NEXT: fmov w11, s1 +; CHECK-NEXT: mov.s w10, v1[2] +; CHECK-NEXT: and x12, x11, #0x1 ; CHECK-NEXT: bfi x8, x11, #2, #1 -; CHECK-NEXT: and x11, x11, #0x1 -; CHECK-NEXT: and x9, x9, #0x1 -; CHECK-NEXT: and w10, w10, #0x1 -; CHECK-NEXT: add x9, x11, x9 ; CHECK-NEXT: mov x11, sp +; CHECK-NEXT: and x9, x9, #0x1 +; CHECK-NEXT: add x9, x12, x9 ; CHECK-NEXT: st1.s { v0 }[1], [x8] -; CHECK-NEXT: add w10, w9, w10 +; CHECK-NEXT: sub w10, w9, w10 ; CHECK-NEXT: orr x9, x11, x9, lsl #2 ; CHECK-NEXT: bfi x11, x10, #2, #2 ; CHECK-NEXT: st1.s { v0 }[2], [x9] @@ -93,7 +92,8 @@ define <2 x double> @test_compress_v2f64(<2 x double> %vec, <2 x i1> %mask) { ; CHECK-NEXT: shl.2d v1, v1, #63 ; CHECK-NEXT: cmlt.2d v1, v1, #0 ; CHECK-NEXT: fmov x9, d1 -; CHECK-NEXT: bfi x8, x9, #3, #1 +; CHECK-NEXT: and x9, x9, #0x8 +; CHECK-NEXT: orr x8, x8, x9 ; CHECK-NEXT: st1.d { v0 }[1], [x8] ; CHECK-NEXT: ldr q0, [sp], #16 ; CHECK-NEXT: ret @@ -420,16 +420,15 @@ define <3 x i32> @test_compress_narrow(<3 x i32> %vec, <3 x i1> %mask) { ; CHECK-NEXT: shl.4s v1, v1, #31 ; CHECK-NEXT: cmlt.4s v1, v1, #0 ; CHECK-NEXT: mov.s w8, v1[1] -; CHECK-NEXT: mov.s w9, v1[2] ; CHECK-NEXT: fmov w10, s1 +; CHECK-NEXT: mov.s w9, v1[2] +; CHECK-NEXT: and x12, x10, #0x1 ; CHECK-NEXT: bfi x11, x10, #2, #1 -; CHECK-NEXT: and x10, x10, #0x1 -; CHECK-NEXT: and x8, x8, #0x1 -; CHECK-NEXT: and w9, w9, #0x1 -; CHECK-NEXT: add x8, x10, x8 ; CHECK-NEXT: mov x10, sp +; CHECK-NEXT: and x8, x8, #0x1 +; CHECK-NEXT: add x8, x12, x8 ; CHECK-NEXT: st1.s { v0 }[1], [x11] -; CHECK-NEXT: add w9, w8, w9 +; CHECK-NEXT: sub w9, w8, w9 ; CHECK-NEXT: orr x8, x10, x8, lsl #2 ; CHECK-NEXT: bfi x10, x9, #2, #2 ; CHECK-NEXT: st1.s { v0 }[2], [x8] diff --git a/llvm/test/CodeGen/AArch64/vector-to-scalar-bitmask.ll b/llvm/test/CodeGen/AArch64/vector-to-scalar-bitmask.ll new file mode 100644 index 000000000000..01c83ca220b6 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/vector-to-scalar-bitmask.ll @@ -0,0 +1,75 @@ +; RUN: llc -O3 -mtriple=aarch64-linux-gnu %s -o - | FileCheck %s --check-prefix=CHECK-LE +; RUN: llc -O3 -mtriple=aarch64_be-linux-gnu %s -o - | FileCheck %s --check-prefix=CHECK-BE + +define i16 @convert_to_bitmask16(<16 x i8> %vec) { + %cmp_result = icmp ne <16 x i8> %vec, zeroinitializer + %bitmask = bitcast <16 x i1> %cmp_result to i16 + ret i16 %bitmask +} + +define i16 @convert_to_bitmask8(<8 x i16> %vec) { + %cmp_result = icmp ne <8 x i16> %vec, zeroinitializer + %bitmask = bitcast <8 x i1> %cmp_result to i8 + %extended_bitmask = zext i8 %bitmask to i16 + ret i16 %extended_bitmask +} + +; Little endian + +; CHECK-LE-LABEL: .LCPI0_0: +; CHECK-LE-NEXT: .byte 1 +; CHECK-LE-NEXT: .byte 2 +; CHECK-LE-NEXT: .byte 4 +; CHECK-LE-NEXT: .byte 8 +; CHECK-LE-NEXT: .byte 16 +; CHECK-LE-NEXT: .byte 32 +; CHECK-LE-NEXT: .byte 64 +; CHECK-LE-NEXT: .byte 128 +; CHECK-LE-NEXT: .byte 1 +; CHECK-LE-NEXT: .byte 2 +; CHECK-LE-NEXT: .byte 4 +; CHECK-LE-NEXT: .byte 8 +; CHECK-LE-NEXT: .byte 16 +; CHECK-LE-NEXT: .byte 32 +; CHECK-LE-NEXT: .byte 64 +; CHECK-LE-NEXT: .byte 128 + +; CHECK-LE-LABEL: .LCPI1_0: +; CHECK-LE-NEXT: .hword 1 +; CHECK-LE-NEXT: .hword 2 +; CHECK-LE-NEXT: .hword 4 +; CHECK-LE-NEXT: .hword 8 +; CHECK-LE-NEXT: .hword 16 +; CHECK-LE-NEXT: .hword 32 +; CHECK-LE-NEXT: .hword 64 +; CHECK-LE-NEXT: .hword 128 + +; Big endian + +; CHECK-BE-LABEL: .LCPI0_0: +; CHECK-BE-NEXT: .byte 128 +; CHECK-BE-NEXT: .byte 64 +; CHECK-BE-NEXT: .byte 32 +; CHECK-BE-NEXT: .byte 16 +; CHECK-BE-NEXT: .byte 8 +; CHECK-BE-NEXT: .byte 4 +; CHECK-BE-NEXT: .byte 2 +; CHECK-BE-NEXT: .byte 1 +; CHECK-BE-NEXT: .byte 128 +; CHECK-BE-NEXT: .byte 64 +; CHECK-BE-NEXT: .byte 32 +; CHECK-BE-NEXT: .byte 16 +; CHECK-BE-NEXT: .byte 8 +; CHECK-BE-NEXT: .byte 4 +; CHECK-BE-NEXT: .byte 2 +; CHECK-BE-NEXT: .byte 1 + +; CHECK-BE-LABEL: .LCPI1_0: +; CHECK-BE-NEXT: .hword 128 +; CHECK-BE-NEXT: .hword 64 +; CHECK-BE-NEXT: .hword 32 +; CHECK-BE-NEXT: .hword 16 +; CHECK-BE-NEXT: .hword 8 +; CHECK-BE-NEXT: .hword 4 +; CHECK-BE-NEXT: .hword 2 +; CHECK-BE-NEXT: .hword 1 diff --git a/llvm/test/CodeGen/AArch64/wineh-bti-funclet.ll b/llvm/test/CodeGen/AArch64/wineh-bti-funclet.ll new file mode 100644 index 000000000000..4f4f984b8974 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/wineh-bti-funclet.ll @@ -0,0 +1,79 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=aarch64-windows -mattr=+bti -o - %s | FileCheck %s + +declare i32 @__CxxFrameHandler3(...) +declare void @may_throw() + +; Purpose: For WinEH funclets, entry is call-like: accept `bti c` / `hint #34` or a PAC prologue. + +define dso_local void @wineh_funclet() #0 personality ptr @__CxxFrameHandler3 { +; CHECK-LABEL: wineh_funclet: +; CHECK: .Lfunc_begin0: +; CHECK-NEXT: .seh_proc wineh_funclet +; CHECK-NEXT: .seh_handler __CxxFrameHandler3, @unwind, @except +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: bti c +; CHECK-NEXT: .seh_nop +; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-NEXT: .seh_save_fplr_x 32 +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: .seh_set_fp +; CHECK-NEXT: .seh_endprologue +; CHECK-NEXT: mov x0, #-2 // =0xfffffffffffffffe +; CHECK-NEXT: stur x0, [x29, #16] +; CHECK-NEXT: .Ltmp0: // EH_LABEL +; CHECK-NEXT: bl may_throw +; CHECK-NEXT: .Ltmp1: // EH_LABEL +; CHECK-NEXT: .LBB0_1: // Block address taken +; CHECK-NEXT: // %try.cont +; CHECK-NEXT: $ehgcr_0_1: +; CHECK-NEXT: bti j +; CHECK-NEXT: .seh_startepilogue +; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-NEXT: .seh_save_fplr_x 32 +; CHECK-NEXT: .seh_endepilogue +; CHECK-NEXT: ret +; CHECK-NEXT: .seh_endfunclet +; CHECK-NEXT: .seh_handlerdata +; CHECK-NEXT: .word $cppxdata$wineh_funclet@IMGREL +; CHECK-NEXT: .text +; CHECK-NEXT: .seh_endproc +; CHECK-NEXT: .def "?catch$2@?0?wineh_funclet@4HA"; +; CHECK-NEXT: .scl 3; +; CHECK-NEXT: .type 32; +; CHECK-NEXT: .endef +; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: "?catch$2@?0?wineh_funclet@4HA": +; CHECK-NEXT: .seh_proc "?catch$2@?0?wineh_funclet@4HA" +; CHECK-NEXT: .seh_handler __CxxFrameHandler3, @unwind, @except +; CHECK-NEXT: .LBB0_2: // %catch +; CHECK-NEXT: bti c +; CHECK-NEXT: .seh_nop +; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-NEXT: .seh_save_fplr_x 16 +; CHECK-NEXT: .seh_endprologue +; CHECK-NEXT: bl may_throw +; CHECK-NEXT: adrp x0, .LBB0_1 +; CHECK-NEXT: add x0, x0, .LBB0_1 +; CHECK-NEXT: .seh_startepilogue +; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-NEXT: .seh_save_fplr_x 16 +; CHECK-NEXT: .seh_endepilogue +; CHECK-NEXT: ret +entry: + invoke void @may_throw() + to label %try.cont unwind label %catch.dispatch + +catch.dispatch: + %cs = catchswitch within none [label %catch] unwind to caller + +catch: + %cp = catchpad within %cs [ptr null, i32 0, ptr null] + call void @may_throw() ["funclet"(token %cp)] + catchret from %cp to label %try.cont + +try.cont: + ret void +} + +attributes #0 = { noinline "branch-target-enforcement"="true" } diff --git a/llvm/test/CodeGen/AArch64/wineh-bti.ll b/llvm/test/CodeGen/AArch64/wineh-bti.ll index 8b8960d37f9e..86555a7f6436 100644 --- a/llvm/test/CodeGen/AArch64/wineh-bti.ll +++ b/llvm/test/CodeGen/AArch64/wineh-bti.ll @@ -29,7 +29,7 @@ lbl4: ; CHECK-LABEL: func: ; CHECK-NEXT: .seh_proc func -; CHECK-NEXT: // %bb.0: +; CHECK: // %bb.0: // %entry ; CHECK-NEXT: hint #34 ; CHECK-NEXT: .seh_nop ; CHECK-NEXT: str x19, [sp, #-16]! |
