diff options
Diffstat (limited to 'llvm/test/CodeGen')
109 files changed, 11554 insertions, 3302 deletions
diff --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-exchange-fence.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-exchange-fence.ll new file mode 100644 index 000000000000..2adbc709d238 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-exchange-fence.ll @@ -0,0 +1,64 @@ +; RUN: llc %s -o - -verify-machineinstrs -mtriple=aarch64 -mattr=+lse -O0 | FileCheck %s +; RUN: llc %s -o - -verify-machineinstrs -mtriple=aarch64 -mattr=+lse -O1 | FileCheck %s + +; When their destination register is WZR/ZZR, SWP operations are not regarded as +; a read for the purpose of a DMB.LD in the AArch64 memory model. +; This test ensures that the AArch64DeadRegisterDefinitions pass does not +; replace the desitnation register of SWP instructions with the zero register +; when the read value is unused. + +define dso_local i32 @atomic_exchange_monotonic(ptr %ptr, ptr %ptr2, i32 %value) { +; CHECK-LABEL: atomic_exchange_monotonic: +; CHECK: // %bb.0: +; CHECK-NEXT: swp +; CHECK-NOT: wzr +; CHECK-NEXT: dmb ishld +; CHECK-NEXT: ldr w0, [x1] +; CHECK-NEXT: ret + %r0 = atomicrmw xchg ptr %ptr, i32 %value monotonic + fence acquire + %r1 = load atomic i32, ptr %ptr2 monotonic, align 4 + ret i32 %r1 +} + +define dso_local i32 @atomic_exchange_acquire(ptr %ptr, ptr %ptr2, i32 %value) { +; CHECK-LABEL: atomic_exchange_acquire: +; CHECK: // %bb.0: +; CHECK-NEXT: swpa +; CHECK-NOT: wzr +; CHECK-NEXT: dmb ishld +; CHECK-NEXT: ldr w0, [x1] +; CHECK-NEXT: ret + %r0 = atomicrmw xchg ptr %ptr, i32 %value acquire + fence acquire + %r1 = load atomic i32, ptr %ptr2 monotonic, align 4 + ret i32 %r1 +} + +define dso_local i32 @atomic_exchange_release(ptr %ptr, ptr %ptr2, i32 %value) { +; CHECK-LABEL: atomic_exchange_release: +; CHECK: // %bb.0: +; CHECK-NEXT: swpl +; CHECK-NOT: wzr +; CHECK-NEXT: dmb ishld +; CHECK-NEXT: ldr w0, [x1] +; CHECK-NEXT: ret + %r0 = atomicrmw xchg ptr %ptr, i32 %value release + fence acquire + %r1 = load atomic i32, ptr %ptr2 monotonic, align 4 + ret i32 %r1 +} + +define dso_local i32 @atomic_exchange_acquire_release(ptr %ptr, ptr %ptr2, i32 %value) { +; CHECK-LABEL: atomic_exchange_acquire_release: +; CHECK: // %bb.0: +; CHECK-NEXT: swpal +; CHECK-NOT: wzr +; CHECK-NEXT: dmb ishld +; CHECK-NEXT: ldr w0, [x1] +; CHECK-NEXT: ret + %r0 = atomicrmw xchg ptr %ptr, i32 %value acq_rel + fence acquire + %r1 = load atomic i32, ptr %ptr2 monotonic, align 4 + ret i32 %r1 +} diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-shuffle-splat.mir b/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-shuffle-splat.mir index 9d12c3c32c7f..71094825e42f 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-shuffle-splat.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-shuffle-splat.mir @@ -367,3 +367,32 @@ body: | %shuf:_(<4 x s32>) = G_SHUFFLE_VECTOR %buildvec(<4 x s32>), %undef, shufflemask(0, 0, 0, 0) $q0 = COPY %shuf(<4 x s32>) RET_ReallyLR implicit $q0 + +... +--- +name: build_vector_rhs +alignment: 4 +legalized: true +tracksRegLiveness: true +body: | + bb.1.entry: + liveins: $w0, $w1, $w2, $w3, $w4 + ; The G_SHUFFLE_VECTOR is fed by a G_BUILD_VECTOR, and the 0th input + ; operand is not a constant. We should get a G_DUP. + ; + ; CHECK-LABEL: name: build_vector + ; CHECK: liveins: $w0, $w1, $w2, $w3, $w4 + ; CHECK: %lane_1:_(s32) = COPY $w1 + ; CHECK: %shuf:_(<4 x s32>) = G_DUP %lane_1(s32) + ; CHECK: $q0 = COPY %shuf(<4 x s32>) + ; CHECK: RET_ReallyLR implicit $q0 + %lane_0:_(s32) = COPY $w0 + %lane_1:_(s32) = COPY $w1 + %b:_(s32) = COPY $w2 + %c:_(s32) = COPY $w3 + %d:_(s32) = COPY $w4 + %buildvec0:_(<4 x s32>) = G_BUILD_VECTOR %lane_0(s32), %b(s32), %c(s32), %d(s32) + %buildvec1:_(<4 x s32>) = G_BUILD_VECTOR %lane_1(s32), %b(s32), %c(s32), %d(s32) + %shuf:_(<4 x s32>) = G_SHUFFLE_VECTOR %buildvec0(<4 x s32>), %buildvec1, shufflemask(4, 4, 4, 4) + $q0 = COPY %shuf(<4 x s32>) + RET_ReallyLR implicit $q0 diff --git a/llvm/test/CodeGen/AArch64/O0-pipeline.ll b/llvm/test/CodeGen/AArch64/O0-pipeline.ll index bfcb9a710f70..ba611493e1a7 100644 --- a/llvm/test/CodeGen/AArch64/O0-pipeline.ll +++ b/llvm/test/CodeGen/AArch64/O0-pipeline.ll @@ -22,7 +22,6 @@ ; CHECK-NEXT: Lower Garbage Collection Instructions ; CHECK-NEXT: Shadow Stack GC Lowering ; CHECK-NEXT: Remove unreachable blocks from the CFG -; CHECK-NEXT: Expand vector predication intrinsics ; CHECK-NEXT: Instrument function entry/exit with calls to e.g. mcount() (post inlining) ; CHECK-NEXT: Scalarize Masked Memory Intrinsics ; CHECK-NEXT: Expand reduction intrinsics diff --git a/llvm/test/CodeGen/AArch64/O3-pipeline.ll b/llvm/test/CodeGen/AArch64/O3-pipeline.ll index 017349aa32af..845634e8e983 100644 --- a/llvm/test/CodeGen/AArch64/O3-pipeline.ll +++ b/llvm/test/CodeGen/AArch64/O3-pipeline.ll @@ -60,7 +60,6 @@ ; CHECK-NEXT: Constant Hoisting ; CHECK-NEXT: Replace intrinsics with calls to vector library ; CHECK-NEXT: Partially inline calls to library functions -; CHECK-NEXT: Expand vector predication intrinsics ; CHECK-NEXT: Instrument function entry/exit with calls to e.g. mcount() (post inlining) ; CHECK-NEXT: Scalarize Masked Memory Intrinsics ; CHECK-NEXT: Expand reduction intrinsics diff --git a/llvm/test/CodeGen/AArch64/abds.ll b/llvm/test/CodeGen/AArch64/abds.ll index d4ad33f963ba..215907c66a6e 100644 --- a/llvm/test/CodeGen/AArch64/abds.ll +++ b/llvm/test/CodeGen/AArch64/abds.ll @@ -571,6 +571,28 @@ define i32 @abd_sub_i32(i32 %a, i32 %b) nounwind { ret i32 %abs } +define i64 @vector_legalized(i16 %a, i16 %b) { +; CHECK-LABEL: vector_legalized: +; CHECK: // %bb.0: +; CHECK-NEXT: movi v0.2d, #0000000000000000 +; CHECK-NEXT: sxth w8, w0 +; CHECK-NEXT: sub w8, w8, w1, sxth +; CHECK-NEXT: addp d0, v0.2d +; CHECK-NEXT: cmp w8, #0 +; CHECK-NEXT: cneg w8, w8, mi +; CHECK-NEXT: fmov x9, d0 +; CHECK-NEXT: add x0, x9, x8 +; CHECK-NEXT: ret + %ea = sext i16 %a to i32 + %eb = sext i16 %b to i32 + %s = sub i32 %ea, %eb + %ab = call i32 @llvm.abs.i32(i32 %s, i1 false) + %e = zext i32 %ab to i64 + %red = call i64 @llvm.vector.reduce.add.v32i64(<32 x i64> zeroinitializer) + %z = add i64 %red, %e + ret i64 %z +} + declare i8 @llvm.abs.i8(i8, i1) declare i16 @llvm.abs.i16(i16, i1) diff --git a/llvm/test/CodeGen/AArch64/abdu.ll b/llvm/test/CodeGen/AArch64/abdu.ll index 983db629e449..f70f095d7dab 100644 --- a/llvm/test/CodeGen/AArch64/abdu.ll +++ b/llvm/test/CodeGen/AArch64/abdu.ll @@ -409,6 +409,32 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind { ret i128 %sel } +; +; negative tests +; + +define i64 @vector_legalized(i16 %a, i16 %b) { +; CHECK-LABEL: vector_legalized: +; CHECK: // %bb.0: +; CHECK-NEXT: movi v0.2d, #0000000000000000 +; CHECK-NEXT: and w8, w0, #0xffff +; CHECK-NEXT: sub w8, w8, w1, uxth +; CHECK-NEXT: cmp w8, #0 +; CHECK-NEXT: addp d0, v0.2d +; CHECK-NEXT: cneg w8, w8, mi +; CHECK-NEXT: fmov x9, d0 +; CHECK-NEXT: add x0, x9, x8 +; CHECK-NEXT: ret + %ea = zext i16 %a to i32 + %eb = zext i16 %b to i32 + %s = sub i32 %ea, %eb + %ab = call i32 @llvm.abs.i32(i32 %s, i1 false) + %e = zext i32 %ab to i64 + %red = call i64 @llvm.vector.reduce.add.v32i64(<32 x i64> zeroinitializer) + %z = add i64 %red, %e + ret i64 %z +} + declare i8 @llvm.abs.i8(i8, i1) declare i16 @llvm.abs.i16(i16, i1) declare i32 @llvm.abs.i32(i32, i1) diff --git a/llvm/test/CodeGen/AArch64/emutls_alias.ll b/llvm/test/CodeGen/AArch64/emutls_alias.ll new file mode 100644 index 000000000000..4a157d8d03e7 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/emutls_alias.ll @@ -0,0 +1,17 @@ +; RUN: llc < %s -emulated-tls -mtriple=aarch64-linux-ohos \ +; RUN: | FileCheck -check-prefix=EMUTLS_CHECK %s + +%struct.__res_state = type { [5 x i8] } + +@foo = dso_local thread_local global %struct.__res_state { [5 x i8] c"\01\02\03\04\05" }, align 1 + +@bar = hidden thread_local(initialexec) alias %struct.__res_state, ptr @foo + +define dso_local i32 @main() { + %1 = alloca i32, align 4 + store i32 0, ptr %1, align 4 + store i8 0, ptr @bar, align 1 + ; EMUTLS_CHECK: adrp x0, __emutls_v.foo + ; EMUTLS_CHECK-NEXT: add x0, x0, :lo12:__emutls_v.foo + ret i32 0 +} diff --git a/llvm/test/CodeGen/AArch64/note-gnu-property-elf-pauthabi.ll b/llvm/test/CodeGen/AArch64/note-gnu-property-elf-pauthabi.ll index 728cffeba02a..b2ebf1fc0411 100644 --- a/llvm/test/CodeGen/AArch64/note-gnu-property-elf-pauthabi.ll +++ b/llvm/test/CodeGen/AArch64/note-gnu-property-elf-pauthabi.ll @@ -10,7 +10,7 @@ !llvm.module.flags = !{!0, !1} !0 = !{i32 1, !"aarch64-elf-pauthabi-platform", i32 268435458} -!1 = !{i32 1, !"aarch64-elf-pauthabi-version", i32 85} +!1 = !{i32 1, !"aarch64-elf-pauthabi-version", i32 341} ; ASM: .section .note.gnu.property,"a",@note ; ASM-NEXT: .p2align 3, 0x0 @@ -22,12 +22,12 @@ ; ASM-NEXT: .word 3221225473 ; ASM-NEXT: .word 16 ; ASM-NEXT: .xword 268435458 -; ASM-NEXT: .xword 85 +; ASM-NEXT: .xword 341 ; OBJ: Displaying notes found in: .note.gnu.property ; OBJ-NEXT: Owner Data size Description ; OBJ-NEXT: GNU 0x00000018 NT_GNU_PROPERTY_TYPE_0 (property note) -; OBJ-NEXT: AArch64 PAuth ABI core info: platform 0x10000002 (llvm_linux), version 0x55 (PointerAuthIntrinsics, !PointerAuthCalls, PointerAuthReturns, !PointerAuthAuthTraps, PointerAuthVTPtrAddressDiscrimination, !PointerAuthVTPtrTypeDiscrimination, PointerAuthInitFini) +; OBJ-NEXT: AArch64 PAuth ABI core info: platform 0x10000002 (llvm_linux), version 0x155 (PointerAuthIntrinsics, !PointerAuthCalls, PointerAuthReturns, !PointerAuthAuthTraps, PointerAuthVTPtrAddressDiscrimination, !PointerAuthVTPtrTypeDiscrimination, PointerAuthInitFini, !PointerAuthInitFiniAddressDiscrimination, PointerAuthELFGOT) ; ERR: either both or no 'aarch64-elf-pauthabi-platform' and 'aarch64-elf-pauthabi-version' module flags must be present diff --git a/llvm/test/CodeGen/AArch64/ptrauth-basic-pic.ll b/llvm/test/CodeGen/AArch64/ptrauth-basic-pic.ll new file mode 100644 index 000000000000..de6901f10761 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/ptrauth-basic-pic.ll @@ -0,0 +1,82 @@ +; RUN: llc -mtriple=aarch64-linux-gnu -global-isel=0 -fast-isel=0 -verify-machineinstrs \ +; RUN: -relocation-model=pic -mattr=+pauth %s -o - | FileCheck %s +; RUN: llc -mtriple=aarch64-linux-gnu -global-isel=0 -fast-isel=1 -verify-machineinstrs \ +; RUN: -relocation-model=pic -mattr=+pauth %s -o - | FileCheck %s +; RUN: llc -mtriple=aarch64-linux-gnu -global-isel=1 -global-isel-abort=1 -verify-machineinstrs \ +; RUN: -relocation-model=pic -mattr=+pauth %s -o - | FileCheck %s + +;; Note: for FastISel, we fall back to SelectionDAG + +@var = global i32 0 + +define i32 @get_globalvar() { +; CHECK-LABEL: get_globalvar: +; CHECK: adrp x[[GOT:[0-9]+]], :got_auth:var +; CHECK-NEXT: add x[[GOT]], x[[GOT]], :got_auth_lo12:var +; CHECK-NEXT: ldr x[[SYM:[0-9]+]], [x[[GOT]]] +; CHECK-NEXT: autda x[[SYM]], x[[GOT]] +; CHECK-NEXT: ldr w0, [x[[SYM]]] + + %val = load i32, ptr @var + ret i32 %val +} + +define ptr @get_globalvaraddr() { +; CHECK-LABEL: get_globalvaraddr: +; CHECK: adrp x[[GOT:[0-9]+]], :got_auth:var +; CHECK-NEXT: add x[[GOT]], x[[GOT]], :got_auth_lo12:var +; CHECK-NEXT: ldr x0, [x[[GOT]]] +; CHECK-NEXT: autda x0, x[[GOT]] + + %val = load i32, ptr @var + ret ptr @var +} + +declare i32 @foo() + +define ptr @resign_globalfunc() { +; CHECK-LABEL: resign_globalfunc: +; CHECK: adrp x17, :got_auth:foo +; CHECK-NEXT: add x17, x17, :got_auth_lo12:foo +; CHECK-NEXT: ldr x16, [x17] +; CHECK-NEXT: autia x16, x17 +; CHECK-NEXT: mov x17, #42 +; CHECK-NEXT: pacia x16, x17 +; CHECK-NEXT: mov x0, x16 +; CHECK-NEXT: ret + + ret ptr ptrauth (ptr @foo, i32 0, i64 42) +} + +define ptr @resign_globalvar() { +; CHECK-LABEL: resign_globalvar: +; CHECK: adrp x17, :got_auth:var +; CHECK-NEXT: add x17, x17, :got_auth_lo12:var +; CHECK-NEXT: ldr x16, [x17] +; CHECK-NEXT: autda x16, x17 +; CHECK-NEXT: mov x17, #43 +; CHECK-NEXT: pacdb x16, x17 +; CHECK-NEXT: mov x0, x16 +; CHECK-NEXT: ret + + ret ptr ptrauth (ptr @var, i32 3, i64 43) +} + +define ptr @resign_globalvar_offset() { +; CHECK-LABEL: resign_globalvar_offset: +; CHECK: adrp x17, :got_auth:var +; CHECK-NEXT: add x17, x17, :got_auth_lo12:var +; CHECK-NEXT: ldr x16, [x17] +; CHECK-NEXT: autda x16, x17 +; CHECK-NEXT: add x16, x16, #16 +; CHECK-NEXT: mov x17, #44 +; CHECK-NEXT: pacda x16, x17 +; CHECK-NEXT: mov x0, x16 +; CHECK-NEXT: ret + + ret ptr ptrauth (ptr getelementptr (i8, ptr @var, i64 16), i32 2, i64 44) +} + +!llvm.module.flags = !{!0, !1} +!0 = !{i32 1, !"aarch64-elf-pauthabi-platform", i32 268435458} +!1 = !{i32 1, !"aarch64-elf-pauthabi-version", i32 256} diff --git a/llvm/test/CodeGen/AArch64/ptrauth-elf-globals-pic.ll b/llvm/test/CodeGen/AArch64/ptrauth-elf-globals-pic.ll new file mode 100644 index 000000000000..2b7d8637b432 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/ptrauth-elf-globals-pic.ll @@ -0,0 +1,23 @@ +; RUN: llc -mtriple=arm64 -global-isel=0 -fast-isel=0 -relocation-model=pic -o - %s -mcpu=cyclone -mattr=+pauth | FileCheck %s +; RUN: llc -mtriple=arm64 -global-isel=0 -fast-isel=1 -relocation-model=pic -o - %s -mcpu=cyclone -mattr=+pauth | FileCheck %s +; RUN: llc -mtriple=arm64 -global-isel=1 -global-isel-abort=1 -relocation-model=pic -o - %s -mcpu=cyclone -mattr=+pauth | FileCheck %s + +;; Note: for FastISel, we fall back to SelectionDAG + +@var8 = external global i8, align 1 + +define i8 @test_i8(i8 %new) { + %val = load i8, ptr @var8, align 1 + store i8 %new, ptr @var8 + ret i8 %val + +; CHECK: adrp x[[HIREG:[0-9]+]], :got_auth:var8 +; CHECK-NEXT: add x[[HIREG]], x[[HIREG]], :got_auth_lo12:var8 +; CHECK-NEXT: ldr x[[VAR_ADDR:[0-9]+]], [x[[HIREG]]] +; CHECK-NEXT: autda x[[VAR_ADDR]], x[[HIREG]] +; CHECK-NEXT: ldrb {{w[0-9]+}}, [x[[VAR_ADDR]]] +} + +!llvm.module.flags = !{!0, !1} +!0 = !{i32 1, !"aarch64-elf-pauthabi-platform", i32 268435458} +!1 = !{i32 1, !"aarch64-elf-pauthabi-version", i32 256} diff --git a/llvm/test/CodeGen/AArch64/ptrauth-extern-weak.ll b/llvm/test/CodeGen/AArch64/ptrauth-extern-weak.ll new file mode 100644 index 000000000000..88b611141c04 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/ptrauth-extern-weak.ll @@ -0,0 +1,36 @@ +; RUN: llc -mtriple=aarch64-none-linux-gnu -global-isel=0 -fast-isel=0 -relocation-model=pic -mattr=+pauth -o - %s | FileCheck %s +; RUN: llc -mtriple=aarch64-none-linux-gnu -global-isel=0 -fast-isel=1 -relocation-model=pic -mattr=+pauth -o - %s | FileCheck %s +; RUN: llc -mtriple=aarch64-none-linux-gnu -global-isel=1 -global-isel-abort=1 -relocation-model=pic -mattr=+pauth -o - %s | FileCheck %s + +;; Note: for FastISel, we fall back to SelectionDAG + +declare extern_weak dso_local i32 @var() + +define ptr @foo() { +; The usual ADRP/ADD pair can't be used for a weak reference because it must +; evaluate to 0 if the symbol is undefined. We use a GOT entry for PIC +; otherwise a litpool entry. + ret ptr @var + +; CHECK: adrp x[[ADDRHI:[0-9]+]], :got_auth:var +; CHECK-NEXT: add x[[ADDRHI]], x[[ADDRHI]], :got_auth_lo12:var +; CHECK-NEXT: ldr x0, [x[[ADDRHI]]] +; CHECK-NEXT: autia x0, x[[ADDRHI]] +} + +@arr_var = extern_weak global [10 x i32] + +define ptr @bar() { + %addr = getelementptr [10 x i32], ptr @arr_var, i32 0, i32 5 + +; CHECK: adrp x[[ADDRHI:[0-9]+]], :got_auth:arr_var +; CHECK-NEXT: add x[[ADDRHI]], x[[ADDRHI]], :got_auth_lo12:arr_var +; CHECK-NEXT: ldr [[BASE:x[0-9]+]], [x[[ADDRHI]]] +; CHECK-NEXT: autda [[BASE]], x[[ADDRHI]] +; CHECK-NEXT: add x0, [[BASE]], #20 + ret ptr %addr +} + +!llvm.module.flags = !{!0, !1} +!0 = !{i32 1, !"aarch64-elf-pauthabi-platform", i32 268435458} +!1 = !{i32 1, !"aarch64-elf-pauthabi-version", i32 256} diff --git a/llvm/test/CodeGen/AArch64/ptrauth-got-abuse.ll b/llvm/test/CodeGen/AArch64/ptrauth-got-abuse.ll new file mode 100644 index 000000000000..c1580534b62c --- /dev/null +++ b/llvm/test/CodeGen/AArch64/ptrauth-got-abuse.ll @@ -0,0 +1,44 @@ +; RUN: llc -mtriple=aarch64-none-linux-gnu -asm-verbose=false -global-isel=0 -fast-isel=0 -relocation-model=pic -mattr=+pauth -o - %s | FileCheck %s +; RUN: llc -mtriple=aarch64-none-linux-gnu -asm-verbose=false -global-isel=0 -fast-isel=1 -relocation-model=pic -mattr=+pauth -o - %s | FileCheck %s +; RUN: llc -mtriple=aarch64-none-linux-gnu -asm-verbose=false -global-isel=1 -global-isel-abort=1 -relocation-model=pic -mattr=+pauth -o - %s | FileCheck %s +; RUN: llc -mtriple=aarch64-none-linux-gnu -asm-verbose=false -global-isel=0 -fast-isel=0 -relocation-model=pic -filetype=obj -mattr=+pauth -o /dev/null %s +; RUN: llc -mtriple=aarch64-none-linux-gnu -asm-verbose=false -global-isel=0 -fast-isel=1 -relocation-model=pic -filetype=obj -mattr=+pauth -o /dev/null %s +; RUN: llc -mtriple=aarch64-none-linux-gnu -asm-verbose=false -global-isel=1 -global-isel-abort=1 -relocation-model=pic -filetype=obj -mattr=+pauth -o /dev/null %s + +;; Note: for FastISel, we fall back to SelectionDAG + +declare void @consume(i32) +declare void @func() + +define void @aliasee_func() { + ret void +} +@alias_func = alias void (), ptr @aliasee_func + +@aliasee_global = global i32 42 +@alias_global = alias i32, ptr @aliasee_global + +define void @foo() nounwind { +; CHECK-LABEL: foo: +entry: + call void @consume(i32 ptrtoint (ptr @func to i32)) +; CHECK: adrp x[[ADDRHI:[0-9]+]], :got_auth:func +; CHECK-NEXT: add x[[ADDRHI]], x[[ADDRHI]], :got_auth_lo12:func +; CHECK-NEXT: ldr x[[SYM:[0-9]+]], [x[[ADDRHI]]] +; CHECK-NEXT: autia x[[SYM:[0-9]+]], x[[ADDRHI]] + call void @consume(i32 ptrtoint (ptr @alias_func to i32)) +; CHECK: adrp x[[ADDRHI:[0-9]+]], :got_auth:alias_func +; CHECK-NEXT: add x[[ADDRHI]], x[[ADDRHI]], :got_auth_lo12:alias_func +; CHECK-NEXT: ldr x[[SYM:[0-9]+]], [x[[ADDRHI]]] +; CHECK-NEXT: autia x[[SYM:[0-9]+]], x[[ADDRHI]] + call void @consume(i32 ptrtoint (ptr @alias_global to i32)) +; CHECK: adrp x[[ADDRHI:[0-9]+]], :got_auth:alias_global +; CHECK-NEXT: add x[[ADDRHI]], x[[ADDRHI]], :got_auth_lo12:alias_global +; CHECK-NEXT: ldr x[[SYM:[0-9]+]], [x[[ADDRHI]]] +; CHECK-NEXT: autda x[[SYM:[0-9]+]], x[[ADDRHI]] + ret void +} + +!llvm.module.flags = !{!0, !1} +!0 = !{i32 1, !"aarch64-elf-pauthabi-platform", i32 268435458} +!1 = !{i32 1, !"aarch64-elf-pauthabi-version", i32 256} diff --git a/llvm/test/CodeGen/AArch64/ptrauth-init-fini.ll b/llvm/test/CodeGen/AArch64/ptrauth-init-fini.ll new file mode 100644 index 000000000000..186a31c63ba1 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/ptrauth-init-fini.ll @@ -0,0 +1,104 @@ +; RUN: rm -rf %t && split-file %s %t && cd %t + +;--- nodisc.ll + +; RUN: llc -mtriple aarch64-elf -mattr=+pauth -filetype=asm -o - nodisc.ll | \ +; RUN: FileCheck %s --check-prefix=ASM +; RUN: llc -mtriple aarch64-elf -mattr=+pauth -filetype=obj -o - nodisc.ll | \ +; RUN: llvm-readelf -r -x .init_array -x .fini_array - | FileCheck %s --check-prefix=OBJ + +; ASM: .section .init_array,"aw",@init_array +; ASM-NEXT: .p2align 3, 0x0 +; ASM-NEXT: .xword foo@AUTH(ia,55764) +; ASM-NEXT: .section .fini_array,"aw",@fini_array +; ASM-NEXT: .p2align 3, 0x0 +; ASM-NEXT: .xword bar@AUTH(ia,55764) + +; OBJ: Relocation section '.rela.init_array' at offset 0x[[#]] contains 1 entries: +; OBJ-NEXT: Offset Info Type Symbol's Value Symbol's Name + Addend +; OBJ-NEXT: 0000000000000000 0000000700000244 R_AARCH64_AUTH_ABS64 0000000000000000 foo + 0 +; OBJ: Relocation section '.rela.fini_array' at offset 0x[[#]] contains 1 entries: +; OBJ-NEXT: Offset Info Type Symbol's Value Symbol's Name + Addend +; OBJ-NEXT: 0000000000000000 0000000800000244 R_AARCH64_AUTH_ABS64 0000000000000004 bar + 0 +; OBJ: Hex dump of section '.init_array': +; OBJ-NEXT: 0x00000000 00000000 d4d90000 +; OBJ: Hex dump of section '.fini_array': +; OBJ-NEXT: 0x00000000 00000000 d4d90000 +;; ^^^^ 0xD9D4: constant discriminator = 55764 +;; ^^ 0x80: bits 61..60 key = IA; bit 63 addr disc = false + +@llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 65535, ptr ptrauth (ptr @foo, i32 0, i64 55764), ptr null }] +@llvm.global_dtors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 65535, ptr ptrauth (ptr @bar, i32 0, i64 55764), ptr null }] + +define void @foo() { + ret void +} + +define void @bar() { + ret void +} + +;--- disc.ll + +; RUN: llc -mtriple aarch64-elf -mattr=+pauth -filetype=asm -o - disc.ll | \ +; RUN: FileCheck %s --check-prefix=ASM-DISC +; RUN: llc -mtriple aarch64-elf -mattr=+pauth -filetype=obj -o - disc.ll | \ +; RUN: llvm-readelf -r -x .init_array -x .fini_array - | FileCheck %s --check-prefix=OBJ-DISC + +; ASM-DISC: .section .init_array,"aw",@init_array +; ASM-DISC-NEXT: .p2align 3, 0x0 +; ASM-DISC-NEXT: .xword foo@AUTH(ia,55764,addr) +; ASM-DISC-NEXT: .section .fini_array,"aw",@fini_array +; ASM-DISC-NEXT: .p2align 3, 0x0 +; ASM-DISC-NEXT: .xword bar@AUTH(ia,55764,addr) + +; OBJ-DISC: Relocation section '.rela.init_array' at offset 0x[[#]] contains 1 entries: +; OBJ-DISC-NEXT: Offset Info Type Symbol's Value Symbol's Name + Addend +; OBJ-DISC-NEXT: 0000000000000000 0000000700000244 R_AARCH64_AUTH_ABS64 0000000000000000 foo + 0 +; OBJ-DISC: Relocation section '.rela.fini_array' at offset 0x[[#]] contains 1 entries: +; OBJ-DISC-NEXT: Offset Info Type Symbol's Value Symbol's Name + Addend +; OBJ-DISC-NEXT: 0000000000000000 0000000800000244 R_AARCH64_AUTH_ABS64 0000000000000004 bar + 0 +; OBJ-DISC: Hex dump of section '.init_array': +; OBJ-DISC-NEXT: 0x00000000 00000000 d4d90080 +; OBJ-DISC: Hex dump of section '.fini_array': +; OBJ-DISC-NEXT: 0x00000000 00000000 d4d90080 +;; ^^^^ 0xD9D4: constant discriminator = 55764 +;; ^^ 0x80: bits 61..60 key = IA; bit 63 addr disc = true + +@llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 65535, ptr ptrauth (ptr @foo, i32 0, i64 55764, ptr inttoptr (i64 1 to ptr)), ptr null }] +@llvm.global_dtors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 65535, ptr ptrauth (ptr @bar, i32 0, i64 55764, ptr inttoptr (i64 1 to ptr)), ptr null }] + +define void @foo() { + ret void +} + +define void @bar() { + ret void +} + +;--- err1.ll + +; RUN: not --crash llc -mtriple aarch64-elf -mattr=+pauth -filetype=asm -o - err1.ll 2>&1 | \ +; RUN: FileCheck %s --check-prefix=ERR1 + +; ERR1: LLVM ERROR: unexpected address discrimination value for ctors/dtors entry, only 'ptr inttoptr (i64 1 to ptr)' is allowed + +@llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 65535, ptr ptrauth (ptr @foo, i32 0, i64 55764, ptr inttoptr (i64 2 to ptr)), ptr null }] + +define void @foo() { + ret void +} + +;--- err2.ll + +; RUN: not --crash llc -mtriple aarch64-elf -mattr=+pauth -filetype=asm -o - err2.ll 2>&1 | \ +; RUN: FileCheck %s --check-prefix=ERR2 + +; ERR2: LLVM ERROR: unexpected address discrimination value for ctors/dtors entry, only 'ptr inttoptr (i64 1 to ptr)' is allowed + +@g = external global ptr +@llvm.global_dtors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 65535, ptr ptrauth (ptr @bar, i32 0, i64 55764, ptr @g), ptr null }] + +define void @bar() { + ret void +} diff --git a/llvm/test/CodeGen/AArch64/ptrauth-tagged-globals-pic.ll b/llvm/test/CodeGen/AArch64/ptrauth-tagged-globals-pic.ll new file mode 100644 index 000000000000..c9a6722505bd --- /dev/null +++ b/llvm/test/CodeGen/AArch64/ptrauth-tagged-globals-pic.ll @@ -0,0 +1,66 @@ +; RUN: llc --relocation-model=pic -mattr=+pauth < %s | FileCheck %s --check-prefixes=CHECK,GISEL + +; RUN: llc -global-isel=0 -fast-isel=0 -O0 --relocation-model=pic < %s -mattr=+pauth | FileCheck %s --check-prefixes=CHECK,DAGISEL +; RUN: llc -global-isel=0 -fast-isel=1 -O0 --relocation-model=pic < %s -mattr=+pauth | FileCheck %s --check-prefixes=CHECK,DAGISEL +; RUN: llc -global-isel=1 -global-isel-abort=1 -O0 --relocation-model=pic < %s -mattr=+pauth | FileCheck %s --check-prefixes=CHECK,GISEL + +;; Note: for FastISel, we fall back to SelectionDAG + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-unknown-linux-android" + +@global = external global i32 +declare void @func() + +define ptr @global_addr() #0 { + ; CHECK-LABEL: global_addr: + ; CHECK: adrp [[REG:x[0-9]+]], :got_auth:global + ; CHECK-NEXT: add [[REG]], [[REG]], :got_auth_lo12:global + ; CHECK-NEXT: ldr x0, [[[REG]]] + ; CHECK-NEXT: autda x0, [[REG]] + ; CHECK-NEXT: ret + + ret ptr @global +} + +define i32 @global_load() #0 { + ; CHECK-LABEL: global_load: + ; CHECK: adrp [[REG0:x[0-9]+]], :got_auth:global + ; CHECK-NEXT: add [[REG0]], [[REG0]], :got_auth_lo12:global + ; CHECK-NEXT: ldr [[REG1:x[0-9]+]], [[[REG0]]] + ; CHECK-NEXT: autda [[REG1]], [[REG0]] + ; CHECK-NEXT: ldr w0, [[[REG1]]] + ; CHECK-NEXT: ret + %load = load i32, ptr @global + ret i32 %load +} + +define void @global_store() #0 { + ; CHECK-LABEL: global_store: + ; CHECK: adrp [[REG0:x[0-9]+]], :got_auth:global + ; CHECK-NEXT: add [[REG0]], [[REG0]], :got_auth_lo12:global + ; CHECK-NEXT: ldr [[REG1:x[0-9]+]], [[[REG0]]] + ; CHECK-NEXT: autda [[REG1]], [[REG0]] + ; GISEL-NEXT: str wzr, [[[REG1]]] + ; DAGISEL-NEXT: mov w8, wzr + ; DAGISEL-NEXT: str w8, [[[REG1]]] + ; CHECK-NEXT: ret + store i32 0, ptr @global + ret void +} + +define ptr @func_addr() #0 { + ; CHECK-LABEL: func_addr: + ; CHECK: adrp [[REG:x[0-9]+]], :got_auth:func + ; CHECK-NEXT: add [[REG]], [[REG]], :got_auth_lo12:func + ; CHECK-NEXT: ldr x0, [[[REG]]] + ; CHECK-NEXT: autia x0, [[REG]] + ; CHECK-NEXT: ret + ret ptr @func +} + +attributes #0 = { "target-features"="+tagged-globals" } + +!llvm.module.flags = !{!0, !1} +!0 = !{i32 1, !"aarch64-elf-pauthabi-platform", i32 268435458} +!1 = !{i32 1, !"aarch64-elf-pauthabi-version", i32 256} diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-max.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-max.ll index 6e2c48f88e02..9d865b1e7447 100644 --- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-max.ll +++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-max.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 -; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -mattr=+b16b16 -force-streaming -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -mattr=+sve-b16b16 -force-streaming -verify-machineinstrs < %s | FileCheck %s ; SMAX (Single, x2) diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-min.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-min.ll index d37984596f84..575bcbc919b8 100644 --- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-min.ll +++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-min.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 -; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -mattr=+b16b16 -force-streaming -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -mattr=+sve-b16b16 -force-streaming -verify-machineinstrs < %s | FileCheck %s ; SMIN (Single, x2) diff --git a/llvm/test/CodeGen/AArch64/sms-order-physreg-deps.mir b/llvm/test/CodeGen/AArch64/sms-order-physreg-deps.mir new file mode 100644 index 000000000000..4d8067e16b96 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sms-order-physreg-deps.mir @@ -0,0 +1,452 @@ +# RUN: llc --verify-machineinstrs -mtriple=aarch64 -o - %s -mcpu=a64fx -aarch64-enable-pipeliner -pipeliner-max-mii=100 -pipeliner-enable-copytophi=0 -debug-only=pipeliner -run-pass=pipeliner -treat-scalable-fixed-error-as-warning 2>&1 | FileCheck %s + +# REQUIRES: asserts + +# Verify that the order of the instructions is correct if they are scheduled in +# the same cycle and they have physical register dependencies. + +# CHECK: Schedule Found? 1 +# CHECK: cycle {{[0-9]+}} (0) {{.*}} SUBS{{.*}} implicit-def $nzcv +# CHECK-NOT: cycle {{[0-9]+}} (0) {{.*}} implicit-def {{.*}} $nzcv + +--- | + target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32" + + declare void @free(ptr allocptr nocapture noundef) local_unnamed_addr #0 + + define dso_local noundef i32 @main(i32 noundef %argc, ptr nocapture noundef readnone %argv) local_unnamed_addr #1 { + entry: + %ret.i.i55 = alloca ptr, align 8 + %ret.i.i = alloca ptr, align 8 + %0 = load ptr, ptr %ret.i.i, align 8 + br label %vector.ph + + vector.ph: ; preds = %for.inc20.i, %entry + %lsr.iv1 = phi i64 [ %lsr.iv.next2, %for.inc20.i ], [ 0, %entry ] + %indvars.iv45.i = phi i64 [ 0, %entry ], [ %indvars.iv.next46.i, %for.inc20.i ] + %broadcast.splatinsert = insertelement <vscale x 4 x i64> poison, i64 %indvars.iv45.i, i64 0 + %broadcast.splat = shufflevector <vscale x 4 x i64> %broadcast.splatinsert, <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer + br label %vector.body + + vector.body: ; preds = %vector.body, %vector.ph + %lsr.iv3 = phi i64 [ %lsr.iv.next4, %vector.body ], [ %lsr.iv1, %vector.ph ] + %lsr.iv = phi i64 [ %lsr.iv.next, %vector.body ], [ 2800, %vector.ph ] + %vec.ind = phi <vscale x 4 x i64> [ zeroinitializer, %vector.ph ], [ %vec.ind.next.6, %vector.body ] + %1 = mul nuw nsw <vscale x 4 x i64> %vec.ind, %broadcast.splat + %2 = trunc <vscale x 4 x i64> %1 to <vscale x 4 x i32> + %3 = urem <vscale x 4 x i32> %2, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 7, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer) + %4 = add nuw nsw <vscale x 4 x i32> %3, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer) + %5 = shl nuw nsw i64 %lsr.iv3, 2 + %scevgep16 = getelementptr i8, ptr %0, i64 %5 + %6 = add nuw nsw <vscale x 4 x i64> %vec.ind, %broadcast.splat + %7 = trunc <vscale x 4 x i64> %6 to <vscale x 4 x i32> + %8 = urem <vscale x 4 x i32> %7, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 13, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer) + %9 = icmp eq <vscale x 4 x i32> %8, zeroinitializer + %10 = urem <vscale x 4 x i32> %7, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 7, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer) + %11 = icmp eq <vscale x 4 x i32> %10, zeroinitializer + %12 = or <vscale x 4 x i1> %9, %11 + %13 = urem <vscale x 4 x i32> %7, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 11, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer) + %14 = icmp eq <vscale x 4 x i32> %13, zeroinitializer + %15 = or <vscale x 4 x i1> %14, %12 + %16 = select <vscale x 4 x i1> %15, <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 999, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32> %4 + store <vscale x 4 x i32> %16, ptr %scevgep16, align 4 + %vec.ind.next = add <vscale x 4 x i64> %vec.ind, shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 4, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer) + %17 = mul nuw nsw <vscale x 4 x i64> %vec.ind.next, %broadcast.splat + %18 = trunc <vscale x 4 x i64> %17 to <vscale x 4 x i32> + %19 = urem <vscale x 4 x i32> %18, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 7, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer) + %20 = add nuw nsw <vscale x 4 x i32> %19, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer) + %21 = shl nuw nsw i64 %lsr.iv3, 2 + %scevgep14 = getelementptr i8, ptr %0, i64 %21 + %scevgep15 = getelementptr i8, ptr %scevgep14, i64 16 + %22 = add nuw nsw <vscale x 4 x i64> %vec.ind.next, %broadcast.splat + %23 = trunc <vscale x 4 x i64> %22 to <vscale x 4 x i32> + %24 = urem <vscale x 4 x i32> %23, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 13, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer) + %25 = icmp eq <vscale x 4 x i32> %24, zeroinitializer + %26 = urem <vscale x 4 x i32> %23, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 7, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer) + %27 = icmp eq <vscale x 4 x i32> %26, zeroinitializer + %28 = or <vscale x 4 x i1> %25, %27 + %29 = urem <vscale x 4 x i32> %23, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 11, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer) + %30 = icmp eq <vscale x 4 x i32> %29, zeroinitializer + %31 = or <vscale x 4 x i1> %30, %28 + %32 = select <vscale x 4 x i1> %31, <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 999, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32> %20 + store <vscale x 4 x i32> %32, ptr %scevgep15, align 4 + %vec.ind.next.1 = add <vscale x 4 x i64> %vec.ind, shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 8, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer) + %33 = mul nuw nsw <vscale x 4 x i64> %vec.ind.next.1, %broadcast.splat + %34 = trunc <vscale x 4 x i64> %33 to <vscale x 4 x i32> + %35 = urem <vscale x 4 x i32> %34, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 7, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer) + %36 = add nuw nsw <vscale x 4 x i32> %35, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer) + %37 = shl nuw nsw i64 %lsr.iv3, 2 + %scevgep12 = getelementptr i8, ptr %0, i64 %37 + %scevgep13 = getelementptr i8, ptr %scevgep12, i64 32 + %38 = add nuw nsw <vscale x 4 x i64> %vec.ind.next.1, %broadcast.splat + %39 = trunc <vscale x 4 x i64> %38 to <vscale x 4 x i32> + %40 = urem <vscale x 4 x i32> %39, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 13, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer) + %41 = icmp eq <vscale x 4 x i32> %40, zeroinitializer + %42 = urem <vscale x 4 x i32> %39, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 7, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer) + %43 = icmp eq <vscale x 4 x i32> %42, zeroinitializer + %44 = or <vscale x 4 x i1> %41, %43 + %45 = urem <vscale x 4 x i32> %39, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 11, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer) + %46 = icmp eq <vscale x 4 x i32> %45, zeroinitializer + %47 = or <vscale x 4 x i1> %46, %44 + %48 = select <vscale x 4 x i1> %47, <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 999, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32> %36 + store <vscale x 4 x i32> %48, ptr %scevgep13, align 4 + %vec.ind.next.2 = add <vscale x 4 x i64> %vec.ind, shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 12, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer) + %49 = mul nuw nsw <vscale x 4 x i64> %vec.ind.next.2, %broadcast.splat + %50 = trunc <vscale x 4 x i64> %49 to <vscale x 4 x i32> + %51 = urem <vscale x 4 x i32> %50, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 7, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer) + %52 = add nuw nsw <vscale x 4 x i32> %51, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer) + %53 = shl nuw nsw i64 %lsr.iv3, 2 + %scevgep10 = getelementptr i8, ptr %0, i64 %53 + %scevgep11 = getelementptr i8, ptr %scevgep10, i64 48 + %54 = add nuw nsw <vscale x 4 x i64> %vec.ind.next.2, %broadcast.splat + %55 = trunc <vscale x 4 x i64> %54 to <vscale x 4 x i32> + %56 = urem <vscale x 4 x i32> %55, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 13, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer) + %57 = icmp eq <vscale x 4 x i32> %56, zeroinitializer + %58 = urem <vscale x 4 x i32> %55, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 7, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer) + %59 = icmp eq <vscale x 4 x i32> %58, zeroinitializer + %60 = or <vscale x 4 x i1> %57, %59 + %61 = urem <vscale x 4 x i32> %55, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 11, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer) + %62 = icmp eq <vscale x 4 x i32> %61, zeroinitializer + %63 = or <vscale x 4 x i1> %62, %60 + %64 = select <vscale x 4 x i1> %63, <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 999, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32> %52 + store <vscale x 4 x i32> %64, ptr %scevgep11, align 4 + %vec.ind.next.3 = add <vscale x 4 x i64> %vec.ind, shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 16, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer) + %65 = mul nuw nsw <vscale x 4 x i64> %vec.ind.next.3, %broadcast.splat + %66 = trunc <vscale x 4 x i64> %65 to <vscale x 4 x i32> + %67 = urem <vscale x 4 x i32> %66, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 7, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer) + %68 = add nuw nsw <vscale x 4 x i32> %67, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer) + %69 = shl nuw nsw i64 %lsr.iv3, 2 + %scevgep8 = getelementptr i8, ptr %0, i64 %69 + %scevgep9 = getelementptr i8, ptr %scevgep8, i64 64 + %70 = add nuw nsw <vscale x 4 x i64> %vec.ind.next.3, %broadcast.splat + %71 = trunc <vscale x 4 x i64> %70 to <vscale x 4 x i32> + %72 = urem <vscale x 4 x i32> %71, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 13, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer) + %73 = icmp eq <vscale x 4 x i32> %72, zeroinitializer + %74 = urem <vscale x 4 x i32> %71, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 7, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer) + %75 = icmp eq <vscale x 4 x i32> %74, zeroinitializer + %76 = or <vscale x 4 x i1> %73, %75 + %77 = urem <vscale x 4 x i32> %71, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 11, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer) + %78 = icmp eq <vscale x 4 x i32> %77, zeroinitializer + %79 = or <vscale x 4 x i1> %78, %76 + %80 = select <vscale x 4 x i1> %79, <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 999, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32> %68 + store <vscale x 4 x i32> %80, ptr %scevgep9, align 4 + %vec.ind.next.4 = add <vscale x 4 x i64> %vec.ind, shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 20, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer) + %81 = mul nuw nsw <vscale x 4 x i64> %vec.ind.next.4, %broadcast.splat + %82 = trunc <vscale x 4 x i64> %81 to <vscale x 4 x i32> + %83 = urem <vscale x 4 x i32> %82, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 7, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer) + %84 = add nuw nsw <vscale x 4 x i32> %83, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer) + %85 = shl nuw nsw i64 %lsr.iv3, 2 + %scevgep6 = getelementptr i8, ptr %0, i64 %85 + %scevgep7 = getelementptr i8, ptr %scevgep6, i64 80 + %86 = add nuw nsw <vscale x 4 x i64> %vec.ind.next.4, %broadcast.splat + %87 = trunc <vscale x 4 x i64> %86 to <vscale x 4 x i32> + %88 = urem <vscale x 4 x i32> %87, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 13, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer) + %89 = icmp eq <vscale x 4 x i32> %88, zeroinitializer + %90 = urem <vscale x 4 x i32> %87, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 7, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer) + %91 = icmp eq <vscale x 4 x i32> %90, zeroinitializer + %92 = or <vscale x 4 x i1> %89, %91 + %93 = urem <vscale x 4 x i32> %87, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 11, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer) + %94 = icmp eq <vscale x 4 x i32> %93, zeroinitializer + %95 = or <vscale x 4 x i1> %94, %92 + %96 = select <vscale x 4 x i1> %95, <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 999, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32> %84 + store <vscale x 4 x i32> %96, ptr %scevgep7, align 4 + %vec.ind.next.5 = add <vscale x 4 x i64> %vec.ind, shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 24, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer) + %97 = mul nuw nsw <vscale x 4 x i64> %vec.ind.next.5, %broadcast.splat + %98 = trunc <vscale x 4 x i64> %97 to <vscale x 4 x i32> + %99 = urem <vscale x 4 x i32> %98, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 7, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer) + %100 = add nuw nsw <vscale x 4 x i32> %99, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer) + %101 = shl nuw nsw i64 %lsr.iv3, 2 + %scevgep = getelementptr i8, ptr %0, i64 %101 + %scevgep5 = getelementptr i8, ptr %scevgep, i64 96 + %102 = add nuw nsw <vscale x 4 x i64> %vec.ind.next.5, %broadcast.splat + %103 = trunc <vscale x 4 x i64> %102 to <vscale x 4 x i32> + %104 = urem <vscale x 4 x i32> %103, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 13, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer) + %105 = icmp eq <vscale x 4 x i32> %104, zeroinitializer + %106 = urem <vscale x 4 x i32> %103, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 7, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer) + %107 = icmp eq <vscale x 4 x i32> %106, zeroinitializer + %108 = or <vscale x 4 x i1> %105, %107 + %109 = urem <vscale x 4 x i32> %103, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 11, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer) + %110 = icmp eq <vscale x 4 x i32> %109, zeroinitializer + %111 = or <vscale x 4 x i1> %110, %108 + %112 = select <vscale x 4 x i1> %111, <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 999, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32> %100 + store <vscale x 4 x i32> %112, ptr %scevgep5, align 4 + %vec.ind.next.6 = add <vscale x 4 x i64> %vec.ind, shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 28, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer) + %lsr.iv.next = add nsw i64 %lsr.iv, -28 + %lsr.iv.next4 = add nuw nsw i64 %lsr.iv3, 28 + %113 = icmp eq i64 %lsr.iv.next, 0 + br i1 %113, label %for.inc20.i, label %vector.body + + for.inc20.i: ; preds = %vector.body + %indvars.iv.next46.i = add nuw nsw i64 %indvars.iv45.i, 1 + %lsr.iv.next2 = add nuw nsw i64 %lsr.iv1, 2800 + %exitcond48.not.i = icmp eq i64 %indvars.iv.next46.i, 2800 + br i1 %exitcond48.not.i, label %init_array.exit, label %vector.ph + + init_array.exit: ; preds = %for.inc20.i + call void @free(ptr noundef nonnull %0) + ret i32 0 + } + + attributes #0 = { mustprogress nounwind willreturn allockind("free") memory(argmem: readwrite, inaccessiblemem: readwrite) "alloc-family"="malloc" "frame-pointer"="non-leaf" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="a64fx" "target-features"="+aes,+complxnum,+crc,+fp-armv8,+fullfp16,+lse,+neon,+outline-atomics,+perfmon,+ras,+rdm,+sha2,+sve,+v8.1a,+v8.2a,+v8a,-fmv" } + attributes #1 = { nounwind uwtable vscale_range(1,1) "frame-pointer"="non-leaf" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="a64fx" "target-features"="+aes,+complxnum,+crc,+fp-armv8,+fullfp16,+lse,+neon,+outline-atomics,+perfmon,+ras,+rdm,+sha2,+sve,+v8.1a,+v8.2a,+v8a,-fmv" } + +... +--- +name: main +tracksRegLiveness: true +stack: + - { id: 0, name: ret.i.i55, size: 8, alignment: 8, local-offset: -8 } + - { id: 1, name: ret.i.i, size: 8, alignment: 8, local-offset: -16 } +machineFunctionInfo: {} +body: | + bb.0.entry: + %18:gpr64all = COPY $xzr + %17:gpr64all = COPY %18 + %19:gpr64common = LDRXui %stack.1.ret.i.i, 0 :: (dereferenceable load (s64) from %ir.ret.i.i) + %0:gpr64common = COPY %19 + %21:zpr = DUP_ZI_D 0, 0, implicit $vg + %23:gpr32 = MOVi32imm 2800 + %24:ppr_3b = PTRUE_D 31, implicit $vg + %28:ppr_3b = PTRUE_S 31, implicit $vg + %29:gpr32common = MOVi32imm 613566757 + %30:zpr = DUP_ZR_S %29 + %36:zpr = DUP_ZI_S 7, 0, implicit $vg + %43:gpr32common = MOVi32imm -991146299 + %44:zpr = DUP_ZR_S %43 + %46:gpr32common = MOVi32imm 330382099 + %47:zpr = DUP_ZR_S %46 + %49:gpr32common = MOVi32imm -1227133513 + %50:zpr = DUP_ZR_S %49 + %52:gpr32common = MOVi32imm 613566756 + %53:zpr = DUP_ZR_S %52 + %56:gpr32common = MOVi32imm -1171354717 + %57:zpr = DUP_ZR_S %56 + %59:gpr32common = MOVi32imm 390451572 + %60:zpr = DUP_ZR_S %59 + %63:gpr32common = MOVi32imm 999 + %64:zpr = DUP_ZR_S %63 + %79:gpr64common = MOVi64imm 4 + %104:gpr64common = MOVi64imm 8 + %129:gpr64common = MOVi64imm 12 + %154:gpr64common = MOVi64imm 16 + %179:gpr64common = MOVi64imm 20 + %204:gpr64common = MOVi64imm 24 + + bb.1.vector.ph: + %1:gpr64sp = PHI %17, %bb.0, %14, %bb.3 + %2:gpr64sp = PHI %17, %bb.0, %13, %bb.3 + %4:zpr = DUP_ZR_D %2 + %22:zpr = COPY %21 + %20:gpr64all = SUBREG_TO_REG 0, %23, %subreg.sub_32 + + bb.2.vector.body: + successors: %bb.3(0x04000000), %bb.2(0x7c000000) + + %5:gpr64common = PHI %1, %bb.1, %12, %bb.2 + %6:gpr64sp = PHI %20, %bb.1, %11, %bb.2 + %7:zpr = PHI %21, %bb.1, %9, %bb.2 + %8:zpr = PHI %22, %bb.1, %10, %bb.2 + %25:zpr = nuw nsw MUL_ZPZZ_D_UNDEF %24, %7, %4 + %26:zpr = nuw nsw MUL_ZPZZ_D_UNDEF %24, %8, %4 + %27:zpr = UZP1_ZZZ_S killed %25, killed %26 + %31:zpr = UMULH_ZPZZ_S_UNDEF %28, %27, %30 + %32:zpr = SUB_ZZZ_S %27, %31 + %33:zpr = LSR_ZZI_S killed %32, 1 + %34:zpr = ADD_ZZZ_S killed %33, %31 + %35:zpr = LSR_ZZI_S killed %34, 2 + %37:zpr = MLS_ZPZZZ_S_UNDEF %28, %27, killed %35, %36 + %38:zpr = nuw nsw ADD_ZI_S %37, 1, 0 + %39:gpr64common = ADDXrs %19, %5, 2 + %40:zpr = nuw nsw ADD_ZZZ_D %7, %4 + %41:zpr = nuw nsw ADD_ZZZ_D %8, %4 + %42:zpr = UZP1_ZZZ_S killed %40, killed %41 + %45:zpr = MUL_ZPZZ_S_UNDEF %28, %42, %44 + %48:ppr = CMPHS_PPzZZ_S %28, %47, killed %45, implicit-def dead $nzcv + %51:zpr = MUL_ZPZZ_S_UNDEF %28, %42, %50 + %54:ppr = CMPHS_PPzZZ_S %28, %53, killed %51, implicit-def dead $nzcv + %55:ppr = SEL_PPPP %48, %48, killed %54 + %58:zpr = MUL_ZPZZ_S_UNDEF %28, %42, %57 + %61:ppr = CMPHS_PPzZZ_S %28, %60, killed %58, implicit-def dead $nzcv + %62:ppr = SEL_PPPP %61, %61, killed %55 + %65:zpr = SEL_ZPZZ_S killed %62, %64, killed %38 + ST1W killed %65, %28, %0, %5 :: (store (<vscale x 1 x s128>) into %ir.scevgep16, align 4) + %67:zpr = ADD_ZI_D %8, 4, 0 + %68:zpr = ADD_ZI_D %7, 4, 0 + %69:zpr = nuw nsw MUL_ZPZZ_D_UNDEF %24, %68, %4 + %70:zpr = nuw nsw MUL_ZPZZ_D_UNDEF %24, %67, %4 + %71:zpr = UZP1_ZZZ_S killed %69, killed %70 + %72:zpr = UMULH_ZPZZ_S_UNDEF %28, %71, %30 + %73:zpr = SUB_ZZZ_S %71, %72 + %74:zpr = LSR_ZZI_S killed %73, 1 + %75:zpr = ADD_ZZZ_S killed %74, %72 + %76:zpr = LSR_ZZI_S killed %75, 2 + %77:zpr = MLS_ZPZZZ_S_UNDEF %28, %71, killed %76, %36 + %78:zpr = nuw nsw ADD_ZI_S %77, 1, 0 + %80:zpr = nuw nsw ADD_ZZZ_D %68, %4 + %81:zpr = nuw nsw ADD_ZZZ_D %67, %4 + %82:zpr = UZP1_ZZZ_S killed %80, killed %81 + %83:zpr = MUL_ZPZZ_S_UNDEF %28, %82, %44 + %84:ppr = CMPHS_PPzZZ_S %28, %47, killed %83, implicit-def dead $nzcv + %85:zpr = MUL_ZPZZ_S_UNDEF %28, %82, %50 + %86:ppr = CMPHS_PPzZZ_S %28, %53, killed %85, implicit-def dead $nzcv + %87:ppr = SEL_PPPP %84, %84, killed %86 + %88:zpr = MUL_ZPZZ_S_UNDEF %28, %82, %57 + %89:ppr = CMPHS_PPzZZ_S %28, %60, killed %88, implicit-def dead $nzcv + %90:ppr = SEL_PPPP %89, %89, killed %87 + %91:zpr = SEL_ZPZZ_S killed %90, %64, killed %78 + ST1W killed %91, %28, %39, %79 :: (store (<vscale x 1 x s128>) into %ir.scevgep15, align 4) + %92:zpr = ADD_ZI_D %8, 8, 0 + %93:zpr = ADD_ZI_D %7, 8, 0 + %94:zpr = nuw nsw MUL_ZPZZ_D_UNDEF %24, %93, %4 + %95:zpr = nuw nsw MUL_ZPZZ_D_UNDEF %24, %92, %4 + %96:zpr = UZP1_ZZZ_S killed %94, killed %95 + %97:zpr = UMULH_ZPZZ_S_UNDEF %28, %96, %30 + %98:zpr = SUB_ZZZ_S %96, %97 + %99:zpr = LSR_ZZI_S killed %98, 1 + %100:zpr = ADD_ZZZ_S killed %99, %97 + %101:zpr = LSR_ZZI_S killed %100, 2 + %102:zpr = MLS_ZPZZZ_S_UNDEF %28, %96, killed %101, %36 + %103:zpr = nuw nsw ADD_ZI_S %102, 1, 0 + %105:zpr = nuw nsw ADD_ZZZ_D %93, %4 + %106:zpr = nuw nsw ADD_ZZZ_D %92, %4 + %107:zpr = UZP1_ZZZ_S killed %105, killed %106 + %108:zpr = MUL_ZPZZ_S_UNDEF %28, %107, %44 + %109:ppr = CMPHS_PPzZZ_S %28, %47, killed %108, implicit-def dead $nzcv + %110:zpr = MUL_ZPZZ_S_UNDEF %28, %107, %50 + %111:ppr = CMPHS_PPzZZ_S %28, %53, killed %110, implicit-def dead $nzcv + %112:ppr = SEL_PPPP %109, %109, killed %111 + %113:zpr = MUL_ZPZZ_S_UNDEF %28, %107, %57 + %114:ppr = CMPHS_PPzZZ_S %28, %60, killed %113, implicit-def dead $nzcv + %115:ppr = SEL_PPPP %114, %114, killed %112 + %116:zpr = SEL_ZPZZ_S killed %115, %64, killed %103 + ST1W killed %116, %28, %39, %104 :: (store (<vscale x 1 x s128>) into %ir.scevgep13, align 4) + %117:zpr = ADD_ZI_D %8, 12, 0 + %118:zpr = ADD_ZI_D %7, 12, 0 + %119:zpr = nuw nsw MUL_ZPZZ_D_UNDEF %24, %118, %4 + %120:zpr = nuw nsw MUL_ZPZZ_D_UNDEF %24, %117, %4 + %121:zpr = UZP1_ZZZ_S killed %119, killed %120 + %122:zpr = UMULH_ZPZZ_S_UNDEF %28, %121, %30 + %123:zpr = SUB_ZZZ_S %121, %122 + %124:zpr = LSR_ZZI_S killed %123, 1 + %125:zpr = ADD_ZZZ_S killed %124, %122 + %126:zpr = LSR_ZZI_S killed %125, 2 + %127:zpr = MLS_ZPZZZ_S_UNDEF %28, %121, killed %126, %36 + %128:zpr = nuw nsw ADD_ZI_S %127, 1, 0 + %130:zpr = nuw nsw ADD_ZZZ_D %118, %4 + %131:zpr = nuw nsw ADD_ZZZ_D %117, %4 + %132:zpr = UZP1_ZZZ_S killed %130, killed %131 + %133:zpr = MUL_ZPZZ_S_UNDEF %28, %132, %44 + %134:ppr = CMPHS_PPzZZ_S %28, %47, killed %133, implicit-def dead $nzcv + %135:zpr = MUL_ZPZZ_S_UNDEF %28, %132, %50 + %136:ppr = CMPHS_PPzZZ_S %28, %53, killed %135, implicit-def dead $nzcv + %137:ppr = SEL_PPPP %134, %134, killed %136 + %138:zpr = MUL_ZPZZ_S_UNDEF %28, %132, %57 + %139:ppr = CMPHS_PPzZZ_S %28, %60, killed %138, implicit-def dead $nzcv + %140:ppr = SEL_PPPP %139, %139, killed %137 + %141:zpr = SEL_ZPZZ_S killed %140, %64, killed %128 + ST1W killed %141, %28, %39, %129 :: (store (<vscale x 1 x s128>) into %ir.scevgep11, align 4) + %142:zpr = ADD_ZI_D %8, 16, 0 + %143:zpr = ADD_ZI_D %7, 16, 0 + %144:zpr = nuw nsw MUL_ZPZZ_D_UNDEF %24, %143, %4 + %145:zpr = nuw nsw MUL_ZPZZ_D_UNDEF %24, %142, %4 + %146:zpr = UZP1_ZZZ_S killed %144, killed %145 + %147:zpr = UMULH_ZPZZ_S_UNDEF %28, %146, %30 + %148:zpr = SUB_ZZZ_S %146, %147 + %149:zpr = LSR_ZZI_S killed %148, 1 + %150:zpr = ADD_ZZZ_S killed %149, %147 + %151:zpr = LSR_ZZI_S killed %150, 2 + %152:zpr = MLS_ZPZZZ_S_UNDEF %28, %146, killed %151, %36 + %153:zpr = nuw nsw ADD_ZI_S %152, 1, 0 + %155:zpr = nuw nsw ADD_ZZZ_D %143, %4 + %156:zpr = nuw nsw ADD_ZZZ_D %142, %4 + %157:zpr = UZP1_ZZZ_S killed %155, killed %156 + %158:zpr = MUL_ZPZZ_S_UNDEF %28, %157, %44 + %159:ppr = CMPHS_PPzZZ_S %28, %47, killed %158, implicit-def dead $nzcv + %160:zpr = MUL_ZPZZ_S_UNDEF %28, %157, %50 + %161:ppr = CMPHS_PPzZZ_S %28, %53, killed %160, implicit-def dead $nzcv + %162:ppr = SEL_PPPP %159, %159, killed %161 + %163:zpr = MUL_ZPZZ_S_UNDEF %28, %157, %57 + %164:ppr = CMPHS_PPzZZ_S %28, %60, killed %163, implicit-def dead $nzcv + %165:ppr = SEL_PPPP %164, %164, killed %162 + %166:zpr = SEL_ZPZZ_S killed %165, %64, killed %153 + ST1W killed %166, %28, %39, %154 :: (store (<vscale x 1 x s128>) into %ir.scevgep9, align 4) + %167:zpr = ADD_ZI_D %8, 20, 0 + %168:zpr = ADD_ZI_D %7, 20, 0 + %169:zpr = nuw nsw MUL_ZPZZ_D_UNDEF %24, %168, %4 + %170:zpr = nuw nsw MUL_ZPZZ_D_UNDEF %24, %167, %4 + %171:zpr = UZP1_ZZZ_S killed %169, killed %170 + %172:zpr = UMULH_ZPZZ_S_UNDEF %28, %171, %30 + %173:zpr = SUB_ZZZ_S %171, %172 + %174:zpr = LSR_ZZI_S killed %173, 1 + %175:zpr = ADD_ZZZ_S killed %174, %172 + %176:zpr = LSR_ZZI_S killed %175, 2 + %177:zpr = MLS_ZPZZZ_S_UNDEF %28, %171, killed %176, %36 + %178:zpr = nuw nsw ADD_ZI_S %177, 1, 0 + %180:zpr = nuw nsw ADD_ZZZ_D %168, %4 + %181:zpr = nuw nsw ADD_ZZZ_D %167, %4 + %182:zpr = UZP1_ZZZ_S killed %180, killed %181 + %183:zpr = MUL_ZPZZ_S_UNDEF %28, %182, %44 + %184:ppr = CMPHS_PPzZZ_S %28, %47, killed %183, implicit-def dead $nzcv + %185:zpr = MUL_ZPZZ_S_UNDEF %28, %182, %50 + %186:ppr = CMPHS_PPzZZ_S %28, %53, killed %185, implicit-def dead $nzcv + %187:ppr = SEL_PPPP %184, %184, killed %186 + %188:zpr = MUL_ZPZZ_S_UNDEF %28, %182, %57 + %189:ppr = CMPHS_PPzZZ_S %28, %60, killed %188, implicit-def dead $nzcv + %190:ppr = SEL_PPPP %189, %189, killed %187 + %191:zpr = SEL_ZPZZ_S killed %190, %64, killed %178 + ST1W killed %191, %28, %39, %179 :: (store (<vscale x 1 x s128>) into %ir.scevgep7, align 4) + %192:zpr = ADD_ZI_D %8, 24, 0 + %193:zpr = ADD_ZI_D %7, 24, 0 + %194:zpr = nuw nsw MUL_ZPZZ_D_UNDEF %24, %193, %4 + %195:zpr = nuw nsw MUL_ZPZZ_D_UNDEF %24, %192, %4 + %196:zpr = UZP1_ZZZ_S killed %194, killed %195 + %197:zpr = UMULH_ZPZZ_S_UNDEF %28, %196, %30 + %198:zpr = SUB_ZZZ_S %196, %197 + %199:zpr = LSR_ZZI_S killed %198, 1 + %200:zpr = ADD_ZZZ_S killed %199, %197 + %201:zpr = LSR_ZZI_S killed %200, 2 + %202:zpr = MLS_ZPZZZ_S_UNDEF %28, %196, killed %201, %36 + %203:zpr = nuw nsw ADD_ZI_S %202, 1, 0 + %205:zpr = nuw nsw ADD_ZZZ_D %193, %4 + %206:zpr = nuw nsw ADD_ZZZ_D %192, %4 + %207:zpr = UZP1_ZZZ_S killed %205, killed %206 + %208:zpr = MUL_ZPZZ_S_UNDEF %28, %207, %44 + %209:ppr = CMPHS_PPzZZ_S %28, %47, killed %208, implicit-def dead $nzcv + %210:zpr = MUL_ZPZZ_S_UNDEF %28, %207, %50 + %211:ppr = CMPHS_PPzZZ_S %28, %53, killed %210, implicit-def dead $nzcv + %212:ppr = SEL_PPPP %209, %209, killed %211 + %213:zpr = MUL_ZPZZ_S_UNDEF %28, %207, %57 + %214:ppr = CMPHS_PPzZZ_S %28, %60, killed %213, implicit-def dead $nzcv + %215:ppr = SEL_PPPP %214, %214, killed %212 + %216:zpr = SEL_ZPZZ_S killed %215, %64, killed %203 + ST1W killed %216, %28, %39, %204 :: (store (<vscale x 1 x s128>) into %ir.scevgep5, align 4) + %9:zpr = ADD_ZI_D %7, 28, 0 + %10:zpr = ADD_ZI_D %8, 28, 0 + %217:gpr64 = nsw SUBSXri %6, 28, 0, implicit-def $nzcv + %11:gpr64all = COPY %217 + %218:gpr64sp = nuw nsw ADDXri %5, 28, 0 + %12:gpr64all = COPY %218 + Bcc 1, %bb.2, implicit $nzcv + B %bb.3 + + bb.3.for.inc20.i: + successors: %bb.4(0x04000000), %bb.1(0x7c000000) + + %219:gpr64sp = nuw nsw ADDXri %2, 1, 0 + %13:gpr64all = COPY %219 + %220:gpr64sp = nuw nsw ADDXri %1, 2800, 0 + %14:gpr64all = COPY %220 + dead $xzr = SUBSXri %219, 2800, 0, implicit-def $nzcv + Bcc 1, %bb.1, implicit $nzcv + B %bb.4 + + bb.4.init_array.exit: + ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + $x0 = COPY %0 + BL @free, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $x0, implicit-def $sp + ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + %222:gpr32all = COPY $wzr + $w0 = COPY %222 + RET_ReallyLR implicit $w0 + +... diff --git a/llvm/test/CodeGen/AArch64/srem-seteq-vec-splat.ll b/llvm/test/CodeGen/AArch64/srem-seteq-vec-splat.ll index c0c0ae5c9d1f..03d40fc61f0c 100644 --- a/llvm/test/CodeGen/AArch64/srem-seteq-vec-splat.ll +++ b/llvm/test/CodeGen/AArch64/srem-seteq-vec-splat.ll @@ -206,7 +206,7 @@ define <4 x i32> @test_srem_int_min(<4 x i32> %X) nounwind { ; CHECK-NEXT: movi v1.4s, #128, lsl #24 ; CHECK-NEXT: usra v3.4s, v2.4s, #1 ; CHECK-NEXT: and v1.16b, v3.16b, v1.16b -; CHECK-NEXT: add v0.4s, v1.4s, v0.4s +; CHECK-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b diff --git a/llvm/test/CodeGen/AArch64/ssve-stack-hazard-remarks.ll b/llvm/test/CodeGen/AArch64/ssve-stack-hazard-remarks.ll new file mode 100644 index 000000000000..0b6bf3892a0c --- /dev/null +++ b/llvm/test/CodeGen/AArch64/ssve-stack-hazard-remarks.ll @@ -0,0 +1,152 @@ +; RUN: llc < %s -mtriple=aarch64 -mattr=+sve2 -pass-remarks-analysis=sme -aarch64-stack-hazard-remark-size=64 -o /dev/null < %s 2>&1 | FileCheck %s --check-prefixes=CHECK +; RUN: llc < %s -mtriple=aarch64 -mattr=+sve2 -pass-remarks-analysis=sme -aarch64-stack-hazard-size=1024 -o /dev/null < %s 2>&1 | FileCheck %s --check-prefixes=CHECK-PADDING + +; Don't emit remarks for non-streaming functions. +define float @csr_x20_stackargs_notsc(float %a, float %b, float %c, float %d, float %e, float %f, float %g, float %h, float %i) { +; CHECK-NOT: remark: <unknown>:0:0: stack hazard in 'csr_x20_stackargs_notsc': +; CHECK-PADDING-NOT: remark: <unknown>:0:0: stack hazard in 'csr_x20_stackargs_notsc': +entry: + tail call void asm sideeffect "", "~{x20}"() #1 + ret float %i +} + +; Don't emit remarks for functions that only access GPR stack objects. +define i64 @stackargs_gpr(i64 %a, i64 %b, i64 %c, i64 %d, i64 %e, i64 %f, i64 %g, i64 %h, i64 %i) #2 { +; CHECK-NOT: remark: <unknown>:0:0: stack hazard in 'csr_x20_stackargs_gpr': +; CHECK-PADDING-NOT: remark: <unknown>:0:0: stack hazard in 'csr_x20_stackargs_gpr': +entry: + ret i64 %i +} + +; Don't emit remarks for functions that only access FPR stack objects. +define double @stackargs_fpr(double %a, double %b, double %c, double %d, double %e, double %f, double %g, double %h, double %i) #2 { +; CHECK-NOT: remark: <unknown>:0:0: stack hazard in 'csr_x20_stackargs_fpr': +; CHECK-PADDING-NOT: remark: <unknown>:0:0: stack hazard in 'csr_x20_stackargs_fpr': +entry: + ret double %i +} + +; As this case is handled by addition of stack hazard padding, only emit remarks when this is not switched on. +define i32 @csr_d8_alloci64(i64 %d) #2 { +; CHECK: remark: <unknown>:0:0: stack hazard in 'csr_d8_alloci64': FPR stack object at [SP-16] is too close to GPR stack object at [SP-8] +; CHECK-PADDING-NOT: remark: <unknown>:0:0: stack hazard in 'csr_d8_alloci64': +entry: + %a = alloca i64 + tail call void asm sideeffect "", "~{d8}"() #1 + store i64 %d, ptr %a + ret i32 0 +} + +; As this case is handled by addition of stack hazard padding, only emit remarks when this is not switched on. +define i32 @csr_d8_allocnxv4i32(i64 %d) #2 { +; CHECK: remark: <unknown>:0:0: stack hazard in 'csr_d8_allocnxv4i32': FPR stack object at [SP-16] is too close to GPR stack object at [SP-8] +; CHECK-PADDING-NOT: remark: <unknown>:0:0: stack hazard in 'csr_d8_allocnxv4i32': +entry: + %a = alloca <vscale x 4 x i32> + tail call void asm sideeffect "", "~{d8}"() #1 + store <vscale x 4 x i32> zeroinitializer, ptr %a + ret i32 0 +} + +define float @csr_x20_stackargs(float %a, float %b, float %c, float %d, float %e, float %f, float %g, float %h, float %i) #2 { +; CHECK: remark: <unknown>:0:0: stack hazard in 'csr_x20_stackargs': GPR stack object at [SP-16] is too close to FPR stack object at [SP+0] +; CHECK-PADDING: remark: <unknown>:0:0: stack hazard in 'csr_x20_stackargs': GPR stack object at [SP-16] is too close to FPR stack object at [SP+0] +entry: + tail call void asm sideeffect "", "~{x20}"() #1 + ret float %i +} + +; In this case, addition of stack hazard padding triggers x29 (fp) spill, so we hazard occurs between FPR argument and GPR spill. +define float @csr_d8_stackargs(float %a, float %b, float %c, float %d, float %e, float %f, float %g, float %h, float %i) #2 { +; CHECK-NOT: remark: <unknown>:0:0: stack hazard in 'csr_d8_stackargs': +; CHECK-PADDING: remark: <unknown>:0:0: stack hazard in 'csr_d8_stackargs': GPR stack object at [SP-8] is too close to FPR stack object at [SP+0] +entry: + tail call void asm sideeffect "", "~{d8}"() #1 + ret float %i +} + +; SVE calling conventions +; Predicate register spills end up in FP region, currently. + +define i32 @svecc_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8> %P3, i16 %P4) #2 { +; CHECK: remark: <unknown>:0:0: stack hazard in 'svecc_call': PPR stack object at [SP-48-258 * vscale] is too close to FPR stack object at [SP-48-256 * vscale] +; CHECK: remark: <unknown>:0:0: stack hazard in 'svecc_call': FPR stack object at [SP-48-16 * vscale] is too close to GPR stack object at [SP-48] +; CHECK-PADDING: remark: <unknown>:0:0: stack hazard in 'svecc_call': PPR stack object at [SP-1072-258 * vscale] is too close to FPR stack object at [SP-1072-256 * vscale] +; CHECK-PADDING-NOT: remark: <unknown>:0:0: stack hazard in 'svecc_call': +entry: + tail call void asm sideeffect "", "~{x0},~{x28},~{x27},~{x3}"() #2 + %call = call ptr @memset(ptr noundef nonnull %P1, i32 noundef 45, i32 noundef 37) + ret i32 -396142473 +} + +define i32 @svecc_alloca_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8> %P3, i16 %P4) #2 { +; CHECK: remark: <unknown>:0:0: stack hazard in 'svecc_alloca_call': PPR stack object at [SP-48-258 * vscale] is too close to FPR stack object at [SP-48-256 * vscale] +; CHECK: remark: <unknown>:0:0: stack hazard in 'svecc_alloca_call': FPR stack object at [SP-48-16 * vscale] is too close to GPR stack object at [SP-48] +; CHECK-PADDING: remark: <unknown>:0:0: stack hazard in 'svecc_alloca_call': PPR stack object at [SP-1072-258 * vscale] is too close to FPR stack object at [SP-1072-256 * vscale] +; CHECK-PADDING-NOT: remark: <unknown>:0:0: stack hazard in 'svecc_alloca_call': +entry: + tail call void asm sideeffect "", "~{x0},~{x28},~{x27},~{x3}"() #2 + %0 = alloca [37 x i8], align 16 + %call = call ptr @memset(ptr noundef nonnull %0, i32 noundef 45, i32 noundef 37) + ret i32 -396142473 +} +declare ptr @memset(ptr, i32, i32) + +%struct.mixed_struct = type { i32, float } + +define i32 @mixed_stack_object(i32 %a, float %b) #2 { +; CHECK: remark: <unknown>:0:0: stack hazard in 'mixed_stack_object': Mixed stack object at [SP-8] accessed by both GP and FP instructions +; CHECK-PADDING: remark: <unknown>:0:0: stack hazard in 'mixed_stack_object': Mixed stack object at [SP-8] accessed by both GP and FP instructions +entry: + %s = alloca %struct.mixed_struct + %s.i = getelementptr %struct.mixed_struct, ptr %s, i32 0, i32 0 + %s.f = getelementptr %struct.mixed_struct, ptr %s, i32 0, i32 1 + store i32 %a, ptr %s.i + store float %b, ptr %s.f + ret i32 %a +} + +define i32 @mixed_stack_objects(i32 %a, float %b) #2 { +; CHECK: remark: <unknown>:0:0: stack hazard in 'mixed_stack_objects': Mixed stack object at [SP-16] is too close to Mixed stack object at [SP-8] +; CHECK: remark: <unknown>:0:0: stack hazard in 'mixed_stack_objects': Mixed stack object at [SP-16] accessed by both GP and FP instructions +; CHECK: remark: <unknown>:0:0: stack hazard in 'mixed_stack_objects': Mixed stack object at [SP-8] accessed by both GP and FP instructions +; CHECK-PADDING: remark: <unknown>:0:0: stack hazard in 'mixed_stack_objects': Mixed stack object at [SP-16] is too close to Mixed stack object at [SP-8] +; CHECK-PADDING: remark: <unknown>:0:0: stack hazard in 'mixed_stack_objects': Mixed stack object at [SP-16] accessed by both GP and FP instructions +; CHECK-PADDING: remark: <unknown>:0:0: stack hazard in 'mixed_stack_objects': Mixed stack object at [SP-8] accessed by both GP and FP instructions +entry: + %s0 = alloca %struct.mixed_struct + %s0.i = getelementptr %struct.mixed_struct, ptr %s0, i32 0, i32 0 + %s0.f = getelementptr %struct.mixed_struct, ptr %s0, i32 0, i32 1 + store i32 %a, ptr %s0.i + store float %b, ptr %s0.f + + %s1 = alloca %struct.mixed_struct + %s1.i = getelementptr %struct.mixed_struct, ptr %s1, i32 0, i32 0 + %s1.f = getelementptr %struct.mixed_struct, ptr %s1, i32 0, i32 1 + store i32 %a, ptr %s1.i + store float %b, ptr %s1.f + + ret i32 %a +} + +; VLA-area stack objects are not separated. +define i32 @csr_d8_allocnxv4i32i32f64_vlai32f64(double %d, i32 %i) #2 { +; CHECK: remark: <unknown>:0:0: stack hazard in 'csr_d8_allocnxv4i32i32f64_vlai32f64': GPR stack object at [SP-48-16 * vscale] is too close to FPR stack object at [SP-48-16 * vscale] +; CHECK: remark: <unknown>:0:0: stack hazard in 'csr_d8_allocnxv4i32i32f64_vlai32f64': FPR stack object at [SP-32] is too close to GPR stack object at [SP-24] +; CHECK-PADDING: remark: <unknown>:0:0: stack hazard in 'csr_d8_allocnxv4i32i32f64_vlai32f64': GPR stack object at [SP-2096-16 * vscale] is too close to FPR stack object at [SP-2096-16 * vscale] +; CHECK-PADDING-NOT: remark: <unknown>:0:0: stack hazard in 'csr_d8_allocnxv4i32i32f64_vlai32f64': +entry: + %a = alloca <vscale x 4 x i32> + %0 = zext i32 %i to i64 + %vla0 = alloca i32, i64 %0 + %vla1 = alloca double, i64 %0 + %c = alloca double + tail call void asm sideeffect "", "~{d8}"() #1 + store <vscale x 4 x i32> zeroinitializer, ptr %a + store i32 zeroinitializer, ptr %vla0 + store double %d, ptr %vla1 + store double %d, ptr %c + ret i32 0 +} + +attributes #2 = { "aarch64_pstate_sm_compatible" } diff --git a/llvm/test/CodeGen/AArch64/sve-pred-selectop2.ll b/llvm/test/CodeGen/AArch64/sve-pred-selectop2.ll index 2541910e080e..adbdee0eb084 100644 --- a/llvm/test/CodeGen/AArch64/sve-pred-selectop2.ll +++ b/llvm/test/CodeGen/AArch64/sve-pred-selectop2.ll @@ -1505,9 +1505,9 @@ define <vscale x 2 x i64> @sub_nxv2i64_y(<vscale x 2 x i64> %x, <vscale x 2 x i6 ; CHECK-LABEL: sub_nxv2i64_y: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: sub z0.d, z0.d, z1.d ; CHECK-NEXT: cmpgt p0.d, p0/z, z2.d, #0 -; CHECK-NEXT: sel z0.d, p0, z0.d, z1.d +; CHECK-NEXT: subr z1.d, p0/m, z1.d, z0.d +; CHECK-NEXT: mov z0.d, z1.d ; CHECK-NEXT: ret entry: %c = icmp sgt <vscale x 2 x i64> %n, zeroinitializer @@ -1520,9 +1520,9 @@ define <vscale x 4 x i32> @sub_nxv4i32_y(<vscale x 4 x i32> %x, <vscale x 4 x i3 ; CHECK-LABEL: sub_nxv4i32_y: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: sub z0.s, z0.s, z1.s ; CHECK-NEXT: cmpgt p0.s, p0/z, z2.s, #0 -; CHECK-NEXT: sel z0.s, p0, z0.s, z1.s +; CHECK-NEXT: subr z1.s, p0/m, z1.s, z0.s +; CHECK-NEXT: mov z0.d, z1.d ; CHECK-NEXT: ret entry: %c = icmp sgt <vscale x 4 x i32> %n, zeroinitializer @@ -1535,9 +1535,9 @@ define <vscale x 8 x i16> @sub_nxv8i16_y(<vscale x 8 x i16> %x, <vscale x 8 x i1 ; CHECK-LABEL: sub_nxv8i16_y: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: sub z0.h, z0.h, z1.h ; CHECK-NEXT: cmpgt p0.h, p0/z, z2.h, #0 -; CHECK-NEXT: sel z0.h, p0, z0.h, z1.h +; CHECK-NEXT: subr z1.h, p0/m, z1.h, z0.h +; CHECK-NEXT: mov z0.d, z1.d ; CHECK-NEXT: ret entry: %c = icmp sgt <vscale x 8 x i16> %n, zeroinitializer @@ -1550,9 +1550,9 @@ define <vscale x 16 x i8> @sub_nxv16i8_y(<vscale x 16 x i8> %x, <vscale x 16 x i ; CHECK-LABEL: sub_nxv16i8_y: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: sub z0.b, z0.b, z1.b ; CHECK-NEXT: cmpgt p0.b, p0/z, z2.b, #0 -; CHECK-NEXT: sel z0.b, p0, z0.b, z1.b +; CHECK-NEXT: subr z1.b, p0/m, z1.b, z0.b +; CHECK-NEXT: mov z0.d, z1.d ; CHECK-NEXT: ret entry: %c = icmp sgt <vscale x 16 x i8> %n, zeroinitializer @@ -2517,10 +2517,10 @@ define <vscale x 4 x float> @fsub_nxv4f32_y(<vscale x 4 x float> %x, <vscale x 4 ; CHECK-LABEL: fsub_nxv4f32_y: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: fsub z0.s, z0.s, z1.s ; CHECK-NEXT: fcmle p1.s, p0/z, z2.s, #0.0 ; CHECK-NEXT: not p0.b, p0/z, p1.b -; CHECK-NEXT: sel z0.s, p0, z0.s, z1.s +; CHECK-NEXT: fsubr z1.s, p0/m, z1.s, z0.s +; CHECK-NEXT: mov z0.d, z1.d ; CHECK-NEXT: ret entry: %c = fcmp ugt <vscale x 4 x float> %n, zeroinitializer @@ -2533,10 +2533,10 @@ define <vscale x 8 x half> @fsub_nxv8f16_y(<vscale x 8 x half> %x, <vscale x 8 x ; CHECK-LABEL: fsub_nxv8f16_y: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: fsub z0.h, z0.h, z1.h ; CHECK-NEXT: fcmle p1.h, p0/z, z2.h, #0.0 ; CHECK-NEXT: not p0.b, p0/z, p1.b -; CHECK-NEXT: sel z0.h, p0, z0.h, z1.h +; CHECK-NEXT: fsubr z1.h, p0/m, z1.h, z0.h +; CHECK-NEXT: mov z0.d, z1.d ; CHECK-NEXT: ret entry: %c = fcmp ugt <vscale x 8 x half> %n, zeroinitializer @@ -2549,10 +2549,10 @@ define <vscale x 2 x double> @fsub_nxv2f64_y(<vscale x 2 x double> %x, <vscale x ; CHECK-LABEL: fsub_nxv2f64_y: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: fsub z0.d, z0.d, z1.d ; CHECK-NEXT: fcmle p1.d, p0/z, z2.d, #0.0 ; CHECK-NEXT: not p0.b, p0/z, p1.b -; CHECK-NEXT: sel z0.d, p0, z0.d, z1.d +; CHECK-NEXT: fsubr z1.d, p0/m, z1.d, z0.d +; CHECK-NEXT: mov z0.d, z1.d ; CHECK-NEXT: ret entry: %c = fcmp ugt <vscale x 2 x double> %n, zeroinitializer diff --git a/llvm/test/CodeGen/AArch64/sve-pred-selectop3.ll b/llvm/test/CodeGen/AArch64/sve-pred-selectop3.ll index bafd5abcc7b2..6607f9c3b368 100644 --- a/llvm/test/CodeGen/AArch64/sve-pred-selectop3.ll +++ b/llvm/test/CodeGen/AArch64/sve-pred-selectop3.ll @@ -932,9 +932,9 @@ define <vscale x 2 x i64> @sub_nxv2i64_y(<vscale x 2 x i64> %x, <vscale x 2 x i6 ; CHECK-LABEL: sub_nxv2i64_y: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: sub z0.d, z0.d, z1.d ; CHECK-NEXT: cmpgt p0.d, p0/z, z2.d, #0 -; CHECK-NEXT: sel z0.d, p0, z0.d, z1.d +; CHECK-NEXT: subr z1.d, p0/m, z1.d, z0.d +; CHECK-NEXT: mov z0.d, z1.d ; CHECK-NEXT: ret entry: %c = icmp sgt <vscale x 2 x i64> %n, zeroinitializer @@ -947,9 +947,9 @@ define <vscale x 4 x i32> @sub_nxv4i32_y(<vscale x 4 x i32> %x, <vscale x 4 x i3 ; CHECK-LABEL: sub_nxv4i32_y: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: sub z0.s, z0.s, z1.s ; CHECK-NEXT: cmpgt p0.s, p0/z, z2.s, #0 -; CHECK-NEXT: sel z0.s, p0, z0.s, z1.s +; CHECK-NEXT: subr z1.s, p0/m, z1.s, z0.s +; CHECK-NEXT: mov z0.d, z1.d ; CHECK-NEXT: ret entry: %c = icmp sgt <vscale x 4 x i32> %n, zeroinitializer @@ -962,9 +962,9 @@ define <vscale x 8 x i16> @sub_nxv8i16_y(<vscale x 8 x i16> %x, <vscale x 8 x i1 ; CHECK-LABEL: sub_nxv8i16_y: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: sub z0.h, z0.h, z1.h ; CHECK-NEXT: cmpgt p0.h, p0/z, z2.h, #0 -; CHECK-NEXT: sel z0.h, p0, z0.h, z1.h +; CHECK-NEXT: subr z1.h, p0/m, z1.h, z0.h +; CHECK-NEXT: mov z0.d, z1.d ; CHECK-NEXT: ret entry: %c = icmp sgt <vscale x 8 x i16> %n, zeroinitializer @@ -977,9 +977,9 @@ define <vscale x 16 x i8> @sub_nxv16i8_y(<vscale x 16 x i8> %x, <vscale x 16 x i ; CHECK-LABEL: sub_nxv16i8_y: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: sub z0.b, z0.b, z1.b ; CHECK-NEXT: cmpgt p0.b, p0/z, z2.b, #0 -; CHECK-NEXT: sel z0.b, p0, z0.b, z1.b +; CHECK-NEXT: subr z1.b, p0/m, z1.b, z0.b +; CHECK-NEXT: mov z0.d, z1.d ; CHECK-NEXT: ret entry: %c = icmp sgt <vscale x 16 x i8> %n, zeroinitializer @@ -1588,10 +1588,10 @@ define <vscale x 4 x float> @fsub_nxv4f32_y(<vscale x 4 x float> %x, <vscale x 4 ; CHECK-LABEL: fsub_nxv4f32_y: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: fsub z0.s, z0.s, z1.s ; CHECK-NEXT: fcmle p1.s, p0/z, z2.s, #0.0 ; CHECK-NEXT: not p0.b, p0/z, p1.b -; CHECK-NEXT: sel z0.s, p0, z0.s, z1.s +; CHECK-NEXT: fsubr z1.s, p0/m, z1.s, z0.s +; CHECK-NEXT: mov z0.d, z1.d ; CHECK-NEXT: ret entry: %c = fcmp ugt <vscale x 4 x float> %n, zeroinitializer @@ -1604,10 +1604,10 @@ define <vscale x 8 x half> @fsub_nxv8f16_y(<vscale x 8 x half> %x, <vscale x 8 x ; CHECK-LABEL: fsub_nxv8f16_y: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: fsub z0.h, z0.h, z1.h ; CHECK-NEXT: fcmle p1.h, p0/z, z2.h, #0.0 ; CHECK-NEXT: not p0.b, p0/z, p1.b -; CHECK-NEXT: sel z0.h, p0, z0.h, z1.h +; CHECK-NEXT: fsubr z1.h, p0/m, z1.h, z0.h +; CHECK-NEXT: mov z0.d, z1.d ; CHECK-NEXT: ret entry: %c = fcmp ugt <vscale x 8 x half> %n, zeroinitializer @@ -1620,10 +1620,10 @@ define <vscale x 2 x double> @fsub_nxv2f64_y(<vscale x 2 x double> %x, <vscale x ; CHECK-LABEL: fsub_nxv2f64_y: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: fsub z0.d, z0.d, z1.d ; CHECK-NEXT: fcmle p1.d, p0/z, z2.d, #0.0 ; CHECK-NEXT: not p0.b, p0/z, p1.b -; CHECK-NEXT: sel z0.d, p0, z0.d, z1.d +; CHECK-NEXT: fsubr z1.d, p0/m, z1.d, z0.d +; CHECK-NEXT: mov z0.d, z1.d ; CHECK-NEXT: ret entry: %c = fcmp ugt <vscale x 2 x double> %n, zeroinitializer diff --git a/llvm/test/CodeGen/AArch64/sve-stack-frame-layout.ll b/llvm/test/CodeGen/AArch64/sve-stack-frame-layout.ll index 431c9dc76508..ec94198a08ca 100644 --- a/llvm/test/CodeGen/AArch64/sve-stack-frame-layout.ll +++ b/llvm/test/CodeGen/AArch64/sve-stack-frame-layout.ll @@ -150,8 +150,8 @@ entry: ; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32], Type: Spill, Align: 8, Size: 8 ; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32-16 x vscale], Type: Variable, Align: 16, Size: vscale x 16 ; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-40-16 x vscale], Type: Variable, Align: 8, Size: 8 -; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32], Type: VariableSized, Align: 1, Size: 0 -; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32], Type: VariableSized, Align: 1, Size: 0 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48-16 x vscale], Type: VariableSized, Align: 1, Size: 0 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48-16 x vscale], Type: VariableSized, Align: 1, Size: 0 define i32 @csr_d8_allocnxv4i32i32f64_vla(double %d, i32 %i) "aarch64_pstate_sm_compatible" { ; CHECK-LABEL: csr_d8_allocnxv4i32i32f64_vla: diff --git a/llvm/test/CodeGen/AArch64/sve2-min-max-clamp.ll b/llvm/test/CodeGen/AArch64/sve2-min-max-clamp.ll index 365fd5345484..d5fda04b9773 100644 --- a/llvm/test/CodeGen/AArch64/sve2-min-max-clamp.ll +++ b/llvm/test/CodeGen/AArch64/sve2-min-max-clamp.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 -; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -mattr=+b16b16 < %s | FileCheck %s +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -mattr=+sve-b16b16 < %s | FileCheck %s ; Replace pattern min(max(v1,v2),v3) by clamp diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfadd.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfadd.ll index 221bb3b6045f..7b921d71cbfb 100644 --- a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfadd.ll +++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfadd.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -mattr=+b16b16 -mattr=+use-experimental-zeroing-pseudos -verify-machineinstrs < %s \ +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -mattr=+sve-b16b16 -mattr=+use-experimental-zeroing-pseudos -verify-machineinstrs < %s \ ; RUN: | FileCheck %s define <vscale x 8 x bfloat> @bfadd_pred(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b){ diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfclamp.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfclamp.ll index 7934f831a7e6..baadd08e392d 100644 --- a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfclamp.ll +++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfclamp.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -mattr=+sme2 -mattr=+b16b16 -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -mattr=+sme2 -mattr=+sve-b16b16 -verify-machineinstrs < %s | FileCheck %s define <vscale x 8 x bfloat> @bfclamp(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c){ ; CHECK-LABEL: bfclamp: diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmax.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmax.ll index 24c4fedb3426..55ef452b6030 100644 --- a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmax.ll +++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmax.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -mattr=+b16b16 -mattr=+use-experimental-zeroing-pseudos -verify-machineinstrs < %s \ +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -mattr=+sve-b16b16 -mattr=+use-experimental-zeroing-pseudos -verify-machineinstrs < %s \ ; RUN: | FileCheck %s define <vscale x 8 x bfloat> @bfmax_pred(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b){ diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmaxnm.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmaxnm.ll index 25fe9cf7243a..9b0f7e039f2e 100644 --- a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmaxnm.ll +++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmaxnm.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -mattr=+b16b16 -mattr=+use-experimental-zeroing-pseudos -verify-machineinstrs < %s \ +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -mattr=+sve-b16b16 -mattr=+use-experimental-zeroing-pseudos -verify-machineinstrs < %s \ ; RUN: | FileCheck %s define <vscale x 8 x bfloat> @bfmaxnm_pred(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b){ diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmin.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmin.ll index d5b0b8be8b85..8c586fd47f5a 100644 --- a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmin.ll +++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmin.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -mattr=+b16b16 -mattr=+use-experimental-zeroing-pseudos -verify-machineinstrs < %s \ +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -mattr=+sve-b16b16 -mattr=+use-experimental-zeroing-pseudos -verify-machineinstrs < %s \ ; RUN: | FileCheck %s define <vscale x 8 x bfloat> @bfmin_pred(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b){ diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfminnm.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfminnm.ll index c019dc7cbe29..90132224e022 100644 --- a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfminnm.ll +++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfminnm.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -mattr=+b16b16 -mattr=+use-experimental-zeroing-pseudos -verify-machineinstrs < %s \ +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -mattr=+sve-b16b16 -mattr=+use-experimental-zeroing-pseudos -verify-machineinstrs < %s \ ; RUN: | FileCheck %s define <vscale x 8 x bfloat> @bfminnm_pred(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b){ diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmla.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmla.ll index 02b1db13ea34..eb7e99f332da 100644 --- a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmla.ll +++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmla.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -mattr=+b16b16 -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -mattr=+sve-b16b16 -verify-machineinstrs < %s | FileCheck %s define <vscale x 8 x bfloat> @bfmla_m(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c){ ; CHECK-LABEL: bfmla_m: diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmla_lane.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmla_lane.ll index d0e3a82df3ff..ece96b38d786 100644 --- a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmla_lane.ll +++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmla_lane.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -mattr=+b16b16 -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -mattr=+sve-b16b16 -verify-machineinstrs < %s | FileCheck %s define <vscale x 8 x bfloat> @bfmla_lane_idx1(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c){ ; CHECK-LABEL: bfmla_lane_idx1: diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmls.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmls.ll index 987fe1fb5822..8ff1afcc9b4a 100644 --- a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmls.ll +++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmls.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -mattr=+b16b16 -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -mattr=+sve-b16b16 -verify-machineinstrs < %s | FileCheck %s define <vscale x 8 x bfloat> @bfmls_m(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c){ ; CHECK-LABEL: bfmls_m: diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmls_lane.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmls_lane.ll index 16b4538ffab9..81406bf08b93 100644 --- a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmls_lane.ll +++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmls_lane.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -mattr=+b16b16 -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -mattr=+sve-b16b16 -verify-machineinstrs < %s | FileCheck %s define <vscale x 8 x bfloat> @bfmls_lane_idx1(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c){ ; CHECK-LABEL: bfmls_lane_idx1: diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmul.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmul.ll index a04c5a52139c..8b6a087578ed 100644 --- a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmul.ll +++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmul.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -mattr=+b16b16 -mattr=+use-experimental-zeroing-pseudos -verify-machineinstrs < %s \ +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -mattr=+sve-b16b16 -mattr=+use-experimental-zeroing-pseudos -verify-machineinstrs < %s \ ; RUN: | FileCheck %s define <vscale x 8 x bfloat> @bfmul_pred(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b){ diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmul_lane.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmul_lane.ll index 2962d59e707c..28ae9b0d19e1 100644 --- a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmul_lane.ll +++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmul_lane.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -mattr=+b16b16 -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -mattr=+sve-b16b16 -verify-machineinstrs < %s | FileCheck %s define <vscale x 8 x bfloat> @bfmul_lane_idx1(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b) { ; CHECK-LABEL: bfmul_lane_idx1: diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfsub.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfsub.ll index 752b5ae9df63..1b1304312ceb 100644 --- a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfsub.ll +++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfsub.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -mattr=+b16b16 -mattr=+use-experimental-zeroing-pseudos -verify-machineinstrs < %s \ +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -mattr=+sve-b16b16 -mattr=+use-experimental-zeroing-pseudos -verify-machineinstrs < %s \ ; RUN: | FileCheck %s define <vscale x 8 x bfloat> @bfsub_pred(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b){ diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-fma.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-fma.ll index b9846a6a555d..b2b433167fe4 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-fma.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-fma.ll @@ -441,6 +441,43 @@ define amdgpu_vs <4 x float> @test_v4f16_v4f32_add_ext_fma_mul_rhs(<4 x float> % ret <4 x float> %d } +define amdgpu_ps float @test_matching_source_from_unmerge(ptr addrspace(3) %aptr, float %b) { +; GFX9-DENORM-LABEL: test_matching_source_from_unmerge: +; GFX9-DENORM: ; %bb.0: ; %.entry +; GFX9-DENORM-NEXT: ds_read_b64 v[2:3], v0 +; GFX9-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DENORM-NEXT: v_mad_mix_f32 v0, v2, v3, v1 op_sel:[1,1,0] op_sel_hi:[1,1,0] +; GFX9-DENORM-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: test_matching_source_from_unmerge: +; GFX10: ; %bb.0: ; %.entry +; GFX10-NEXT: ds_read_b64 v[2:3], v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_fma_mix_f32 v0, v2, v3, v1 op_sel:[1,1,0] op_sel_hi:[1,1,0] +; GFX10-NEXT: ; return to shader part epilog +; +; GFX10-CONTRACT-LABEL: test_matching_source_from_unmerge: +; GFX10-CONTRACT: ; %bb.0: ; %.entry +; GFX10-CONTRACT-NEXT: ds_read_b64 v[2:3], v0 +; GFX10-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CONTRACT-NEXT: v_fma_mix_f32 v0, v2, v3, v1 op_sel:[1,1,0] op_sel_hi:[1,1,0] +; GFX10-CONTRACT-NEXT: ; return to shader part epilog +; +; GFX10-DENORM-LABEL: test_matching_source_from_unmerge: +; GFX10-DENORM: ; %bb.0: ; %.entry +; GFX10-DENORM-NEXT: ds_read_b64 v[2:3], v0 +; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DENORM-NEXT: v_fma_mix_f32 v0, v2, v3, v1 op_sel:[1,1,0] op_sel_hi:[1,1,0] +; GFX10-DENORM-NEXT: ; return to shader part epilog +.entry: + %a = load <4 x half>, ptr addrspace(3) %aptr, align 16 + %a_f32 = fpext <4 x half> %a to <4 x float> + %.a3_f32 = extractelement <4 x float> %a_f32, i64 3 + %.a1_f32 = extractelement <4 x float> %a_f32, i64 1 + %res = call float @llvm.fmuladd.f32(float %.a1_f32, float %.a3_f32, float %b) + ret float %res +} + declare float @llvm.fmuladd.f32(float, float, float) #0 declare half @llvm.fmuladd.f16(half, half, half) #0 declare <4 x float> @llvm.fmuladd.v4f32(<4 x float>, <4 x float>, <4 x float>) #0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.ll index 9b9249b62b0b..66b88236bbb4 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.ll @@ -7,10 +7,11 @@ target triple = "amdgcn-amd-amdhsa" ; Make sure flat_scratch_init is set ; GCN-LABEL: {{^}}stack_object_addrspacecast_in_kernel_no_calls: -; RW-FLAT: s_add_u32 flat_scratch_lo, s4, s7 -; RW-FLAT: s_addc_u32 flat_scratch_hi, s5, 0 +; RW-FLAT: s_add_u32 s0, s0, s7 +; RW-FLAT: s_addc_u32 s1, s1, 0 ; RO-FLAT-NOT: flat_scratch -; GCN: flat_store_dword +; RW-FLAT: buffer_store_dword +; RO-FLAT: scratch_store_dword ; RO-FLAT-NOT: .amdhsa_user_sgpr_private_segment_buffer ; RW-FLAT: .amdhsa_user_sgpr_flat_scratch_init 1 ; RO-FLAT-NOT: .amdhsa_user_sgpr_flat_scratch_init diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-copy.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-copy.mir index 48de4838b78f..30c374ddee57 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-copy.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-copy.mir @@ -1,6 +1,8 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=amdgcn -mcpu=gfx908 -run-pass=amdgpu-regbankselect -regbankselect-fast -verify-machineinstrs %s -o - | FileCheck %s # RUN: llc -mtriple=amdgcn -mcpu=gfx908 -run-pass=amdgpu-regbankselect -regbankselect-greedy -verify-machineinstrs %s -o - | FileCheck %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=amdgpu-regbankselect -regbankselect-fast -verify-machineinstrs %s -o - | FileCheck --check-prefix=WAVE32 %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=amdgpu-regbankselect -regbankselect-greedy -verify-machineinstrs %s -o - | FileCheck --check-prefix=WAVE32 %s --- name: copy_s32_vgpr_to_vgpr @@ -201,3 +203,299 @@ body: | %2:vcc(s1) = COPY %1 S_ENDPGM 0, implicit %2 ... + +--- +name: wave64_copy_sgpr_64_to_s1 +legalized: true + +body: | + bb.0: + liveins: $sgpr4_sgpr5 + ; CHECK-LABEL: name: wave64_copy_sgpr_64_to_s1 + ; CHECK: liveins: $sgpr4_sgpr5 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vcc(s1) = COPY $sgpr4_sgpr5 + ; CHECK-NEXT: [[CONST1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1 + ; CHECK-NEXT: [[CONST2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[COPY]](s1), [[CONST1]], [[CONST2]] + %0:_(s1) = COPY $sgpr4_sgpr5 + %1:_(s32) = G_ZEXT %0:_(s1) +... + +--- +name: wave32_copy_sgpr_32_to_s1 +legalized: true + +body: | + bb.0: + liveins: $sgpr0 + ; WAVE32-LABEL: name: wave32_copy_sgpr_32_to_s1 + ; WAVE32: liveins: $sgpr0 + ; WAVE32-NEXT: {{ $}} + ; WAVE32-NEXT: [[COPY:%[0-9]+]]:vcc(s1) = COPY $sgpr0 + ; WAVE32-NEXT: [[CONST1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1 + ; WAVE32-NEXT: [[CONST2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; WAVE32-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[COPY]](s1), [[CONST1]], [[CONST2]] + %0:_(s1) = COPY $sgpr0 + %1:_(s32) = G_ZEXT %0:_(s1) +... + +--- +name: wave64_copy2_sgpr_64_to_s1 +legalized: true + +body: | + bb.0: + liveins: $sgpr4_sgpr5, $sgpr6_sgpr7 + ; CHECK-LABEL: name: wave64_copy2_sgpr_64_to_s1 + ; CHECK: liveins: $sgpr4_sgpr5, $sgpr6_sgpr7 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vcc(s1) = COPY $sgpr4_sgpr5 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vcc(s1) = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[CONST1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1 + ; CHECK-NEXT: [[CONST2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:vgpr(s32) = G_SELECT [[COPY1]](s1), [[CONST1]], [[CONST2]] + ; CHECK-NEXT: [[CONST3:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1 + ; CHECK-NEXT: [[CONST4:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[SELECT2:%[0-9]+]]:vgpr(s32) = G_SELECT [[COPY2]](s1), [[CONST3]], [[CONST4]] + %0:_(s1) = COPY $sgpr4_sgpr5 + %1:_(s1) = COPY $sgpr6_sgpr7 + %2:_(s32) = G_ZEXT %0:_(s1) + %3:_(s32) = G_ZEXT %1:_(s1) +... + +--- +name: wave32_copy2_sgpr_32_to_s1 +legalized: true + +body: | + bb.0: + liveins: $sgpr0, $sgpr1 + ; WAVE32-LABEL: name: wave32_copy2_sgpr_32_to_s1 + ; WAVE32: liveins: $sgpr0, $sgpr1 + ; WAVE32-NEXT: {{ $}} + ; WAVE32-NEXT: [[COPY1:%[0-9]+]]:vcc(s1) = COPY $sgpr0 + ; WAVE32-NEXT: [[COPY2:%[0-9]+]]:vcc(s1) = COPY $sgpr1 + ; WAVE32-NEXT: [[CONST1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1 + ; WAVE32-NEXT: [[CONST2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; WAVE32-NEXT: [[SELECT1:%[0-9]+]]:vgpr(s32) = G_SELECT [[COPY1]](s1), [[CONST1]], [[CONST2]] + ; WAVE32-NEXT: [[CONST3:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1 + ; WAVE32-NEXT: [[CONST4:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; WAVE32-NEXT: [[SELECT2:%[0-9]+]]:vgpr(s32) = G_SELECT [[COPY2]](s1), [[CONST3]], [[CONST4]] + %0:_(s1) = COPY $sgpr0 + %1:_(s1) = COPY $sgpr1 + %2:_(s32) = G_ZEXT %0:_(s1) + %3:_(s32) = G_ZEXT %1:_(s1) +... + +--- +name: copy_sgpr_64_to_s1_vgpr +legalized: true + +body: | + bb.0: + liveins: $sgpr4_sgpr5 + ; CHECK-LABEL: name: copy_sgpr_64_to_s1_vgpr + ; CHECK: liveins: $sgpr4_sgpr5 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s1) = COPY $sgpr4_sgpr5 + ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:vgpr(s32) = G_ZEXT [[COPY]](s1) + ; + ; WAVE32-LABEL: name: copy_sgpr_64_to_s1_vgpr + ; WAVE32: liveins: $sgpr4_sgpr5 + ; WAVE32-NEXT: {{ $}} + ; WAVE32-NEXT: [[COPY:%[0-9]+]]:vgpr(s1) = COPY $sgpr4_sgpr5 + ; WAVE32-NEXT: [[ZEXT:%[0-9]+]]:vgpr(s32) = G_ZEXT [[COPY]](s1) + %0:vgpr(s1) = COPY $sgpr4_sgpr5 + %1:_(s32) = G_ZEXT %0:vgpr(s1) +... + +--- +name: copy_sgpr_32_to_s1_vgpr +legalized: true + +body: | + bb.0: + liveins: $sgpr0 + ; CHECK-LABEL: name: copy_sgpr_32_to_s1_vgpr + ; CHECK: liveins: $sgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s1) = COPY $sgpr0 + ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:vgpr(s32) = G_ZEXT [[COPY]](s1) + ; + ; WAVE32-LABEL: name: copy_sgpr_32_to_s1_vgpr + ; WAVE32: liveins: $sgpr0 + ; WAVE32-NEXT: {{ $}} + ; WAVE32-NEXT: [[COPY:%[0-9]+]]:vgpr(s1) = COPY $sgpr0 + ; WAVE32-NEXT: [[ZEXT:%[0-9]+]]:vgpr(s32) = G_ZEXT [[COPY]](s1) + %0:vgpr(s1) = COPY $sgpr0 + %1:_(s32) = G_ZEXT %0:vgpr(s1) +... + +--- +name: wave64_copy_sgpr_64_to_s1_vcc +legalized: true + +body: | + bb.0: + liveins: $sgpr4_sgpr5 + ; CHECK-LABEL: name: wave64_copy_sgpr_64_to_s1_vcc + ; CHECK: liveins: $sgpr4_sgpr5 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vcc(s1) = COPY $sgpr4_sgpr5 + ; CHECK-NEXT: [[CONST1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1 + ; CHECK-NEXT: [[CONST2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[COPY]](s1), [[CONST1]], [[CONST2]] + %0:vcc(s1) = COPY $sgpr4_sgpr5 + %1:_(s32) = G_ZEXT %0:vcc(s1) +... + +--- +name: wave32_copy_sgpr_32_to_s1_vcc +legalized: true + +body: | + bb.0: + liveins: $sgpr0 + ; WAVE32-LABEL: name: wave32_copy_sgpr_32_to_s1_vcc + ; WAVE32: liveins: $sgpr0 + ; WAVE32-NEXT: {{ $}} + ; WAVE32-NEXT: [[COPY:%[0-9]+]]:vcc(s1) = COPY $sgpr0 + ; WAVE32-NEXT: [[CONST1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1 + ; WAVE32-NEXT: [[CONST2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; WAVE32-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[COPY]](s1), [[CONST1]], [[CONST2]] + %0:vcc(s1) = COPY $sgpr0 + %1:_(s32) = G_ZEXT %0:vcc(s1) +... + +--- +name: copy_virt_reg_to_s1 +legalized: true + +body: | + bb.0: + liveins: $vgpr0 + ; CHECK-LABEL: name: copy_virt_reg_to_s1 + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s1) = G_TRUNC [[COPY]](s32) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s1) = COPY [[TRUNC]](s1) + ; + ; WAVE32-LABEL: name: copy_virt_reg_to_s1 + ; WAVE32: liveins: $vgpr0 + ; WAVE32-NEXT: {{ $}} + ; WAVE32-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; WAVE32-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s1) = G_TRUNC [[COPY]](s32) + ; WAVE32-NEXT: [[COPY2:%[0-9]+]]:vgpr(s1) = COPY [[TRUNC]](s1) + %0:_(s32) = COPY $vgpr0 + %1:_(s1) = G_TRUNC %0 + %2:_(s1) = COPY %1 +... + +--- +name: copy_virt_reg_to_s1_vgpr +legalized: true + +body: | + bb.0: + liveins: $vgpr0 + ; CHECK-LABEL: name: copy_virt_reg_to_s1_vgpr + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s1) = G_TRUNC [[COPY]](s32) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s1) = COPY [[TRUNC]](s1) + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s1) = COPY [[COPY2]](s1) + ; + ; WAVE32-LABEL: name: copy_virt_reg_to_s1_vgpr + ; WAVE32: liveins: $vgpr0 + ; WAVE32-NEXT: {{ $}} + ; WAVE32-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; WAVE32-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s1) = G_TRUNC [[COPY]](s32) + ; WAVE32-NEXT: [[COPY2:%[0-9]+]]:vgpr(s1) = COPY [[TRUNC]](s1) + ; WAVE32-NEXT: [[COPY3:%[0-9]+]]:vgpr(s1) = COPY [[COPY2]](s1) + %0:_(s32) = COPY $vgpr0 + %1:_(s1) = G_TRUNC %0 + %2:vgpr(s1) = COPY %1 + %3:_(s1) = COPY %2 +... + + +--- +name: copy_virt_reg_to_s1_vcc +legalized: true + +body: | + bb.0: + liveins: $vgpr0 + ; CHECK-LABEL: name: copy_virt_reg_to_s1_vcc + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s1) = G_TRUNC [[COPY]](s32) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vcc(s1) = COPY [[TRUNC]](s1) + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vcc(s1) = COPY [[COPY2]](s1) + ; + ; WAVE32-LABEL: name: copy_virt_reg_to_s1_vcc + ; WAVE32: liveins: $vgpr0 + ; WAVE32-NEXT: {{ $}} + ; WAVE32-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; WAVE32-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s1) = G_TRUNC [[COPY]](s32) + ; WAVE32-NEXT: [[COPY2:%[0-9]+]]:vcc(s1) = COPY [[TRUNC]](s1) + ; WAVE32-NEXT: [[COPY3:%[0-9]+]]:vcc(s1) = COPY [[COPY2]](s1) + %0:_(s32) = COPY $vgpr0 + %1:_(s1) = G_TRUNC %0 + %2:vcc(s1) = COPY %1 + %3:_(s1) = COPY %2 +... + +--- +name: copy_s1_to_sgpr_64 +legalized: true + +body: | + bb.0: + liveins: $vgpr0 + ; CHECK-LABEL: name: copy_s1_to_sgpr_64 + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s1) = G_TRUNC [[COPY]](s32) + ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[TRUNC]](s1) + ; + ; WAVE32-LABEL: name: copy_s1_to_sgpr_64 + ; WAVE32: liveins: $vgpr0 + ; WAVE32-NEXT: {{ $}} + ; WAVE32-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; WAVE32-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s1) = G_TRUNC [[COPY]](s32) + ; WAVE32-NEXT: $sgpr4_sgpr5 = COPY [[TRUNC]](s1) + %0:_(s32) = COPY $vgpr0 + %1:_(s1) = G_TRUNC %0 + $sgpr4_sgpr5 = COPY %1 +... + +--- +name: copy_s1_to_sgpr_32 +legalized: true + +body: | + bb.0: + liveins: $vgpr0 + ; CHECK-LABEL: name: copy_s1_to_sgpr_32 + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s1) = G_TRUNC [[COPY]](s32) + ; CHECK-NEXT: $sgpr0 = COPY [[TRUNC]](s1) + ; + ; WAVE32-LABEL: name: copy_s1_to_sgpr_32 + ; WAVE32: liveins: $vgpr0 + ; WAVE32-NEXT: {{ $}} + ; WAVE32-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; WAVE32-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s1) = G_TRUNC [[COPY]](s32) + ; WAVE32-NEXT: $sgpr0 = COPY [[TRUNC]](s1) + %0:_(s32) = COPY $vgpr0 + %1:_(s1) = G_TRUNC %0 + $sgpr0 = COPY %1 +... diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast.ll index 1315d576a83e..4b1484e9bd95 100644 --- a/llvm/test/CodeGen/AMDGPU/addrspacecast.ll +++ b/llvm/test/CodeGen/AMDGPU/addrspacecast.ll @@ -5,22 +5,11 @@ target triple = "amdgcn-amd-amdhsa" ; HSA-LABEL: {{^}}use_group_to_flat_addrspacecast: -; CI-DAG: s_load_dword [[PTR:s[0-9]+]], s[6:7], 0x0{{$}} -; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x10{{$}} -; CI-DAG: s_cmp_lg_u32 [[PTR]], -1 -; CI-DAG: s_cselect_b32 s[[HI:[0-9]+]], [[APERTURE]], 0 -; CI-DAG: s_cselect_b32 s[[LO:[0-9]+]], [[PTR]], 0 - -; GFX9-DAG: s_mov_b64 s[{{[0-9]+}}:[[HIBASE:[0-9]+]]], src_shared_base - +; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[6:7], 0x0{{$}} +; GFX9-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x0{{$}} +; HSA-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], [[APERTURE]] ; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7 -; GFX9-DAG: s_load_dword [[PTR:s[0-9]+]], s[4:5], 0x0{{$}} - -; GFX9: s_cmp_lg_u32 [[PTR]], -1 -; GFX9-DAG: s_cselect_b32 s[[LO:[0-9]+]], s[[HIBASE]], 0 -; GFX9-DAG: s_cselect_b32 s[[HI:[0-9]+]], [[PTR]], 0 - -; HSA: flat_store_dword v[[[LO]]:[[HI]]], [[K]] +; HSA-DAG: ds_write_b32 [[PTR]], [[K]] ; HSA: .amdhsa_user_sgpr_private_segment_buffer 1 ; HSA: .amdhsa_user_sgpr_dispatch_ptr 0 @@ -39,22 +28,8 @@ define amdgpu_kernel void @use_group_to_flat_addrspacecast(ptr addrspace(3) %ptr ; Test handling inside a non-kernel ; HSA-LABEL: {{^}}use_group_to_flat_addrspacecast_func: -; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[6:7], 0x10{{$}} -; CI-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[APERTURE]] -; CI-DAG: v_cmp_ne_u32_e32 vcc, -1, v0 -; CI-DAG: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc -; CI-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, v0 - -; GFX9-DAG: s_mov_b64 s[{{[0-9]+}}:[[HIBASE:[0-9]+]]], src_shared_base - ; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7 - -; GFX9-DAG: v_mov_b32_e32 v[[VREG_HIBASE:[0-9]+]], s[[HIBASE]] -; GFX9-DAG: v_cmp_ne_u32_e32 vcc, -1, v0 -; GFX9-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, v0, vcc -; GFX9-DAG: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, v[[VREG_HIBASE]], vcc - -; HSA: flat_store_dword v[[[LO]]:[[HI]]], [[K]] +; HSA-DAG: ds_write_b32 v0, [[K]] define void @use_group_to_flat_addrspacecast_func(ptr addrspace(3) %ptr) #0 { %stof = addrspacecast ptr addrspace(3) %ptr to ptr store volatile i32 7, ptr %stof @@ -63,23 +38,16 @@ define void @use_group_to_flat_addrspacecast_func(ptr addrspace(3) %ptr) #0 { ; HSA-LABEL: {{^}}use_private_to_flat_addrspacecast: -; CI-DAG: s_load_dword [[PTR:s[0-9]+]], s[6:7], 0x0{{$}} -; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x11{{$}} - -; CI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7 -; CI-DAG: s_cmp_lg_u32 [[PTR]], -1 -; CI-DAG: s_cselect_b32 s[[HI:[0-9]+]], [[APERTURE]], 0 -; CI-DAG: s_cselect_b32 s[[LO:[0-9]+]], [[PTR]], 0 - -; GFX9-DAG: s_load_dword [[PTR:s[0-9]+]], s[4:5], 0x0{{$}} -; GFX9-DAG: s_mov_b64 s[{{[0-9]+}}:[[HIBASE:[0-9]+]]], src_private_base - -; GFX9-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7 -; GFX9: s_cmp_lg_u32 [[PTR]], -1 -; GFX9: s_cselect_b32 s[[LO:[0-9]+]], s[[HIBASE]], 0 -; GFX9: s_cselect_b32 s[[HI:[0-9]+]], [[PTR]], 0 - -; HSA: flat_store_dword v[[[LO]]:[[HI]]], [[K]] +; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[6:7], 0x0{{$}} +; GFX9-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x0{{$}} +; HSA-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], [[APERTURE]] +; HSA-DAG: s_mov_b64 s[{{[0-9]+}}:[[RSRCHI:[0-9]+]]], s[2:3] +; HSA-DAG: s_mov_b64 s[[[BASELO:[0-9]+]]:[[BASEHI:[0-9]+]]], s[0:1] +; SI-DAG: s_add_u32 s[[BASELO]], s[[BASELO]], s9 +; GFX9-DAG: s_add_u32 s[[BASELO]], s[[BASELO]], s7 +; HSA-DAG: s_addc_u32 s[[BASEHI]], s[[BASEHI]], 0 +; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7 +; HSA: buffer_store_dword [[K]], [[PTR]], s[[[BASELO]]:[[RSRCHI]]], 0 offen ; HSA: .amdhsa_user_sgpr_private_segment_buffer 1 ; HSA: .amdhsa_user_sgpr_dispatch_ptr 0 @@ -97,10 +65,12 @@ define amdgpu_kernel void @use_private_to_flat_addrspacecast(ptr addrspace(5) %p ; HSA-LABEL: {{^}}use_global_to_flat_addrspacecast: ; HSA: s_load_dwordx2 s[[[PTRLO:[0-9]+]]:[[PTRHI:[0-9]+]]] -; HSA-DAG: v_mov_b32_e32 v[[VPTRLO:[0-9]+]], s[[PTRLO]] -; HSA-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]] +; CI-DAG: v_mov_b32_e32 v[[VPTRLO:[0-9]+]], s[[PTRLO]] +; CI-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]] ; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7 -; HSA: flat_store_dword v[[[VPTRLO]]:[[VPTRHI]]], [[K]] +; CI: flat_store_dword v[[[VPTRLO]]:[[VPTRHI]]], [[K]] +; GFX9-DAG: v_mov_b32_e32 [[ADDR:v[0-9]+]], 0 +; GFX9: global_store_dword [[ADDR]], [[K]], s[[[PTRLO]]:[[PTRHI]]] ; HSA: .amdhsa_user_sgpr_queue_ptr 0 define amdgpu_kernel void @use_global_to_flat_addrspacecast(ptr addrspace(1) %ptr) #0 { @@ -112,9 +82,7 @@ define amdgpu_kernel void @use_global_to_flat_addrspacecast(ptr addrspace(1) %pt ; no-op ; HSA-LABEL: {{^}}use_constant_to_flat_addrspacecast: ; HSA: s_load_dwordx2 s[[[PTRLO:[0-9]+]]:[[PTRHI:[0-9]+]]] -; HSA-DAG: v_mov_b32_e32 v[[VPTRLO:[0-9]+]], s[[PTRLO]] -; HSA-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]] -; HSA: flat_load_dword v{{[0-9]+}}, v[[[VPTRLO]]:[[VPTRHI]]] +; HSA-DAG: s_load_dword s0, s[[[PTRLO]]:[[PTRHI]]], 0x0 define amdgpu_kernel void @use_constant_to_flat_addrspacecast(ptr addrspace(4) %ptr) #0 { %stof = addrspacecast ptr addrspace(4) %ptr to ptr %ld = load volatile i32, ptr %stof @@ -215,14 +183,9 @@ define amdgpu_kernel void @use_flat_to_constant_addrspacecast(ptr %ptr) #0 { } ; HSA-LABEL: {{^}}cast_0_group_to_flat_addrspacecast: -; CI: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x10 -; CI-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[APERTURE]] - -; GFX9-DAG: s_mov_b64 s[{{[0-9]+}}:[[HI:[0-9]+]]], src_shared_base - ; HSA-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} ; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}} -; HSA: {{flat|global}}_store_dword v[[[LO]]:[[HI]]], v[[K]] +; HSA: ds_write_b32 v[[LO]], v[[K]] define amdgpu_kernel void @cast_0_group_to_flat_addrspacecast() #0 { %cast = addrspacecast ptr addrspace(3) null to ptr store volatile i32 7, ptr %cast @@ -240,10 +203,9 @@ define amdgpu_kernel void @cast_0_flat_to_group_addrspacecast() #0 { } ; HSA-LABEL: {{^}}cast_neg1_group_to_flat_addrspacecast: -; HSA: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} ; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}} -; HSA-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}} -; HSA: {{flat|global}}_store_dword v[[[LO]]:[[HI]]], v[[K]] +; HSA-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], -1 +; HSA: ds_write_b32 v[[LO]], v[[K]] define amdgpu_kernel void @cast_neg1_group_to_flat_addrspacecast() #0 { %cast = addrspacecast ptr addrspace(3) inttoptr (i32 -1 to ptr addrspace(3)) to ptr store volatile i32 7, ptr %cast @@ -262,14 +224,13 @@ define amdgpu_kernel void @cast_neg1_flat_to_group_addrspacecast() #0 { ; FIXME: Shouldn't need to enable queue ptr ; HSA-LABEL: {{^}}cast_0_private_to_flat_addrspacecast: -; CI: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x11 -; CI-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[APERTURE]] - -; GFX9-DAG: s_mov_b64 s[{{[0-9]+}}:[[HI:[0-9]+]]], src_private_base - -; HSA-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} +; HSA-DAG: s_mov_b64 s[{{[0-9]+}}:[[RSRCHI:[0-9]+]]], s[2:3] +; HSA-DAG: s_mov_b64 s[[[BASELO:[0-9]+]]:[[BASEHI:[0-9]+]]], s[0:1] +; CI-DAG: s_add_u32 s[[BASELO]], s[[BASELO]], s7 +; GFX9-DAG: s_add_u32 s[[BASELO]], s[[BASELO]], s5 +; HSA-DAG: s_addc_u32 s[[BASEHI]], s[[BASEHI]], 0 ; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}} -; HSA: {{flat|global}}_store_dword v[[[LO]]:[[HI]]], v[[K]] +; HSA: buffer_store_dword v[[K]], off, s[[[BASELO]]:[[RSRCHI]]], 0 define amdgpu_kernel void @cast_0_private_to_flat_addrspacecast() #0 { %cast = addrspacecast ptr addrspace(5) null to ptr store volatile i32 7, ptr %cast @@ -286,13 +247,16 @@ define amdgpu_kernel void @cast_0_flat_to_private_addrspacecast() #0 { ret void } - ; HSA-LABEL: {{^}}cast_neg1_private_to_flat_addrspacecast: -; HSA: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} +; HSA-DAG: s_mov_b64 s[{{[0-9]+}}:[[RSRCHI:[0-9]+]]], s[2:3] +; HSA-DAG: s_mov_b64 s[[[BASELO:[0-9]+]]:[[BASEHI:[0-9]+]]], s[0:1] +; CI-DAG: s_add_u32 s[[BASELO]], s[[BASELO]], s7 +; GFX9-DAG: s_add_u32 s[[BASELO]], s[[BASELO]], s5 +; HSA-DAG: s_addc_u32 s[[BASEHI]], s[[BASEHI]], 0 +; HSA-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], -1{{$}} ; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}} -; HSA-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}} -; HSA: {{flat|global}}_store_dword v[[[LO]]:[[HI]]], v[[K]] +; HSA: buffer_store_dword v[[K]], [[PTR]], s[[[BASELO]]:[[RSRCHI]]], 0 offen ; CI: .amdhsa_user_sgpr_queue_ptr 1 ; GFX9: .amdhsa_user_sgpr_queue_ptr 0 @@ -342,16 +306,18 @@ end: ; Check for prologue initializing special SGPRs pointing to scratch. ; HSA-LABEL: {{^}}store_flat_scratch: -; CI-DAG: s_mov_b32 flat_scratch_lo, s9 ; CI-DAG: s_add_i32 [[ADD:s[0-9]+]], s8, s11 ; CI-DAG: s_lshr_b32 flat_scratch_hi, [[ADD]], 8 - -; GFX9: s_add_u32 flat_scratch_lo, s6, s9 -; GFX9: s_addc_u32 flat_scratch_hi, s7, 0 - -; HSA: {{flat|global}}_store_dword -; HSA: s_barrier -; HSA: {{flat|global}}_load_dword +; HSA: buffer_store_dword +; HSA: s_barrier +; HSA: buffer_load_dword [[K:v[0-9]+]], v{{[0-9]+}}, s[0:3], 0 offen glc +; HSA-DAG: s_load_dwordx2 +; CI-DAG: s_mov_b32 flat_scratch_lo, s9 +; CI-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], s4 +; CI-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], s5 +; GFX9-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], 0 +; CI: flat_store_dword v[[[LO]]:[[HI]]], [[K]] +; GFX9: global_store_dword [[PTR]], [[K]] define amdgpu_kernel void @store_flat_scratch(ptr addrspace(1) noalias %out, i32) #0 { %alloca = alloca i32, i32 9, align 4, addrspace(5) %x = call i32 @llvm.amdgcn.workitem.id.x() #2 diff --git a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll index 43cdf85ed381..879bceaef97c 100644 --- a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll +++ b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll @@ -425,8 +425,7 @@ define amdgpu_kernel void @use_group_to_flat_addrspacecast(ptr addrspace(3) %ptr ; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_group_to_flat_addrspacecast ; ATTRIBUTOR_HSA-SAME: (ptr addrspace(3) [[PTR:%.*]]) #[[ATTR12:[0-9]+]] { -; ATTRIBUTOR_HSA-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(3) [[PTR]] to ptr -; ATTRIBUTOR_HSA-NEXT: store volatile i32 0, ptr [[STOF]], align 4 +; ATTRIBUTOR_HSA-NEXT: store volatile i32 0, ptr addrspace(3) [[PTR]], align 4 ; ATTRIBUTOR_HSA-NEXT: ret void ; %stof = addrspacecast ptr addrspace(3) %ptr to ptr @@ -443,8 +442,7 @@ define amdgpu_kernel void @use_private_to_flat_addrspacecast(ptr addrspace(5) %p ; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_private_to_flat_addrspacecast ; ATTRIBUTOR_HSA-SAME: (ptr addrspace(5) [[PTR:%.*]]) #[[ATTR12]] { -; ATTRIBUTOR_HSA-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(5) [[PTR]] to ptr -; ATTRIBUTOR_HSA-NEXT: store volatile i32 0, ptr [[STOF]], align 4 +; ATTRIBUTOR_HSA-NEXT: store volatile i32 0, ptr addrspace(5) [[PTR]], align 4 ; ATTRIBUTOR_HSA-NEXT: ret void ; %stof = addrspacecast ptr addrspace(5) %ptr to ptr @@ -478,11 +476,16 @@ define amdgpu_kernel void @use_flat_to_private_addrspacecast(ptr %ptr) #1 { ; No-op addrspacecast should not use queue ptr define amdgpu_kernel void @use_global_to_flat_addrspacecast(ptr addrspace(1) %ptr) #1 { -; HSA-LABEL: define {{[^@]+}}@use_global_to_flat_addrspacecast -; HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] { -; HSA-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr -; HSA-NEXT: store volatile i32 0, ptr [[STOF]], align 4 -; HSA-NEXT: ret void +; AKF_HSA-LABEL: define {{[^@]+}}@use_global_to_flat_addrspacecast +; AKF_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] { +; AKF_HSA-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr +; AKF_HSA-NEXT: store volatile i32 0, ptr [[STOF]], align 4 +; AKF_HSA-NEXT: ret void +; +; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_global_to_flat_addrspacecast +; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] { +; ATTRIBUTOR_HSA-NEXT: store volatile i32 0, ptr addrspace(1) [[PTR]], align 4 +; ATTRIBUTOR_HSA-NEXT: ret void ; %stof = addrspacecast ptr addrspace(1) %ptr to ptr store volatile i32 0, ptr %stof @@ -490,11 +493,16 @@ define amdgpu_kernel void @use_global_to_flat_addrspacecast(ptr addrspace(1) %pt } define amdgpu_kernel void @use_constant_to_flat_addrspacecast(ptr addrspace(4) %ptr) #1 { -; HSA-LABEL: define {{[^@]+}}@use_constant_to_flat_addrspacecast -; HSA-SAME: (ptr addrspace(4) [[PTR:%.*]]) #[[ATTR1]] { -; HSA-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(4) [[PTR]] to ptr -; HSA-NEXT: [[LD:%.*]] = load volatile i32, ptr [[STOF]], align 4 -; HSA-NEXT: ret void +; AKF_HSA-LABEL: define {{[^@]+}}@use_constant_to_flat_addrspacecast +; AKF_HSA-SAME: (ptr addrspace(4) [[PTR:%.*]]) #[[ATTR1]] { +; AKF_HSA-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(4) [[PTR]] to ptr +; AKF_HSA-NEXT: [[LD:%.*]] = load volatile i32, ptr [[STOF]], align 4 +; AKF_HSA-NEXT: ret void +; +; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_constant_to_flat_addrspacecast +; ATTRIBUTOR_HSA-SAME: (ptr addrspace(4) [[PTR:%.*]]) #[[ATTR1]] { +; ATTRIBUTOR_HSA-NEXT: [[LD:%.*]] = load volatile i32, ptr addrspace(4) [[PTR]], align 4 +; ATTRIBUTOR_HSA-NEXT: ret void ; %stof = addrspacecast ptr addrspace(4) %ptr to ptr %ld = load volatile i32, ptr %stof diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll index 8ef2d89e76d4..032ec65fa851 100644 --- a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll +++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll @@ -38,15 +38,9 @@ define amdgpu_kernel void @kern_indirect_use_queue_ptr(i32) #1 { } ; GCN-LABEL: {{^}}use_queue_ptr_addrspacecast: -; CIVI: s_load_dword [[APERTURE_LOAD:s[0-9]+]], s[4:5], 0x0 -; CIVI: v_mov_b32_e32 v[[LO:[0-9]+]], 16 -; CIVI-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[APERTURE_LOAD]] +; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], 0 +; GCN-DAG: ds_write_b32 v[[LO]], v[[LO]] offset:16 -; GFX9: s_mov_b64 s[{{[0-9]+}}:[[HI:[0-9]+]]], src_shared_base -; GFX9-DAG: v_mov_b32_e32 v[[VGPR_HI:[0-9]+]], s[[HI]] -; GFX9: {{flat|global}}_store_dword v{{\[[0-9]+}}:[[VGPR_HI]]] - -; CIVI: {{flat|global}}_store_dword v[[[LO]]:[[HI]]] define hidden void @use_queue_ptr_addrspacecast() #1 { %asc = addrspacecast ptr addrspace(3) inttoptr (i32 16 to ptr addrspace(3)) to ptr store volatile i32 0, ptr %asc diff --git a/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-v-add-co-u32.mir b/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-v-add-co-u32.mir index 2cd7b8a6424b..6ec296144bf1 100644 --- a/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-v-add-co-u32.mir +++ b/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-v-add-co-u32.mir @@ -23,10 +23,35 @@ body: | bb.0: ; MUBUFW64-LABEL: name: v_add_co_u32_e32__inline_imm__fi_offset0 ; MUBUFW64: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec + ; MUBUFW64-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 12, killed $vgpr1, implicit-def dead $vcc, implicit $exec + ; MUBUFW64-NEXT: SI_RETURN implicit $vgpr0 + ; + ; FLATSCRW64-LABEL: name: v_add_co_u32_e32__inline_imm__fi_offset0 + ; FLATSCRW64: $vgpr1 = V_MOV_B32_e32 $sgpr32, implicit $exec + ; FLATSCRW64-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 12, killed $vgpr1, implicit-def dead $vcc, implicit $exec + ; FLATSCRW64-NEXT: SI_RETURN implicit $vgpr0 + renamable $vgpr0 = V_ADD_CO_U32_e32 12, %stack.0, implicit-def dead $vcc, implicit $exec + SI_RETURN implicit $vgpr0 + +... + +--- +name: v_add_co_u32_e32__inline_imm__fi_offset0_live_vcc +tracksRegLiveness: true +stack: + - { id: 0, size: 4, alignment: 16 } +machineFunctionInfo: + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' +body: | + bb.0: + ; MUBUFW64-LABEL: name: v_add_co_u32_e32__inline_imm__fi_offset0_live_vcc + ; MUBUFW64: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec ; MUBUFW64-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 12, killed $vgpr1, implicit-def $vcc, implicit $exec ; MUBUFW64-NEXT: SI_RETURN implicit $vgpr0, implicit $vcc ; - ; FLATSCRW64-LABEL: name: v_add_co_u32_e32__inline_imm__fi_offset0 + ; FLATSCRW64-LABEL: name: v_add_co_u32_e32__inline_imm__fi_offset0_live_vcc ; FLATSCRW64: $vgpr1 = V_MOV_B32_e32 $sgpr32, implicit $exec ; FLATSCRW64-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 12, killed $vgpr1, implicit-def $vcc, implicit $exec ; FLATSCRW64-NEXT: SI_RETURN implicit $vgpr0, implicit $vcc @@ -49,35 +74,87 @@ body: | bb.0: ; GFX7-LABEL: name: v_add_co_u32_e32__inline_imm___fi_offset_inline_imm ; GFX7: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec + ; GFX7-NEXT: $vgpr1, dead $vcc = V_ADD_CO_U32_e64 16, killed $vgpr1, 0, implicit $exec + ; GFX7-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 12, killed $vgpr1, implicit-def dead $vcc, implicit $exec + ; GFX7-NEXT: SI_RETURN implicit $vgpr0 + ; + ; GFX8-LABEL: name: v_add_co_u32_e32__inline_imm___fi_offset_inline_imm + ; GFX8: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec + ; GFX8-NEXT: $vgpr1, dead $vcc = V_ADD_CO_U32_e64 16, killed $vgpr1, 0, implicit $exec + ; GFX8-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 12, killed $vgpr1, implicit-def dead $vcc, implicit $exec + ; GFX8-NEXT: SI_RETURN implicit $vgpr0 + ; + ; GFX900-LABEL: name: v_add_co_u32_e32__inline_imm___fi_offset_inline_imm + ; GFX900: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec + ; GFX900-NEXT: $vgpr1 = V_ADD_U32_e32 16, killed $vgpr1, implicit $exec + ; GFX900-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 12, killed $vgpr1, implicit-def dead $vcc, implicit $exec + ; GFX900-NEXT: SI_RETURN implicit $vgpr0 + ; + ; GFX90A-LABEL: name: v_add_co_u32_e32__inline_imm___fi_offset_inline_imm + ; GFX90A: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec + ; GFX90A-NEXT: $vgpr1 = V_ADD_U32_e32 16, killed $vgpr1, implicit $exec + ; GFX90A-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 12, killed $vgpr1, implicit-def dead $vcc, implicit $exec + ; GFX90A-NEXT: SI_RETURN implicit $vgpr0 + ; + ; GFX10-LABEL: name: v_add_co_u32_e32__inline_imm___fi_offset_inline_imm + ; GFX10: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec + ; GFX10-NEXT: $vgpr1 = V_ADD_U32_e32 16, killed $vgpr1, implicit $exec + ; GFX10-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 12, killed $vgpr1, implicit-def dead $vcc, implicit $exec + ; GFX10-NEXT: SI_RETURN implicit $vgpr0 + ; + ; FLATSCRW64-LABEL: name: v_add_co_u32_e32__inline_imm___fi_offset_inline_imm + ; FLATSCRW64: $sgpr4 = S_ADD_I32 $sgpr32, 16, implicit-def $scc + ; FLATSCRW64-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr4, implicit $exec + ; FLATSCRW64-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 12, killed $vgpr1, implicit-def dead $vcc, implicit $exec + ; FLATSCRW64-NEXT: SI_RETURN implicit $vgpr0 + renamable $vgpr0 = V_ADD_CO_U32_e32 12, %stack.1, implicit-def dead $vcc, implicit $exec + SI_RETURN implicit $vgpr0 + +... + +--- +name: v_add_co_u32_e32__inline_imm___fi_offset_inline_imm_live_vcc +tracksRegLiveness: true +stack: + - { id: 0, size: 16, alignment: 16 } + - { id: 1, size: 4, alignment: 4 } +machineFunctionInfo: + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' +body: | + bb.0: + ; GFX7-LABEL: name: v_add_co_u32_e32__inline_imm___fi_offset_inline_imm_live_vcc + ; GFX7: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec ; GFX7-NEXT: $vgpr1, dead $sgpr4_sgpr5 = V_ADD_CO_U32_e64 16, killed $vgpr1, 0, implicit $exec ; GFX7-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 12, killed $vgpr1, implicit-def $vcc, implicit $exec ; GFX7-NEXT: SI_RETURN implicit $vgpr0, implicit $vcc ; - ; GFX8-LABEL: name: v_add_co_u32_e32__inline_imm___fi_offset_inline_imm + ; GFX8-LABEL: name: v_add_co_u32_e32__inline_imm___fi_offset_inline_imm_live_vcc ; GFX8: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec ; GFX8-NEXT: $vgpr1, dead $sgpr4_sgpr5 = V_ADD_CO_U32_e64 16, killed $vgpr1, 0, implicit $exec ; GFX8-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 12, killed $vgpr1, implicit-def $vcc, implicit $exec ; GFX8-NEXT: SI_RETURN implicit $vgpr0, implicit $vcc ; - ; GFX900-LABEL: name: v_add_co_u32_e32__inline_imm___fi_offset_inline_imm + ; GFX900-LABEL: name: v_add_co_u32_e32__inline_imm___fi_offset_inline_imm_live_vcc ; GFX900: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec ; GFX900-NEXT: $vgpr1 = V_ADD_U32_e32 16, killed $vgpr1, implicit $exec ; GFX900-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 12, killed $vgpr1, implicit-def $vcc, implicit $exec ; GFX900-NEXT: SI_RETURN implicit $vgpr0, implicit $vcc ; - ; GFX90A-LABEL: name: v_add_co_u32_e32__inline_imm___fi_offset_inline_imm + ; GFX90A-LABEL: name: v_add_co_u32_e32__inline_imm___fi_offset_inline_imm_live_vcc ; GFX90A: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec ; GFX90A-NEXT: $vgpr1 = V_ADD_U32_e32 16, killed $vgpr1, implicit $exec ; GFX90A-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 12, killed $vgpr1, implicit-def $vcc, implicit $exec ; GFX90A-NEXT: SI_RETURN implicit $vgpr0, implicit $vcc ; - ; GFX10-LABEL: name: v_add_co_u32_e32__inline_imm___fi_offset_inline_imm + ; GFX10-LABEL: name: v_add_co_u32_e32__inline_imm___fi_offset_inline_imm_live_vcc ; GFX10: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec ; GFX10-NEXT: $vgpr1 = V_ADD_U32_e32 16, killed $vgpr1, implicit $exec ; GFX10-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 12, killed $vgpr1, implicit-def $vcc, implicit $exec ; GFX10-NEXT: SI_RETURN implicit $vgpr0, implicit $vcc ; - ; FLATSCRW64-LABEL: name: v_add_co_u32_e32__inline_imm___fi_offset_inline_imm + ; FLATSCRW64-LABEL: name: v_add_co_u32_e32__inline_imm___fi_offset_inline_imm_live_vcc ; FLATSCRW64: $sgpr4 = S_ADD_I32 $sgpr32, 16, implicit-def $scc ; FLATSCRW64-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr4, implicit $exec ; FLATSCRW64-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 12, killed $vgpr1, implicit-def $vcc, implicit $exec @@ -100,10 +177,35 @@ body: | bb.0: ; MUBUFW64-LABEL: name: v_add_co_u32_e32__literal__fi_offset0 ; MUBUFW64: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec + ; MUBUFW64-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 68, killed $vgpr1, implicit-def dead $vcc, implicit $exec + ; MUBUFW64-NEXT: SI_RETURN implicit $vgpr0 + ; + ; FLATSCRW64-LABEL: name: v_add_co_u32_e32__literal__fi_offset0 + ; FLATSCRW64: $vgpr1 = V_MOV_B32_e32 $sgpr32, implicit $exec + ; FLATSCRW64-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 68, killed $vgpr1, implicit-def dead $vcc, implicit $exec + ; FLATSCRW64-NEXT: SI_RETURN implicit $vgpr0 + renamable $vgpr0 = V_ADD_CO_U32_e32 68, %stack.0, implicit-def dead $vcc, implicit $exec + SI_RETURN implicit $vgpr0 + +... + +--- +name: v_add_co_u32_e32__literal__fi_offset0_live_vcc +tracksRegLiveness: true +stack: + - { id: 0, size: 4, alignment: 16 } +machineFunctionInfo: + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' +body: | + bb.0: + ; MUBUFW64-LABEL: name: v_add_co_u32_e32__literal__fi_offset0_live_vcc + ; MUBUFW64: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec ; MUBUFW64-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 68, killed $vgpr1, implicit-def $vcc, implicit $exec ; MUBUFW64-NEXT: SI_RETURN implicit $vgpr0, implicit $vcc ; - ; FLATSCRW64-LABEL: name: v_add_co_u32_e32__literal__fi_offset0 + ; FLATSCRW64-LABEL: name: v_add_co_u32_e32__literal__fi_offset0_live_vcc ; FLATSCRW64: $vgpr1 = V_MOV_B32_e32 $sgpr32, implicit $exec ; FLATSCRW64-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 68, killed $vgpr1, implicit-def $vcc, implicit $exec ; FLATSCRW64-NEXT: SI_RETURN implicit $vgpr0, implicit $vcc @@ -126,35 +228,87 @@ body: | bb.0: ; GFX7-LABEL: name: v_add_co_u32_e32__literal__fi_offset0__offset_inlineimm ; GFX7: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec + ; GFX7-NEXT: $vgpr1, dead $vcc = V_ADD_CO_U32_e64 32, killed $vgpr1, 0, implicit $exec + ; GFX7-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 68, killed $vgpr1, implicit-def dead $vcc, implicit $exec + ; GFX7-NEXT: SI_RETURN implicit $vgpr0 + ; + ; GFX8-LABEL: name: v_add_co_u32_e32__literal__fi_offset0__offset_inlineimm + ; GFX8: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec + ; GFX8-NEXT: $vgpr1, dead $vcc = V_ADD_CO_U32_e64 32, killed $vgpr1, 0, implicit $exec + ; GFX8-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 68, killed $vgpr1, implicit-def dead $vcc, implicit $exec + ; GFX8-NEXT: SI_RETURN implicit $vgpr0 + ; + ; GFX900-LABEL: name: v_add_co_u32_e32__literal__fi_offset0__offset_inlineimm + ; GFX900: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec + ; GFX900-NEXT: $vgpr1 = V_ADD_U32_e32 32, killed $vgpr1, implicit $exec + ; GFX900-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 68, killed $vgpr1, implicit-def dead $vcc, implicit $exec + ; GFX900-NEXT: SI_RETURN implicit $vgpr0 + ; + ; GFX90A-LABEL: name: v_add_co_u32_e32__literal__fi_offset0__offset_inlineimm + ; GFX90A: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec + ; GFX90A-NEXT: $vgpr1 = V_ADD_U32_e32 32, killed $vgpr1, implicit $exec + ; GFX90A-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 68, killed $vgpr1, implicit-def dead $vcc, implicit $exec + ; GFX90A-NEXT: SI_RETURN implicit $vgpr0 + ; + ; GFX10-LABEL: name: v_add_co_u32_e32__literal__fi_offset0__offset_inlineimm + ; GFX10: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec + ; GFX10-NEXT: $vgpr1 = V_ADD_U32_e32 32, killed $vgpr1, implicit $exec + ; GFX10-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 68, killed $vgpr1, implicit-def dead $vcc, implicit $exec + ; GFX10-NEXT: SI_RETURN implicit $vgpr0 + ; + ; FLATSCRW64-LABEL: name: v_add_co_u32_e32__literal__fi_offset0__offset_inlineimm + ; FLATSCRW64: $sgpr4 = S_ADD_I32 $sgpr32, 32, implicit-def $scc + ; FLATSCRW64-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr4, implicit $exec + ; FLATSCRW64-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 68, killed $vgpr1, implicit-def dead $vcc, implicit $exec + ; FLATSCRW64-NEXT: SI_RETURN implicit $vgpr0 + renamable $vgpr0 = V_ADD_CO_U32_e32 68, %stack.1, implicit-def dead $vcc, implicit $exec + SI_RETURN implicit $vgpr0 + +... + +--- +name: v_add_co_u32_e32__literal__fi_offset0__offset_inlineimm_live_vcc +tracksRegLiveness: true +stack: + - { id: 0, size: 32, alignment: 16 } + - { id: 1, size: 4, alignment: 4 } +machineFunctionInfo: + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' +body: | + bb.0: + ; GFX7-LABEL: name: v_add_co_u32_e32__literal__fi_offset0__offset_inlineimm_live_vcc + ; GFX7: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec ; GFX7-NEXT: $vgpr1, dead $sgpr4_sgpr5 = V_ADD_CO_U32_e64 32, killed $vgpr1, 0, implicit $exec ; GFX7-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 68, killed $vgpr1, implicit-def $vcc, implicit $exec ; GFX7-NEXT: SI_RETURN implicit $vgpr0, implicit $vcc ; - ; GFX8-LABEL: name: v_add_co_u32_e32__literal__fi_offset0__offset_inlineimm + ; GFX8-LABEL: name: v_add_co_u32_e32__literal__fi_offset0__offset_inlineimm_live_vcc ; GFX8: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec ; GFX8-NEXT: $vgpr1, dead $sgpr4_sgpr5 = V_ADD_CO_U32_e64 32, killed $vgpr1, 0, implicit $exec ; GFX8-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 68, killed $vgpr1, implicit-def $vcc, implicit $exec ; GFX8-NEXT: SI_RETURN implicit $vgpr0, implicit $vcc ; - ; GFX900-LABEL: name: v_add_co_u32_e32__literal__fi_offset0__offset_inlineimm + ; GFX900-LABEL: name: v_add_co_u32_e32__literal__fi_offset0__offset_inlineimm_live_vcc ; GFX900: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec ; GFX900-NEXT: $vgpr1 = V_ADD_U32_e32 32, killed $vgpr1, implicit $exec ; GFX900-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 68, killed $vgpr1, implicit-def $vcc, implicit $exec ; GFX900-NEXT: SI_RETURN implicit $vgpr0, implicit $vcc ; - ; GFX90A-LABEL: name: v_add_co_u32_e32__literal__fi_offset0__offset_inlineimm + ; GFX90A-LABEL: name: v_add_co_u32_e32__literal__fi_offset0__offset_inlineimm_live_vcc ; GFX90A: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec ; GFX90A-NEXT: $vgpr1 = V_ADD_U32_e32 32, killed $vgpr1, implicit $exec ; GFX90A-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 68, killed $vgpr1, implicit-def $vcc, implicit $exec ; GFX90A-NEXT: SI_RETURN implicit $vgpr0, implicit $vcc ; - ; GFX10-LABEL: name: v_add_co_u32_e32__literal__fi_offset0__offset_inlineimm + ; GFX10-LABEL: name: v_add_co_u32_e32__literal__fi_offset0__offset_inlineimm_live_vcc ; GFX10: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec ; GFX10-NEXT: $vgpr1 = V_ADD_U32_e32 32, killed $vgpr1, implicit $exec ; GFX10-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 68, killed $vgpr1, implicit-def $vcc, implicit $exec ; GFX10-NEXT: SI_RETURN implicit $vgpr0, implicit $vcc ; - ; FLATSCRW64-LABEL: name: v_add_co_u32_e32__literal__fi_offset0__offset_inlineimm + ; FLATSCRW64-LABEL: name: v_add_co_u32_e32__literal__fi_offset0__offset_inlineimm_live_vcc ; FLATSCRW64: $sgpr4 = S_ADD_I32 $sgpr32, 32, implicit-def $scc ; FLATSCRW64-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr4, implicit $exec ; FLATSCRW64-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 68, killed $vgpr1, implicit-def $vcc, implicit $exec @@ -180,17 +334,17 @@ body: | ; MUBUFW64: liveins: $vgpr1 ; MUBUFW64-NEXT: {{ $}} ; MUBUFW64-NEXT: $vgpr2 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec - ; MUBUFW64-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 $vgpr1, killed $vgpr2, implicit-def $vcc, implicit $exec - ; MUBUFW64-NEXT: SI_RETURN implicit $vgpr0, implicit $vcc + ; MUBUFW64-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 $vgpr1, killed $vgpr2, implicit-def dead $vcc, implicit $exec + ; MUBUFW64-NEXT: SI_RETURN implicit $vgpr0 ; ; FLATSCRW64-LABEL: name: v_add_co_u32_e32__vgpr__fi_offset0 ; FLATSCRW64: liveins: $vgpr1 ; FLATSCRW64-NEXT: {{ $}} ; FLATSCRW64-NEXT: $vgpr2 = V_MOV_B32_e32 $sgpr32, implicit $exec - ; FLATSCRW64-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 $vgpr1, killed $vgpr2, implicit-def $vcc, implicit $exec - ; FLATSCRW64-NEXT: SI_RETURN implicit $vgpr0, implicit $vcc - renamable $vgpr0 = V_ADD_CO_U32_e32 $vgpr1, %stack.0, implicit-def $vcc, implicit $exec - SI_RETURN implicit $vgpr0, implicit $vcc + ; FLATSCRW64-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 $vgpr1, killed $vgpr2, implicit-def dead $vcc, implicit $exec + ; FLATSCRW64-NEXT: SI_RETURN implicit $vgpr0 + renamable $vgpr0 = V_ADD_CO_U32_e32 $vgpr1, %stack.0, implicit-def dead $vcc, implicit $exec + SI_RETURN implicit $vgpr0 ... @@ -210,16 +364,16 @@ body: | ; MUBUFW64: liveins: $vgpr1 ; MUBUFW64-NEXT: {{ $}} ; MUBUFW64-NEXT: $vgpr2 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec - ; MUBUFW64-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 killed $vgpr2, $vgpr1, implicit-def $vcc, implicit $exec - ; MUBUFW64-NEXT: SI_RETURN implicit $vgpr0, implicit $vcc + ; MUBUFW64-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 killed $vgpr2, $vgpr1, implicit-def dead $vcc, implicit $exec + ; MUBUFW64-NEXT: SI_RETURN implicit $vgpr0 ; ; FLATSCRW64-LABEL: name: v_add_co_u32_e32__fi_offset0__vgpr ; FLATSCRW64: liveins: $vgpr1 ; FLATSCRW64-NEXT: {{ $}} - ; FLATSCRW64-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 $sgpr32, $vgpr1, implicit-def $vcc, implicit $exec - ; FLATSCRW64-NEXT: SI_RETURN implicit $vgpr0, implicit $vcc - renamable $vgpr0 = V_ADD_CO_U32_e32 %stack.0, $vgpr1, implicit-def $vcc, implicit $exec - SI_RETURN implicit $vgpr0, implicit $vcc + ; FLATSCRW64-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 $sgpr32, $vgpr1, implicit-def dead $vcc, implicit $exec + ; FLATSCRW64-NEXT: SI_RETURN implicit $vgpr0 + renamable $vgpr0 = V_ADD_CO_U32_e32 %stack.0, $vgpr1, implicit-def dead $vcc, implicit $exec + SI_RETURN implicit $vgpr0 ... @@ -240,53 +394,53 @@ body: | ; GFX7: liveins: $vgpr1 ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: $vgpr2 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec - ; GFX7-NEXT: $sgpr4 = S_MOV_B32 128 - ; GFX7-NEXT: $vgpr2, dead $sgpr4_sgpr5 = V_ADD_CO_U32_e64 killed $sgpr4, killed $vgpr2, 0, implicit $exec - ; GFX7-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 $vgpr1, killed $vgpr2, implicit-def $vcc, implicit $exec - ; GFX7-NEXT: SI_RETURN implicit $vgpr0, implicit $vcc + ; GFX7-NEXT: $vcc_lo = S_MOV_B32 128 + ; GFX7-NEXT: $vgpr2, dead $vcc = V_ADD_CO_U32_e64 killed $vcc_lo, killed $vgpr2, 0, implicit $exec + ; GFX7-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 $vgpr1, killed $vgpr2, implicit-def dead $vcc, implicit $exec + ; GFX7-NEXT: SI_RETURN implicit $vgpr0 ; ; GFX8-LABEL: name: v_add_co_u32_e32__vgpr__fi_literal_offset ; GFX8: liveins: $vgpr1 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: $vgpr2 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec - ; GFX8-NEXT: $sgpr4 = S_MOV_B32 128 - ; GFX8-NEXT: $vgpr2, dead $sgpr4_sgpr5 = V_ADD_CO_U32_e64 killed $sgpr4, killed $vgpr2, 0, implicit $exec - ; GFX8-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 $vgpr1, killed $vgpr2, implicit-def $vcc, implicit $exec - ; GFX8-NEXT: SI_RETURN implicit $vgpr0, implicit $vcc + ; GFX8-NEXT: $vcc_lo = S_MOV_B32 128 + ; GFX8-NEXT: $vgpr2, dead $vcc = V_ADD_CO_U32_e64 killed $vcc_lo, killed $vgpr2, 0, implicit $exec + ; GFX8-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 $vgpr1, killed $vgpr2, implicit-def dead $vcc, implicit $exec + ; GFX8-NEXT: SI_RETURN implicit $vgpr0 ; ; GFX900-LABEL: name: v_add_co_u32_e32__vgpr__fi_literal_offset ; GFX900: liveins: $vgpr1 ; GFX900-NEXT: {{ $}} ; GFX900-NEXT: $vgpr2 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec ; GFX900-NEXT: $vgpr2 = V_ADD_U32_e32 128, killed $vgpr2, implicit $exec - ; GFX900-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 $vgpr1, killed $vgpr2, implicit-def $vcc, implicit $exec - ; GFX900-NEXT: SI_RETURN implicit $vgpr0, implicit $vcc + ; GFX900-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 $vgpr1, killed $vgpr2, implicit-def dead $vcc, implicit $exec + ; GFX900-NEXT: SI_RETURN implicit $vgpr0 ; ; GFX90A-LABEL: name: v_add_co_u32_e32__vgpr__fi_literal_offset ; GFX90A: liveins: $vgpr1 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $vgpr2 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec ; GFX90A-NEXT: $vgpr2 = V_ADD_U32_e32 128, killed $vgpr2, implicit $exec - ; GFX90A-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 $vgpr1, killed $vgpr2, implicit-def $vcc, implicit $exec - ; GFX90A-NEXT: SI_RETURN implicit $vgpr0, implicit $vcc + ; GFX90A-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 $vgpr1, killed $vgpr2, implicit-def dead $vcc, implicit $exec + ; GFX90A-NEXT: SI_RETURN implicit $vgpr0 ; ; GFX10-LABEL: name: v_add_co_u32_e32__vgpr__fi_literal_offset ; GFX10: liveins: $vgpr1 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: $vgpr2 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec ; GFX10-NEXT: $vgpr2 = V_ADD_U32_e32 128, killed $vgpr2, implicit $exec - ; GFX10-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 $vgpr1, killed $vgpr2, implicit-def $vcc, implicit $exec - ; GFX10-NEXT: SI_RETURN implicit $vgpr0, implicit $vcc + ; GFX10-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 $vgpr1, killed $vgpr2, implicit-def dead $vcc, implicit $exec + ; GFX10-NEXT: SI_RETURN implicit $vgpr0 ; ; FLATSCRW64-LABEL: name: v_add_co_u32_e32__vgpr__fi_literal_offset ; FLATSCRW64: liveins: $vgpr1 ; FLATSCRW64-NEXT: {{ $}} ; FLATSCRW64-NEXT: $sgpr4 = S_ADD_I32 $sgpr32, 128, implicit-def $scc ; FLATSCRW64-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr4, implicit $exec - ; FLATSCRW64-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 $vgpr1, killed $vgpr2, implicit-def $vcc, implicit $exec - ; FLATSCRW64-NEXT: SI_RETURN implicit $vgpr0, implicit $vcc - renamable $vgpr0 = V_ADD_CO_U32_e32 $vgpr1, %stack.1, implicit-def $vcc, implicit $exec - SI_RETURN implicit $vgpr0, implicit $vcc + ; FLATSCRW64-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 $vgpr1, killed $vgpr2, implicit-def dead $vcc, implicit $exec + ; FLATSCRW64-NEXT: SI_RETURN implicit $vgpr0 + renamable $vgpr0 = V_ADD_CO_U32_e32 $vgpr1, %stack.1, implicit-def dead $vcc, implicit $exec + SI_RETURN implicit $vgpr0 ... @@ -307,52 +461,52 @@ body: | ; GFX7: liveins: $vgpr1 ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: $vgpr2 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec - ; GFX7-NEXT: $sgpr4 = S_MOV_B32 128 - ; GFX7-NEXT: $vgpr2, dead $sgpr4_sgpr5 = V_ADD_CO_U32_e64 killed $sgpr4, killed $vgpr2, 0, implicit $exec - ; GFX7-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 killed $vgpr2, $vgpr1, implicit-def $vcc, implicit $exec - ; GFX7-NEXT: SI_RETURN implicit $vgpr0, implicit $vcc + ; GFX7-NEXT: $vcc_lo = S_MOV_B32 128 + ; GFX7-NEXT: $vgpr2, dead $vcc = V_ADD_CO_U32_e64 killed $vcc_lo, killed $vgpr2, 0, implicit $exec + ; GFX7-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 killed $vgpr2, $vgpr1, implicit-def dead $vcc, implicit $exec + ; GFX7-NEXT: SI_RETURN implicit $vgpr0 ; ; GFX8-LABEL: name: v_add_co_u32_e32__fi_literal_offset__vgpr ; GFX8: liveins: $vgpr1 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: $vgpr2 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec - ; GFX8-NEXT: $sgpr4 = S_MOV_B32 128 - ; GFX8-NEXT: $vgpr2, dead $sgpr4_sgpr5 = V_ADD_CO_U32_e64 killed $sgpr4, killed $vgpr2, 0, implicit $exec - ; GFX8-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 killed $vgpr2, $vgpr1, implicit-def $vcc, implicit $exec - ; GFX8-NEXT: SI_RETURN implicit $vgpr0, implicit $vcc + ; GFX8-NEXT: $vcc_lo = S_MOV_B32 128 + ; GFX8-NEXT: $vgpr2, dead $vcc = V_ADD_CO_U32_e64 killed $vcc_lo, killed $vgpr2, 0, implicit $exec + ; GFX8-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 killed $vgpr2, $vgpr1, implicit-def dead $vcc, implicit $exec + ; GFX8-NEXT: SI_RETURN implicit $vgpr0 ; ; GFX900-LABEL: name: v_add_co_u32_e32__fi_literal_offset__vgpr ; GFX900: liveins: $vgpr1 ; GFX900-NEXT: {{ $}} ; GFX900-NEXT: $vgpr2 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec ; GFX900-NEXT: $vgpr2 = V_ADD_U32_e32 128, killed $vgpr2, implicit $exec - ; GFX900-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 killed $vgpr2, $vgpr1, implicit-def $vcc, implicit $exec - ; GFX900-NEXT: SI_RETURN implicit $vgpr0, implicit $vcc + ; GFX900-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 killed $vgpr2, $vgpr1, implicit-def dead $vcc, implicit $exec + ; GFX900-NEXT: SI_RETURN implicit $vgpr0 ; ; GFX90A-LABEL: name: v_add_co_u32_e32__fi_literal_offset__vgpr ; GFX90A: liveins: $vgpr1 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $vgpr2 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec ; GFX90A-NEXT: $vgpr2 = V_ADD_U32_e32 128, killed $vgpr2, implicit $exec - ; GFX90A-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 killed $vgpr2, $vgpr1, implicit-def $vcc, implicit $exec - ; GFX90A-NEXT: SI_RETURN implicit $vgpr0, implicit $vcc + ; GFX90A-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 killed $vgpr2, $vgpr1, implicit-def dead $vcc, implicit $exec + ; GFX90A-NEXT: SI_RETURN implicit $vgpr0 ; ; GFX10-LABEL: name: v_add_co_u32_e32__fi_literal_offset__vgpr ; GFX10: liveins: $vgpr1 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: $vgpr2 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec ; GFX10-NEXT: $vgpr2 = V_ADD_U32_e32 128, killed $vgpr2, implicit $exec - ; GFX10-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 killed $vgpr2, $vgpr1, implicit-def $vcc, implicit $exec - ; GFX10-NEXT: SI_RETURN implicit $vgpr0, implicit $vcc + ; GFX10-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 killed $vgpr2, $vgpr1, implicit-def dead $vcc, implicit $exec + ; GFX10-NEXT: SI_RETURN implicit $vgpr0 ; ; FLATSCRW64-LABEL: name: v_add_co_u32_e32__fi_literal_offset__vgpr ; FLATSCRW64: liveins: $vgpr1 ; FLATSCRW64-NEXT: {{ $}} ; FLATSCRW64-NEXT: $sgpr4 = S_ADD_I32 $sgpr32, 128, implicit-def $scc - ; FLATSCRW64-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 killed $sgpr4, $vgpr1, implicit-def $vcc, implicit $exec - ; FLATSCRW64-NEXT: SI_RETURN implicit $vgpr0, implicit $vcc - renamable $vgpr0 = V_ADD_CO_U32_e32 %stack.1, $vgpr1, implicit-def $vcc, implicit $exec - SI_RETURN implicit $vgpr0, implicit $vcc + ; FLATSCRW64-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 killed $sgpr4, $vgpr1, implicit-def dead $vcc, implicit $exec + ; FLATSCRW64-NEXT: SI_RETURN implicit $vgpr0 + renamable $vgpr0 = V_ADD_CO_U32_e32 %stack.1, $vgpr1, implicit-def dead $vcc, implicit $exec + SI_RETURN implicit $vgpr0 ... @@ -373,53 +527,53 @@ body: | ; GFX7: liveins: $sgpr8 ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec - ; GFX7-NEXT: $sgpr4 = S_MOV_B32 128 - ; GFX7-NEXT: $vgpr1, dead $sgpr4_sgpr5 = V_ADD_CO_U32_e64 killed $sgpr4, killed $vgpr1, 0, implicit $exec - ; GFX7-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 $sgpr8, killed $vgpr1, implicit-def $vcc, implicit $exec - ; GFX7-NEXT: SI_RETURN implicit $vgpr0, implicit $vcc + ; GFX7-NEXT: $vcc_lo = S_MOV_B32 128 + ; GFX7-NEXT: $vgpr1, dead $vcc = V_ADD_CO_U32_e64 killed $vcc_lo, killed $vgpr1, 0, implicit $exec + ; GFX7-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 $sgpr8, killed $vgpr1, implicit-def dead $vcc, implicit $exec + ; GFX7-NEXT: SI_RETURN implicit $vgpr0 ; ; GFX8-LABEL: name: v_add_co_u32_e32__sgpr__fi_literal_offset ; GFX8: liveins: $sgpr8 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec - ; GFX8-NEXT: $sgpr4 = S_MOV_B32 128 - ; GFX8-NEXT: $vgpr1, dead $sgpr4_sgpr5 = V_ADD_CO_U32_e64 killed $sgpr4, killed $vgpr1, 0, implicit $exec - ; GFX8-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 $sgpr8, killed $vgpr1, implicit-def $vcc, implicit $exec - ; GFX8-NEXT: SI_RETURN implicit $vgpr0, implicit $vcc + ; GFX8-NEXT: $vcc_lo = S_MOV_B32 128 + ; GFX8-NEXT: $vgpr1, dead $vcc = V_ADD_CO_U32_e64 killed $vcc_lo, killed $vgpr1, 0, implicit $exec + ; GFX8-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 $sgpr8, killed $vgpr1, implicit-def dead $vcc, implicit $exec + ; GFX8-NEXT: SI_RETURN implicit $vgpr0 ; ; GFX900-LABEL: name: v_add_co_u32_e32__sgpr__fi_literal_offset ; GFX900: liveins: $sgpr8 ; GFX900-NEXT: {{ $}} ; GFX900-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec ; GFX900-NEXT: $vgpr1 = V_ADD_U32_e32 128, killed $vgpr1, implicit $exec - ; GFX900-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 $sgpr8, killed $vgpr1, implicit-def $vcc, implicit $exec - ; GFX900-NEXT: SI_RETURN implicit $vgpr0, implicit $vcc + ; GFX900-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 $sgpr8, killed $vgpr1, implicit-def dead $vcc, implicit $exec + ; GFX900-NEXT: SI_RETURN implicit $vgpr0 ; ; GFX90A-LABEL: name: v_add_co_u32_e32__sgpr__fi_literal_offset ; GFX90A: liveins: $sgpr8 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec ; GFX90A-NEXT: $vgpr1 = V_ADD_U32_e32 128, killed $vgpr1, implicit $exec - ; GFX90A-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 $sgpr8, killed $vgpr1, implicit-def $vcc, implicit $exec - ; GFX90A-NEXT: SI_RETURN implicit $vgpr0, implicit $vcc + ; GFX90A-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 $sgpr8, killed $vgpr1, implicit-def dead $vcc, implicit $exec + ; GFX90A-NEXT: SI_RETURN implicit $vgpr0 ; ; GFX10-LABEL: name: v_add_co_u32_e32__sgpr__fi_literal_offset ; GFX10: liveins: $sgpr8 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec ; GFX10-NEXT: $vgpr1 = V_ADD_U32_e32 128, killed $vgpr1, implicit $exec - ; GFX10-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 $sgpr8, killed $vgpr1, implicit-def $vcc, implicit $exec - ; GFX10-NEXT: SI_RETURN implicit $vgpr0, implicit $vcc + ; GFX10-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 $sgpr8, killed $vgpr1, implicit-def dead $vcc, implicit $exec + ; GFX10-NEXT: SI_RETURN implicit $vgpr0 ; ; FLATSCRW64-LABEL: name: v_add_co_u32_e32__sgpr__fi_literal_offset ; FLATSCRW64: liveins: $sgpr8 ; FLATSCRW64-NEXT: {{ $}} ; FLATSCRW64-NEXT: $sgpr4 = S_ADD_I32 $sgpr32, 128, implicit-def $scc ; FLATSCRW64-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr4, implicit $exec - ; FLATSCRW64-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 $sgpr8, killed $vgpr1, implicit-def $vcc, implicit $exec - ; FLATSCRW64-NEXT: SI_RETURN implicit $vgpr0, implicit $vcc - renamable $vgpr0 = V_ADD_CO_U32_e32 $sgpr8, %stack.1, implicit-def $vcc, implicit $exec - SI_RETURN implicit $vgpr0, implicit $vcc + ; FLATSCRW64-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 $sgpr8, killed $vgpr1, implicit-def dead $vcc, implicit $exec + ; FLATSCRW64-NEXT: SI_RETURN implicit $vgpr0 + renamable $vgpr0 = V_ADD_CO_U32_e32 $sgpr8, %stack.1, implicit-def dead $vcc, implicit $exec + SI_RETURN implicit $vgpr0 ... @@ -448,6 +602,54 @@ body: | ... --- +name: v_add_co_u32_e64__inline_imm__fi_offset0__clamp +tracksRegLiveness: true +stack: + - { id: 0, size: 4, alignment: 16 } +machineFunctionInfo: + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' +body: | + bb.0: + ; MUBUFW64-LABEL: name: v_add_co_u32_e64__inline_imm__fi_offset0__clamp + ; MUBUFW64: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec + ; MUBUFW64-NEXT: renamable $vgpr0, dead $vcc = V_ADD_CO_U32_e64 12, killed $vgpr1, 1, implicit $exec + ; MUBUFW64-NEXT: SI_RETURN implicit $vgpr0 + ; + ; FLATSCRW64-LABEL: name: v_add_co_u32_e64__inline_imm__fi_offset0__clamp + ; FLATSCRW64: renamable $vgpr0, dead $vcc = V_ADD_CO_U32_e64 12, $sgpr32, 1, implicit $exec + ; FLATSCRW64-NEXT: SI_RETURN implicit $vgpr0 + renamable $vgpr0, dead $vcc = V_ADD_CO_U32_e64 12, %stack.0, 1, implicit $exec + SI_RETURN implicit $vgpr0 + +... + +--- +name: v_add_co_u32_e64__inline_imm__fi_offset0__live_vcc_clamp +tracksRegLiveness: true +stack: + - { id: 0, size: 4, alignment: 16 } +machineFunctionInfo: + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' +body: | + bb.0: + ; MUBUFW64-LABEL: name: v_add_co_u32_e64__inline_imm__fi_offset0__live_vcc_clamp + ; MUBUFW64: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec + ; MUBUFW64-NEXT: renamable $vgpr0, renamable $vcc = V_ADD_CO_U32_e64 12, killed $vgpr1, 1, implicit $exec + ; MUBUFW64-NEXT: SI_RETURN implicit $vgpr0, implicit $vcc + ; + ; FLATSCRW64-LABEL: name: v_add_co_u32_e64__inline_imm__fi_offset0__live_vcc_clamp + ; FLATSCRW64: renamable $vgpr0, renamable $vcc = V_ADD_CO_U32_e64 12, $sgpr32, 1, implicit $exec + ; FLATSCRW64-NEXT: SI_RETURN implicit $vgpr0, implicit $vcc + renamable $vgpr0, renamable $vcc = V_ADD_CO_U32_e64 12, %stack.0, 1, implicit $exec + SI_RETURN implicit $vgpr0, implicit $vcc + +... + +--- name: v_add_co_u32_e64__fi_literal_offset__sgpr tracksRegLiveness: true stack: @@ -527,3 +729,1186 @@ body: | SI_RETURN implicit $vgpr0, implicit $vcc ... + +--- +name: v_add_co_u32_e64__fi_literal_offset__sgpr_clamp +tracksRegLiveness: true +stack: + - { id: 0, size: 128, alignment: 16 } + - { id: 1, size: 4, alignment: 4 } +machineFunctionInfo: + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' +body: | + bb.0: + liveins: $sgpr8 + ; GFX7-LABEL: name: v_add_co_u32_e64__fi_literal_offset__sgpr_clamp + ; GFX7: liveins: $sgpr8 + ; GFX7-NEXT: {{ $}} + ; GFX7-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec + ; GFX7-NEXT: $sgpr4 = S_MOV_B32 128 + ; GFX7-NEXT: $vgpr1, dead $sgpr4_sgpr5 = V_ADD_CO_U32_e64 killed $sgpr4, killed $vgpr1, 0, implicit $exec + ; GFX7-NEXT: renamable $vgpr0, renamable $vcc = V_ADD_CO_U32_e64 killed $vgpr1, $sgpr8, 1, implicit $exec + ; GFX7-NEXT: SI_RETURN implicit $vgpr0, implicit $vcc + ; + ; GFX8-LABEL: name: v_add_co_u32_e64__fi_literal_offset__sgpr_clamp + ; GFX8: liveins: $sgpr8 + ; GFX8-NEXT: {{ $}} + ; GFX8-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec + ; GFX8-NEXT: $sgpr4 = S_MOV_B32 128 + ; GFX8-NEXT: $vgpr1, dead $sgpr4_sgpr5 = V_ADD_CO_U32_e64 killed $sgpr4, killed $vgpr1, 0, implicit $exec + ; GFX8-NEXT: renamable $vgpr0, renamable $vcc = V_ADD_CO_U32_e64 killed $vgpr1, $sgpr8, 1, implicit $exec + ; GFX8-NEXT: SI_RETURN implicit $vgpr0, implicit $vcc + ; + ; GFX900-LABEL: name: v_add_co_u32_e64__fi_literal_offset__sgpr_clamp + ; GFX900: liveins: $sgpr8 + ; GFX900-NEXT: {{ $}} + ; GFX900-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec + ; GFX900-NEXT: $vgpr1 = V_ADD_U32_e32 128, killed $vgpr1, implicit $exec + ; GFX900-NEXT: renamable $vgpr0, renamable $vcc = V_ADD_CO_U32_e64 killed $vgpr1, $sgpr8, 1, implicit $exec + ; GFX900-NEXT: SI_RETURN implicit $vgpr0, implicit $vcc + ; + ; GFX90A-LABEL: name: v_add_co_u32_e64__fi_literal_offset__sgpr_clamp + ; GFX90A: liveins: $sgpr8 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec + ; GFX90A-NEXT: $vgpr1 = V_ADD_U32_e32 128, killed $vgpr1, implicit $exec + ; GFX90A-NEXT: renamable $vgpr0, renamable $vcc = V_ADD_CO_U32_e64 killed $vgpr1, $sgpr8, 1, implicit $exec + ; GFX90A-NEXT: SI_RETURN implicit $vgpr0, implicit $vcc + ; + ; GFX10-LABEL: name: v_add_co_u32_e64__fi_literal_offset__sgpr_clamp + ; GFX10: liveins: $sgpr8 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec + ; GFX10-NEXT: $vgpr1 = V_ADD_U32_e32 128, killed $vgpr1, implicit $exec + ; GFX10-NEXT: renamable $vgpr0, renamable $vcc = V_ADD_CO_U32_e64 killed $vgpr1, $sgpr8, 1, implicit $exec + ; GFX10-NEXT: SI_RETURN implicit $vgpr0, implicit $vcc + ; + ; GFX940-LABEL: name: v_add_co_u32_e64__fi_literal_offset__sgpr_clamp + ; GFX940: liveins: $sgpr8 + ; GFX940-NEXT: {{ $}} + ; GFX940-NEXT: $sgpr4 = S_ADD_I32 $sgpr32, 128, implicit-def $scc + ; GFX940-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr4, implicit $exec + ; GFX940-NEXT: renamable $vgpr0, renamable $vcc = V_ADD_CO_U32_e64 killed $vgpr1, $sgpr8, 1, implicit $exec + ; GFX940-NEXT: SI_RETURN implicit $vgpr0, implicit $vcc + ; + ; GFX11-LABEL: name: v_add_co_u32_e64__fi_literal_offset__sgpr_clamp + ; GFX11: liveins: $sgpr8 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: $sgpr4 = S_ADD_I32 $sgpr32, 128, implicit-def $scc + ; GFX11-NEXT: renamable $vgpr0, renamable $vcc = V_ADD_CO_U32_e64 killed $sgpr4, $sgpr8, 1, implicit $exec + ; GFX11-NEXT: SI_RETURN implicit $vgpr0, implicit $vcc + ; + ; GFX12-LABEL: name: v_add_co_u32_e64__fi_literal_offset__sgpr_clamp + ; GFX12: liveins: $sgpr8 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: $sgpr4 = S_ADD_I32 $sgpr32, 128, implicit-def $scc + ; GFX12-NEXT: renamable $vgpr0, renamable $vcc = V_ADD_CO_U32_e64 killed $sgpr4, $sgpr8, 1, implicit $exec + ; GFX12-NEXT: SI_RETURN implicit $vgpr0, implicit $vcc + renamable $vgpr0, renamable $vcc = V_ADD_CO_U32_e64 %stack.1, $sgpr8, 1, implicit $exec + SI_RETURN implicit $vgpr0, implicit $vcc + +... + +--- +name: v_add_co_u32_e64__fi_literal_offset__vgpr +tracksRegLiveness: true +stack: + - { id: 0, size: 128, alignment: 16 } + - { id: 1, size: 4, alignment: 4 } +machineFunctionInfo: + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' +body: | + bb.0: + liveins: $vgpr8 + + ; GFX7-LABEL: name: v_add_co_u32_e64__fi_literal_offset__vgpr + ; GFX7: liveins: $vgpr8 + ; GFX7-NEXT: {{ $}} + ; GFX7-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec + ; GFX7-NEXT: $vcc_lo = S_MOV_B32 128 + ; GFX7-NEXT: $vgpr1, dead $vcc = V_ADD_CO_U32_e64 killed $vcc_lo, killed $vgpr1, 0, implicit $exec + ; GFX7-NEXT: renamable $vgpr0, dead renamable $vcc = V_ADD_CO_U32_e64 killed $vgpr1, $vgpr8, 0, implicit $exec + ; GFX7-NEXT: SI_RETURN implicit $vgpr0 + ; + ; GFX8-LABEL: name: v_add_co_u32_e64__fi_literal_offset__vgpr + ; GFX8: liveins: $vgpr8 + ; GFX8-NEXT: {{ $}} + ; GFX8-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec + ; GFX8-NEXT: $vcc_lo = S_MOV_B32 128 + ; GFX8-NEXT: $vgpr1, dead $vcc = V_ADD_CO_U32_e64 killed $vcc_lo, killed $vgpr1, 0, implicit $exec + ; GFX8-NEXT: renamable $vgpr0, dead renamable $vcc = V_ADD_CO_U32_e64 killed $vgpr1, $vgpr8, 0, implicit $exec + ; GFX8-NEXT: SI_RETURN implicit $vgpr0 + ; + ; GFX900-LABEL: name: v_add_co_u32_e64__fi_literal_offset__vgpr + ; GFX900: liveins: $vgpr8 + ; GFX900-NEXT: {{ $}} + ; GFX900-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec + ; GFX900-NEXT: $vgpr1 = V_ADD_U32_e32 128, killed $vgpr1, implicit $exec + ; GFX900-NEXT: renamable $vgpr0, dead renamable $vcc = V_ADD_CO_U32_e64 killed $vgpr1, $vgpr8, 0, implicit $exec + ; GFX900-NEXT: SI_RETURN implicit $vgpr0 + ; + ; GFX90A-LABEL: name: v_add_co_u32_e64__fi_literal_offset__vgpr + ; GFX90A: liveins: $vgpr8 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec + ; GFX90A-NEXT: $vgpr1 = V_ADD_U32_e32 128, killed $vgpr1, implicit $exec + ; GFX90A-NEXT: renamable $vgpr0, dead renamable $vcc = V_ADD_CO_U32_e64 killed $vgpr1, $vgpr8, 0, implicit $exec + ; GFX90A-NEXT: SI_RETURN implicit $vgpr0 + ; + ; GFX10-LABEL: name: v_add_co_u32_e64__fi_literal_offset__vgpr + ; GFX10: liveins: $vgpr8 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec + ; GFX10-NEXT: $vgpr1 = V_ADD_U32_e32 128, killed $vgpr1, implicit $exec + ; GFX10-NEXT: renamable $vgpr0, dead renamable $vcc = V_ADD_CO_U32_e64 killed $vgpr1, $vgpr8, 0, implicit $exec + ; GFX10-NEXT: SI_RETURN implicit $vgpr0 + ; + ; FLATSCRW64-LABEL: name: v_add_co_u32_e64__fi_literal_offset__vgpr + ; FLATSCRW64: liveins: $vgpr8 + ; FLATSCRW64-NEXT: {{ $}} + ; FLATSCRW64-NEXT: $sgpr4 = S_ADD_I32 $sgpr32, 128, implicit-def $scc + ; FLATSCRW64-NEXT: renamable $vgpr0, dead renamable $vcc = V_ADD_CO_U32_e64 killed $sgpr4, $vgpr8, 0, implicit $exec + ; FLATSCRW64-NEXT: SI_RETURN implicit $vgpr0 + renamable $vgpr0, renamable dead $vcc = V_ADD_CO_U32_e64 %stack.1, $vgpr8, 0, implicit $exec + SI_RETURN implicit $vgpr0 + +... + +--- +name: v_add_co_u32_e64__fi_literal_offset__vgpr__clamp +tracksRegLiveness: true +stack: + - { id: 0, size: 128, alignment: 16 } + - { id: 1, size: 4, alignment: 4 } +machineFunctionInfo: + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' +body: | + bb.0: + liveins: $vgpr8 + + ; GFX7-LABEL: name: v_add_co_u32_e64__fi_literal_offset__vgpr__clamp + ; GFX7: liveins: $vgpr8 + ; GFX7-NEXT: {{ $}} + ; GFX7-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec + ; GFX7-NEXT: $vcc_lo = S_MOV_B32 128 + ; GFX7-NEXT: $vgpr1, dead $vcc = V_ADD_CO_U32_e64 killed $vcc_lo, killed $vgpr1, 0, implicit $exec + ; GFX7-NEXT: renamable $vgpr0, dead renamable $vcc = V_ADD_CO_U32_e64 killed $vgpr1, $vgpr8, 1, implicit $exec + ; GFX7-NEXT: SI_RETURN implicit $vgpr0 + ; + ; GFX8-LABEL: name: v_add_co_u32_e64__fi_literal_offset__vgpr__clamp + ; GFX8: liveins: $vgpr8 + ; GFX8-NEXT: {{ $}} + ; GFX8-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec + ; GFX8-NEXT: $vcc_lo = S_MOV_B32 128 + ; GFX8-NEXT: $vgpr1, dead $vcc = V_ADD_CO_U32_e64 killed $vcc_lo, killed $vgpr1, 0, implicit $exec + ; GFX8-NEXT: renamable $vgpr0, dead renamable $vcc = V_ADD_CO_U32_e64 killed $vgpr1, $vgpr8, 1, implicit $exec + ; GFX8-NEXT: SI_RETURN implicit $vgpr0 + ; + ; GFX900-LABEL: name: v_add_co_u32_e64__fi_literal_offset__vgpr__clamp + ; GFX900: liveins: $vgpr8 + ; GFX900-NEXT: {{ $}} + ; GFX900-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec + ; GFX900-NEXT: $vgpr1 = V_ADD_U32_e32 128, killed $vgpr1, implicit $exec + ; GFX900-NEXT: renamable $vgpr0, dead renamable $vcc = V_ADD_CO_U32_e64 killed $vgpr1, $vgpr8, 1, implicit $exec + ; GFX900-NEXT: SI_RETURN implicit $vgpr0 + ; + ; GFX90A-LABEL: name: v_add_co_u32_e64__fi_literal_offset__vgpr__clamp + ; GFX90A: liveins: $vgpr8 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec + ; GFX90A-NEXT: $vgpr1 = V_ADD_U32_e32 128, killed $vgpr1, implicit $exec + ; GFX90A-NEXT: renamable $vgpr0, dead renamable $vcc = V_ADD_CO_U32_e64 killed $vgpr1, $vgpr8, 1, implicit $exec + ; GFX90A-NEXT: SI_RETURN implicit $vgpr0 + ; + ; GFX10-LABEL: name: v_add_co_u32_e64__fi_literal_offset__vgpr__clamp + ; GFX10: liveins: $vgpr8 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec + ; GFX10-NEXT: $vgpr1 = V_ADD_U32_e32 128, killed $vgpr1, implicit $exec + ; GFX10-NEXT: renamable $vgpr0, dead renamable $vcc = V_ADD_CO_U32_e64 killed $vgpr1, $vgpr8, 1, implicit $exec + ; GFX10-NEXT: SI_RETURN implicit $vgpr0 + ; + ; FLATSCRW64-LABEL: name: v_add_co_u32_e64__fi_literal_offset__vgpr__clamp + ; FLATSCRW64: liveins: $vgpr8 + ; FLATSCRW64-NEXT: {{ $}} + ; FLATSCRW64-NEXT: $sgpr4 = S_ADD_I32 $sgpr32, 128, implicit-def $scc + ; FLATSCRW64-NEXT: renamable $vgpr0, dead renamable $vcc = V_ADD_CO_U32_e64 killed $sgpr4, $vgpr8, 1, implicit $exec + ; FLATSCRW64-NEXT: SI_RETURN implicit $vgpr0 + renamable $vgpr0, renamable dead $vcc = V_ADD_CO_U32_e64 %stack.1, $vgpr8, 1, implicit $exec + SI_RETURN implicit $vgpr0 + +... + +--- +name: v_add_co_u32_e64__fi_literal_offset__vgpr__live_vcc +tracksRegLiveness: true +stack: + - { id: 0, size: 128, alignment: 16 } + - { id: 1, size: 4, alignment: 4 } +machineFunctionInfo: + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' +body: | + bb.0: + liveins: $vgpr8 + + ; GFX7-LABEL: name: v_add_co_u32_e64__fi_literal_offset__vgpr__live_vcc + ; GFX7: liveins: $vgpr8 + ; GFX7-NEXT: {{ $}} + ; GFX7-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec + ; GFX7-NEXT: $sgpr4 = S_MOV_B32 128 + ; GFX7-NEXT: $vgpr1, dead $sgpr4_sgpr5 = V_ADD_CO_U32_e64 killed $sgpr4, killed $vgpr1, 0, implicit $exec + ; GFX7-NEXT: renamable $vgpr0, renamable $vcc = V_ADD_CO_U32_e64 killed $vgpr1, $vgpr8, 0, implicit $exec + ; GFX7-NEXT: SI_RETURN implicit $vgpr0, implicit $vcc + ; + ; GFX8-LABEL: name: v_add_co_u32_e64__fi_literal_offset__vgpr__live_vcc + ; GFX8: liveins: $vgpr8 + ; GFX8-NEXT: {{ $}} + ; GFX8-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec + ; GFX8-NEXT: $sgpr4 = S_MOV_B32 128 + ; GFX8-NEXT: $vgpr1, dead $sgpr4_sgpr5 = V_ADD_CO_U32_e64 killed $sgpr4, killed $vgpr1, 0, implicit $exec + ; GFX8-NEXT: renamable $vgpr0, renamable $vcc = V_ADD_CO_U32_e64 killed $vgpr1, $vgpr8, 0, implicit $exec + ; GFX8-NEXT: SI_RETURN implicit $vgpr0, implicit $vcc + ; + ; GFX900-LABEL: name: v_add_co_u32_e64__fi_literal_offset__vgpr__live_vcc + ; GFX900: liveins: $vgpr8 + ; GFX900-NEXT: {{ $}} + ; GFX900-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec + ; GFX900-NEXT: $vgpr1 = V_ADD_U32_e32 128, killed $vgpr1, implicit $exec + ; GFX900-NEXT: renamable $vgpr0, renamable $vcc = V_ADD_CO_U32_e64 killed $vgpr1, $vgpr8, 0, implicit $exec + ; GFX900-NEXT: SI_RETURN implicit $vgpr0, implicit $vcc + ; + ; GFX90A-LABEL: name: v_add_co_u32_e64__fi_literal_offset__vgpr__live_vcc + ; GFX90A: liveins: $vgpr8 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec + ; GFX90A-NEXT: $vgpr1 = V_ADD_U32_e32 128, killed $vgpr1, implicit $exec + ; GFX90A-NEXT: renamable $vgpr0, renamable $vcc = V_ADD_CO_U32_e64 killed $vgpr1, $vgpr8, 0, implicit $exec + ; GFX90A-NEXT: SI_RETURN implicit $vgpr0, implicit $vcc + ; + ; GFX10-LABEL: name: v_add_co_u32_e64__fi_literal_offset__vgpr__live_vcc + ; GFX10: liveins: $vgpr8 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec + ; GFX10-NEXT: $vgpr1 = V_ADD_U32_e32 128, killed $vgpr1, implicit $exec + ; GFX10-NEXT: renamable $vgpr0, renamable $vcc = V_ADD_CO_U32_e64 killed $vgpr1, $vgpr8, 0, implicit $exec + ; GFX10-NEXT: SI_RETURN implicit $vgpr0, implicit $vcc + ; + ; FLATSCRW64-LABEL: name: v_add_co_u32_e64__fi_literal_offset__vgpr__live_vcc + ; FLATSCRW64: liveins: $vgpr8 + ; FLATSCRW64-NEXT: {{ $}} + ; FLATSCRW64-NEXT: $sgpr4 = S_ADD_I32 $sgpr32, 128, implicit-def $scc + ; FLATSCRW64-NEXT: renamable $vgpr0, renamable $vcc = V_ADD_CO_U32_e64 killed $sgpr4, $vgpr8, 0, implicit $exec + ; FLATSCRW64-NEXT: SI_RETURN implicit $vgpr0, implicit $vcc + renamable $vgpr0, renamable $vcc = V_ADD_CO_U32_e64 %stack.1, $vgpr8, 0, implicit $exec + SI_RETURN implicit $vgpr0, implicit $vcc + +... + +--- +name: v_add_co_u32_e32__inline_imm__fi_offset0__kernel +tracksRegLiveness: true +stack: + - { id: 0, size: 32, alignment: 16 } +machineFunctionInfo: + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' + isEntryFunction: true +body: | + bb.0: + ; MUBUFW64-LABEL: name: v_add_co_u32_e32__inline_imm__fi_offset0__kernel + ; MUBUFW64: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW64-NEXT: {{ $}} + ; MUBUFW64-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $noreg, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW64-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW64-NEXT: $vgpr1 = V_MOV_B32_e32 0, implicit $exec + ; MUBUFW64-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 12, killed $vgpr1, implicit-def dead $vcc, implicit $exec + ; MUBUFW64-NEXT: SI_RETURN implicit $vgpr0 + ; + ; FLATSCRW64-LABEL: name: v_add_co_u32_e32__inline_imm__fi_offset0__kernel + ; FLATSCRW64: $vgpr1 = V_MOV_B32_e32 0, implicit $exec + ; FLATSCRW64-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 12, killed $vgpr1, implicit-def dead $vcc, implicit $exec + ; FLATSCRW64-NEXT: SI_RETURN implicit $vgpr0 + renamable $vgpr0 = V_ADD_CO_U32_e32 12, %stack.0, implicit-def dead $vcc, implicit $exec + SI_RETURN implicit $vgpr0 + +... + +--- +name: v_add_co_u32_e32__inline_imm__fi_offset0__kernel__live_vcc +tracksRegLiveness: true +stack: + - { id: 0, size: 32, alignment: 16 } +machineFunctionInfo: + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' + isEntryFunction: true +body: | + bb.0: + ; MUBUFW64-LABEL: name: v_add_co_u32_e32__inline_imm__fi_offset0__kernel__live_vcc + ; MUBUFW64: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW64-NEXT: {{ $}} + ; MUBUFW64-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $noreg, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW64-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW64-NEXT: $vgpr1 = V_MOV_B32_e32 0, implicit $exec + ; MUBUFW64-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 12, killed $vgpr1, implicit-def $vcc, implicit $exec + ; MUBUFW64-NEXT: SI_RETURN implicit $vgpr0, implicit $vcc + ; + ; FLATSCRW64-LABEL: name: v_add_co_u32_e32__inline_imm__fi_offset0__kernel__live_vcc + ; FLATSCRW64: $vgpr1 = V_MOV_B32_e32 0, implicit $exec + ; FLATSCRW64-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 12, killed $vgpr1, implicit-def $vcc, implicit $exec + ; FLATSCRW64-NEXT: SI_RETURN implicit $vgpr0, implicit $vcc + renamable $vgpr0 = V_ADD_CO_U32_e32 12, %stack.0, implicit-def $vcc, implicit $exec + SI_RETURN implicit $vgpr0, implicit $vcc + +... + +--- +name: v_add_co_u32_e32__inline_imm__fi_offset_literal__kernel +tracksRegLiveness: true +stack: + - { id: 0, size: 72, alignment: 16 } + - { id: 1, size: 40, alignment: 4 } +machineFunctionInfo: + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' + isEntryFunction: true +body: | + bb.0: + ; MUBUFW64-LABEL: name: v_add_co_u32_e32__inline_imm__fi_offset_literal__kernel + ; MUBUFW64: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW64-NEXT: {{ $}} + ; MUBUFW64-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $noreg, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW64-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW64-NEXT: $vgpr1 = V_MOV_B32_e32 72, implicit $exec + ; MUBUFW64-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 12, killed $vgpr1, implicit-def dead $vcc, implicit $exec + ; MUBUFW64-NEXT: SI_RETURN implicit $vgpr0 + ; + ; FLATSCRW64-LABEL: name: v_add_co_u32_e32__inline_imm__fi_offset_literal__kernel + ; FLATSCRW64: $vgpr1 = V_MOV_B32_e32 72, implicit $exec + ; FLATSCRW64-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 12, killed $vgpr1, implicit-def dead $vcc, implicit $exec + ; FLATSCRW64-NEXT: SI_RETURN implicit $vgpr0 + renamable $vgpr0 = V_ADD_CO_U32_e32 12, %stack.1, implicit-def dead $vcc, implicit $exec + SI_RETURN implicit $vgpr0 + +... + +--- +name: v_add_co_u32_e64__inline_imm__fi_offset_literal__kernel__clamp +tracksRegLiveness: true +stack: + - { id: 0, size: 72, alignment: 16 } + - { id: 1, size: 40, alignment: 4 } +machineFunctionInfo: + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' + isEntryFunction: true +body: | + bb.0: + ; GFX7-LABEL: name: v_add_co_u32_e64__inline_imm__fi_offset_literal__kernel__clamp + ; GFX7: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX7-NEXT: {{ $}} + ; GFX7-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $noreg, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX7-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX7-NEXT: $vgpr1 = V_MOV_B32_e32 72, implicit $exec + ; GFX7-NEXT: renamable $vgpr0, dead $vcc = V_ADD_CO_U32_e64 12, killed $vgpr1, 1, implicit $exec + ; GFX7-NEXT: SI_RETURN implicit $vgpr0 + ; + ; GFX8-LABEL: name: v_add_co_u32_e64__inline_imm__fi_offset_literal__kernel__clamp + ; GFX8: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX8-NEXT: {{ $}} + ; GFX8-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $noreg, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX8-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX8-NEXT: $vgpr1 = V_MOV_B32_e32 72, implicit $exec + ; GFX8-NEXT: renamable $vgpr0, dead $vcc = V_ADD_CO_U32_e64 12, killed $vgpr1, 1, implicit $exec + ; GFX8-NEXT: SI_RETURN implicit $vgpr0 + ; + ; GFX900-LABEL: name: v_add_co_u32_e64__inline_imm__fi_offset_literal__kernel__clamp + ; GFX900: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX900-NEXT: {{ $}} + ; GFX900-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $noreg, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX900-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX900-NEXT: $vgpr1 = V_MOV_B32_e32 72, implicit $exec + ; GFX900-NEXT: renamable $vgpr0, dead $vcc = V_ADD_CO_U32_e64 12, killed $vgpr1, 1, implicit $exec + ; GFX900-NEXT: SI_RETURN implicit $vgpr0 + ; + ; GFX90A-LABEL: name: v_add_co_u32_e64__inline_imm__fi_offset_literal__kernel__clamp + ; GFX90A: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $noreg, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: $vgpr1 = V_MOV_B32_e32 72, implicit $exec + ; GFX90A-NEXT: renamable $vgpr0, dead $vcc = V_ADD_CO_U32_e64 12, killed $vgpr1, 1, implicit $exec + ; GFX90A-NEXT: SI_RETURN implicit $vgpr0 + ; + ; GFX10-LABEL: name: v_add_co_u32_e64__inline_imm__fi_offset_literal__kernel__clamp + ; GFX10: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $noreg, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX10-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX10-NEXT: renamable $vgpr0, dead $vcc = V_ADD_CO_U32_e64 12, 72, 1, implicit $exec + ; GFX10-NEXT: SI_RETURN implicit $vgpr0 + ; + ; GFX940-LABEL: name: v_add_co_u32_e64__inline_imm__fi_offset_literal__kernel__clamp + ; GFX940: $sgpr4 = S_MOV_B32 72 + ; GFX940-NEXT: renamable $vgpr0, dead $vcc = V_ADD_CO_U32_e64 12, killed $sgpr4, 1, implicit $exec + ; GFX940-NEXT: SI_RETURN implicit $vgpr0 + ; + ; GFX11-LABEL: name: v_add_co_u32_e64__inline_imm__fi_offset_literal__kernel__clamp + ; GFX11: renamable $vgpr0, dead $vcc = V_ADD_CO_U32_e64 12, 72, 1, implicit $exec + ; GFX11-NEXT: SI_RETURN implicit $vgpr0 + ; + ; GFX12-LABEL: name: v_add_co_u32_e64__inline_imm__fi_offset_literal__kernel__clamp + ; GFX12: renamable $vgpr0, dead $vcc = V_ADD_CO_U32_e64 12, 72, 1, implicit $exec + ; GFX12-NEXT: SI_RETURN implicit $vgpr0 + renamable $vgpr0, dead $vcc = V_ADD_CO_U32_e64 12, %stack.1, 1, implicit $exec + SI_RETURN implicit $vgpr0 + +... + +--- +name: v_add_co_u32_e64__fi_literal_offset__sgpr__scavenge_spill_required +tracksRegLiveness: true +stack: + - { id: 0, size: 128, alignment: 16 } + - { id: 1, size: 4, alignment: 4 } +machineFunctionInfo: + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' +body: | + bb.0: + liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253, $vgpr254, $vgpr255, $sgpr8 + + ; GFX7-LABEL: name: v_add_co_u32_e64__fi_literal_offset__sgpr__scavenge_spill_required + ; GFX7: liveins: $sgpr8, $vgpr254, $vgpr255, $vgpr252_vgpr253, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; GFX7-NEXT: {{ $}} + ; GFX7-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 132, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) + ; GFX7-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec + ; GFX7-NEXT: $vcc_lo = S_MOV_B32 128 + ; GFX7-NEXT: $vgpr1, dead $vcc = V_ADD_CO_U32_e64 killed $vcc_lo, killed $vgpr1, 0, implicit $exec + ; GFX7-NEXT: renamable $vgpr0, renamable $vcc = V_ADD_CO_U32_e64 killed $vgpr1, $sgpr8, 0, implicit $exec + ; GFX7-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 132, 0, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5) + ; GFX7-NEXT: S_NOP 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, implicit $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, implicit $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, implicit $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, implicit $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, implicit $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, implicit $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, implicit $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, implicit $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, implicit $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, implicit $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, implicit $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, implicit $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, implicit $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, implicit $vgpr248_vgpr249_vgpr250_vgpr251, implicit $vgpr252_vgpr253, implicit $vgpr254, implicit $vgpr255 + ; GFX7-NEXT: SI_RETURN implicit $vgpr0 + ; + ; GFX8-LABEL: name: v_add_co_u32_e64__fi_literal_offset__sgpr__scavenge_spill_required + ; GFX8: liveins: $sgpr8, $vgpr254, $vgpr255, $vgpr252_vgpr253, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; GFX8-NEXT: {{ $}} + ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 132, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) + ; GFX8-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec + ; GFX8-NEXT: $vcc_lo = S_MOV_B32 128 + ; GFX8-NEXT: $vgpr1, dead $vcc = V_ADD_CO_U32_e64 killed $vcc_lo, killed $vgpr1, 0, implicit $exec + ; GFX8-NEXT: renamable $vgpr0, renamable $vcc = V_ADD_CO_U32_e64 killed $vgpr1, $sgpr8, 0, implicit $exec + ; GFX8-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 132, 0, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5) + ; GFX8-NEXT: S_NOP 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, implicit $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, implicit $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, implicit $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, implicit $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, implicit $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, implicit $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, implicit $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, implicit $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, implicit $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, implicit $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, implicit $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, implicit $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, implicit $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, implicit $vgpr248_vgpr249_vgpr250_vgpr251, implicit $vgpr252_vgpr253, implicit $vgpr254, implicit $vgpr255 + ; GFX8-NEXT: SI_RETURN implicit $vgpr0 + ; + ; GFX900-LABEL: name: v_add_co_u32_e64__fi_literal_offset__sgpr__scavenge_spill_required + ; GFX900: liveins: $sgpr8, $vgpr254, $vgpr255, $vgpr252_vgpr253, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; GFX900-NEXT: {{ $}} + ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 132, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) + ; GFX900-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec + ; GFX900-NEXT: $vgpr1 = V_ADD_U32_e32 128, killed $vgpr1, implicit $exec + ; GFX900-NEXT: renamable $vgpr0, renamable $vcc = V_ADD_CO_U32_e64 killed $vgpr1, $sgpr8, 0, implicit $exec + ; GFX900-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 132, 0, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5) + ; GFX900-NEXT: S_NOP 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, implicit $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, implicit $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, implicit $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, implicit $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, implicit $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, implicit $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, implicit $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, implicit $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, implicit $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, implicit $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, implicit $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, implicit $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, implicit $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, implicit $vgpr248_vgpr249_vgpr250_vgpr251, implicit $vgpr252_vgpr253, implicit $vgpr254, implicit $vgpr255 + ; GFX900-NEXT: SI_RETURN implicit $vgpr0 + ; + ; GFX90A-LABEL: name: v_add_co_u32_e64__fi_literal_offset__sgpr__scavenge_spill_required + ; GFX90A: liveins: $sgpr8, $vgpr254, $vgpr255, $vgpr252_vgpr253, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 132, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) + ; GFX90A-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec + ; GFX90A-NEXT: $vgpr1 = V_ADD_U32_e32 128, killed $vgpr1, implicit $exec + ; GFX90A-NEXT: renamable $vgpr0, renamable $vcc = V_ADD_CO_U32_e64 killed $vgpr1, $sgpr8, 0, implicit $exec + ; GFX90A-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 132, 0, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5) + ; GFX90A-NEXT: S_NOP 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, implicit $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, implicit $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, implicit $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, implicit $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, implicit $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, implicit $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, implicit $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, implicit $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, implicit $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, implicit $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, implicit $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, implicit $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, implicit $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, implicit $vgpr248_vgpr249_vgpr250_vgpr251, implicit $vgpr252_vgpr253, implicit $vgpr254, implicit $vgpr255 + ; GFX90A-NEXT: SI_RETURN implicit $vgpr0 + ; + ; GFX10-LABEL: name: v_add_co_u32_e64__fi_literal_offset__sgpr__scavenge_spill_required + ; GFX10: liveins: $sgpr8, $vgpr254, $vgpr255, $vgpr252_vgpr253, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 132, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) + ; GFX10-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec + ; GFX10-NEXT: $vgpr1 = V_ADD_U32_e32 128, killed $vgpr1, implicit $exec + ; GFX10-NEXT: renamable $vgpr0, renamable $vcc = V_ADD_CO_U32_e64 killed $vgpr1, $sgpr8, 0, implicit $exec + ; GFX10-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 132, 0, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5) + ; GFX10-NEXT: S_NOP 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, implicit $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, implicit $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, implicit $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, implicit $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, implicit $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, implicit $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, implicit $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, implicit $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, implicit $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, implicit $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, implicit $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, implicit $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, implicit $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, implicit $vgpr248_vgpr249_vgpr250_vgpr251, implicit $vgpr252_vgpr253, implicit $vgpr254, implicit $vgpr255 + ; GFX10-NEXT: SI_RETURN implicit $vgpr0 + ; + ; GFX940-LABEL: name: v_add_co_u32_e64__fi_literal_offset__sgpr__scavenge_spill_required + ; GFX940: liveins: $sgpr8, $vgpr254, $vgpr255, $vgpr252_vgpr253, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; GFX940-NEXT: {{ $}} + ; GFX940-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr1, $sgpr32, 132, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.2, addrspace 5) + ; GFX940-NEXT: $sgpr4 = S_ADD_I32 $sgpr32, 128, implicit-def $scc + ; GFX940-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr4, implicit $exec + ; GFX940-NEXT: renamable $vgpr0, renamable $vcc = V_ADD_CO_U32_e64 killed $vgpr1, $sgpr8, 0, implicit $exec + ; GFX940-NEXT: $vgpr1 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 132, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.2, addrspace 5) + ; GFX940-NEXT: S_NOP 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, implicit $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, implicit $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, implicit $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, implicit $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, implicit $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, implicit $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, implicit $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, implicit $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, implicit $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, implicit $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, implicit $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, implicit $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, implicit $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, implicit $vgpr248_vgpr249_vgpr250_vgpr251, implicit $vgpr252_vgpr253, implicit $vgpr254, implicit $vgpr255 + ; GFX940-NEXT: SI_RETURN implicit $vgpr0 + ; + ; GFX11-LABEL: name: v_add_co_u32_e64__fi_literal_offset__sgpr__scavenge_spill_required + ; GFX11: liveins: $sgpr8, $vgpr254, $vgpr255, $vgpr252_vgpr253, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: $sgpr4 = S_ADD_I32 $sgpr32, 128, implicit-def $scc + ; GFX11-NEXT: renamable $vgpr0, renamable $vcc = V_ADD_CO_U32_e64 killed $sgpr4, $sgpr8, 0, implicit $exec + ; GFX11-NEXT: S_NOP 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, implicit $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, implicit $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, implicit $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, implicit $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, implicit $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, implicit $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, implicit $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, implicit $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, implicit $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, implicit $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, implicit $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, implicit $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, implicit $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, implicit $vgpr248_vgpr249_vgpr250_vgpr251, implicit $vgpr252_vgpr253, implicit $vgpr254, implicit $vgpr255 + ; GFX11-NEXT: SI_RETURN implicit $vgpr0 + ; + ; GFX12-LABEL: name: v_add_co_u32_e64__fi_literal_offset__sgpr__scavenge_spill_required + ; GFX12: liveins: $sgpr8, $vgpr254, $vgpr255, $vgpr252_vgpr253, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: $sgpr4 = S_ADD_I32 $sgpr32, 128, implicit-def $scc + ; GFX12-NEXT: renamable $vgpr0, renamable $vcc = V_ADD_CO_U32_e64 killed $sgpr4, $sgpr8, 0, implicit $exec + ; GFX12-NEXT: S_NOP 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, implicit $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, implicit $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, implicit $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, implicit $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, implicit $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, implicit $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, implicit $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, implicit $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, implicit $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, implicit $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, implicit $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, implicit $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, implicit $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, implicit $vgpr248_vgpr249_vgpr250_vgpr251, implicit $vgpr252_vgpr253, implicit $vgpr254, implicit $vgpr255 + ; GFX12-NEXT: SI_RETURN implicit $vgpr0 + renamable $vgpr0, renamable $vcc = V_ADD_CO_U32_e64 %stack.1, $sgpr8, 0, implicit $exec + + S_NOP 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, implicit $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, implicit $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, implicit $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, implicit $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, implicit $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, implicit $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, implicit $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, implicit $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, implicit $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, implicit $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, implicit $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, implicit $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, implicit $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, implicit $vgpr248_vgpr249_vgpr250_vgpr251, implicit $vgpr252_vgpr253, implicit $vgpr254, implicit $vgpr255 + SI_RETURN implicit $vgpr0 + +... + +--- +name: v_add_co_u32_e32__fi_literal_offset__sgpr__scavenge_spill_required +tracksRegLiveness: true +stack: + - { id: 0, size: 128, alignment: 16 } + - { id: 1, size: 4, alignment: 4 } +machineFunctionInfo: + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' +body: | + bb.0: + liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253, $vgpr254, $vgpr255, $sgpr8 + + ; GFX7-LABEL: name: v_add_co_u32_e32__fi_literal_offset__sgpr__scavenge_spill_required + ; GFX7: liveins: $sgpr8, $vgpr254, $vgpr255, $vgpr252_vgpr253, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; GFX7-NEXT: {{ $}} + ; GFX7-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 132, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) + ; GFX7-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec + ; GFX7-NEXT: $vcc_lo = S_MOV_B32 128 + ; GFX7-NEXT: $vgpr1, dead $vcc = V_ADD_CO_U32_e64 killed $vcc_lo, killed $vgpr1, 0, implicit $exec + ; GFX7-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 $sgpr8, killed $vgpr1, implicit-def dead $vcc, implicit $exec + ; GFX7-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 132, 0, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5) + ; GFX7-NEXT: S_NOP 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, implicit $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, implicit $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, implicit $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, implicit $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, implicit $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, implicit $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, implicit $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, implicit $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, implicit $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, implicit $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, implicit $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, implicit $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, implicit $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, implicit $vgpr248_vgpr249_vgpr250_vgpr251, implicit $vgpr252_vgpr253, implicit $vgpr254, implicit $vgpr255 + ; GFX7-NEXT: SI_RETURN implicit $vgpr0 + ; + ; GFX8-LABEL: name: v_add_co_u32_e32__fi_literal_offset__sgpr__scavenge_spill_required + ; GFX8: liveins: $sgpr8, $vgpr254, $vgpr255, $vgpr252_vgpr253, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; GFX8-NEXT: {{ $}} + ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 132, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) + ; GFX8-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec + ; GFX8-NEXT: $vcc_lo = S_MOV_B32 128 + ; GFX8-NEXT: $vgpr1, dead $vcc = V_ADD_CO_U32_e64 killed $vcc_lo, killed $vgpr1, 0, implicit $exec + ; GFX8-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 $sgpr8, killed $vgpr1, implicit-def dead $vcc, implicit $exec + ; GFX8-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 132, 0, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5) + ; GFX8-NEXT: S_NOP 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, implicit $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, implicit $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, implicit $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, implicit $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, implicit $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, implicit $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, implicit $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, implicit $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, implicit $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, implicit $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, implicit $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, implicit $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, implicit $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, implicit $vgpr248_vgpr249_vgpr250_vgpr251, implicit $vgpr252_vgpr253, implicit $vgpr254, implicit $vgpr255 + ; GFX8-NEXT: SI_RETURN implicit $vgpr0 + ; + ; GFX900-LABEL: name: v_add_co_u32_e32__fi_literal_offset__sgpr__scavenge_spill_required + ; GFX900: liveins: $sgpr8, $vgpr254, $vgpr255, $vgpr252_vgpr253, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; GFX900-NEXT: {{ $}} + ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 132, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) + ; GFX900-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec + ; GFX900-NEXT: $vgpr1 = V_ADD_U32_e32 128, killed $vgpr1, implicit $exec + ; GFX900-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 $sgpr8, killed $vgpr1, implicit-def dead $vcc, implicit $exec + ; GFX900-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 132, 0, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5) + ; GFX900-NEXT: S_NOP 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, implicit $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, implicit $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, implicit $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, implicit $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, implicit $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, implicit $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, implicit $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, implicit $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, implicit $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, implicit $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, implicit $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, implicit $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, implicit $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, implicit $vgpr248_vgpr249_vgpr250_vgpr251, implicit $vgpr252_vgpr253, implicit $vgpr254, implicit $vgpr255 + ; GFX900-NEXT: SI_RETURN implicit $vgpr0 + ; + ; GFX90A-LABEL: name: v_add_co_u32_e32__fi_literal_offset__sgpr__scavenge_spill_required + ; GFX90A: liveins: $sgpr8, $vgpr254, $vgpr255, $vgpr252_vgpr253, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 132, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) + ; GFX90A-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec + ; GFX90A-NEXT: $vgpr1 = V_ADD_U32_e32 128, killed $vgpr1, implicit $exec + ; GFX90A-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 $sgpr8, killed $vgpr1, implicit-def dead $vcc, implicit $exec + ; GFX90A-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 132, 0, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5) + ; GFX90A-NEXT: S_NOP 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, implicit $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, implicit $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, implicit $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, implicit $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, implicit $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, implicit $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, implicit $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, implicit $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, implicit $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, implicit $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, implicit $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, implicit $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, implicit $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, implicit $vgpr248_vgpr249_vgpr250_vgpr251, implicit $vgpr252_vgpr253, implicit $vgpr254, implicit $vgpr255 + ; GFX90A-NEXT: SI_RETURN implicit $vgpr0 + ; + ; GFX10-LABEL: name: v_add_co_u32_e32__fi_literal_offset__sgpr__scavenge_spill_required + ; GFX10: liveins: $sgpr8, $vgpr254, $vgpr255, $vgpr252_vgpr253, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 132, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) + ; GFX10-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec + ; GFX10-NEXT: $vgpr1 = V_ADD_U32_e32 128, killed $vgpr1, implicit $exec + ; GFX10-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 $sgpr8, killed $vgpr1, implicit-def dead $vcc, implicit $exec + ; GFX10-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 132, 0, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5) + ; GFX10-NEXT: S_NOP 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, implicit $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, implicit $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, implicit $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, implicit $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, implicit $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, implicit $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, implicit $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, implicit $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, implicit $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, implicit $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, implicit $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, implicit $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, implicit $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, implicit $vgpr248_vgpr249_vgpr250_vgpr251, implicit $vgpr252_vgpr253, implicit $vgpr254, implicit $vgpr255 + ; GFX10-NEXT: SI_RETURN implicit $vgpr0 + ; + ; FLATSCRW64-LABEL: name: v_add_co_u32_e32__fi_literal_offset__sgpr__scavenge_spill_required + ; FLATSCRW64: liveins: $sgpr8, $vgpr254, $vgpr255, $vgpr252_vgpr253, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; FLATSCRW64-NEXT: {{ $}} + ; FLATSCRW64-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr1, $sgpr32, 132, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.2, addrspace 5) + ; FLATSCRW64-NEXT: $sgpr4 = S_ADD_I32 $sgpr32, 128, implicit-def $scc + ; FLATSCRW64-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr4, implicit $exec + ; FLATSCRW64-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 $sgpr8, killed $vgpr1, implicit-def dead $vcc, implicit $exec + ; FLATSCRW64-NEXT: $vgpr1 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 132, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.2, addrspace 5) + ; FLATSCRW64-NEXT: S_NOP 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, implicit $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, implicit $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, implicit $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, implicit $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, implicit $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, implicit $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, implicit $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, implicit $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, implicit $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, implicit $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, implicit $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, implicit $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, implicit $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, implicit $vgpr248_vgpr249_vgpr250_vgpr251, implicit $vgpr252_vgpr253, implicit $vgpr254, implicit $vgpr255 + ; FLATSCRW64-NEXT: SI_RETURN implicit $vgpr0 + renamable $vgpr0 = V_ADD_CO_U32_e32 $sgpr8, %stack.1, implicit-def dead $vcc, implicit $exec + + S_NOP 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, implicit $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, implicit $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, implicit $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, implicit $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, implicit $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, implicit $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, implicit $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, implicit $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, implicit $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, implicit $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, implicit $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, implicit $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, implicit $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, implicit $vgpr248_vgpr249_vgpr250_vgpr251, implicit $vgpr252_vgpr253, implicit $vgpr254, implicit $vgpr255 + SI_RETURN implicit $vgpr0 + +... + +--- +name: v_add_co_u32_e32__fi_literal_offset__vgpr__scavenge_spill_required +tracksRegLiveness: true +stack: + - { id: 0, size: 128, alignment: 16 } + - { id: 1, size: 4, alignment: 4 } +machineFunctionInfo: + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' +body: | + bb.0: + liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253, $vgpr254, $vgpr255, $sgpr8 + + ; GFX7-LABEL: name: v_add_co_u32_e32__fi_literal_offset__vgpr__scavenge_spill_required + ; GFX7: liveins: $sgpr8, $vgpr254, $vgpr255, $vgpr252_vgpr253, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; GFX7-NEXT: {{ $}} + ; GFX7-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 132, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) + ; GFX7-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec + ; GFX7-NEXT: $vcc_lo = S_MOV_B32 128 + ; GFX7-NEXT: $vgpr1, dead $vcc = V_ADD_CO_U32_e64 killed $vcc_lo, killed $vgpr1, 0, implicit $exec + ; GFX7-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 $vgpr8, killed $vgpr1, implicit-def dead $vcc, implicit $exec + ; GFX7-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 132, 0, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5) + ; GFX7-NEXT: S_NOP 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, implicit $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, implicit $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, implicit $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, implicit $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, implicit $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, implicit $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, implicit $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, implicit $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, implicit $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, implicit $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, implicit $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, implicit $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, implicit $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, implicit $vgpr248_vgpr249_vgpr250_vgpr251, implicit $vgpr252_vgpr253, implicit $vgpr254, implicit $vgpr255 + ; GFX7-NEXT: SI_RETURN implicit $vgpr0 + ; + ; GFX8-LABEL: name: v_add_co_u32_e32__fi_literal_offset__vgpr__scavenge_spill_required + ; GFX8: liveins: $sgpr8, $vgpr254, $vgpr255, $vgpr252_vgpr253, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; GFX8-NEXT: {{ $}} + ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 132, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) + ; GFX8-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec + ; GFX8-NEXT: $vcc_lo = S_MOV_B32 128 + ; GFX8-NEXT: $vgpr1, dead $vcc = V_ADD_CO_U32_e64 killed $vcc_lo, killed $vgpr1, 0, implicit $exec + ; GFX8-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 $vgpr8, killed $vgpr1, implicit-def dead $vcc, implicit $exec + ; GFX8-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 132, 0, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5) + ; GFX8-NEXT: S_NOP 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, implicit $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, implicit $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, implicit $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, implicit $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, implicit $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, implicit $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, implicit $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, implicit $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, implicit $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, implicit $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, implicit $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, implicit $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, implicit $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, implicit $vgpr248_vgpr249_vgpr250_vgpr251, implicit $vgpr252_vgpr253, implicit $vgpr254, implicit $vgpr255 + ; GFX8-NEXT: SI_RETURN implicit $vgpr0 + ; + ; GFX900-LABEL: name: v_add_co_u32_e32__fi_literal_offset__vgpr__scavenge_spill_required + ; GFX900: liveins: $sgpr8, $vgpr254, $vgpr255, $vgpr252_vgpr253, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; GFX900-NEXT: {{ $}} + ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 132, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) + ; GFX900-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec + ; GFX900-NEXT: $vgpr1 = V_ADD_U32_e32 128, killed $vgpr1, implicit $exec + ; GFX900-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 $vgpr8, killed $vgpr1, implicit-def dead $vcc, implicit $exec + ; GFX900-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 132, 0, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5) + ; GFX900-NEXT: S_NOP 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, implicit $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, implicit $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, implicit $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, implicit $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, implicit $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, implicit $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, implicit $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, implicit $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, implicit $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, implicit $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, implicit $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, implicit $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, implicit $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, implicit $vgpr248_vgpr249_vgpr250_vgpr251, implicit $vgpr252_vgpr253, implicit $vgpr254, implicit $vgpr255 + ; GFX900-NEXT: SI_RETURN implicit $vgpr0 + ; + ; GFX90A-LABEL: name: v_add_co_u32_e32__fi_literal_offset__vgpr__scavenge_spill_required + ; GFX90A: liveins: $sgpr8, $vgpr254, $vgpr255, $vgpr252_vgpr253, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 132, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) + ; GFX90A-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec + ; GFX90A-NEXT: $vgpr1 = V_ADD_U32_e32 128, killed $vgpr1, implicit $exec + ; GFX90A-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 $vgpr8, killed $vgpr1, implicit-def dead $vcc, implicit $exec + ; GFX90A-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 132, 0, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5) + ; GFX90A-NEXT: S_NOP 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, implicit $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, implicit $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, implicit $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, implicit $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, implicit $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, implicit $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, implicit $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, implicit $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, implicit $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, implicit $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, implicit $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, implicit $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, implicit $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, implicit $vgpr248_vgpr249_vgpr250_vgpr251, implicit $vgpr252_vgpr253, implicit $vgpr254, implicit $vgpr255 + ; GFX90A-NEXT: SI_RETURN implicit $vgpr0 + ; + ; GFX10-LABEL: name: v_add_co_u32_e32__fi_literal_offset__vgpr__scavenge_spill_required + ; GFX10: liveins: $sgpr8, $vgpr254, $vgpr255, $vgpr252_vgpr253, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 132, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) + ; GFX10-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec + ; GFX10-NEXT: $vgpr1 = V_ADD_U32_e32 128, killed $vgpr1, implicit $exec + ; GFX10-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 $vgpr8, killed $vgpr1, implicit-def dead $vcc, implicit $exec + ; GFX10-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 132, 0, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5) + ; GFX10-NEXT: S_NOP 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, implicit $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, implicit $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, implicit $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, implicit $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, implicit $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, implicit $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, implicit $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, implicit $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, implicit $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, implicit $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, implicit $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, implicit $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, implicit $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, implicit $vgpr248_vgpr249_vgpr250_vgpr251, implicit $vgpr252_vgpr253, implicit $vgpr254, implicit $vgpr255 + ; GFX10-NEXT: SI_RETURN implicit $vgpr0 + ; + ; FLATSCRW64-LABEL: name: v_add_co_u32_e32__fi_literal_offset__vgpr__scavenge_spill_required + ; FLATSCRW64: liveins: $sgpr8, $vgpr254, $vgpr255, $vgpr252_vgpr253, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; FLATSCRW64-NEXT: {{ $}} + ; FLATSCRW64-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr1, $sgpr32, 132, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.2, addrspace 5) + ; FLATSCRW64-NEXT: $sgpr4 = S_ADD_I32 $sgpr32, 128, implicit-def $scc + ; FLATSCRW64-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr4, implicit $exec + ; FLATSCRW64-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 $vgpr8, killed $vgpr1, implicit-def dead $vcc, implicit $exec + ; FLATSCRW64-NEXT: $vgpr1 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 132, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.2, addrspace 5) + ; FLATSCRW64-NEXT: S_NOP 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, implicit $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, implicit $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, implicit $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, implicit $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, implicit $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, implicit $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, implicit $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, implicit $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, implicit $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, implicit $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, implicit $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, implicit $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, implicit $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, implicit $vgpr248_vgpr249_vgpr250_vgpr251, implicit $vgpr252_vgpr253, implicit $vgpr254, implicit $vgpr255 + ; FLATSCRW64-NEXT: SI_RETURN implicit $vgpr0 + renamable $vgpr0 = V_ADD_CO_U32_e32 $vgpr8, %stack.1, implicit-def dead $vcc, implicit $exec + + S_NOP 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, implicit $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, implicit $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, implicit $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, implicit $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, implicit $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, implicit $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, implicit $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, implicit $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, implicit $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, implicit $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, implicit $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, implicit $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, implicit $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, implicit $vgpr248_vgpr249_vgpr250_vgpr251, implicit $vgpr252_vgpr253, implicit $vgpr254, implicit $vgpr255 + SI_RETURN implicit $vgpr0 + +... + +--- +name: v_add_co_u32_e32__kernel_fi_offset0__other_vgpr_live_after +tracksRegLiveness: true +stack: + - { id: 0, size: 16, alignment: 16 } +machineFunctionInfo: + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' + isEntryFunction: true +body: | + bb.0: + liveins: $vgpr1 + ; MUBUFW64-LABEL: name: v_add_co_u32_e32__kernel_fi_offset0__other_vgpr_live_after + ; MUBUFW64: liveins: $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW64-NEXT: {{ $}} + ; MUBUFW64-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $noreg, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW64-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW64-NEXT: $vgpr2 = V_MOV_B32_e32 0, implicit $exec + ; MUBUFW64-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 $vgpr1, killed $vgpr2, implicit-def dead $vcc, implicit $exec + ; MUBUFW64-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1 + ; + ; FLATSCRW64-LABEL: name: v_add_co_u32_e32__kernel_fi_offset0__other_vgpr_live_after + ; FLATSCRW64: liveins: $vgpr1 + ; FLATSCRW64-NEXT: {{ $}} + ; FLATSCRW64-NEXT: $vgpr2 = V_MOV_B32_e32 0, implicit $exec + ; FLATSCRW64-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 $vgpr1, killed $vgpr2, implicit-def dead $vcc, implicit $exec + ; FLATSCRW64-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1 + renamable $vgpr0 = V_ADD_CO_U32_e32 renamable $vgpr1, %stack.0, implicit-def dead $vcc, implicit $exec + SI_RETURN implicit $vgpr0, implicit $vgpr1 + +... + +--- +name: v_add_co_u32_e64__kernel_fi_offset0__other_vgpr_live_after +tracksRegLiveness: true +stack: + - { id: 0, size: 16, alignment: 16 } +machineFunctionInfo: + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' + isEntryFunction: true +body: | + bb.0: + liveins: $vgpr1 + ; MUBUFW64-LABEL: name: v_add_co_u32_e64__kernel_fi_offset0__other_vgpr_live_after + ; MUBUFW64: liveins: $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW64-NEXT: {{ $}} + ; MUBUFW64-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $noreg, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW64-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW64-NEXT: renamable $vgpr0, renamable $sgpr8_sgpr9 = V_ADD_CO_U32_e64 $vgpr1, 0, 0, implicit $exec + ; MUBUFW64-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1 + ; + ; FLATSCRW64-LABEL: name: v_add_co_u32_e64__kernel_fi_offset0__other_vgpr_live_after + ; FLATSCRW64: liveins: $vgpr1 + ; FLATSCRW64-NEXT: {{ $}} + ; FLATSCRW64-NEXT: renamable $vgpr0, renamable $sgpr8_sgpr9 = V_ADD_CO_U32_e64 $vgpr1, 0, 0, implicit $exec + ; FLATSCRW64-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1 + renamable $vgpr0, renamable $sgpr8_sgpr9 = V_ADD_CO_U32_e64 renamable $vgpr1, %stack.0, 0, implicit $exec + SI_RETURN implicit $vgpr0, implicit $vgpr1 + +... + +--- +name: v_add_co_u32_e64__kernel__other_vgpr_live_after__fi_offset0 +tracksRegLiveness: true +stack: + - { id: 0, size: 16, alignment: 16 } +machineFunctionInfo: + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' + isEntryFunction: true +body: | + bb.0: + liveins: $vgpr1 + ; MUBUFW64-LABEL: name: v_add_co_u32_e64__kernel__other_vgpr_live_after__fi_offset0 + ; MUBUFW64: liveins: $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW64-NEXT: {{ $}} + ; MUBUFW64-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $noreg, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW64-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW64-NEXT: renamable $vgpr0, renamable $sgpr8_sgpr9 = V_ADD_CO_U32_e64 0, $vgpr1, 0, implicit $exec + ; MUBUFW64-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1 + ; + ; FLATSCRW64-LABEL: name: v_add_co_u32_e64__kernel__other_vgpr_live_after__fi_offset0 + ; FLATSCRW64: liveins: $vgpr1 + ; FLATSCRW64-NEXT: {{ $}} + ; FLATSCRW64-NEXT: renamable $vgpr0, renamable $sgpr8_sgpr9 = V_ADD_CO_U32_e64 0, $vgpr1, 0, implicit $exec + ; FLATSCRW64-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1 + renamable $vgpr0, renamable $sgpr8_sgpr9 = V_ADD_CO_U32_e64 %stack.0, renamable $vgpr1, 0, implicit $exec + SI_RETURN implicit $vgpr0, implicit $vgpr1 + +... + +--- +name: v_add_co_u32_e32__identity_vgpr__fi_offset0__kernel +tracksRegLiveness: true +stack: + - { id: 0, size: 32, alignment: 16 } +machineFunctionInfo: + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' + isEntryFunction: true +body: | + bb.0: + liveins: $vgpr0 + + ; MUBUFW64-LABEL: name: v_add_co_u32_e32__identity_vgpr__fi_offset0__kernel + ; MUBUFW64: liveins: $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW64-NEXT: {{ $}} + ; MUBUFW64-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $noreg, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW64-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW64-NEXT: $vgpr1 = V_MOV_B32_e32 0, implicit $exec + ; MUBUFW64-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 $vgpr0, killed $vgpr1, implicit-def dead $vcc, implicit $exec + ; MUBUFW64-NEXT: SI_RETURN implicit $vgpr0 + ; + ; FLATSCRW64-LABEL: name: v_add_co_u32_e32__identity_vgpr__fi_offset0__kernel + ; FLATSCRW64: liveins: $vgpr0 + ; FLATSCRW64-NEXT: {{ $}} + ; FLATSCRW64-NEXT: $vgpr1 = V_MOV_B32_e32 0, implicit $exec + ; FLATSCRW64-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 $vgpr0, killed $vgpr1, implicit-def dead $vcc, implicit $exec + ; FLATSCRW64-NEXT: SI_RETURN implicit $vgpr0 + renamable $vgpr0 = V_ADD_CO_U32_e32 $vgpr0, %stack.0, implicit-def dead $vcc, implicit $exec + SI_RETURN implicit $vgpr0 + +... + +--- +name: v_add_co_u32_e32__fi_offset0__identity_vgpr__kernel +tracksRegLiveness: true +stack: + - { id: 0, size: 32, alignment: 16 } +machineFunctionInfo: + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' + isEntryFunction: true +body: | + bb.0: + liveins: $vgpr0 + ; MUBUFW64-LABEL: name: v_add_co_u32_e32__fi_offset0__identity_vgpr__kernel + ; MUBUFW64: liveins: $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW64-NEXT: {{ $}} + ; MUBUFW64-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $noreg, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW64-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW64-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 0, $vgpr0, implicit-def dead $vcc, implicit $exec + ; MUBUFW64-NEXT: SI_RETURN implicit $vgpr0 + ; + ; FLATSCRW64-LABEL: name: v_add_co_u32_e32__fi_offset0__identity_vgpr__kernel + ; FLATSCRW64: liveins: $vgpr0 + ; FLATSCRW64-NEXT: {{ $}} + ; FLATSCRW64-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 0, $vgpr0, implicit-def dead $vcc, implicit $exec + ; FLATSCRW64-NEXT: SI_RETURN implicit $vgpr0 + renamable $vgpr0 = V_ADD_CO_U32_e32 %stack.0, $vgpr0, implicit-def dead $vcc, implicit $exec + SI_RETURN implicit $vgpr0 + +... + +--- +name: v_add_co_u32_e64__identity_vgpr__fi_offset0__kernel +tracksRegLiveness: true +stack: + - { id: 0, size: 32, alignment: 16 } +machineFunctionInfo: + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' + isEntryFunction: true +body: | + bb.0: + liveins: $vgpr0 + + ; MUBUFW64-LABEL: name: v_add_co_u32_e64__identity_vgpr__fi_offset0__kernel + ; MUBUFW64: liveins: $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW64-NEXT: {{ $}} + ; MUBUFW64-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $noreg, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW64-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW64-NEXT: renamable $vgpr0, dead renamable $vcc = V_ADD_CO_U32_e64 $vgpr0, 0, 0, implicit $exec + ; MUBUFW64-NEXT: SI_RETURN implicit $vgpr0 + ; + ; FLATSCRW64-LABEL: name: v_add_co_u32_e64__identity_vgpr__fi_offset0__kernel + ; FLATSCRW64: liveins: $vgpr0 + ; FLATSCRW64-NEXT: {{ $}} + ; FLATSCRW64-NEXT: renamable $vgpr0, dead renamable $vcc = V_ADD_CO_U32_e64 $vgpr0, 0, 0, implicit $exec + ; FLATSCRW64-NEXT: SI_RETURN implicit $vgpr0 + renamable $vgpr0, renamable dead $vcc = V_ADD_CO_U32_e64 $vgpr0, %stack.0, 0, implicit $exec + SI_RETURN implicit $vgpr0 + +... + +--- +name: v_add_co_u32_e64__fi_offset0__identity_vgpr__kernel +tracksRegLiveness: true +stack: + - { id: 0, size: 32, alignment: 16 } +machineFunctionInfo: + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' + isEntryFunction: true +body: | + bb.0: + liveins: $vgpr0 + + ; MUBUFW64-LABEL: name: v_add_co_u32_e64__fi_offset0__identity_vgpr__kernel + ; MUBUFW64: liveins: $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW64-NEXT: {{ $}} + ; MUBUFW64-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $noreg, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW64-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW64-NEXT: renamable $vgpr0, dead renamable $vcc = V_ADD_CO_U32_e64 0, $vgpr0, 0, implicit $exec + ; MUBUFW64-NEXT: SI_RETURN implicit $vgpr0 + ; + ; FLATSCRW64-LABEL: name: v_add_co_u32_e64__fi_offset0__identity_vgpr__kernel + ; FLATSCRW64: liveins: $vgpr0 + ; FLATSCRW64-NEXT: {{ $}} + ; FLATSCRW64-NEXT: renamable $vgpr0, dead renamable $vcc = V_ADD_CO_U32_e64 0, $vgpr0, 0, implicit $exec + ; FLATSCRW64-NEXT: SI_RETURN implicit $vgpr0 + renamable $vgpr0, renamable dead $vcc = V_ADD_CO_U32_e64 %stack.0, $vgpr0, 0, implicit $exec + SI_RETURN implicit $vgpr0 + +... + +--- +name: v_add_co_u32_e32__fi_offset0__identity_vgpr__kernel_kill +tracksRegLiveness: true +stack: + - { id: 0, size: 32, alignment: 16 } +machineFunctionInfo: + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' + isEntryFunction: true +body: | + bb.0: + liveins: $vgpr0 + ; MUBUFW64-LABEL: name: v_add_co_u32_e32__fi_offset0__identity_vgpr__kernel_kill + ; MUBUFW64: liveins: $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW64-NEXT: {{ $}} + ; MUBUFW64-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $noreg, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW64-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW64-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 0, killed $vgpr0, implicit-def dead $vcc, implicit $exec + ; MUBUFW64-NEXT: SI_RETURN implicit $vgpr0 + ; + ; FLATSCRW64-LABEL: name: v_add_co_u32_e32__fi_offset0__identity_vgpr__kernel_kill + ; FLATSCRW64: liveins: $vgpr0 + ; FLATSCRW64-NEXT: {{ $}} + ; FLATSCRW64-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 0, killed $vgpr0, implicit-def dead $vcc, implicit $exec + ; FLATSCRW64-NEXT: SI_RETURN implicit $vgpr0 + renamable $vgpr0 = V_ADD_CO_U32_e32 %stack.0, killed $vgpr0, implicit-def dead $vcc, implicit $exec + SI_RETURN implicit $vgpr0 + +... + +--- +name: v_add_co_u32_e32__fi_offset0__identity_vgpr__kernel_live_vcc +tracksRegLiveness: true +stack: + - { id: 0, size: 32, alignment: 16 } +machineFunctionInfo: + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' + isEntryFunction: true +body: | + bb.0: + liveins: $vgpr0 + ; MUBUFW64-LABEL: name: v_add_co_u32_e32__fi_offset0__identity_vgpr__kernel_live_vcc + ; MUBUFW64: liveins: $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW64-NEXT: {{ $}} + ; MUBUFW64-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $noreg, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW64-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW64-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 0, $vgpr0, implicit-def $vcc, implicit $exec + ; MUBUFW64-NEXT: SI_RETURN implicit $vgpr0, implicit $vcc + ; + ; FLATSCRW64-LABEL: name: v_add_co_u32_e32__fi_offset0__identity_vgpr__kernel_live_vcc + ; FLATSCRW64: liveins: $vgpr0 + ; FLATSCRW64-NEXT: {{ $}} + ; FLATSCRW64-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 0, $vgpr0, implicit-def $vcc, implicit $exec + ; FLATSCRW64-NEXT: SI_RETURN implicit $vgpr0, implicit $vcc + renamable $vgpr0 = V_ADD_CO_U32_e32 %stack.0, $vgpr0, implicit-def $vcc, implicit $exec + SI_RETURN implicit $vgpr0, implicit $vcc + +... + +--- +name: v_add_co_u32_e32__identity_vgpr__fi_offset32__kernel +tracksRegLiveness: true +stack: + - { id: 0, size: 32, alignment: 16 } + - { id: 1, size: 64, alignment: 4 } +machineFunctionInfo: + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' + isEntryFunction: true +body: | + bb.0: + liveins: $vgpr0 + + ; MUBUFW64-LABEL: name: v_add_co_u32_e32__identity_vgpr__fi_offset32__kernel + ; MUBUFW64: liveins: $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW64-NEXT: {{ $}} + ; MUBUFW64-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $noreg, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW64-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW64-NEXT: $vgpr1 = V_MOV_B32_e32 32, implicit $exec + ; MUBUFW64-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 $vgpr0, killed $vgpr1, implicit-def dead $vcc, implicit $exec + ; MUBUFW64-NEXT: SI_RETURN implicit $vgpr0 + ; + ; FLATSCRW64-LABEL: name: v_add_co_u32_e32__identity_vgpr__fi_offset32__kernel + ; FLATSCRW64: liveins: $vgpr0 + ; FLATSCRW64-NEXT: {{ $}} + ; FLATSCRW64-NEXT: $vgpr1 = V_MOV_B32_e32 32, implicit $exec + ; FLATSCRW64-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 $vgpr0, killed $vgpr1, implicit-def dead $vcc, implicit $exec + ; FLATSCRW64-NEXT: SI_RETURN implicit $vgpr0 + renamable $vgpr0 = V_ADD_CO_U32_e32 $vgpr0, %stack.1, implicit-def dead $vcc, implicit $exec + SI_RETURN implicit $vgpr0 + +... + + +--- +name: v_add_co_u32_e32__identity_vgpr__fi_offset72__kernel +tracksRegLiveness: true +stack: + - { id: 0, size: 72, alignment: 16 } + - { id: 1, size: 64, alignment: 4 } +machineFunctionInfo: + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' + isEntryFunction: true +body: | + bb.0: + liveins: $vgpr0 + + ; MUBUFW64-LABEL: name: v_add_co_u32_e32__identity_vgpr__fi_offset72__kernel + ; MUBUFW64: liveins: $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW64-NEXT: {{ $}} + ; MUBUFW64-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $noreg, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW64-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW64-NEXT: $vgpr1 = V_MOV_B32_e32 72, implicit $exec + ; MUBUFW64-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 $vgpr0, killed $vgpr1, implicit-def dead $vcc, implicit $exec + ; MUBUFW64-NEXT: SI_RETURN implicit $vgpr0 + ; + ; FLATSCRW64-LABEL: name: v_add_co_u32_e32__identity_vgpr__fi_offset72__kernel + ; FLATSCRW64: liveins: $vgpr0 + ; FLATSCRW64-NEXT: {{ $}} + ; FLATSCRW64-NEXT: $vgpr1 = V_MOV_B32_e32 72, implicit $exec + ; FLATSCRW64-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 $vgpr0, killed $vgpr1, implicit-def dead $vcc, implicit $exec + ; FLATSCRW64-NEXT: SI_RETURN implicit $vgpr0 + renamable $vgpr0 = V_ADD_CO_U32_e32 $vgpr0, %stack.1, implicit-def dead $vcc, implicit $exec + SI_RETURN implicit $vgpr0 + +... + +--- +name: v_add_co_u32_e32__fi_offset72__identity_vgpr__kernel +tracksRegLiveness: true +stack: + - { id: 0, size: 72, alignment: 16 } + - { id: 1, size: 64, alignment: 4 } +machineFunctionInfo: + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' + isEntryFunction: true +body: | + bb.0: + liveins: $vgpr0 + + ; MUBUFW64-LABEL: name: v_add_co_u32_e32__fi_offset72__identity_vgpr__kernel + ; MUBUFW64: liveins: $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW64-NEXT: {{ $}} + ; MUBUFW64-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $noreg, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW64-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW64-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 72, $vgpr0, implicit-def dead $vcc, implicit $exec + ; MUBUFW64-NEXT: SI_RETURN implicit $vgpr0 + ; + ; FLATSCRW64-LABEL: name: v_add_co_u32_e32__fi_offset72__identity_vgpr__kernel + ; FLATSCRW64: liveins: $vgpr0 + ; FLATSCRW64-NEXT: {{ $}} + ; FLATSCRW64-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 72, $vgpr0, implicit-def dead $vcc, implicit $exec + ; FLATSCRW64-NEXT: SI_RETURN implicit $vgpr0 + renamable $vgpr0 = V_ADD_CO_U32_e32 %stack.1, $vgpr0, implicit-def dead $vcc, implicit $exec + SI_RETURN implicit $vgpr0 + +... + +--- +name: v_add_co_u32_e32__fi_offset32__identity_vgpr__kernel +tracksRegLiveness: true +stack: + - { id: 0, size: 32, alignment: 16 } + - { id: 1, size: 64, alignment: 4 } +machineFunctionInfo: + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' + isEntryFunction: true +body: | + bb.0: + liveins: $vgpr0 + + ; MUBUFW64-LABEL: name: v_add_co_u32_e32__fi_offset32__identity_vgpr__kernel + ; MUBUFW64: liveins: $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW64-NEXT: {{ $}} + ; MUBUFW64-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $noreg, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW64-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW64-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 32, $vgpr0, implicit-def dead $vcc, implicit $exec + ; MUBUFW64-NEXT: SI_RETURN implicit $vgpr0 + ; + ; FLATSCRW64-LABEL: name: v_add_co_u32_e32__fi_offset32__identity_vgpr__kernel + ; FLATSCRW64: liveins: $vgpr0 + ; FLATSCRW64-NEXT: {{ $}} + ; FLATSCRW64-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 32, $vgpr0, implicit-def dead $vcc, implicit $exec + ; FLATSCRW64-NEXT: SI_RETURN implicit $vgpr0 + renamable $vgpr0 = V_ADD_CO_U32_e32 %stack.1, $vgpr0, implicit-def dead $vcc, implicit $exec + SI_RETURN implicit $vgpr0 + +... + +--- +name: v_add_co_u32_e64__identity_vgpr__fi_offset32__kernel +tracksRegLiveness: true +stack: + - { id: 0, size: 32, alignment: 16 } + - { id: 1, size: 64, alignment: 4 } +machineFunctionInfo: + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' + isEntryFunction: true +body: | + bb.0: + liveins: $vgpr0 + + ; MUBUFW64-LABEL: name: v_add_co_u32_e64__identity_vgpr__fi_offset32__kernel + ; MUBUFW64: liveins: $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW64-NEXT: {{ $}} + ; MUBUFW64-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $noreg, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW64-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW64-NEXT: renamable $vgpr0, dead renamable $vcc = V_ADD_CO_U32_e64 $vgpr0, 32, 0, implicit $exec + ; MUBUFW64-NEXT: SI_RETURN implicit $vgpr0 + ; + ; FLATSCRW64-LABEL: name: v_add_co_u32_e64__identity_vgpr__fi_offset32__kernel + ; FLATSCRW64: liveins: $vgpr0 + ; FLATSCRW64-NEXT: {{ $}} + ; FLATSCRW64-NEXT: renamable $vgpr0, dead renamable $vcc = V_ADD_CO_U32_e64 $vgpr0, 32, 0, implicit $exec + ; FLATSCRW64-NEXT: SI_RETURN implicit $vgpr0 + renamable $vgpr0, renamable dead $vcc = V_ADD_CO_U32_e64 $vgpr0, %stack.1, 0, implicit $exec + SI_RETURN implicit $vgpr0 + +... diff --git a/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-v-add-u32.mir b/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-v-add-u32.mir index 2d62d4238daa..af6823c1ab64 100644 --- a/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-v-add-u32.mir +++ b/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-v-add-u32.mir @@ -467,3 +467,1280 @@ body: | SI_RETURN implicit $vgpr0 ... + +--- +name: v_add_u32_e64__vgpr__fi_literal_offset +tracksRegLiveness: true +stack: + - { id: 0, size: 128, alignment: 16 } + - { id: 1, size: 4, alignment: 4 } +machineFunctionInfo: + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' +body: | + bb.0: + liveins: $vgpr8 + ; MUBUF-LABEL: name: v_add_u32_e64__vgpr__fi_literal_offset + ; MUBUF: liveins: $vgpr8 + ; MUBUF-NEXT: {{ $}} + ; MUBUF-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec + ; MUBUF-NEXT: $vgpr1 = V_ADD_U32_e32 128, killed $vgpr1, implicit $exec + ; MUBUF-NEXT: renamable $vgpr0 = V_ADD_U32_e64 $vgpr8, killed $vgpr1, 0, implicit $exec + ; MUBUF-NEXT: SI_RETURN implicit $vgpr0 + ; + ; MUBUFW32-LABEL: name: v_add_u32_e64__vgpr__fi_literal_offset + ; MUBUFW32: liveins: $vgpr8 + ; MUBUFW32-NEXT: {{ $}} + ; MUBUFW32-NEXT: $vgpr1 = V_LSHRREV_B32_e64 5, $sgpr32, implicit $exec + ; MUBUFW32-NEXT: $vgpr1 = V_ADD_U32_e32 128, killed $vgpr1, implicit $exec + ; MUBUFW32-NEXT: renamable $vgpr0 = V_ADD_U32_e64 $vgpr8, killed $vgpr1, 0, implicit $exec + ; MUBUFW32-NEXT: SI_RETURN implicit $vgpr0 + ; + ; FLATSCRW64-LABEL: name: v_add_u32_e64__vgpr__fi_literal_offset + ; FLATSCRW64: liveins: $vgpr8 + ; FLATSCRW64-NEXT: {{ $}} + ; FLATSCRW64-NEXT: $sgpr4 = S_ADD_I32 $sgpr32, 128, implicit-def $scc + ; FLATSCRW64-NEXT: renamable $vgpr0 = V_ADD_U32_e64 $vgpr8, killed $sgpr4, 0, implicit $exec + ; FLATSCRW64-NEXT: SI_RETURN implicit $vgpr0 + ; + ; FLATSCRW32-LABEL: name: v_add_u32_e64__vgpr__fi_literal_offset + ; FLATSCRW32: liveins: $vgpr8 + ; FLATSCRW32-NEXT: {{ $}} + ; FLATSCRW32-NEXT: $sgpr4 = S_ADD_I32 $sgpr32, 128, implicit-def $scc + ; FLATSCRW32-NEXT: renamable $vgpr0 = V_ADD_U32_e64 $vgpr8, killed $sgpr4, 0, implicit $exec + ; FLATSCRW32-NEXT: SI_RETURN implicit $vgpr0 + renamable $vgpr0 = V_ADD_U32_e64 $vgpr8, %stack.1, 0, implicit $exec + SI_RETURN implicit $vgpr0 + +... + +--- +name: v_add_u32_e64__vgpr__fi_literal_offset__clamp +tracksRegLiveness: true +stack: + - { id: 0, size: 128, alignment: 16 } + - { id: 1, size: 4, alignment: 4 } +machineFunctionInfo: + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' +body: | + bb.0: + liveins: $vgpr8 + ; MUBUF-LABEL: name: v_add_u32_e64__vgpr__fi_literal_offset__clamp + ; MUBUF: liveins: $vgpr8 + ; MUBUF-NEXT: {{ $}} + ; MUBUF-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec + ; MUBUF-NEXT: $vgpr1 = V_ADD_U32_e32 128, killed $vgpr1, implicit $exec + ; MUBUF-NEXT: renamable $vgpr0 = V_ADD_U32_e64 $vgpr8, killed $vgpr1, 1, implicit $exec + ; MUBUF-NEXT: SI_RETURN implicit $vgpr0 + ; + ; MUBUFW32-LABEL: name: v_add_u32_e64__vgpr__fi_literal_offset__clamp + ; MUBUFW32: liveins: $vgpr8 + ; MUBUFW32-NEXT: {{ $}} + ; MUBUFW32-NEXT: $vgpr1 = V_LSHRREV_B32_e64 5, $sgpr32, implicit $exec + ; MUBUFW32-NEXT: $vgpr1 = V_ADD_U32_e32 128, killed $vgpr1, implicit $exec + ; MUBUFW32-NEXT: renamable $vgpr0 = V_ADD_U32_e64 $vgpr8, killed $vgpr1, 1, implicit $exec + ; MUBUFW32-NEXT: SI_RETURN implicit $vgpr0 + ; + ; FLATSCRW64-LABEL: name: v_add_u32_e64__vgpr__fi_literal_offset__clamp + ; FLATSCRW64: liveins: $vgpr8 + ; FLATSCRW64-NEXT: {{ $}} + ; FLATSCRW64-NEXT: $sgpr4 = S_ADD_I32 $sgpr32, 128, implicit-def $scc + ; FLATSCRW64-NEXT: renamable $vgpr0 = V_ADD_U32_e64 $vgpr8, killed $sgpr4, 1, implicit $exec + ; FLATSCRW64-NEXT: SI_RETURN implicit $vgpr0 + ; + ; FLATSCRW32-LABEL: name: v_add_u32_e64__vgpr__fi_literal_offset__clamp + ; FLATSCRW32: liveins: $vgpr8 + ; FLATSCRW32-NEXT: {{ $}} + ; FLATSCRW32-NEXT: $sgpr4 = S_ADD_I32 $sgpr32, 128, implicit-def $scc + ; FLATSCRW32-NEXT: renamable $vgpr0 = V_ADD_U32_e64 $vgpr8, killed $sgpr4, 1, implicit $exec + ; FLATSCRW32-NEXT: SI_RETURN implicit $vgpr0 + renamable $vgpr0 = V_ADD_U32_e64 $vgpr8, %stack.1, 1, implicit $exec + SI_RETURN implicit $vgpr0 + +... + +--- +name: v_add_u32_e64__fi_literal_offset__vgpr__clamp +tracksRegLiveness: true +stack: + - { id: 0, size: 128, alignment: 16 } + - { id: 1, size: 4, alignment: 4 } +machineFunctionInfo: + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' +body: | + bb.0: + liveins: $vgpr8 + ; MUBUF-LABEL: name: v_add_u32_e64__fi_literal_offset__vgpr__clamp + ; MUBUF: liveins: $vgpr8 + ; MUBUF-NEXT: {{ $}} + ; MUBUF-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec + ; MUBUF-NEXT: $vgpr1 = V_ADD_U32_e32 128, killed $vgpr1, implicit $exec + ; MUBUF-NEXT: renamable $vgpr0 = V_ADD_U32_e64 killed $vgpr1, $vgpr8, 1, implicit $exec + ; MUBUF-NEXT: SI_RETURN implicit $vgpr0 + ; + ; MUBUFW32-LABEL: name: v_add_u32_e64__fi_literal_offset__vgpr__clamp + ; MUBUFW32: liveins: $vgpr8 + ; MUBUFW32-NEXT: {{ $}} + ; MUBUFW32-NEXT: $vgpr1 = V_LSHRREV_B32_e64 5, $sgpr32, implicit $exec + ; MUBUFW32-NEXT: $vgpr1 = V_ADD_U32_e32 128, killed $vgpr1, implicit $exec + ; MUBUFW32-NEXT: renamable $vgpr0 = V_ADD_U32_e64 killed $vgpr1, $vgpr8, 1, implicit $exec + ; MUBUFW32-NEXT: SI_RETURN implicit $vgpr0 + ; + ; FLATSCRW64-LABEL: name: v_add_u32_e64__fi_literal_offset__vgpr__clamp + ; FLATSCRW64: liveins: $vgpr8 + ; FLATSCRW64-NEXT: {{ $}} + ; FLATSCRW64-NEXT: $sgpr4 = S_ADD_I32 $sgpr32, 128, implicit-def $scc + ; FLATSCRW64-NEXT: renamable $vgpr0 = V_ADD_U32_e64 killed $sgpr4, $vgpr8, 1, implicit $exec + ; FLATSCRW64-NEXT: SI_RETURN implicit $vgpr0 + ; + ; FLATSCRW32-LABEL: name: v_add_u32_e64__fi_literal_offset__vgpr__clamp + ; FLATSCRW32: liveins: $vgpr8 + ; FLATSCRW32-NEXT: {{ $}} + ; FLATSCRW32-NEXT: $sgpr4 = S_ADD_I32 $sgpr32, 128, implicit-def $scc + ; FLATSCRW32-NEXT: renamable $vgpr0 = V_ADD_U32_e64 killed $sgpr4, $vgpr8, 1, implicit $exec + ; FLATSCRW32-NEXT: SI_RETURN implicit $vgpr0 + renamable $vgpr0 = V_ADD_U32_e64 %stack.1, $vgpr8, 1, implicit $exec + SI_RETURN implicit $vgpr0 + +... + +--- +name: v_add_u32_e64__fi_literal_offset__vgpr__clamp__kernel +tracksRegLiveness: true +stack: + - { id: 0, size: 128, alignment: 16 } + - { id: 1, size: 4, alignment: 4 } +machineFunctionInfo: + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' + isEntryFunction: true +body: | + bb.0: + liveins: $vgpr8 + ; MUBUF-LABEL: name: v_add_u32_e64__fi_literal_offset__vgpr__clamp__kernel + ; MUBUF: liveins: $vgpr8, $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUF-NEXT: {{ $}} + ; MUBUF-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $noreg, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUF-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUF-NEXT: $vgpr1 = V_MOV_B32_e32 128, implicit $exec + ; MUBUF-NEXT: renamable $vgpr0 = V_ADD_U32_e64 killed $vgpr1, $vgpr8, 1, implicit $exec + ; MUBUF-NEXT: SI_RETURN implicit $vgpr0 + ; + ; MUBUFW32-LABEL: name: v_add_u32_e64__fi_literal_offset__vgpr__clamp__kernel + ; MUBUFW32: liveins: $vgpr8, $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW32-NEXT: {{ $}} + ; MUBUFW32-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $noreg, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW32-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW32-NEXT: renamable $vgpr0 = V_ADD_U32_e64 128, $vgpr8, 1, implicit $exec + ; MUBUFW32-NEXT: SI_RETURN implicit $vgpr0 + ; + ; FLATSCRW64-LABEL: name: v_add_u32_e64__fi_literal_offset__vgpr__clamp__kernel + ; FLATSCRW64: liveins: $vgpr8 + ; FLATSCRW64-NEXT: {{ $}} + ; FLATSCRW64-NEXT: $sgpr4 = S_MOV_B32 128 + ; FLATSCRW64-NEXT: renamable $vgpr0 = V_ADD_U32_e64 killed $sgpr4, $vgpr8, 1, implicit $exec + ; FLATSCRW64-NEXT: SI_RETURN implicit $vgpr0 + ; + ; FLATSCRW32-LABEL: name: v_add_u32_e64__fi_literal_offset__vgpr__clamp__kernel + ; FLATSCRW32: liveins: $vgpr8 + ; FLATSCRW32-NEXT: {{ $}} + ; FLATSCRW32-NEXT: renamable $vgpr0 = V_ADD_U32_e64 128, $vgpr8, 1, implicit $exec + ; FLATSCRW32-NEXT: SI_RETURN implicit $vgpr0 + renamable $vgpr0 = V_ADD_U32_e64 %stack.1, $vgpr8, 1, implicit $exec + SI_RETURN implicit $vgpr0 + +... + +--- +name: v_add_u32_e32__inline_imm__fi_offset0__kernel +tracksRegLiveness: true +stack: + - { id: 0, size: 32, alignment: 16 } +machineFunctionInfo: + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' + isEntryFunction: true +body: | + bb.0: + ; MUBUF-LABEL: name: v_add_u32_e32__inline_imm__fi_offset0__kernel + ; MUBUF: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUF-NEXT: {{ $}} + ; MUBUF-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $noreg, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUF-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUF-NEXT: $vgpr1 = V_MOV_B32_e32 0, implicit $exec + ; MUBUF-NEXT: renamable $vgpr0 = V_ADD_U32_e32 12, killed $vgpr1, implicit $exec + ; MUBUF-NEXT: SI_RETURN implicit $vgpr0 + ; + ; MUBUFW32-LABEL: name: v_add_u32_e32__inline_imm__fi_offset0__kernel + ; MUBUFW32: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW32-NEXT: {{ $}} + ; MUBUFW32-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $noreg, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW32-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW32-NEXT: $vgpr1 = V_MOV_B32_e32 0, implicit $exec + ; MUBUFW32-NEXT: renamable $vgpr0 = V_ADD_U32_e32 12, killed $vgpr1, implicit $exec + ; MUBUFW32-NEXT: SI_RETURN implicit $vgpr0 + ; + ; FLATSCRW64-LABEL: name: v_add_u32_e32__inline_imm__fi_offset0__kernel + ; FLATSCRW64: $vgpr1 = V_MOV_B32_e32 0, implicit $exec + ; FLATSCRW64-NEXT: renamable $vgpr0 = V_ADD_U32_e32 12, killed $vgpr1, implicit $exec + ; FLATSCRW64-NEXT: SI_RETURN implicit $vgpr0 + ; + ; FLATSCRW32-LABEL: name: v_add_u32_e32__inline_imm__fi_offset0__kernel + ; FLATSCRW32: $vgpr1 = V_MOV_B32_e32 0, implicit $exec + ; FLATSCRW32-NEXT: renamable $vgpr0 = V_ADD_U32_e32 12, killed $vgpr1, implicit $exec + ; FLATSCRW32-NEXT: SI_RETURN implicit $vgpr0 + renamable $vgpr0 = V_ADD_U32_e32 12, %stack.0, implicit $exec + SI_RETURN implicit $vgpr0 + +... + +--- +name: v_add_u32_e64__inline_imm__fi_offset0__kernel +tracksRegLiveness: true +stack: + - { id: 0, size: 32, alignment: 16 } +machineFunctionInfo: + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' + isEntryFunction: true +body: | + bb.0: + ; MUBUF-LABEL: name: v_add_u32_e64__inline_imm__fi_offset0__kernel + ; MUBUF: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUF-NEXT: {{ $}} + ; MUBUF-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $noreg, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUF-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUF-NEXT: renamable $vgpr0 = V_ADD_U32_e64 12, 0, 0, implicit $exec + ; MUBUF-NEXT: SI_RETURN implicit $vgpr0 + ; + ; MUBUFW32-LABEL: name: v_add_u32_e64__inline_imm__fi_offset0__kernel + ; MUBUFW32: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW32-NEXT: {{ $}} + ; MUBUFW32-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $noreg, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW32-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW32-NEXT: renamable $vgpr0 = V_ADD_U32_e64 12, 0, 0, implicit $exec + ; MUBUFW32-NEXT: SI_RETURN implicit $vgpr0 + ; + ; FLATSCRW64-LABEL: name: v_add_u32_e64__inline_imm__fi_offset0__kernel + ; FLATSCRW64: renamable $vgpr0 = V_ADD_U32_e64 12, 0, 0, implicit $exec + ; FLATSCRW64-NEXT: SI_RETURN implicit $vgpr0 + ; + ; FLATSCRW32-LABEL: name: v_add_u32_e64__inline_imm__fi_offset0__kernel + ; FLATSCRW32: renamable $vgpr0 = V_ADD_U32_e64 12, 0, 0, implicit $exec + ; FLATSCRW32-NEXT: SI_RETURN implicit $vgpr0 + renamable $vgpr0 = V_ADD_U32_e64 12, %stack.0, 0, implicit $exec + SI_RETURN implicit $vgpr0 + +... + + + +--- +name: v_add_u32_e32__inline_imm__fi_literal__kernel +tracksRegLiveness: true +stack: + - { id: 0, size: 32, alignment: 16 } + - { id: 1, size: 80, alignment: 4 } +machineFunctionInfo: + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' + isEntryFunction: true +body: | + bb.0: + ; MUBUF-LABEL: name: v_add_u32_e32__inline_imm__fi_literal__kernel + ; MUBUF: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUF-NEXT: {{ $}} + ; MUBUF-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $noreg, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUF-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUF-NEXT: $vgpr1 = V_MOV_B32_e32 32, implicit $exec + ; MUBUF-NEXT: renamable $vgpr0 = V_ADD_U32_e32 12, killed $vgpr1, implicit $exec + ; MUBUF-NEXT: SI_RETURN implicit $vgpr0 + ; + ; MUBUFW32-LABEL: name: v_add_u32_e32__inline_imm__fi_literal__kernel + ; MUBUFW32: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW32-NEXT: {{ $}} + ; MUBUFW32-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $noreg, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW32-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW32-NEXT: $vgpr1 = V_MOV_B32_e32 32, implicit $exec + ; MUBUFW32-NEXT: renamable $vgpr0 = V_ADD_U32_e32 12, killed $vgpr1, implicit $exec + ; MUBUFW32-NEXT: SI_RETURN implicit $vgpr0 + ; + ; FLATSCRW64-LABEL: name: v_add_u32_e32__inline_imm__fi_literal__kernel + ; FLATSCRW64: $vgpr1 = V_MOV_B32_e32 32, implicit $exec + ; FLATSCRW64-NEXT: renamable $vgpr0 = V_ADD_U32_e32 12, killed $vgpr1, implicit $exec + ; FLATSCRW64-NEXT: SI_RETURN implicit $vgpr0 + ; + ; FLATSCRW32-LABEL: name: v_add_u32_e32__inline_imm__fi_literal__kernel + ; FLATSCRW32: $vgpr1 = V_MOV_B32_e32 32, implicit $exec + ; FLATSCRW32-NEXT: renamable $vgpr0 = V_ADD_U32_e32 12, killed $vgpr1, implicit $exec + ; FLATSCRW32-NEXT: SI_RETURN implicit $vgpr0 + renamable $vgpr0 = V_ADD_U32_e32 12, %stack.1, implicit $exec + SI_RETURN implicit $vgpr0 + +... + +--- +name: v_add_u32_e64__inline_imm__fi_literal__kernel +tracksRegLiveness: true +stack: + - { id: 0, size: 32, alignment: 16 } + - { id: 1, size: 80, alignment: 4 } +machineFunctionInfo: + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' + isEntryFunction: true +body: | + bb.0: + ; MUBUF-LABEL: name: v_add_u32_e64__inline_imm__fi_literal__kernel + ; MUBUF: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUF-NEXT: {{ $}} + ; MUBUF-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $noreg, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUF-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUF-NEXT: renamable $vgpr0 = V_ADD_U32_e64 12, 32, 0, implicit $exec + ; MUBUF-NEXT: SI_RETURN implicit $vgpr0 + ; + ; MUBUFW32-LABEL: name: v_add_u32_e64__inline_imm__fi_literal__kernel + ; MUBUFW32: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW32-NEXT: {{ $}} + ; MUBUFW32-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $noreg, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW32-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW32-NEXT: renamable $vgpr0 = V_ADD_U32_e64 12, 32, 0, implicit $exec + ; MUBUFW32-NEXT: SI_RETURN implicit $vgpr0 + ; + ; FLATSCRW64-LABEL: name: v_add_u32_e64__inline_imm__fi_literal__kernel + ; FLATSCRW64: renamable $vgpr0 = V_ADD_U32_e64 12, 32, 0, implicit $exec + ; FLATSCRW64-NEXT: SI_RETURN implicit $vgpr0 + ; + ; FLATSCRW32-LABEL: name: v_add_u32_e64__inline_imm__fi_literal__kernel + ; FLATSCRW32: renamable $vgpr0 = V_ADD_U32_e64 12, 32, 0, implicit $exec + ; FLATSCRW32-NEXT: SI_RETURN implicit $vgpr0 + renamable $vgpr0 = V_ADD_U32_e64 12, %stack.1, 0, implicit $exec + SI_RETURN implicit $vgpr0 + +... + +--- +name: v_add_u32_e64__fi_literal__inline_imm__kernel +tracksRegLiveness: true +stack: + - { id: 0, size: 32, alignment: 16 } + - { id: 1, size: 80, alignment: 4 } +machineFunctionInfo: + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' + isEntryFunction: true +body: | + bb.0: + ; MUBUF-LABEL: name: v_add_u32_e64__fi_literal__inline_imm__kernel + ; MUBUF: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUF-NEXT: {{ $}} + ; MUBUF-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $noreg, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUF-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUF-NEXT: renamable $vgpr0 = V_ADD_U32_e64 32, 12, 0, implicit $exec + ; MUBUF-NEXT: SI_RETURN implicit $vgpr0 + ; + ; MUBUFW32-LABEL: name: v_add_u32_e64__fi_literal__inline_imm__kernel + ; MUBUFW32: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW32-NEXT: {{ $}} + ; MUBUFW32-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $noreg, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW32-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW32-NEXT: renamable $vgpr0 = V_ADD_U32_e64 32, 12, 0, implicit $exec + ; MUBUFW32-NEXT: SI_RETURN implicit $vgpr0 + ; + ; FLATSCRW64-LABEL: name: v_add_u32_e64__fi_literal__inline_imm__kernel + ; FLATSCRW64: renamable $vgpr0 = V_ADD_U32_e64 32, 12, 0, implicit $exec + ; FLATSCRW64-NEXT: SI_RETURN implicit $vgpr0 + ; + ; FLATSCRW32-LABEL: name: v_add_u32_e64__fi_literal__inline_imm__kernel + ; FLATSCRW32: renamable $vgpr0 = V_ADD_U32_e64 32, 12, 0, implicit $exec + ; FLATSCRW32-NEXT: SI_RETURN implicit $vgpr0 + renamable $vgpr0 = V_ADD_U32_e64 %stack.1, 12, 0, implicit $exec + SI_RETURN implicit $vgpr0 + +... + +--- +name: v_add_u32_e64__inline_imm__fi_literal__kernel__clamp +tracksRegLiveness: true +stack: + - { id: 0, size: 32, alignment: 16 } + - { id: 1, size: 80, alignment: 4 } +machineFunctionInfo: + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' + isEntryFunction: true +body: | + bb.0: + ; MUBUF-LABEL: name: v_add_u32_e64__inline_imm__fi_literal__kernel__clamp + ; MUBUF: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUF-NEXT: {{ $}} + ; MUBUF-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $noreg, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUF-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUF-NEXT: renamable $vgpr0 = V_ADD_U32_e64 12, 32, 1, implicit $exec + ; MUBUF-NEXT: SI_RETURN implicit $vgpr0 + ; + ; MUBUFW32-LABEL: name: v_add_u32_e64__inline_imm__fi_literal__kernel__clamp + ; MUBUFW32: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW32-NEXT: {{ $}} + ; MUBUFW32-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $noreg, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW32-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW32-NEXT: renamable $vgpr0 = V_ADD_U32_e64 12, 32, 1, implicit $exec + ; MUBUFW32-NEXT: SI_RETURN implicit $vgpr0 + ; + ; FLATSCRW64-LABEL: name: v_add_u32_e64__inline_imm__fi_literal__kernel__clamp + ; FLATSCRW64: renamable $vgpr0 = V_ADD_U32_e64 12, 32, 1, implicit $exec + ; FLATSCRW64-NEXT: SI_RETURN implicit $vgpr0 + ; + ; FLATSCRW32-LABEL: name: v_add_u32_e64__inline_imm__fi_literal__kernel__clamp + ; FLATSCRW32: renamable $vgpr0 = V_ADD_U32_e64 12, 32, 1, implicit $exec + ; FLATSCRW32-NEXT: SI_RETURN implicit $vgpr0 + renamable $vgpr0 = V_ADD_U32_e64 12, %stack.1, 1, implicit $exec + SI_RETURN implicit $vgpr0 + +... + +--- +name: killed_reg_regression +tracksRegLiveness: true +stack: + - { id: 0, size: 32, alignment: 16 } + - { id: 1, size: 80, alignment: 4 } +machineFunctionInfo: + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' + isEntryFunction: true +body: | + bb.0: + liveins: $vgpr0 + + ; MUBUF-LABEL: name: killed_reg_regression + ; MUBUF: liveins: $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUF-NEXT: {{ $}} + ; MUBUF-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $noreg, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUF-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUF-NEXT: renamable $vgpr0 = V_LSHLREV_B32_e32 2, killed $vgpr0, implicit $exec + ; MUBUF-NEXT: renamable $vgpr1 = V_ADD_U32_e32 0, $vgpr0, implicit $exec + ; MUBUF-NEXT: renamable $vgpr2 = V_MOV_B32_e32 15, implicit $exec + ; MUBUF-NEXT: SCRATCH_STORE_DWORD killed renamable $vgpr2, killed renamable $vgpr1, 0, 0, implicit $exec, implicit $flat_scr :: (volatile store (s32), addrspace 5) + ; MUBUF-NEXT: renamable $vgpr0 = V_SUB_U32_e32 0, killed $vgpr0, implicit $exec + ; MUBUF-NEXT: dead renamable $vgpr0 = SCRATCH_LOAD_DWORD killed renamable $vgpr0, 124, 0, implicit $exec, implicit $flat_scr :: (volatile load (s32), addrspace 5) + ; MUBUF-NEXT: S_ENDPGM 0 + ; + ; MUBUFW32-LABEL: name: killed_reg_regression + ; MUBUFW32: liveins: $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW32-NEXT: {{ $}} + ; MUBUFW32-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $noreg, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW32-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW32-NEXT: renamable $vgpr0 = V_LSHLREV_B32_e32 2, killed $vgpr0, implicit $exec + ; MUBUFW32-NEXT: renamable $vgpr1 = V_ADD_U32_e32 0, $vgpr0, implicit $exec + ; MUBUFW32-NEXT: renamable $vgpr2 = V_MOV_B32_e32 15, implicit $exec + ; MUBUFW32-NEXT: SCRATCH_STORE_DWORD killed renamable $vgpr2, killed renamable $vgpr1, 0, 0, implicit $exec, implicit $flat_scr :: (volatile store (s32), addrspace 5) + ; MUBUFW32-NEXT: renamable $vgpr0 = V_SUB_U32_e32 0, killed $vgpr0, implicit $exec + ; MUBUFW32-NEXT: dead renamable $vgpr0 = SCRATCH_LOAD_DWORD killed renamable $vgpr0, 124, 0, implicit $exec, implicit $flat_scr :: (volatile load (s32), addrspace 5) + ; MUBUFW32-NEXT: S_ENDPGM 0 + ; + ; FLATSCRW64-LABEL: name: killed_reg_regression + ; FLATSCRW64: liveins: $vgpr0 + ; FLATSCRW64-NEXT: {{ $}} + ; FLATSCRW64-NEXT: renamable $vgpr0 = V_LSHLREV_B32_e32 2, killed $vgpr0, implicit $exec + ; FLATSCRW64-NEXT: renamable $vgpr1 = V_ADD_U32_e32 0, $vgpr0, implicit $exec + ; FLATSCRW64-NEXT: renamable $vgpr2 = V_MOV_B32_e32 15, implicit $exec + ; FLATSCRW64-NEXT: SCRATCH_STORE_DWORD killed renamable $vgpr2, killed renamable $vgpr1, 0, 0, implicit $exec, implicit $flat_scr :: (volatile store (s32), addrspace 5) + ; FLATSCRW64-NEXT: renamable $vgpr0 = V_SUB_U32_e32 0, killed $vgpr0, implicit $exec + ; FLATSCRW64-NEXT: dead renamable $vgpr0 = SCRATCH_LOAD_DWORD killed renamable $vgpr0, 124, 0, implicit $exec, implicit $flat_scr :: (volatile load (s32), addrspace 5) + ; FLATSCRW64-NEXT: S_ENDPGM 0 + ; + ; FLATSCRW32-LABEL: name: killed_reg_regression + ; FLATSCRW32: liveins: $vgpr0 + ; FLATSCRW32-NEXT: {{ $}} + ; FLATSCRW32-NEXT: renamable $vgpr0 = V_LSHLREV_B32_e32 2, killed $vgpr0, implicit $exec + ; FLATSCRW32-NEXT: renamable $vgpr1 = V_ADD_U32_e32 0, $vgpr0, implicit $exec + ; FLATSCRW32-NEXT: renamable $vgpr2 = V_MOV_B32_e32 15, implicit $exec + ; FLATSCRW32-NEXT: SCRATCH_STORE_DWORD killed renamable $vgpr2, killed renamable $vgpr1, 0, 0, implicit $exec, implicit $flat_scr :: (volatile store (s32), addrspace 5) + ; FLATSCRW32-NEXT: renamable $vgpr0 = V_SUB_U32_e32 0, killed $vgpr0, implicit $exec + ; FLATSCRW32-NEXT: dead renamable $vgpr0 = SCRATCH_LOAD_DWORD killed renamable $vgpr0, 124, 0, implicit $exec, implicit $flat_scr :: (volatile load (s32), addrspace 5) + ; FLATSCRW32-NEXT: S_ENDPGM 0 + renamable $vgpr0 = V_LSHLREV_B32_e32 2, killed $vgpr0, implicit $exec + renamable $vgpr1 = V_ADD_U32_e32 %stack.0, $vgpr0, implicit $exec + renamable $vgpr2 = V_MOV_B32_e32 15, implicit $exec + SCRATCH_STORE_DWORD killed renamable $vgpr2, killed renamable $vgpr1, 0, 0, implicit $exec, implicit $flat_scr :: (volatile store (s32), addrspace 5) + renamable $vgpr0 = V_SUB_U32_e32 %stack.0, killed $vgpr0, implicit $exec + dead renamable $vgpr0 = SCRATCH_LOAD_DWORD killed renamable $vgpr0, 124, 0, implicit $exec, implicit $flat_scr :: (volatile load (s32), addrspace 5) + S_ENDPGM 0 + +... + +--- +name: v_add_u32_e32__kernel_fi_offset0__other_vgpr_live_after +tracksRegLiveness: true +stack: + - { id: 0, size: 16, alignment: 16 } +machineFunctionInfo: + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' + isEntryFunction: true +body: | + bb.0: + liveins: $vgpr1 + ; MUBUF-LABEL: name: v_add_u32_e32__kernel_fi_offset0__other_vgpr_live_after + ; MUBUF: liveins: $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUF-NEXT: {{ $}} + ; MUBUF-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $noreg, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUF-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUF-NEXT: $vgpr2 = V_MOV_B32_e32 0, implicit $exec + ; MUBUF-NEXT: renamable $vgpr0 = V_ADD_U32_e32 $vgpr1, killed $vgpr2, implicit $exec + ; MUBUF-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1 + ; + ; MUBUFW32-LABEL: name: v_add_u32_e32__kernel_fi_offset0__other_vgpr_live_after + ; MUBUFW32: liveins: $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW32-NEXT: {{ $}} + ; MUBUFW32-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $noreg, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW32-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW32-NEXT: $vgpr2 = V_MOV_B32_e32 0, implicit $exec + ; MUBUFW32-NEXT: renamable $vgpr0 = V_ADD_U32_e32 $vgpr1, killed $vgpr2, implicit $exec + ; MUBUFW32-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1 + ; + ; FLATSCRW64-LABEL: name: v_add_u32_e32__kernel_fi_offset0__other_vgpr_live_after + ; FLATSCRW64: liveins: $vgpr1 + ; FLATSCRW64-NEXT: {{ $}} + ; FLATSCRW64-NEXT: $vgpr2 = V_MOV_B32_e32 0, implicit $exec + ; FLATSCRW64-NEXT: renamable $vgpr0 = V_ADD_U32_e32 $vgpr1, killed $vgpr2, implicit $exec + ; FLATSCRW64-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1 + ; + ; FLATSCRW32-LABEL: name: v_add_u32_e32__kernel_fi_offset0__other_vgpr_live_after + ; FLATSCRW32: liveins: $vgpr1 + ; FLATSCRW32-NEXT: {{ $}} + ; FLATSCRW32-NEXT: $vgpr2 = V_MOV_B32_e32 0, implicit $exec + ; FLATSCRW32-NEXT: renamable $vgpr0 = V_ADD_U32_e32 $vgpr1, killed $vgpr2, implicit $exec + ; FLATSCRW32-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1 + renamable $vgpr0 = V_ADD_U32_e32 renamable $vgpr1, %stack.0, implicit $exec + SI_RETURN implicit $vgpr0, implicit $vgpr1 + +... + +--- +name: v_add_u32_e32__kernel_other_vgpr_live_after__fi_offset0 +tracksRegLiveness: true +stack: + - { id: 0, size: 16, alignment: 16 } +machineFunctionInfo: + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' + isEntryFunction: true +body: | + bb.0: + liveins: $vgpr1 + ; MUBUF-LABEL: name: v_add_u32_e32__kernel_other_vgpr_live_after__fi_offset0 + ; MUBUF: liveins: $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUF-NEXT: {{ $}} + ; MUBUF-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $noreg, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUF-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUF-NEXT: renamable $vgpr0 = V_ADD_U32_e32 0, $vgpr1, implicit $exec + ; MUBUF-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1 + ; + ; MUBUFW32-LABEL: name: v_add_u32_e32__kernel_other_vgpr_live_after__fi_offset0 + ; MUBUFW32: liveins: $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW32-NEXT: {{ $}} + ; MUBUFW32-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $noreg, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW32-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW32-NEXT: renamable $vgpr0 = V_ADD_U32_e32 0, $vgpr1, implicit $exec + ; MUBUFW32-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1 + ; + ; FLATSCRW64-LABEL: name: v_add_u32_e32__kernel_other_vgpr_live_after__fi_offset0 + ; FLATSCRW64: liveins: $vgpr1 + ; FLATSCRW64-NEXT: {{ $}} + ; FLATSCRW64-NEXT: renamable $vgpr0 = V_ADD_U32_e32 0, $vgpr1, implicit $exec + ; FLATSCRW64-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1 + ; + ; FLATSCRW32-LABEL: name: v_add_u32_e32__kernel_other_vgpr_live_after__fi_offset0 + ; FLATSCRW32: liveins: $vgpr1 + ; FLATSCRW32-NEXT: {{ $}} + ; FLATSCRW32-NEXT: renamable $vgpr0 = V_ADD_U32_e32 0, $vgpr1, implicit $exec + ; FLATSCRW32-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1 + renamable $vgpr0 = V_ADD_U32_e32 %stack.0, renamable $vgpr1, implicit $exec + SI_RETURN implicit $vgpr0, implicit $vgpr1 + +... + +--- +name: v_add_u32_e32__kernel_fi_offset0__sgpr_live_after +tracksRegLiveness: true +stack: + - { id: 0, size: 16, alignment: 16 } +machineFunctionInfo: + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' + isEntryFunction: true +body: | + bb.0: + liveins: $sgpr8 + ; MUBUF-LABEL: name: v_add_u32_e32__kernel_fi_offset0__sgpr_live_after + ; MUBUF: liveins: $sgpr8, $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUF-NEXT: {{ $}} + ; MUBUF-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $noreg, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUF-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUF-NEXT: $vgpr1 = V_MOV_B32_e32 0, implicit $exec + ; MUBUF-NEXT: renamable $vgpr0 = V_ADD_U32_e32 $sgpr8, killed $vgpr1, implicit $exec + ; MUBUF-NEXT: SI_RETURN implicit $vgpr0, implicit $sgpr8 + ; + ; MUBUFW32-LABEL: name: v_add_u32_e32__kernel_fi_offset0__sgpr_live_after + ; MUBUFW32: liveins: $sgpr8, $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW32-NEXT: {{ $}} + ; MUBUFW32-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $noreg, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW32-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW32-NEXT: $vgpr1 = V_MOV_B32_e32 0, implicit $exec + ; MUBUFW32-NEXT: renamable $vgpr0 = V_ADD_U32_e32 $sgpr8, killed $vgpr1, implicit $exec + ; MUBUFW32-NEXT: SI_RETURN implicit $vgpr0, implicit $sgpr8 + ; + ; FLATSCRW64-LABEL: name: v_add_u32_e32__kernel_fi_offset0__sgpr_live_after + ; FLATSCRW64: liveins: $sgpr8 + ; FLATSCRW64-NEXT: {{ $}} + ; FLATSCRW64-NEXT: $vgpr1 = V_MOV_B32_e32 0, implicit $exec + ; FLATSCRW64-NEXT: renamable $vgpr0 = V_ADD_U32_e32 $sgpr8, killed $vgpr1, implicit $exec + ; FLATSCRW64-NEXT: SI_RETURN implicit $vgpr0, implicit $sgpr8 + ; + ; FLATSCRW32-LABEL: name: v_add_u32_e32__kernel_fi_offset0__sgpr_live_after + ; FLATSCRW32: liveins: $sgpr8 + ; FLATSCRW32-NEXT: {{ $}} + ; FLATSCRW32-NEXT: $vgpr1 = V_MOV_B32_e32 0, implicit $exec + ; FLATSCRW32-NEXT: renamable $vgpr0 = V_ADD_U32_e32 $sgpr8, killed $vgpr1, implicit $exec + ; FLATSCRW32-NEXT: SI_RETURN implicit $vgpr0, implicit $sgpr8 + renamable $vgpr0 = V_ADD_U32_e32 renamable $sgpr8, %stack.0, implicit $exec + SI_RETURN implicit $vgpr0, implicit $sgpr8 + +... + +--- +name: v_add_u32_e64__kernel_fi_offset0__other_vgpr_live_after +tracksRegLiveness: true +stack: + - { id: 0, size: 16, alignment: 16 } +machineFunctionInfo: + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' + isEntryFunction: true +body: | + bb.0: + liveins: $vgpr1 + ; MUBUF-LABEL: name: v_add_u32_e64__kernel_fi_offset0__other_vgpr_live_after + ; MUBUF: liveins: $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUF-NEXT: {{ $}} + ; MUBUF-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $noreg, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUF-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUF-NEXT: renamable $vgpr0 = V_ADD_U32_e64 $vgpr1, 0, 0, implicit $exec + ; MUBUF-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1 + ; + ; MUBUFW32-LABEL: name: v_add_u32_e64__kernel_fi_offset0__other_vgpr_live_after + ; MUBUFW32: liveins: $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW32-NEXT: {{ $}} + ; MUBUFW32-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $noreg, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW32-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW32-NEXT: renamable $vgpr0 = V_ADD_U32_e64 $vgpr1, 0, 0, implicit $exec + ; MUBUFW32-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1 + ; + ; FLATSCRW64-LABEL: name: v_add_u32_e64__kernel_fi_offset0__other_vgpr_live_after + ; FLATSCRW64: liveins: $vgpr1 + ; FLATSCRW64-NEXT: {{ $}} + ; FLATSCRW64-NEXT: renamable $vgpr0 = V_ADD_U32_e64 $vgpr1, 0, 0, implicit $exec + ; FLATSCRW64-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1 + ; + ; FLATSCRW32-LABEL: name: v_add_u32_e64__kernel_fi_offset0__other_vgpr_live_after + ; FLATSCRW32: liveins: $vgpr1 + ; FLATSCRW32-NEXT: {{ $}} + ; FLATSCRW32-NEXT: renamable $vgpr0 = V_ADD_U32_e64 $vgpr1, 0, 0, implicit $exec + ; FLATSCRW32-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1 + renamable $vgpr0 = V_ADD_U32_e64 renamable $vgpr1, %stack.0, 0, implicit $exec + SI_RETURN implicit $vgpr0, implicit $vgpr1 + +... + +--- +name: v_add_u32_e32__kernel_fi_offset72__sgpr_live_after +tracksRegLiveness: true +stack: + - { id: 0, size: 72, alignment: 16 } + - { id: 1, size: 32, alignment: 4 } +machineFunctionInfo: + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' + isEntryFunction: true +body: | + bb.0: + liveins: $sgpr8 + ; MUBUF-LABEL: name: v_add_u32_e32__kernel_fi_offset72__sgpr_live_after + ; MUBUF: liveins: $sgpr8, $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUF-NEXT: {{ $}} + ; MUBUF-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $noreg, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUF-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUF-NEXT: $vgpr1 = V_MOV_B32_e32 72, implicit $exec + ; MUBUF-NEXT: renamable $vgpr0 = V_ADD_U32_e32 $sgpr8, killed $vgpr1, implicit $exec + ; MUBUF-NEXT: SI_RETURN implicit $vgpr0, implicit $sgpr8 + ; + ; MUBUFW32-LABEL: name: v_add_u32_e32__kernel_fi_offset72__sgpr_live_after + ; MUBUFW32: liveins: $sgpr8, $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW32-NEXT: {{ $}} + ; MUBUFW32-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $noreg, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW32-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW32-NEXT: $vgpr1 = V_MOV_B32_e32 72, implicit $exec + ; MUBUFW32-NEXT: renamable $vgpr0 = V_ADD_U32_e32 $sgpr8, killed $vgpr1, implicit $exec + ; MUBUFW32-NEXT: SI_RETURN implicit $vgpr0, implicit $sgpr8 + ; + ; FLATSCRW64-LABEL: name: v_add_u32_e32__kernel_fi_offset72__sgpr_live_after + ; FLATSCRW64: liveins: $sgpr8 + ; FLATSCRW64-NEXT: {{ $}} + ; FLATSCRW64-NEXT: $vgpr1 = V_MOV_B32_e32 72, implicit $exec + ; FLATSCRW64-NEXT: renamable $vgpr0 = V_ADD_U32_e32 $sgpr8, killed $vgpr1, implicit $exec + ; FLATSCRW64-NEXT: SI_RETURN implicit $vgpr0, implicit $sgpr8 + ; + ; FLATSCRW32-LABEL: name: v_add_u32_e32__kernel_fi_offset72__sgpr_live_after + ; FLATSCRW32: liveins: $sgpr8 + ; FLATSCRW32-NEXT: {{ $}} + ; FLATSCRW32-NEXT: $vgpr1 = V_MOV_B32_e32 72, implicit $exec + ; FLATSCRW32-NEXT: renamable $vgpr0 = V_ADD_U32_e32 $sgpr8, killed $vgpr1, implicit $exec + ; FLATSCRW32-NEXT: SI_RETURN implicit $vgpr0, implicit $sgpr8 + renamable $vgpr0 = V_ADD_U32_e32 renamable $sgpr8, %stack.1, implicit $exec + SI_RETURN implicit $vgpr0, implicit $sgpr8 + +... + +--- +name: v_add_u32_e64__kernel_fi_offset72__sgpr_live_after +tracksRegLiveness: true +stack: + - { id: 0, size: 72, alignment: 16 } + - { id: 1, size: 32, alignment: 4 } +machineFunctionInfo: + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' + isEntryFunction: true +body: | + bb.0: + liveins: $sgpr8 + ; MUBUF-LABEL: name: v_add_u32_e64__kernel_fi_offset72__sgpr_live_after + ; MUBUF: liveins: $sgpr8, $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUF-NEXT: {{ $}} + ; MUBUF-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $noreg, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUF-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUF-NEXT: $vgpr1 = V_MOV_B32_e32 72, implicit $exec + ; MUBUF-NEXT: renamable $vgpr0 = V_ADD_U32_e64 $sgpr8, killed $vgpr1, 0, implicit $exec + ; MUBUF-NEXT: SI_RETURN implicit $vgpr0, implicit $sgpr8 + ; + ; MUBUFW32-LABEL: name: v_add_u32_e64__kernel_fi_offset72__sgpr_live_after + ; MUBUFW32: liveins: $sgpr8, $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW32-NEXT: {{ $}} + ; MUBUFW32-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $noreg, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW32-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW32-NEXT: renamable $vgpr0 = V_ADD_U32_e64 $sgpr8, 72, 0, implicit $exec + ; MUBUFW32-NEXT: SI_RETURN implicit $vgpr0, implicit $sgpr8 + ; + ; FLATSCRW64-LABEL: name: v_add_u32_e64__kernel_fi_offset72__sgpr_live_after + ; FLATSCRW64: liveins: $sgpr8 + ; FLATSCRW64-NEXT: {{ $}} + ; FLATSCRW64-NEXT: $vgpr1 = V_MOV_B32_e32 72, implicit $exec + ; FLATSCRW64-NEXT: renamable $vgpr0 = V_ADD_U32_e64 $sgpr8, killed $vgpr1, 0, implicit $exec + ; FLATSCRW64-NEXT: SI_RETURN implicit $vgpr0, implicit $sgpr8 + ; + ; FLATSCRW32-LABEL: name: v_add_u32_e64__kernel_fi_offset72__sgpr_live_after + ; FLATSCRW32: liveins: $sgpr8 + ; FLATSCRW32-NEXT: {{ $}} + ; FLATSCRW32-NEXT: renamable $vgpr0 = V_ADD_U32_e64 $sgpr8, 72, 0, implicit $exec + ; FLATSCRW32-NEXT: SI_RETURN implicit $vgpr0, implicit $sgpr8 + renamable $vgpr0 = V_ADD_U32_e64 renamable $sgpr8, %stack.1, 0, implicit $exec + SI_RETURN implicit $vgpr0, implicit $sgpr8 + +... + +--- +name: v_add_u32_e32__identity_vgpr__fi_offset0__kernel +tracksRegLiveness: true +stack: + - { id: 0, size: 32, alignment: 16 } +machineFunctionInfo: + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' + isEntryFunction: true +body: | + bb.0: + liveins: $vgpr0 + + ; MUBUF-LABEL: name: v_add_u32_e32__identity_vgpr__fi_offset0__kernel + ; MUBUF: liveins: $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUF-NEXT: {{ $}} + ; MUBUF-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $noreg, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUF-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUF-NEXT: $vgpr1 = V_MOV_B32_e32 0, implicit $exec + ; MUBUF-NEXT: renamable $vgpr0 = V_ADD_U32_e32 $vgpr0, killed $vgpr1, implicit $exec + ; MUBUF-NEXT: SI_RETURN implicit $vgpr0 + ; + ; MUBUFW32-LABEL: name: v_add_u32_e32__identity_vgpr__fi_offset0__kernel + ; MUBUFW32: liveins: $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW32-NEXT: {{ $}} + ; MUBUFW32-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $noreg, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW32-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW32-NEXT: $vgpr1 = V_MOV_B32_e32 0, implicit $exec + ; MUBUFW32-NEXT: renamable $vgpr0 = V_ADD_U32_e32 $vgpr0, killed $vgpr1, implicit $exec + ; MUBUFW32-NEXT: SI_RETURN implicit $vgpr0 + ; + ; FLATSCRW64-LABEL: name: v_add_u32_e32__identity_vgpr__fi_offset0__kernel + ; FLATSCRW64: liveins: $vgpr0 + ; FLATSCRW64-NEXT: {{ $}} + ; FLATSCRW64-NEXT: $vgpr1 = V_MOV_B32_e32 0, implicit $exec + ; FLATSCRW64-NEXT: renamable $vgpr0 = V_ADD_U32_e32 $vgpr0, killed $vgpr1, implicit $exec + ; FLATSCRW64-NEXT: SI_RETURN implicit $vgpr0 + ; + ; FLATSCRW32-LABEL: name: v_add_u32_e32__identity_vgpr__fi_offset0__kernel + ; FLATSCRW32: liveins: $vgpr0 + ; FLATSCRW32-NEXT: {{ $}} + ; FLATSCRW32-NEXT: $vgpr1 = V_MOV_B32_e32 0, implicit $exec + ; FLATSCRW32-NEXT: renamable $vgpr0 = V_ADD_U32_e32 $vgpr0, killed $vgpr1, implicit $exec + ; FLATSCRW32-NEXT: SI_RETURN implicit $vgpr0 + renamable $vgpr0 = V_ADD_U32_e32 $vgpr0, %stack.0, implicit $exec + SI_RETURN implicit $vgpr0 + +... + +--- +name: v_add_u32_e32__fi_offset0__identity_vgpr__kernel +tracksRegLiveness: true +stack: + - { id: 0, size: 32, alignment: 16 } +machineFunctionInfo: + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' + isEntryFunction: true +body: | + bb.0: + liveins: $vgpr0 + ; MUBUF-LABEL: name: v_add_u32_e32__fi_offset0__identity_vgpr__kernel + ; MUBUF: liveins: $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUF-NEXT: {{ $}} + ; MUBUF-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $noreg, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUF-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUF-NEXT: renamable $vgpr0 = V_ADD_U32_e32 0, $vgpr0, implicit $exec + ; MUBUF-NEXT: SI_RETURN implicit $vgpr0 + ; + ; MUBUFW32-LABEL: name: v_add_u32_e32__fi_offset0__identity_vgpr__kernel + ; MUBUFW32: liveins: $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW32-NEXT: {{ $}} + ; MUBUFW32-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $noreg, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW32-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW32-NEXT: renamable $vgpr0 = V_ADD_U32_e32 0, $vgpr0, implicit $exec + ; MUBUFW32-NEXT: SI_RETURN implicit $vgpr0 + ; + ; FLATSCRW64-LABEL: name: v_add_u32_e32__fi_offset0__identity_vgpr__kernel + ; FLATSCRW64: liveins: $vgpr0 + ; FLATSCRW64-NEXT: {{ $}} + ; FLATSCRW64-NEXT: renamable $vgpr0 = V_ADD_U32_e32 0, $vgpr0, implicit $exec + ; FLATSCRW64-NEXT: SI_RETURN implicit $vgpr0 + ; + ; FLATSCRW32-LABEL: name: v_add_u32_e32__fi_offset0__identity_vgpr__kernel + ; FLATSCRW32: liveins: $vgpr0 + ; FLATSCRW32-NEXT: {{ $}} + ; FLATSCRW32-NEXT: renamable $vgpr0 = V_ADD_U32_e32 0, $vgpr0, implicit $exec + ; FLATSCRW32-NEXT: SI_RETURN implicit $vgpr0 + renamable $vgpr0 = V_ADD_U32_e32 %stack.0, $vgpr0, implicit $exec + SI_RETURN implicit $vgpr0 + +... + +--- +name: v_add_u32_e64__identity_vgpr__fi_offset0__kernel +tracksRegLiveness: true +stack: + - { id: 0, size: 32, alignment: 16 } +machineFunctionInfo: + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' + isEntryFunction: true +body: | + bb.0: + liveins: $vgpr0 + + ; MUBUF-LABEL: name: v_add_u32_e64__identity_vgpr__fi_offset0__kernel + ; MUBUF: liveins: $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUF-NEXT: {{ $}} + ; MUBUF-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $noreg, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUF-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUF-NEXT: renamable $vgpr0 = V_ADD_U32_e64 $vgpr0, 0, 0, implicit $exec + ; MUBUF-NEXT: SI_RETURN implicit $vgpr0 + ; + ; MUBUFW32-LABEL: name: v_add_u32_e64__identity_vgpr__fi_offset0__kernel + ; MUBUFW32: liveins: $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW32-NEXT: {{ $}} + ; MUBUFW32-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $noreg, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW32-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW32-NEXT: renamable $vgpr0 = V_ADD_U32_e64 $vgpr0, 0, 0, implicit $exec + ; MUBUFW32-NEXT: SI_RETURN implicit $vgpr0 + ; + ; FLATSCRW64-LABEL: name: v_add_u32_e64__identity_vgpr__fi_offset0__kernel + ; FLATSCRW64: liveins: $vgpr0 + ; FLATSCRW64-NEXT: {{ $}} + ; FLATSCRW64-NEXT: renamable $vgpr0 = V_ADD_U32_e64 $vgpr0, 0, 0, implicit $exec + ; FLATSCRW64-NEXT: SI_RETURN implicit $vgpr0 + ; + ; FLATSCRW32-LABEL: name: v_add_u32_e64__identity_vgpr__fi_offset0__kernel + ; FLATSCRW32: liveins: $vgpr0 + ; FLATSCRW32-NEXT: {{ $}} + ; FLATSCRW32-NEXT: renamable $vgpr0 = V_ADD_U32_e64 $vgpr0, 0, 0, implicit $exec + ; FLATSCRW32-NEXT: SI_RETURN implicit $vgpr0 + renamable $vgpr0 = V_ADD_U32_e64 $vgpr0, %stack.0, 0, implicit $exec + SI_RETURN implicit $vgpr0 + +... + +--- +name: v_add_u32_e64__fi_offset0__identity_vgpr__kernel +tracksRegLiveness: true +stack: + - { id: 0, size: 32, alignment: 16 } +machineFunctionInfo: + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' + isEntryFunction: true +body: | + bb.0: + liveins: $vgpr0 + + ; MUBUF-LABEL: name: v_add_u32_e64__fi_offset0__identity_vgpr__kernel + ; MUBUF: liveins: $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUF-NEXT: {{ $}} + ; MUBUF-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $noreg, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUF-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUF-NEXT: renamable $vgpr0 = V_ADD_U32_e64 0, $vgpr0, 0, implicit $exec + ; MUBUF-NEXT: SI_RETURN implicit $vgpr0 + ; + ; MUBUFW32-LABEL: name: v_add_u32_e64__fi_offset0__identity_vgpr__kernel + ; MUBUFW32: liveins: $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW32-NEXT: {{ $}} + ; MUBUFW32-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $noreg, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW32-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW32-NEXT: renamable $vgpr0 = V_ADD_U32_e64 0, $vgpr0, 0, implicit $exec + ; MUBUFW32-NEXT: SI_RETURN implicit $vgpr0 + ; + ; FLATSCRW64-LABEL: name: v_add_u32_e64__fi_offset0__identity_vgpr__kernel + ; FLATSCRW64: liveins: $vgpr0 + ; FLATSCRW64-NEXT: {{ $}} + ; FLATSCRW64-NEXT: renamable $vgpr0 = V_ADD_U32_e64 0, $vgpr0, 0, implicit $exec + ; FLATSCRW64-NEXT: SI_RETURN implicit $vgpr0 + ; + ; FLATSCRW32-LABEL: name: v_add_u32_e64__fi_offset0__identity_vgpr__kernel + ; FLATSCRW32: liveins: $vgpr0 + ; FLATSCRW32-NEXT: {{ $}} + ; FLATSCRW32-NEXT: renamable $vgpr0 = V_ADD_U32_e64 0, $vgpr0, 0, implicit $exec + ; FLATSCRW32-NEXT: SI_RETURN implicit $vgpr0 + renamable $vgpr0 = V_ADD_U32_e64 %stack.0, $vgpr0, 0, implicit $exec + SI_RETURN implicit $vgpr0 + +... + +--- +name: v_add_u32_e32__fi_offset0__identity_vgpr__kernel_kill +tracksRegLiveness: true +stack: + - { id: 0, size: 32, alignment: 16 } +machineFunctionInfo: + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' + isEntryFunction: true +body: | + bb.0: + liveins: $vgpr0 + ; MUBUF-LABEL: name: v_add_u32_e32__fi_offset0__identity_vgpr__kernel_kill + ; MUBUF: liveins: $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUF-NEXT: {{ $}} + ; MUBUF-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $noreg, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUF-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUF-NEXT: renamable $vgpr0 = V_ADD_U32_e32 0, killed $vgpr0, implicit $exec + ; MUBUF-NEXT: SI_RETURN implicit $vgpr0 + ; + ; MUBUFW32-LABEL: name: v_add_u32_e32__fi_offset0__identity_vgpr__kernel_kill + ; MUBUFW32: liveins: $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW32-NEXT: {{ $}} + ; MUBUFW32-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $noreg, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW32-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW32-NEXT: renamable $vgpr0 = V_ADD_U32_e32 0, killed $vgpr0, implicit $exec + ; MUBUFW32-NEXT: SI_RETURN implicit $vgpr0 + ; + ; FLATSCRW64-LABEL: name: v_add_u32_e32__fi_offset0__identity_vgpr__kernel_kill + ; FLATSCRW64: liveins: $vgpr0 + ; FLATSCRW64-NEXT: {{ $}} + ; FLATSCRW64-NEXT: renamable $vgpr0 = V_ADD_U32_e32 0, killed $vgpr0, implicit $exec + ; FLATSCRW64-NEXT: SI_RETURN implicit $vgpr0 + ; + ; FLATSCRW32-LABEL: name: v_add_u32_e32__fi_offset0__identity_vgpr__kernel_kill + ; FLATSCRW32: liveins: $vgpr0 + ; FLATSCRW32-NEXT: {{ $}} + ; FLATSCRW32-NEXT: renamable $vgpr0 = V_ADD_U32_e32 0, killed $vgpr0, implicit $exec + ; FLATSCRW32-NEXT: SI_RETURN implicit $vgpr0 + renamable $vgpr0 = V_ADD_U32_e32 %stack.0, killed $vgpr0, implicit $exec + SI_RETURN implicit $vgpr0 + +... + +--- +name: v_add_u32_e32__identity_vgpr__fi_offset32__kernel +tracksRegLiveness: true +stack: + - { id: 0, size: 32, alignment: 16 } + - { id: 1, size: 64, alignment: 4 } +machineFunctionInfo: + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' + isEntryFunction: true +body: | + bb.0: + liveins: $vgpr0 + + ; MUBUF-LABEL: name: v_add_u32_e32__identity_vgpr__fi_offset32__kernel + ; MUBUF: liveins: $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUF-NEXT: {{ $}} + ; MUBUF-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $noreg, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUF-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUF-NEXT: $vgpr1 = V_MOV_B32_e32 32, implicit $exec + ; MUBUF-NEXT: renamable $vgpr0 = V_ADD_U32_e32 $vgpr0, killed $vgpr1, implicit $exec + ; MUBUF-NEXT: SI_RETURN implicit $vgpr0 + ; + ; MUBUFW32-LABEL: name: v_add_u32_e32__identity_vgpr__fi_offset32__kernel + ; MUBUFW32: liveins: $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW32-NEXT: {{ $}} + ; MUBUFW32-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $noreg, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW32-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW32-NEXT: $vgpr1 = V_MOV_B32_e32 32, implicit $exec + ; MUBUFW32-NEXT: renamable $vgpr0 = V_ADD_U32_e32 $vgpr0, killed $vgpr1, implicit $exec + ; MUBUFW32-NEXT: SI_RETURN implicit $vgpr0 + ; + ; FLATSCRW64-LABEL: name: v_add_u32_e32__identity_vgpr__fi_offset32__kernel + ; FLATSCRW64: liveins: $vgpr0 + ; FLATSCRW64-NEXT: {{ $}} + ; FLATSCRW64-NEXT: $vgpr1 = V_MOV_B32_e32 32, implicit $exec + ; FLATSCRW64-NEXT: renamable $vgpr0 = V_ADD_U32_e32 $vgpr0, killed $vgpr1, implicit $exec + ; FLATSCRW64-NEXT: SI_RETURN implicit $vgpr0 + ; + ; FLATSCRW32-LABEL: name: v_add_u32_e32__identity_vgpr__fi_offset32__kernel + ; FLATSCRW32: liveins: $vgpr0 + ; FLATSCRW32-NEXT: {{ $}} + ; FLATSCRW32-NEXT: $vgpr1 = V_MOV_B32_e32 32, implicit $exec + ; FLATSCRW32-NEXT: renamable $vgpr0 = V_ADD_U32_e32 $vgpr0, killed $vgpr1, implicit $exec + ; FLATSCRW32-NEXT: SI_RETURN implicit $vgpr0 + renamable $vgpr0 = V_ADD_U32_e32 $vgpr0, %stack.1, implicit $exec + SI_RETURN implicit $vgpr0 + +... + + +--- +name: v_add_u32_e32__identity_vgpr__fi_offset72__kernel +tracksRegLiveness: true +stack: + - { id: 0, size: 72, alignment: 16 } + - { id: 1, size: 64, alignment: 4 } +machineFunctionInfo: + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' + isEntryFunction: true +body: | + bb.0: + liveins: $vgpr0 + + ; MUBUF-LABEL: name: v_add_u32_e32__identity_vgpr__fi_offset72__kernel + ; MUBUF: liveins: $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUF-NEXT: {{ $}} + ; MUBUF-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $noreg, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUF-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUF-NEXT: $vgpr1 = V_MOV_B32_e32 72, implicit $exec + ; MUBUF-NEXT: renamable $vgpr0 = V_ADD_U32_e32 $vgpr0, killed $vgpr1, implicit $exec + ; MUBUF-NEXT: SI_RETURN implicit $vgpr0 + ; + ; MUBUFW32-LABEL: name: v_add_u32_e32__identity_vgpr__fi_offset72__kernel + ; MUBUFW32: liveins: $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW32-NEXT: {{ $}} + ; MUBUFW32-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $noreg, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW32-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW32-NEXT: $vgpr1 = V_MOV_B32_e32 72, implicit $exec + ; MUBUFW32-NEXT: renamable $vgpr0 = V_ADD_U32_e32 $vgpr0, killed $vgpr1, implicit $exec + ; MUBUFW32-NEXT: SI_RETURN implicit $vgpr0 + ; + ; FLATSCRW64-LABEL: name: v_add_u32_e32__identity_vgpr__fi_offset72__kernel + ; FLATSCRW64: liveins: $vgpr0 + ; FLATSCRW64-NEXT: {{ $}} + ; FLATSCRW64-NEXT: $vgpr1 = V_MOV_B32_e32 72, implicit $exec + ; FLATSCRW64-NEXT: renamable $vgpr0 = V_ADD_U32_e32 $vgpr0, killed $vgpr1, implicit $exec + ; FLATSCRW64-NEXT: SI_RETURN implicit $vgpr0 + ; + ; FLATSCRW32-LABEL: name: v_add_u32_e32__identity_vgpr__fi_offset72__kernel + ; FLATSCRW32: liveins: $vgpr0 + ; FLATSCRW32-NEXT: {{ $}} + ; FLATSCRW32-NEXT: $vgpr1 = V_MOV_B32_e32 72, implicit $exec + ; FLATSCRW32-NEXT: renamable $vgpr0 = V_ADD_U32_e32 $vgpr0, killed $vgpr1, implicit $exec + ; FLATSCRW32-NEXT: SI_RETURN implicit $vgpr0 + renamable $vgpr0 = V_ADD_U32_e32 $vgpr0, %stack.1, implicit $exec + SI_RETURN implicit $vgpr0 + +... + +--- +name: v_add_u32_e32__fi_offset72__identity_vgpr__kernel +tracksRegLiveness: true +stack: + - { id: 0, size: 72, alignment: 16 } + - { id: 1, size: 64, alignment: 4 } +machineFunctionInfo: + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' + isEntryFunction: true +body: | + bb.0: + liveins: $vgpr0 + + ; MUBUF-LABEL: name: v_add_u32_e32__fi_offset72__identity_vgpr__kernel + ; MUBUF: liveins: $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUF-NEXT: {{ $}} + ; MUBUF-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $noreg, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUF-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUF-NEXT: renamable $vgpr0 = V_ADD_U32_e32 72, $vgpr0, implicit $exec + ; MUBUF-NEXT: SI_RETURN implicit $vgpr0 + ; + ; MUBUFW32-LABEL: name: v_add_u32_e32__fi_offset72__identity_vgpr__kernel + ; MUBUFW32: liveins: $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW32-NEXT: {{ $}} + ; MUBUFW32-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $noreg, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW32-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW32-NEXT: renamable $vgpr0 = V_ADD_U32_e32 72, $vgpr0, implicit $exec + ; MUBUFW32-NEXT: SI_RETURN implicit $vgpr0 + ; + ; FLATSCRW64-LABEL: name: v_add_u32_e32__fi_offset72__identity_vgpr__kernel + ; FLATSCRW64: liveins: $vgpr0 + ; FLATSCRW64-NEXT: {{ $}} + ; FLATSCRW64-NEXT: renamable $vgpr0 = V_ADD_U32_e32 72, $vgpr0, implicit $exec + ; FLATSCRW64-NEXT: SI_RETURN implicit $vgpr0 + ; + ; FLATSCRW32-LABEL: name: v_add_u32_e32__fi_offset72__identity_vgpr__kernel + ; FLATSCRW32: liveins: $vgpr0 + ; FLATSCRW32-NEXT: {{ $}} + ; FLATSCRW32-NEXT: renamable $vgpr0 = V_ADD_U32_e32 72, $vgpr0, implicit $exec + ; FLATSCRW32-NEXT: SI_RETURN implicit $vgpr0 + renamable $vgpr0 = V_ADD_U32_e32 %stack.1, $vgpr0, implicit $exec + SI_RETURN implicit $vgpr0 + +... + +--- +name: v_add_u32_e32__fi_offset32__identity_vgpr__kernel +tracksRegLiveness: true +stack: + - { id: 0, size: 32, alignment: 16 } + - { id: 1, size: 64, alignment: 4 } +machineFunctionInfo: + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' + isEntryFunction: true +body: | + bb.0: + liveins: $vgpr0 + + ; MUBUF-LABEL: name: v_add_u32_e32__fi_offset32__identity_vgpr__kernel + ; MUBUF: liveins: $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUF-NEXT: {{ $}} + ; MUBUF-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $noreg, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUF-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUF-NEXT: renamable $vgpr0 = V_ADD_U32_e32 32, $vgpr0, implicit $exec + ; MUBUF-NEXT: SI_RETURN implicit $vgpr0 + ; + ; MUBUFW32-LABEL: name: v_add_u32_e32__fi_offset32__identity_vgpr__kernel + ; MUBUFW32: liveins: $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW32-NEXT: {{ $}} + ; MUBUFW32-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $noreg, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW32-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW32-NEXT: renamable $vgpr0 = V_ADD_U32_e32 32, $vgpr0, implicit $exec + ; MUBUFW32-NEXT: SI_RETURN implicit $vgpr0 + ; + ; FLATSCRW64-LABEL: name: v_add_u32_e32__fi_offset32__identity_vgpr__kernel + ; FLATSCRW64: liveins: $vgpr0 + ; FLATSCRW64-NEXT: {{ $}} + ; FLATSCRW64-NEXT: renamable $vgpr0 = V_ADD_U32_e32 32, $vgpr0, implicit $exec + ; FLATSCRW64-NEXT: SI_RETURN implicit $vgpr0 + ; + ; FLATSCRW32-LABEL: name: v_add_u32_e32__fi_offset32__identity_vgpr__kernel + ; FLATSCRW32: liveins: $vgpr0 + ; FLATSCRW32-NEXT: {{ $}} + ; FLATSCRW32-NEXT: renamable $vgpr0 = V_ADD_U32_e32 32, $vgpr0, implicit $exec + ; FLATSCRW32-NEXT: SI_RETURN implicit $vgpr0 + renamable $vgpr0 = V_ADD_U32_e32 %stack.1, $vgpr0, implicit $exec + SI_RETURN implicit $vgpr0 + +... + +--- +name: v_add_u32_e64__identity_vgpr__fi_offset32__kernel +tracksRegLiveness: true +stack: + - { id: 0, size: 32, alignment: 16 } + - { id: 1, size: 64, alignment: 4 } +machineFunctionInfo: + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' + isEntryFunction: true +body: | + bb.0: + liveins: $vgpr0 + + ; MUBUF-LABEL: name: v_add_u32_e64__identity_vgpr__fi_offset32__kernel + ; MUBUF: liveins: $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUF-NEXT: {{ $}} + ; MUBUF-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $noreg, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUF-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUF-NEXT: renamable $vgpr0 = V_ADD_U32_e64 $vgpr0, 32, 0, implicit $exec + ; MUBUF-NEXT: SI_RETURN implicit $vgpr0 + ; + ; MUBUFW32-LABEL: name: v_add_u32_e64__identity_vgpr__fi_offset32__kernel + ; MUBUFW32: liveins: $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW32-NEXT: {{ $}} + ; MUBUFW32-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $noreg, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW32-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; MUBUFW32-NEXT: renamable $vgpr0 = V_ADD_U32_e64 $vgpr0, 32, 0, implicit $exec + ; MUBUFW32-NEXT: SI_RETURN implicit $vgpr0 + ; + ; FLATSCRW64-LABEL: name: v_add_u32_e64__identity_vgpr__fi_offset32__kernel + ; FLATSCRW64: liveins: $vgpr0 + ; FLATSCRW64-NEXT: {{ $}} + ; FLATSCRW64-NEXT: renamable $vgpr0 = V_ADD_U32_e64 $vgpr0, 32, 0, implicit $exec + ; FLATSCRW64-NEXT: SI_RETURN implicit $vgpr0 + ; + ; FLATSCRW32-LABEL: name: v_add_u32_e64__identity_vgpr__fi_offset32__kernel + ; FLATSCRW32: liveins: $vgpr0 + ; FLATSCRW32-NEXT: {{ $}} + ; FLATSCRW32-NEXT: renamable $vgpr0 = V_ADD_U32_e64 $vgpr0, 32, 0, implicit $exec + ; FLATSCRW32-NEXT: SI_RETURN implicit $vgpr0 + renamable $vgpr0 = V_ADD_U32_e64 $vgpr0, %stack.1, 0, implicit $exec + SI_RETURN implicit $vgpr0 + +... diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call-set-from-other-function.ll b/llvm/test/CodeGen/AMDGPU/indirect-call-set-from-other-function.ll new file mode 100644 index 000000000000..f419d89a7f0a --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/indirect-call-set-from-other-function.ll @@ -0,0 +1,73 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-attributor %s -o - | FileCheck %s + +@g_fn = addrspace(1) global ptr null + +;. +; CHECK: @g_fn = addrspace(1) global ptr null +;. +define void @set_fn(ptr %fn) { +; CHECK-LABEL: define {{[^@]+}}@set_fn +; CHECK-SAME: (ptr [[FN:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: store ptr [[FN]], ptr addrspace(1) @g_fn, align 8 +; CHECK-NEXT: ret void +; +entry: + store ptr %fn, ptr addrspace(1) @g_fn + ret void +} + +define void @get_fn(ptr %fn) { +; CHECK-LABEL: define {{[^@]+}}@get_fn +; CHECK-SAME: (ptr [[FN:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[LOAD:%.*]] = load ptr, ptr addrspace(1) @g_fn, align 8 +; CHECK-NEXT: store ptr [[LOAD]], ptr [[FN]], align 8 +; CHECK-NEXT: ret void +; +entry: + %load = load ptr, ptr addrspace(1) @g_fn + store ptr %load, ptr %fn + ret void +} + +define void @foo() { +; CHECK-LABEL: define {{[^@]+}}@foo +; CHECK-SAME: () #[[ATTR1:[0-9]+]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[FN:%.*]] = alloca ptr, align 8, addrspace(5) +; CHECK-NEXT: store ptr null, ptr addrspace(5) [[FN]], align 8 +; CHECK-NEXT: [[FN_CAST:%.*]] = addrspacecast ptr addrspace(5) [[FN]] to ptr +; CHECK-NEXT: call void @get_fn(ptr [[FN_CAST]]) +; CHECK-NEXT: [[LOAD:%.*]] = load ptr, ptr addrspace(5) [[FN]], align 8 +; CHECK-NEXT: [[TOBOOL:%.*]] = icmp ne ptr [[LOAD]], null +; CHECK-NEXT: br i1 [[TOBOOL]], label [[IF_THEN:%.*]], label [[IF_END:%.*]] +; CHECK: if.then: +; CHECK-NEXT: [[LOAD_1:%.*]] = load ptr, ptr addrspace(5) [[FN]], align 8 +; CHECK-NEXT: call void [[LOAD_1]]() +; CHECK-NEXT: br label [[IF_END]] +; CHECK: if.end: +; CHECK-NEXT: ret void +; +entry: + %fn = alloca ptr, addrspace(5) + store ptr null, ptr addrspace(5) %fn + %fn.cast = addrspacecast ptr addrspace(5) %fn to ptr + call void @get_fn(ptr %fn.cast) + %load = load ptr, ptr addrspace(5) %fn + %tobool = icmp ne ptr %load, null + br i1 %tobool, label %if.then, label %if.end + +if.then: + %load.1 = load ptr, ptr addrspace(5) %fn + call void %load.1() + br label %if.end + +if.end: + ret void +} +;. +; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR1]] = { "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +;. diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll index 15f23eda241b..ee7f9375bf5d 100644 --- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll +++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll @@ -43,7 +43,6 @@ ; GCN-O0-NEXT: FunctionPass Manager ; GCN-O0-NEXT: Expand Atomic instructions ; GCN-O0-NEXT: Remove unreachable blocks from the CFG -; GCN-O0-NEXT: Expand vector predication intrinsics ; GCN-O0-NEXT: Instrument function entry/exit with calls to e.g. mcount() (post inlining) ; GCN-O0-NEXT: Scalarize Masked Memory Intrinsics ; GCN-O0-NEXT: Expand reduction intrinsics @@ -222,7 +221,6 @@ ; GCN-O1-NEXT: Constant Hoisting ; GCN-O1-NEXT: Replace intrinsics with calls to vector library ; GCN-O1-NEXT: Partially inline calls to library functions -; GCN-O1-NEXT: Expand vector predication intrinsics ; GCN-O1-NEXT: Instrument function entry/exit with calls to e.g. mcount() (post inlining) ; GCN-O1-NEXT: Scalarize Masked Memory Intrinsics ; GCN-O1-NEXT: Expand reduction intrinsics @@ -508,7 +506,6 @@ ; GCN-O1-OPTS-NEXT: Constant Hoisting ; GCN-O1-OPTS-NEXT: Replace intrinsics with calls to vector library ; GCN-O1-OPTS-NEXT: Partially inline calls to library functions -; GCN-O1-OPTS-NEXT: Expand vector predication intrinsics ; GCN-O1-OPTS-NEXT: Instrument function entry/exit with calls to e.g. mcount() (post inlining) ; GCN-O1-OPTS-NEXT: Scalarize Masked Memory Intrinsics ; GCN-O1-OPTS-NEXT: Expand reduction intrinsics @@ -813,7 +810,6 @@ ; GCN-O2-NEXT: Constant Hoisting ; GCN-O2-NEXT: Replace intrinsics with calls to vector library ; GCN-O2-NEXT: Partially inline calls to library functions -; GCN-O2-NEXT: Expand vector predication intrinsics ; GCN-O2-NEXT: Instrument function entry/exit with calls to e.g. mcount() (post inlining) ; GCN-O2-NEXT: Scalarize Masked Memory Intrinsics ; GCN-O2-NEXT: Expand reduction intrinsics @@ -1126,7 +1122,6 @@ ; GCN-O3-NEXT: Constant Hoisting ; GCN-O3-NEXT: Replace intrinsics with calls to vector library ; GCN-O3-NEXT: Partially inline calls to library functions -; GCN-O3-NEXT: Expand vector predication intrinsics ; GCN-O3-NEXT: Instrument function entry/exit with calls to e.g. mcount() (post inlining) ; GCN-O3-NEXT: Scalarize Masked Memory Intrinsics ; GCN-O3-NEXT: Expand reduction intrinsics diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.ll index 074489b9ff50..d085b3c768a8 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.ll @@ -523,14 +523,23 @@ define amdgpu_ps void @s_buffer_load_imm_mergex2(<4 x i32> inreg %desc) { ; GFX67-NEXT: exp mrt0 v0, v1, v0, v0 done vm ; GFX67-NEXT: s_endpgm ; -; GFX8910-LABEL: s_buffer_load_imm_mergex2: -; GFX8910: ; %bb.0: ; %main_body -; GFX8910-NEXT: s_buffer_load_dwordx2 s[0:1], s[0:3], 0x4 -; GFX8910-NEXT: s_waitcnt lgkmcnt(0) -; GFX8910-NEXT: v_mov_b32_e32 v0, s0 -; GFX8910-NEXT: v_mov_b32_e32 v1, s1 -; GFX8910-NEXT: exp mrt0 v0, v1, v0, v0 done vm -; GFX8910-NEXT: s_endpgm +; GFX8-LABEL: s_buffer_load_imm_mergex2: +; GFX8: ; %bb.0: ; %main_body +; GFX8-NEXT: s_buffer_load_dwordx2 s[0:1], s[0:3], 0x4 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: exp mrt0 v0, v1, v0, v0 done vm +; GFX8-NEXT: s_endpgm +; +; GFX910-LABEL: s_buffer_load_imm_mergex2: +; GFX910: ; %bb.0: ; %main_body +; GFX910-NEXT: s_buffer_load_dwordx2 s[4:5], s[0:3], 0x4 +; GFX910-NEXT: s_waitcnt lgkmcnt(0) +; GFX910-NEXT: v_mov_b32_e32 v0, s4 +; GFX910-NEXT: v_mov_b32_e32 v1, s5 +; GFX910-NEXT: exp mrt0 v0, v1, v0, v0 done vm +; GFX910-NEXT: s_endpgm ; ; GFX11-LABEL: s_buffer_load_imm_mergex2: ; GFX11: ; %bb.0: ; %main_body @@ -570,16 +579,27 @@ define amdgpu_ps void @s_buffer_load_imm_mergex4(<4 x i32> inreg %desc) { ; GFX67-NEXT: exp mrt0 v0, v1, v2, v3 done vm ; GFX67-NEXT: s_endpgm ; -; GFX8910-LABEL: s_buffer_load_imm_mergex4: -; GFX8910: ; %bb.0: ; %main_body -; GFX8910-NEXT: s_buffer_load_dwordx4 s[0:3], s[0:3], 0x8 -; GFX8910-NEXT: s_waitcnt lgkmcnt(0) -; GFX8910-NEXT: v_mov_b32_e32 v0, s0 -; GFX8910-NEXT: v_mov_b32_e32 v1, s1 -; GFX8910-NEXT: v_mov_b32_e32 v2, s2 -; GFX8910-NEXT: v_mov_b32_e32 v3, s3 -; GFX8910-NEXT: exp mrt0 v0, v1, v2, v3 done vm -; GFX8910-NEXT: s_endpgm +; GFX8-LABEL: s_buffer_load_imm_mergex4: +; GFX8: ; %bb.0: ; %main_body +; GFX8-NEXT: s_buffer_load_dwordx4 s[0:3], s[0:3], 0x8 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: exp mrt0 v0, v1, v2, v3 done vm +; GFX8-NEXT: s_endpgm +; +; GFX910-LABEL: s_buffer_load_imm_mergex4: +; GFX910: ; %bb.0: ; %main_body +; GFX910-NEXT: s_buffer_load_dwordx4 s[4:7], s[0:3], 0x8 +; GFX910-NEXT: s_waitcnt lgkmcnt(0) +; GFX910-NEXT: v_mov_b32_e32 v0, s4 +; GFX910-NEXT: v_mov_b32_e32 v1, s5 +; GFX910-NEXT: v_mov_b32_e32 v2, s6 +; GFX910-NEXT: v_mov_b32_e32 v3, s7 +; GFX910-NEXT: exp mrt0 v0, v1, v2, v3 done vm +; GFX910-NEXT: s_endpgm ; ; GFX11-LABEL: s_buffer_load_imm_mergex4: ; GFX11: ; %bb.0: ; %main_body diff --git a/llvm/test/CodeGen/AMDGPU/merge-sbuffer-load.mir b/llvm/test/CodeGen/AMDGPU/merge-sbuffer-load.mir index 1b2f672fd57b..02c1a328f482 100644 --- a/llvm/test/CodeGen/AMDGPU/merge-sbuffer-load.mir +++ b/llvm/test/CodeGen/AMDGPU/merge-sbuffer-load.mir @@ -1,14 +1,31 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 # RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs -run-pass si-load-store-opt -o - %s | FileCheck %s -check-prefixes=CHECK,GFX10 # RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -run-pass si-load-store-opt -o - %s | FileCheck %s -check-prefixes=CHECK,GFX12 -# CHECK-LABEL: name: merge_s_buffer_load_x2 -# CHECK: S_BUFFER_LOAD_DWORDX2_IMM %0, 0, 0 :: (dereferenceable invariant load (s64), align 4) +--- name: merge_s_buffer_load_x2 tracksRegLiveness: true body: | bb.0: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX10-LABEL: name: merge_s_buffer_load_x2 + ; GFX10: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX10-NEXT: early-clobber %3:sreg_64_xexec = S_BUFFER_LOAD_DWORDX2_IMM_ec [[COPY]], 0, 0 :: (dereferenceable invariant load (s64), align 4) + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY %3.sub0 + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed %3.sub1 + ; GFX10-NEXT: S_ENDPGM 0 + ; + ; GFX12-LABEL: name: merge_s_buffer_load_x2 + ; GFX12: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX12-NEXT: [[S_BUFFER_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_BUFFER_LOAD_DWORDX2_IMM [[COPY]], 0, 0 :: (dereferenceable invariant load (s64), align 4) + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[S_BUFFER_LOAD_DWORDX2_IMM]].sub0 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[S_BUFFER_LOAD_DWORDX2_IMM]].sub1 + ; GFX12-NEXT: S_ENDPGM 0 %0:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 %1:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 0, 0 :: (dereferenceable invariant load (s32)) %2:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 4, 0 :: (dereferenceable invariant load (s32)) @@ -17,15 +34,19 @@ body: | ... --- -# CHECK-LABEL: name: merge_s_buffer_load_x1_x2 -# CHECK: S_BUFFER_LOAD_DWORD_IMM %0, 0, 0 :: (dereferenceable invariant load (s32)) -# CHECK: S_BUFFER_LOAD_DWORDX2_IMM %0, 4, 0 :: (dereferenceable invariant load (s64)) name: merge_s_buffer_load_x1_x2 tracksRegLiveness: true body: | bb.0: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + ; CHECK-LABEL: name: merge_s_buffer_load_x1_x2 + ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[COPY]], 0, 0 :: (dereferenceable invariant load (s32)) + ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORDX2_IMM:%[0-9]+]]:sgpr_64 = S_BUFFER_LOAD_DWORDX2_IMM [[COPY]], 4, 0 :: (dereferenceable invariant load (s64)) + ; CHECK-NEXT: S_ENDPGM 0 %0:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 %1:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 0, 0 :: (dereferenceable invariant load (s32)) %2:sgpr_64 = S_BUFFER_LOAD_DWORDX2_IMM %0:sgpr_128, 4, 0 :: (dereferenceable invariant load (s64)) @@ -34,16 +55,28 @@ body: | ... --- -# CHECK-LABEL: name: merge_s_buffer_load_x2_x1 -# GFX10: S_BUFFER_LOAD_DWORDX2_IMM %0, 0, 0 :: (dereferenceable invariant load (s64)) -# GFX10: S_BUFFER_LOAD_DWORD_IMM %0, 8, 0 :: (dereferenceable invariant load (s32)) -# GFX12: S_BUFFER_LOAD_DWORDX3_IMM %0, 0, 0 :: (dereferenceable invariant load (s96), align 8) name: merge_s_buffer_load_x2_x1 tracksRegLiveness: true body: | bb.0: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX10-LABEL: name: merge_s_buffer_load_x2_x1 + ; GFX10: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX10-NEXT: [[S_BUFFER_LOAD_DWORDX2_IMM:%[0-9]+]]:sgpr_64 = S_BUFFER_LOAD_DWORDX2_IMM [[COPY]], 0, 0 :: (dereferenceable invariant load (s64)) + ; GFX10-NEXT: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[COPY]], 8, 0 :: (dereferenceable invariant load (s32)) + ; GFX10-NEXT: S_ENDPGM 0 + ; + ; GFX12-LABEL: name: merge_s_buffer_load_x2_x1 + ; GFX12: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX12-NEXT: [[S_BUFFER_LOAD_DWORDX3_IMM:%[0-9]+]]:sgpr_96 = S_BUFFER_LOAD_DWORDX3_IMM [[COPY]], 0, 0 :: (dereferenceable invariant load (s96), align 8) + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_64 = COPY [[S_BUFFER_LOAD_DWORDX3_IMM]].sub0_sub1 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[S_BUFFER_LOAD_DWORDX3_IMM]].sub2 + ; GFX12-NEXT: S_ENDPGM 0 %0:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 %1:sgpr_64 = S_BUFFER_LOAD_DWORDX2_IMM %0:sgpr_128, 0, 0 :: (dereferenceable invariant load (s64)) %2:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 8, 0 :: (dereferenceable invariant load (s32)) @@ -52,14 +85,37 @@ body: | ... --- -# CHECK-LABEL: name: merge_s_buffer_load_x4 -# CHECK: S_BUFFER_LOAD_DWORDX4_IMM %0, 0, 0 :: (dereferenceable invariant load (s128), align 4) name: merge_s_buffer_load_x4 tracksRegLiveness: true body: | bb.0: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX10-LABEL: name: merge_s_buffer_load_x4 + ; GFX10: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX10-NEXT: early-clobber %7:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM_ec [[COPY]], 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sreg_64_xexec = COPY %7.sub0_sub1 + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sreg_64_xexec = COPY killed %7.sub2_sub3 + ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY1]].sub0 + ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY1]].sub1 + ; GFX10-NEXT: [[COPY5:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY2]].sub0 + ; GFX10-NEXT: [[COPY6:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY2]].sub1 + ; GFX10-NEXT: S_ENDPGM 0 + ; + ; GFX12-LABEL: name: merge_s_buffer_load_x4 + ; GFX12: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX12-NEXT: [[S_BUFFER_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM [[COPY]], 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_96 = COPY [[S_BUFFER_LOAD_DWORDX4_IMM]].sub0_sub1_sub2 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[S_BUFFER_LOAD_DWORDX4_IMM]].sub3 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_64_xexec = COPY [[COPY1]].sub0_sub1 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY1]].sub2 + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY3]].sub0 + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY3]].sub1 + ; GFX12-NEXT: S_ENDPGM 0 %0:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 %1:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 0, 0 :: (dereferenceable invariant load (s32)) %2:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 4, 0 :: (dereferenceable invariant load (s32)) @@ -70,15 +126,19 @@ body: | ... --- -# CHECK-LABEL: name: merge_s_buffer_load_x1_x3 -# CHECK: S_BUFFER_LOAD_DWORD_IMM %0, 0, 0 :: (dereferenceable invariant load (s32)) -# CHECK: S_BUFFER_LOAD_DWORDX3_IMM %0, 4, 0 :: (dereferenceable invariant load (s96), align 16) name: merge_s_buffer_load_x1_x3 tracksRegLiveness: true body: | bb.0: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + ; CHECK-LABEL: name: merge_s_buffer_load_x1_x3 + ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[COPY]], 0, 0 :: (dereferenceable invariant load (s32)) + ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORDX3_IMM:%[0-9]+]]:sgpr_96 = S_BUFFER_LOAD_DWORDX3_IMM [[COPY]], 4, 0 :: (dereferenceable invariant load (s96), align 16) + ; CHECK-NEXT: S_ENDPGM 0 %0:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 %1:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 0, 0 :: (dereferenceable invariant load (s32)) %2:sgpr_96 = S_BUFFER_LOAD_DWORDX3_IMM %0:sgpr_128, 4, 0 :: (dereferenceable invariant load (s96)) @@ -87,14 +147,20 @@ body: | ... --- -# CHECK-LABEL: name: merge_s_buffer_load_x3_x1 -# CHECK: S_BUFFER_LOAD_DWORDX4_IMM %0, 0, 0 :: (dereferenceable invariant load (s128)) name: merge_s_buffer_load_x3_x1 tracksRegLiveness: true body: | bb.0: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + ; CHECK-LABEL: name: merge_s_buffer_load_x3_x1 + ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM [[COPY]], 0, 0 :: (dereferenceable invariant load (s128)) + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_96 = COPY [[S_BUFFER_LOAD_DWORDX4_IMM]].sub0_sub1_sub2 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[S_BUFFER_LOAD_DWORDX4_IMM]].sub3 + ; CHECK-NEXT: S_ENDPGM 0 %0:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 %1:sgpr_96 = S_BUFFER_LOAD_DWORDX3_IMM %0:sgpr_128, 0, 0 :: (dereferenceable invariant load (s96)) %2:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 12, 0 :: (dereferenceable invariant load (s32)) @@ -103,14 +169,53 @@ body: | ... --- -# CHECK-LABEL: name: merge_s_buffer_load_x8 -# CHECK: S_BUFFER_LOAD_DWORDX8_IMM %0, 0, 0 :: (dereferenceable invariant load (s256), align 4) name: merge_s_buffer_load_x8 tracksRegLiveness: true body: | bb.0: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX10-LABEL: name: merge_s_buffer_load_x8 + ; GFX10: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX10-NEXT: early-clobber %15:sgpr_256 = S_BUFFER_LOAD_DWORDX8_IMM_ec [[COPY]], 0, 0 :: (dereferenceable invariant load (s256), align 4) + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY %15.sub0_sub1_sub2_sub3 + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_128 = COPY killed %15.sub4_sub5_sub6_sub7 + ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sreg_64_xexec = COPY [[COPY1]].sub0_sub1 + ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sreg_64_xexec = COPY killed [[COPY1]].sub2_sub3 + ; GFX10-NEXT: [[COPY5:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY3]].sub0 + ; GFX10-NEXT: [[COPY6:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY3]].sub1 + ; GFX10-NEXT: [[COPY7:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY4]].sub0 + ; GFX10-NEXT: [[COPY8:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY4]].sub1 + ; GFX10-NEXT: [[COPY9:%[0-9]+]]:sreg_64_xexec = COPY [[COPY2]].sub0_sub1 + ; GFX10-NEXT: [[COPY10:%[0-9]+]]:sreg_64_xexec = COPY killed [[COPY2]].sub2_sub3 + ; GFX10-NEXT: [[COPY11:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY9]].sub0 + ; GFX10-NEXT: [[COPY12:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY9]].sub1 + ; GFX10-NEXT: [[COPY13:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY10]].sub0 + ; GFX10-NEXT: [[COPY14:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY10]].sub1 + ; GFX10-NEXT: S_ENDPGM 0 + ; + ; GFX12-LABEL: name: merge_s_buffer_load_x8 + ; GFX12: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX12-NEXT: [[S_BUFFER_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_BUFFER_LOAD_DWORDX8_IMM [[COPY]], 0, 0 :: (dereferenceable invariant load (s256), align 4) + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY [[S_BUFFER_LOAD_DWORDX8_IMM]].sub0_sub1_sub2_sub3 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_128 = COPY killed [[S_BUFFER_LOAD_DWORDX8_IMM]].sub4_sub5_sub6_sub7 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_96 = COPY [[COPY1]].sub0_sub1_sub2 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY1]].sub3 + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_64_xexec = COPY [[COPY3]].sub0_sub1 + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY3]].sub2 + ; GFX12-NEXT: [[COPY7:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY5]].sub0 + ; GFX12-NEXT: [[COPY8:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY5]].sub1 + ; GFX12-NEXT: [[COPY9:%[0-9]+]]:sgpr_96 = COPY [[COPY2]].sub0_sub1_sub2 + ; GFX12-NEXT: [[COPY10:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY2]].sub3 + ; GFX12-NEXT: [[COPY11:%[0-9]+]]:sreg_64_xexec = COPY [[COPY9]].sub0_sub1 + ; GFX12-NEXT: [[COPY12:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY9]].sub2 + ; GFX12-NEXT: [[COPY13:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY11]].sub0 + ; GFX12-NEXT: [[COPY14:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY11]].sub1 + ; GFX12-NEXT: S_ENDPGM 0 %0:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 %1:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 0, 0 :: (dereferenceable invariant load (s32)) %2:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 4, 0 :: (dereferenceable invariant load (s32)) @@ -125,14 +230,53 @@ body: | ... --- -# CHECK-LABEL: name: merge_s_buffer_load_x8_reordered -# CHECK: S_BUFFER_LOAD_DWORDX8_IMM %0, 0, 0 :: (dereferenceable invariant load (s256), align 4) name: merge_s_buffer_load_x8_reordered tracksRegLiveness: true body: | bb.0: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX10-LABEL: name: merge_s_buffer_load_x8_reordered + ; GFX10: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX10-NEXT: early-clobber %15:sgpr_256 = S_BUFFER_LOAD_DWORDX8_IMM_ec [[COPY]], 0, 0 :: (dereferenceable invariant load (s256), align 4) + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY %15.sub4_sub5_sub6_sub7 + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_128 = COPY killed %15.sub0_sub1_sub2_sub3 + ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sreg_64_xexec = COPY [[COPY1]].sub0_sub1 + ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sreg_64_xexec = COPY killed [[COPY1]].sub2_sub3 + ; GFX10-NEXT: [[COPY5:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY3]].sub1 + ; GFX10-NEXT: [[COPY6:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY3]].sub0 + ; GFX10-NEXT: [[COPY7:%[0-9]+]]:sreg_64_xexec = COPY [[COPY2]].sub0_sub1 + ; GFX10-NEXT: [[COPY8:%[0-9]+]]:sreg_64_xexec = COPY killed [[COPY2]].sub2_sub3 + ; GFX10-NEXT: [[COPY9:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY7]].sub1 + ; GFX10-NEXT: [[COPY10:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY7]].sub0 + ; GFX10-NEXT: [[COPY11:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY4]].sub1 + ; GFX10-NEXT: [[COPY12:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY4]].sub0 + ; GFX10-NEXT: [[COPY13:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY8]].sub1 + ; GFX10-NEXT: [[COPY14:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY8]].sub0 + ; GFX10-NEXT: S_ENDPGM 0 + ; + ; GFX12-LABEL: name: merge_s_buffer_load_x8_reordered + ; GFX12: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX12-NEXT: [[S_BUFFER_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_BUFFER_LOAD_DWORDX8_IMM [[COPY]], 0, 0 :: (dereferenceable invariant load (s256), align 4) + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY [[S_BUFFER_LOAD_DWORDX8_IMM]].sub4_sub5_sub6_sub7 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_128 = COPY killed [[S_BUFFER_LOAD_DWORDX8_IMM]].sub0_sub1_sub2_sub3 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_96 = COPY [[COPY1]].sub0_sub1_sub2 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY1]].sub3 + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_64_xexec = COPY [[COPY3]].sub0_sub1 + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY3]].sub2 + ; GFX12-NEXT: [[COPY7:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY5]].sub1 + ; GFX12-NEXT: [[COPY8:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY5]].sub0 + ; GFX12-NEXT: [[COPY9:%[0-9]+]]:sgpr_96 = COPY [[COPY2]].sub0_sub1_sub2 + ; GFX12-NEXT: [[COPY10:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY2]].sub3 + ; GFX12-NEXT: [[COPY11:%[0-9]+]]:sreg_64_xexec = COPY [[COPY9]].sub0_sub1 + ; GFX12-NEXT: [[COPY12:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY9]].sub2 + ; GFX12-NEXT: [[COPY13:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY11]].sub1 + ; GFX12-NEXT: [[COPY14:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY11]].sub0 + ; GFX12-NEXT: S_ENDPGM 0 %0:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 %1:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 20, 0 :: (dereferenceable invariant load (s32)) %2:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 4, 0 :: (dereferenceable invariant load (s32)) @@ -147,14 +291,37 @@ body: | ... --- -# CHECK-LABEL: name: merge_s_buffer_load_x8_out_of_x2 -# CHECK: S_BUFFER_LOAD_DWORDX8_IMM %0, 0, 0 :: (dereferenceable invariant load (s256), align 8) name: merge_s_buffer_load_x8_out_of_x2 tracksRegLiveness: true body: | bb.0: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX10-LABEL: name: merge_s_buffer_load_x8_out_of_x2 + ; GFX10: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX10-NEXT: early-clobber %7:sgpr_256 = S_BUFFER_LOAD_DWORDX8_IMM_ec [[COPY]], 0, 0 :: (dereferenceable invariant load (s256), align 8) + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY %7.sub4_sub5_sub6_sub7 + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_128 = COPY killed %7.sub0_sub1_sub2_sub3 + ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_64 = COPY [[COPY1]].sub0_sub1 + ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sgpr_64 = COPY killed [[COPY1]].sub2_sub3 + ; GFX10-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY [[COPY2]].sub2_sub3 + ; GFX10-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY killed [[COPY2]].sub0_sub1 + ; GFX10-NEXT: S_ENDPGM 0 + ; + ; GFX12-LABEL: name: merge_s_buffer_load_x8_out_of_x2 + ; GFX12: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX12-NEXT: [[S_BUFFER_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_BUFFER_LOAD_DWORDX8_IMM [[COPY]], 0, 0 :: (dereferenceable invariant load (s256), align 8) + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY [[S_BUFFER_LOAD_DWORDX8_IMM]].sub4_sub5_sub6_sub7 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_128 = COPY killed [[S_BUFFER_LOAD_DWORDX8_IMM]].sub0_sub1_sub2_sub3 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_64 = COPY [[COPY1]].sub0_sub1 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sgpr_64 = COPY killed [[COPY1]].sub2_sub3 + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY [[COPY2]].sub2_sub3 + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY killed [[COPY2]].sub0_sub1 + ; GFX12-NEXT: S_ENDPGM 0 %0:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 %1:sgpr_64 = S_BUFFER_LOAD_DWORDX2_IMM %0:sgpr_128, 16, 0 :: (dereferenceable invariant load (s64)) %2:sgpr_64 = S_BUFFER_LOAD_DWORDX2_IMM %0:sgpr_128, 8, 0 :: (dereferenceable invariant load (s64)) @@ -165,14 +332,29 @@ body: | ... --- -# CHECK-LABEL: name: merge_s_buffer_load_x8_out_of_x4 -# CHECK: S_BUFFER_LOAD_DWORDX8_IMM %0, 0, 0 :: (dereferenceable invariant load (s256), align 16) name: merge_s_buffer_load_x8_out_of_x4 tracksRegLiveness: true body: | bb.0: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX10-LABEL: name: merge_s_buffer_load_x8_out_of_x4 + ; GFX10: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX10-NEXT: early-clobber %3:sgpr_256 = S_BUFFER_LOAD_DWORDX8_IMM_ec [[COPY]], 0, 0 :: (dereferenceable invariant load (s256), align 16) + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY %3.sub0_sub1_sub2_sub3 + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_128 = COPY killed %3.sub4_sub5_sub6_sub7 + ; GFX10-NEXT: S_ENDPGM 0 + ; + ; GFX12-LABEL: name: merge_s_buffer_load_x8_out_of_x4 + ; GFX12: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX12-NEXT: [[S_BUFFER_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_BUFFER_LOAD_DWORDX8_IMM [[COPY]], 0, 0 :: (dereferenceable invariant load (s256), align 16) + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY [[S_BUFFER_LOAD_DWORDX8_IMM]].sub0_sub1_sub2_sub3 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_128 = COPY killed [[S_BUFFER_LOAD_DWORDX8_IMM]].sub4_sub5_sub6_sub7 + ; GFX12-NEXT: S_ENDPGM 0 %0:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 %1:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %0:sgpr_128, 0, 0 :: (dereferenceable invariant load (s128)) %2:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %0:sgpr_128, 16, 0 :: (dereferenceable invariant load (s128)) @@ -181,14 +363,37 @@ body: | ... --- -# CHECK-LABEL: name: merge_s_buffer_load_x8_mixed -# CHECK: S_BUFFER_LOAD_DWORDX8_IMM %0, 0, 0 :: (dereferenceable invariant load (s256), align 16) name: merge_s_buffer_load_x8_mixed tracksRegLiveness: true body: | bb.0: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX10-LABEL: name: merge_s_buffer_load_x8_mixed + ; GFX10: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX10-NEXT: early-clobber %7:sgpr_256 = S_BUFFER_LOAD_DWORDX8_IMM_ec [[COPY]], 0, 0 :: (dereferenceable invariant load (s256), align 16) + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY %7.sub0_sub1_sub2_sub3 + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_128 = COPY killed %7.sub4_sub5_sub6_sub7 + ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sreg_64_xexec = COPY [[COPY2]].sub0_sub1 + ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sgpr_64 = COPY killed [[COPY2]].sub2_sub3 + ; GFX10-NEXT: [[COPY5:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY3]].sub0 + ; GFX10-NEXT: [[COPY6:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY3]].sub1 + ; GFX10-NEXT: S_ENDPGM 0 + ; + ; GFX12-LABEL: name: merge_s_buffer_load_x8_mixed + ; GFX12: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX12-NEXT: [[S_BUFFER_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_BUFFER_LOAD_DWORDX8_IMM [[COPY]], 0, 0 :: (dereferenceable invariant load (s256), align 16) + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY [[S_BUFFER_LOAD_DWORDX8_IMM]].sub0_sub1_sub2_sub3 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_128 = COPY killed [[S_BUFFER_LOAD_DWORDX8_IMM]].sub4_sub5_sub6_sub7 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_64_xexec = COPY [[COPY2]].sub0_sub1 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sgpr_64 = COPY killed [[COPY2]].sub2_sub3 + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY3]].sub0 + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY3]].sub1 + ; GFX12-NEXT: S_ENDPGM 0 %0:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 %1:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %0:sgpr_128, 0, 0 :: (dereferenceable invariant load (s128)) %2:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 16, 0 :: (dereferenceable invariant load (s32)) @@ -199,14 +404,39 @@ body: | ... --- -# CHECK-LABEL: name: merge_s_buffer_load_sgpr_imm -# CHECK: S_BUFFER_LOAD_DWORDX4_SGPR_IMM %0, %1, 0, 0 :: (dereferenceable invariant load (s128), align 4) name: merge_s_buffer_load_sgpr_imm tracksRegLiveness: true body: | bb.0: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4 + ; GFX10-LABEL: name: merge_s_buffer_load_sgpr_imm + ; GFX10: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX10-NEXT: early-clobber %8:sgpr_128 = S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec [[COPY]], [[COPY1]], 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sreg_64_xexec = COPY %8.sub0_sub1 + ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sreg_64_xexec = COPY killed %8.sub2_sub3 + ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY2]].sub0 + ; GFX10-NEXT: [[COPY5:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY2]].sub1 + ; GFX10-NEXT: [[COPY6:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY3]].sub0 + ; GFX10-NEXT: [[COPY7:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY3]].sub1 + ; GFX10-NEXT: S_ENDPGM 0 + ; + ; GFX12-LABEL: name: merge_s_buffer_load_sgpr_imm + ; GFX12: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX12-NEXT: [[S_BUFFER_LOAD_DWORDX4_SGPR_IMM:%[0-9]+]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_SGPR_IMM [[COPY]], [[COPY1]], 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_96 = COPY [[S_BUFFER_LOAD_DWORDX4_SGPR_IMM]].sub0_sub1_sub2 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[S_BUFFER_LOAD_DWORDX4_SGPR_IMM]].sub3 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_64_xexec = COPY [[COPY2]].sub0_sub1 + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY2]].sub2 + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY4]].sub0 + ; GFX12-NEXT: [[COPY7:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY4]].sub1 + ; GFX12-NEXT: S_ENDPGM 0 %0:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 %1:sreg_32 = COPY $sgpr4 %2:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM %0:sgpr_128, %1:sreg_32, 0, 0 :: (dereferenceable invariant load (s32)) @@ -218,15 +448,21 @@ body: | ... --- -# CHECK-LABEL: name: no_merge_for_different_soffsets -# CHECK: S_BUFFER_LOAD_DWORD_SGPR_IMM %0, %1, 4, 0 :: (dereferenceable invariant load (s32)) -# CHECK: S_BUFFER_LOAD_DWORD_SGPR_IMM %0, %2, 8, 0 :: (dereferenceable invariant load (s32)) name: no_merge_for_different_soffsets tracksRegLiveness: true body: | bb.0: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, $sgpr5 + ; CHECK-LABEL: name: no_merge_for_different_soffsets + ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, $sgpr5 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM [[COPY]], [[COPY1]], 4, 0 :: (dereferenceable invariant load (s32)) + ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM [[COPY]], [[COPY2]], 8, 0 :: (dereferenceable invariant load (s32)) + ; CHECK-NEXT: S_ENDPGM 0 %0:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 %1:sreg_32 = COPY $sgpr4 %2:sreg_32 = COPY $sgpr5 @@ -237,15 +473,20 @@ body: | ... --- -# CHECK-LABEL: name: no_merge_for_non_adjacent_offsets -# CHECK: S_BUFFER_LOAD_DWORD_SGPR_IMM %0, %1, 4, 0 :: (dereferenceable invariant load (s32)) -# CHECK: S_BUFFER_LOAD_DWORD_SGPR_IMM %0, %1, 12, 0 :: (dereferenceable invariant load (s32)) name: no_merge_for_non_adjacent_offsets tracksRegLiveness: true body: | bb.0: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4 + ; CHECK-LABEL: name: no_merge_for_non_adjacent_offsets + ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM [[COPY]], [[COPY1]], 4, 0 :: (dereferenceable invariant load (s32)) + ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM [[COPY]], [[COPY1]], 12, 0 :: (dereferenceable invariant load (s32)) + ; CHECK-NEXT: S_ENDPGM 0 %0:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 %1:sreg_32 = COPY $sgpr4 %2:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM %0:sgpr_128, %1:sreg_32, 4, 0 :: (dereferenceable invariant load (s32)) @@ -253,4 +494,420 @@ body: | S_ENDPGM 0 ... + +# The constrained multi-dword buffer load merge tests. + +--- +name: merge_s_buffer_load_x1_x2ec +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + + ; CHECK-LABEL: name: merge_s_buffer_load_x1_x2ec + ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[COPY]], 0, 0 :: (dereferenceable invariant load (s32)) + ; CHECK-NEXT: early-clobber %2:sgpr_64 = S_BUFFER_LOAD_DWORDX2_IMM_ec [[COPY]], 4, 0 :: (dereferenceable invariant load (s64)) + ; CHECK-NEXT: S_ENDPGM 0 + %0:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + %1:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 0, 0 :: (dereferenceable invariant load (s32)) + early-clobber %2:sgpr_64 = S_BUFFER_LOAD_DWORDX2_IMM_ec %0:sgpr_128, 4, 0 :: (dereferenceable invariant load (s64)) + + S_ENDPGM 0 +... +--- + +name: merge_s_buffer_load_x2ec_x1 +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + + ; GFX10-LABEL: name: merge_s_buffer_load_x2ec_x1 + ; GFX10: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX10-NEXT: early-clobber %1:sgpr_64 = S_BUFFER_LOAD_DWORDX2_IMM_ec [[COPY]], 0, 0 :: (dereferenceable invariant load (s64)) + ; GFX10-NEXT: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[COPY]], 8, 0 :: (dereferenceable invariant load (s32)) + ; GFX10-NEXT: S_ENDPGM 0 + ; + ; GFX12-LABEL: name: merge_s_buffer_load_x2ec_x1 + ; GFX12: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX12-NEXT: [[S_BUFFER_LOAD_DWORDX3_IMM:%[0-9]+]]:sgpr_96 = S_BUFFER_LOAD_DWORDX3_IMM [[COPY]], 0, 0 :: (dereferenceable invariant load (s96), align 8) + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_64 = COPY [[S_BUFFER_LOAD_DWORDX3_IMM]].sub0_sub1 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[S_BUFFER_LOAD_DWORDX3_IMM]].sub2 + ; GFX12-NEXT: S_ENDPGM 0 + %0:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + early-clobber %1:sgpr_64 = S_BUFFER_LOAD_DWORDX2_IMM_ec %0:sgpr_128, 0, 0 :: (dereferenceable invariant load (s64)) + %2:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 8, 0 :: (dereferenceable invariant load (s32)) + + S_ENDPGM 0 +... +--- + +name: merge_s_buffer_load_x1_x3ec +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + + ; CHECK-LABEL: name: merge_s_buffer_load_x1_x3ec + ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[COPY]], 0, 0 :: (dereferenceable invariant load (s32)) + ; CHECK-NEXT: early-clobber %2:sgpr_96 = S_BUFFER_LOAD_DWORDX3_IMM_ec [[COPY]], 4, 0 :: (dereferenceable invariant load (s96), align 16) + ; CHECK-NEXT: S_ENDPGM 0 + %0:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + %1:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 0, 0 :: (dereferenceable invariant load (s32)) + early-clobber %2:sgpr_96 = S_BUFFER_LOAD_DWORDX3_IMM_ec %0:sgpr_128, 4, 0 :: (dereferenceable invariant load (s96)) + + S_ENDPGM 0 +... +--- + +name: merge_s_buffer_load_x3ec_x1 +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + + ; CHECK-LABEL: name: merge_s_buffer_load_x3ec_x1 + ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM [[COPY]], 0, 0 :: (dereferenceable invariant load (s128)) + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_96 = COPY [[S_BUFFER_LOAD_DWORDX4_IMM]].sub0_sub1_sub2 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[S_BUFFER_LOAD_DWORDX4_IMM]].sub3 + ; CHECK-NEXT: S_ENDPGM 0 + %0:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + early-clobber %1:sgpr_96 = S_BUFFER_LOAD_DWORDX3_IMM_ec %0:sgpr_128, 0, 0 :: (dereferenceable invariant load (s96)) + %2:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 12, 0 :: (dereferenceable invariant load (s32)) + + S_ENDPGM 0 +... +--- + +name: merge_s_buffer_load_x8_out_of_x2ec_reordered +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + + ; GFX10-LABEL: name: merge_s_buffer_load_x8_out_of_x2ec_reordered + ; GFX10: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX10-NEXT: early-clobber %7:sgpr_256 = S_BUFFER_LOAD_DWORDX8_IMM_ec [[COPY]], 0, 0 :: (dereferenceable invariant load (s256), align 8) + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY %7.sub4_sub5_sub6_sub7 + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_128 = COPY killed %7.sub0_sub1_sub2_sub3 + ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_64 = COPY [[COPY1]].sub0_sub1 + ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sgpr_64 = COPY killed [[COPY1]].sub2_sub3 + ; GFX10-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY [[COPY2]].sub2_sub3 + ; GFX10-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY killed [[COPY2]].sub0_sub1 + ; GFX10-NEXT: S_ENDPGM 0 + ; + ; GFX12-LABEL: name: merge_s_buffer_load_x8_out_of_x2ec_reordered + ; GFX12: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX12-NEXT: [[S_BUFFER_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_BUFFER_LOAD_DWORDX8_IMM [[COPY]], 0, 0 :: (dereferenceable invariant load (s256), align 8) + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY [[S_BUFFER_LOAD_DWORDX8_IMM]].sub4_sub5_sub6_sub7 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_128 = COPY killed [[S_BUFFER_LOAD_DWORDX8_IMM]].sub0_sub1_sub2_sub3 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_64 = COPY [[COPY1]].sub0_sub1 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sgpr_64 = COPY killed [[COPY1]].sub2_sub3 + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY [[COPY2]].sub2_sub3 + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY killed [[COPY2]].sub0_sub1 + ; GFX12-NEXT: S_ENDPGM 0 + %0:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + early-clobber %1:sgpr_64 = S_BUFFER_LOAD_DWORDX2_IMM_ec %0:sgpr_128, 16, 0 :: (dereferenceable invariant load (s64)) + early-clobber %2:sgpr_64 = S_BUFFER_LOAD_DWORDX2_IMM_ec %0:sgpr_128, 8, 0 :: (dereferenceable invariant load (s64)) + early-clobber %3:sgpr_64 = S_BUFFER_LOAD_DWORDX2_IMM_ec %0:sgpr_128, 0, 0 :: (dereferenceable invariant load (s64)) + early-clobber %4:sgpr_64 = S_BUFFER_LOAD_DWORDX2_IMM_ec %0:sgpr_128, 24, 0 :: (dereferenceable invariant load (s64)) + + S_ENDPGM 0 +... --- + +name: merge_s_buffer_load_x8_out_of_x2ec_x2 +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + + ; GFX10-LABEL: name: merge_s_buffer_load_x8_out_of_x2ec_x2 + ; GFX10: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX10-NEXT: early-clobber %7:sgpr_256 = S_BUFFER_LOAD_DWORDX8_IMM_ec [[COPY]], 0, 0 :: (dereferenceable invariant load (s256), align 8) + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY %7.sub4_sub5_sub6_sub7 + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_128 = COPY killed %7.sub0_sub1_sub2_sub3 + ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_64 = COPY [[COPY1]].sub0_sub1 + ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sgpr_64 = COPY killed [[COPY1]].sub2_sub3 + ; GFX10-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY [[COPY2]].sub2_sub3 + ; GFX10-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY killed [[COPY2]].sub0_sub1 + ; GFX10-NEXT: S_ENDPGM 0 + ; + ; GFX12-LABEL: name: merge_s_buffer_load_x8_out_of_x2ec_x2 + ; GFX12: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX12-NEXT: [[S_BUFFER_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_BUFFER_LOAD_DWORDX8_IMM [[COPY]], 0, 0 :: (dereferenceable invariant load (s256), align 8) + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY [[S_BUFFER_LOAD_DWORDX8_IMM]].sub4_sub5_sub6_sub7 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_128 = COPY killed [[S_BUFFER_LOAD_DWORDX8_IMM]].sub0_sub1_sub2_sub3 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_64 = COPY [[COPY1]].sub0_sub1 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sgpr_64 = COPY killed [[COPY1]].sub2_sub3 + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY [[COPY2]].sub2_sub3 + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY killed [[COPY2]].sub0_sub1 + ; GFX12-NEXT: S_ENDPGM 0 + %0:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + early-clobber %1:sgpr_64 = S_BUFFER_LOAD_DWORDX2_IMM_ec %0:sgpr_128, 16, 0 :: (dereferenceable invariant load (s64)) + early-clobber %2:sgpr_64 = S_BUFFER_LOAD_DWORDX2_IMM_ec %0:sgpr_128, 8, 0 :: (dereferenceable invariant load (s64)) + %3:sgpr_64 = S_BUFFER_LOAD_DWORDX2_IMM %0:sgpr_128, 0, 0 :: (dereferenceable invariant load (s64)) + %4:sgpr_64 = S_BUFFER_LOAD_DWORDX2_IMM %0:sgpr_128, 24, 0 :: (dereferenceable invariant load (s64)) + + S_ENDPGM 0 +... +--- + +name: merge_s_buffer_load_x8_out_of_x4ec +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + + ; GFX10-LABEL: name: merge_s_buffer_load_x8_out_of_x4ec + ; GFX10: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX10-NEXT: early-clobber %3:sgpr_256 = S_BUFFER_LOAD_DWORDX8_IMM_ec [[COPY]], 0, 0 :: (dereferenceable invariant load (s256), align 16) + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY %3.sub0_sub1_sub2_sub3 + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_128 = COPY killed %3.sub4_sub5_sub6_sub7 + ; GFX10-NEXT: S_ENDPGM 0 + ; + ; GFX12-LABEL: name: merge_s_buffer_load_x8_out_of_x4ec + ; GFX12: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX12-NEXT: [[S_BUFFER_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_BUFFER_LOAD_DWORDX8_IMM [[COPY]], 0, 0 :: (dereferenceable invariant load (s256), align 16) + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY [[S_BUFFER_LOAD_DWORDX8_IMM]].sub0_sub1_sub2_sub3 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_128 = COPY killed [[S_BUFFER_LOAD_DWORDX8_IMM]].sub4_sub5_sub6_sub7 + ; GFX12-NEXT: S_ENDPGM 0 + %0:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + early-clobber %1:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM_ec %0:sgpr_128, 0, 0 :: (dereferenceable invariant load (s128)) + early-clobber %2:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM_ec %0:sgpr_128, 16, 0 :: (dereferenceable invariant load (s128)) + + S_ENDPGM 0 +... +--- + +name: merge_s_buffer_load_x8_out_of_x4ec_x4 +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + + ; GFX10-LABEL: name: merge_s_buffer_load_x8_out_of_x4ec_x4 + ; GFX10: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX10-NEXT: early-clobber %3:sgpr_256 = S_BUFFER_LOAD_DWORDX8_IMM_ec [[COPY]], 0, 0 :: (dereferenceable invariant load (s256), align 16) + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY %3.sub0_sub1_sub2_sub3 + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_128 = COPY killed %3.sub4_sub5_sub6_sub7 + ; GFX10-NEXT: S_ENDPGM 0 + ; + ; GFX12-LABEL: name: merge_s_buffer_load_x8_out_of_x4ec_x4 + ; GFX12: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX12-NEXT: [[S_BUFFER_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_BUFFER_LOAD_DWORDX8_IMM [[COPY]], 0, 0 :: (dereferenceable invariant load (s256), align 16) + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY [[S_BUFFER_LOAD_DWORDX8_IMM]].sub0_sub1_sub2_sub3 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_128 = COPY killed [[S_BUFFER_LOAD_DWORDX8_IMM]].sub4_sub5_sub6_sub7 + ; GFX12-NEXT: S_ENDPGM 0 + %0:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + early-clobber %1:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM_ec %0:sgpr_128, 0, 0 :: (dereferenceable invariant load (s128)) + %2:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %0:sgpr_128, 16, 0 :: (dereferenceable invariant load (s128)) + + S_ENDPGM 0 +... +--- + +name: merge_s_buffer_load_x8_out_of_x4_x4ec +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + + ; GFX10-LABEL: name: merge_s_buffer_load_x8_out_of_x4_x4ec + ; GFX10: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX10-NEXT: early-clobber %3:sgpr_256 = S_BUFFER_LOAD_DWORDX8_IMM_ec [[COPY]], 0, 0 :: (dereferenceable invariant load (s256), align 16) + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY %3.sub0_sub1_sub2_sub3 + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_128 = COPY killed %3.sub4_sub5_sub6_sub7 + ; GFX10-NEXT: S_ENDPGM 0 + ; + ; GFX12-LABEL: name: merge_s_buffer_load_x8_out_of_x4_x4ec + ; GFX12: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX12-NEXT: [[S_BUFFER_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_BUFFER_LOAD_DWORDX8_IMM [[COPY]], 0, 0 :: (dereferenceable invariant load (s256), align 16) + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY [[S_BUFFER_LOAD_DWORDX8_IMM]].sub0_sub1_sub2_sub3 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_128 = COPY killed [[S_BUFFER_LOAD_DWORDX8_IMM]].sub4_sub5_sub6_sub7 + ; GFX12-NEXT: S_ENDPGM 0 + %0:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + %1:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %0:sgpr_128, 0, 0 :: (dereferenceable invariant load (s128)) + early-clobber %2:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM_ec %0:sgpr_128, 16, 0 :: (dereferenceable invariant load (s128)) + + S_ENDPGM 0 +... +--- + +name: merge_s_buffer_load_x8_mixed_including_ec_opcodes +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + + ; GFX10-LABEL: name: merge_s_buffer_load_x8_mixed_including_ec_opcodes + ; GFX10: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX10-NEXT: early-clobber %7:sgpr_256 = S_BUFFER_LOAD_DWORDX8_IMM_ec [[COPY]], 0, 0 :: (dereferenceable invariant load (s256), align 16) + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY %7.sub0_sub1_sub2_sub3 + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_128 = COPY killed %7.sub4_sub5_sub6_sub7 + ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sreg_64_xexec = COPY [[COPY2]].sub0_sub1 + ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sgpr_64 = COPY killed [[COPY2]].sub2_sub3 + ; GFX10-NEXT: [[COPY5:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY3]].sub0 + ; GFX10-NEXT: [[COPY6:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY3]].sub1 + ; GFX10-NEXT: S_ENDPGM 0 + ; + ; GFX12-LABEL: name: merge_s_buffer_load_x8_mixed_including_ec_opcodes + ; GFX12: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX12-NEXT: [[S_BUFFER_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_BUFFER_LOAD_DWORDX8_IMM [[COPY]], 0, 0 :: (dereferenceable invariant load (s256), align 16) + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY [[S_BUFFER_LOAD_DWORDX8_IMM]].sub0_sub1_sub2_sub3 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_128 = COPY killed [[S_BUFFER_LOAD_DWORDX8_IMM]].sub4_sub5_sub6_sub7 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_64_xexec = COPY [[COPY2]].sub0_sub1 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sgpr_64 = COPY killed [[COPY2]].sub2_sub3 + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY3]].sub0 + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY3]].sub1 + ; GFX12-NEXT: S_ENDPGM 0 + %0:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + early-clobber %1:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM_ec %0:sgpr_128, 0, 0 :: (dereferenceable invariant load (s128)) + %2:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 16, 0 :: (dereferenceable invariant load (s32)) + early-clobber %3:sgpr_64 = S_BUFFER_LOAD_DWORDX2_IMM_ec %0:sgpr_128, 24, 0 :: (dereferenceable invariant load (s64)) + %4:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 20, 0 :: (dereferenceable invariant load (s32)) + + S_ENDPGM 0 +... +--- + +name: merge_s_buffer_load_sgpr_imm_x2ec_x2ec +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4 + + ; GFX10-LABEL: name: merge_s_buffer_load_sgpr_imm_x2ec_x2ec + ; GFX10: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX10-NEXT: early-clobber %4:sgpr_128 = S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec [[COPY]], [[COPY1]], 0, 0 :: (dereferenceable invariant load (s128), align 8) + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_64 = COPY %4.sub0_sub1 + ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_64 = COPY killed %4.sub2_sub3 + ; GFX10-NEXT: S_ENDPGM 0 + ; + ; GFX12-LABEL: name: merge_s_buffer_load_sgpr_imm_x2ec_x2ec + ; GFX12: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX12-NEXT: [[S_BUFFER_LOAD_DWORDX4_SGPR_IMM:%[0-9]+]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_SGPR_IMM [[COPY]], [[COPY1]], 0, 0 :: (dereferenceable invariant load (s128), align 8) + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_64 = COPY [[S_BUFFER_LOAD_DWORDX4_SGPR_IMM]].sub0_sub1 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_64 = COPY killed [[S_BUFFER_LOAD_DWORDX4_SGPR_IMM]].sub2_sub3 + ; GFX12-NEXT: S_ENDPGM 0 + %0:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + %1:sreg_32 = COPY $sgpr4 + early-clobber %2:sgpr_64 = S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec %0:sgpr_128, %1:sreg_32, 0, 0 :: (dereferenceable invariant load (s64)) + early-clobber %3:sgpr_64 = S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec %0:sgpr_128, %1:sreg_32, 8, 0 :: (dereferenceable invariant load (s64)) + + S_ENDPGM 0 +... + +# No constrained opcode required when the MEM operand has met the required alignment. + +--- + +name: merge_s_buffer_load_x2_x2_no_constrained_opc_needed +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + + ; CHECK-LABEL: name: merge_s_buffer_load_x2_x2_no_constrained_opc_needed + ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM [[COPY]], 0, 0 :: (dereferenceable invariant load (s128)) + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_64 = COPY [[S_BUFFER_LOAD_DWORDX4_IMM]].sub0_sub1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr_64 = COPY killed [[S_BUFFER_LOAD_DWORDX4_IMM]].sub2_sub3 + ; CHECK-NEXT: S_ENDPGM 0 + %0:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + %1:sgpr_64 = S_BUFFER_LOAD_DWORDX2_IMM %0:sgpr_128, 0, 0 :: (dereferenceable invariant load (s64), align 16) + %2:sgpr_64 = S_BUFFER_LOAD_DWORDX2_IMM %0:sgpr_128, 8, 0 :: (dereferenceable invariant load (s64)) + + S_ENDPGM 0 +... +--- + +name: merge_s_buffer_load_x4_x4_no_constrained_opc_needed +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + + ; CHECK-LABEL: name: merge_s_buffer_load_x4_x4_no_constrained_opc_needed + ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_BUFFER_LOAD_DWORDX8_IMM [[COPY]], 0, 0 :: (dereferenceable invariant load (s256)) + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY [[S_BUFFER_LOAD_DWORDX8_IMM]].sub0_sub1_sub2_sub3 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr_128 = COPY killed [[S_BUFFER_LOAD_DWORDX8_IMM]].sub4_sub5_sub6_sub7 + ; CHECK-NEXT: S_ENDPGM 0 + %0:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + %1:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %0:sgpr_128, 0, 0 :: (dereferenceable invariant load (s128), align 32) + %2:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %0:sgpr_128, 16, 0 :: (dereferenceable invariant load (s128)) + + S_ENDPGM 0 +... +--- + +name: merge_s_buffer_load_sgpr_imm_x2ec_x2ec_no_constrained_opc_needed +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4 + + ; CHECK-LABEL: name: merge_s_buffer_load_sgpr_imm_x2ec_x2ec_no_constrained_opc_needed + ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORDX4_SGPR_IMM:%[0-9]+]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_SGPR_IMM [[COPY]], [[COPY1]], 0, 0 :: (dereferenceable invariant load (s128)) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr_64 = COPY [[S_BUFFER_LOAD_DWORDX4_SGPR_IMM]].sub0_sub1 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_64 = COPY killed [[S_BUFFER_LOAD_DWORDX4_SGPR_IMM]].sub2_sub3 + ; CHECK-NEXT: S_ENDPGM 0 + %0:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + %1:sreg_32 = COPY $sgpr4 + %2:sgpr_64 = S_BUFFER_LOAD_DWORDX2_SGPR_IMM %0:sgpr_128, %1:sreg_32, 0, 0 :: (dereferenceable invariant load (s64), align 16) + %3:sgpr_64 = S_BUFFER_LOAD_DWORDX2_SGPR_IMM %0:sgpr_128, %1:sreg_32, 8, 0 :: (dereferenceable invariant load (s64)) + + S_ENDPGM 0 +... diff --git a/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll index e86ee1adef3d..3a6b0485d241 100644 --- a/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll +++ b/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll @@ -34,9 +34,8 @@ define amdgpu_kernel void @test_simple_indirect_call() { ; ATTRIBUTOR_GCN-LABEL: define {{[^@]+}}@test_simple_indirect_call ; ATTRIBUTOR_GCN-SAME: () #[[ATTR1:[0-9]+]] { ; ATTRIBUTOR_GCN-NEXT: [[FPTR:%.*]] = alloca ptr, align 8, addrspace(5) -; ATTRIBUTOR_GCN-NEXT: [[FPTR_CAST:%.*]] = addrspacecast ptr addrspace(5) [[FPTR]] to ptr -; ATTRIBUTOR_GCN-NEXT: store ptr @indirect, ptr [[FPTR_CAST]], align 8 -; ATTRIBUTOR_GCN-NEXT: [[FP:%.*]] = load ptr, ptr [[FPTR_CAST]], align 8 +; ATTRIBUTOR_GCN-NEXT: store ptr @indirect, ptr addrspace(5) [[FPTR]], align 8 +; ATTRIBUTOR_GCN-NEXT: [[FP:%.*]] = load ptr, ptr addrspace(5) [[FPTR]], align 8 ; ATTRIBUTOR_GCN-NEXT: call void [[FP]]() ; ATTRIBUTOR_GCN-NEXT: ret void ; @@ -75,12 +74,16 @@ define amdgpu_kernel void @test_simple_indirect_call() { ret void } + +!llvm.module.flags = !{!0} +!0 = !{i32 1, !"amdhsa_code_object_version", i32 500} ;. ; AKF_GCN: attributes #[[ATTR0]] = { "amdgpu-calls" "amdgpu-stack-objects" } ;. ; ATTRIBUTOR_GCN: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } ; ATTRIBUTOR_GCN: attributes #[[ATTR1]] = { "uniform-work-group-size"="false" } ;. - -!llvm.module.flags = !{!0} -!0 = !{i32 1, !"amdhsa_code_object_version", i32 500} +; AKF_GCN: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 500} +;. +; ATTRIBUTOR_GCN: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 500} +;. diff --git a/llvm/test/CodeGen/ARM/O3-pipeline.ll b/llvm/test/CodeGen/ARM/O3-pipeline.ll index aa92c7ab4fb9..e74e4f2ad389 100644 --- a/llvm/test/CodeGen/ARM/O3-pipeline.ll +++ b/llvm/test/CodeGen/ARM/O3-pipeline.ll @@ -38,7 +38,6 @@ ; CHECK-NEXT: Constant Hoisting ; CHECK-NEXT: Replace intrinsics with calls to vector library ; CHECK-NEXT: Partially inline calls to library functions -; CHECK-NEXT: Expand vector predication intrinsics ; CHECK-NEXT: Instrument function entry/exit with calls to e.g. mcount() (post inlining) ; CHECK-NEXT: Scalarize Masked Memory Intrinsics ; CHECK-NEXT: Expand reduction intrinsics diff --git a/llvm/test/CodeGen/ARM/setjmp-bti-basic.ll b/llvm/test/CodeGen/ARM/setjmp-bti-basic.ll index 3b01e3e9327e..7fe7015a482a 100644 --- a/llvm/test/CodeGen/ARM/setjmp-bti-basic.ll +++ b/llvm/test/CodeGen/ARM/setjmp-bti-basic.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi < %s | FileCheck %s --check-prefix=BTI ; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+no-bti-at-return-twice < %s | \ ; RUN: FileCheck %s --check-prefix=NOBTI @@ -20,11 +21,43 @@ define i32 @foo(i32 %x) "branch-target-enforcement" { ; BTI-LABEL: foo: -; BTI: bl setjmp -; BTI-NEXT: bti +; BTI: @ %bb.0: @ %entry +; BTI-NEXT: bti +; BTI-NEXT: .save {r4, lr} +; BTI-NEXT: push {r4, lr} +; BTI-NEXT: mov r4, r0 +; BTI-NEXT: movw r0, :lower16:buf +; BTI-NEXT: movt r0, :upper16:buf +; BTI-NEXT: bl setjmp +; BTI-NEXT: bti +; BTI-NEXT: cmp r0, #0 +; BTI-NEXT: itt ne +; BTI-NEXT: movne r0, #0 +; BTI-NEXT: popne {r4, pc} +; BTI-NEXT: .LBB0_1: @ %if.else +; BTI-NEXT: mov r0, r4 +; BTI-NEXT: bl bar +; BTI-NEXT: mov r0, r4 +; BTI-NEXT: pop {r4, pc} +; ; NOBTI-LABEL: foo: -; NOBTI: bl setjmp -; NOBTI-NOT: bti +; NOBTI: @ %bb.0: @ %entry +; NOBTI-NEXT: bti +; NOBTI-NEXT: .save {r4, lr} +; NOBTI-NEXT: push {r4, lr} +; NOBTI-NEXT: mov r4, r0 +; NOBTI-NEXT: movw r0, :lower16:buf +; NOBTI-NEXT: movt r0, :upper16:buf +; NOBTI-NEXT: bl setjmp +; NOBTI-NEXT: cmp r0, #0 +; NOBTI-NEXT: itt ne +; NOBTI-NEXT: movne r0, #0 +; NOBTI-NEXT: popne {r4, pc} +; NOBTI-NEXT: .LBB0_1: @ %if.else +; NOBTI-NEXT: mov r0, r4 +; NOBTI-NEXT: bl bar +; NOBTI-NEXT: mov r0, r4 +; NOBTI-NEXT: pop {r4, pc} entry: %call = call i32 @setjmp(ptr @buf) #0 @@ -40,6 +73,41 @@ if.end: ; preds = %entry, %if.else ret i32 %x.addr.0 } +;; Check that the BL to setjmp correctly clobbers LR + +define i32 @baz() "branch-target-enforcement" { +; BTI-LABEL: baz: +; BTI: @ %bb.0: @ %entry +; BTI-NEXT: bti +; BTI-NEXT: .save {r7, lr} +; BTI-NEXT: push {r7, lr} +; BTI-NEXT: .pad #160 +; BTI-NEXT: sub sp, #160 +; BTI-NEXT: mov r0, sp +; BTI-NEXT: bl setjmp +; BTI-NEXT: bti +; BTI-NEXT: movs r0, #0 +; BTI-NEXT: add sp, #160 +; BTI-NEXT: pop {r7, pc} +; +; NOBTI-LABEL: baz: +; NOBTI: @ %bb.0: @ %entry +; NOBTI-NEXT: bti +; NOBTI-NEXT: .save {r7, lr} +; NOBTI-NEXT: push {r7, lr} +; NOBTI-NEXT: .pad #160 +; NOBTI-NEXT: sub sp, #160 +; NOBTI-NEXT: mov r0, sp +; NOBTI-NEXT: bl setjmp +; NOBTI-NEXT: movs r0, #0 +; NOBTI-NEXT: add sp, #160 +; NOBTI-NEXT: pop {r7, pc} +entry: + %outgoing_jb = alloca [20 x i64], align 8 + %call = call i32 @setjmp(ptr %outgoing_jb) returns_twice + ret i32 0 +} + declare void @bar(i32) declare i32 @setjmp(ptr) #0 diff --git a/llvm/test/CodeGen/BPF/objdump_atomics.ll b/llvm/test/CodeGen/BPF/objdump_atomics.ll index 3ec364f7368b..c4cb16b2c364 100644 --- a/llvm/test/CodeGen/BPF/objdump_atomics.ll +++ b/llvm/test/CodeGen/BPF/objdump_atomics.ll @@ -2,7 +2,7 @@ ; CHECK-LABEL: test_load_add_32 ; CHECK: c3 21 -; CHECK: r2 = atomic_fetch_add((u32 *)(r1 + 0), r2) +; CHECK: w2 = atomic_fetch_add((u32 *)(r1 + 0), w2) define void @test_load_add_32(ptr %p, i32 zeroext %v) { entry: atomicrmw add ptr %p, i32 %v seq_cst diff --git a/llvm/test/CodeGen/BPF/objdump_cond_op.ll b/llvm/test/CodeGen/BPF/objdump_cond_op.ll index 3b2e6c1922fc..c64a0f2f2938 100644 --- a/llvm/test/CodeGen/BPF/objdump_cond_op.ll +++ b/llvm/test/CodeGen/BPF/objdump_cond_op.ll @@ -1,4 +1,4 @@ -; RUN: llc -mtriple=bpfel -filetype=obj -o - %s | llvm-objdump --no-print-imm-hex -d - | FileCheck %s +; RUN: llc -mtriple=bpfel -filetype=obj -o - %s | llvm-objdump --no-print-imm-hex --mcpu=v1 -d - | FileCheck %s ; Source Code: ; int gbl; diff --git a/llvm/test/CodeGen/BPF/objdump_imm_hex.ll b/llvm/test/CodeGen/BPF/objdump_imm_hex.ll index 1760bb6b6c52..38b93e8a39b5 100644 --- a/llvm/test/CodeGen/BPF/objdump_imm_hex.ll +++ b/llvm/test/CodeGen/BPF/objdump_imm_hex.ll @@ -53,8 +53,8 @@ define i32 @test(i64, i64) local_unnamed_addr #0 { %14 = phi i32 [ %12, %10 ], [ %7, %4 ] %15 = phi i32 [ 2, %10 ], [ 1, %4 ] store i32 %14, ptr @gbl, align 4 -; CHECK-DEC: 63 12 00 00 00 00 00 00 *(u32 *)(r2 + 0) = r1 -; CHECK-HEX: 63 12 00 00 00 00 00 00 *(u32 *)(r2 + 0x0) = r1 +; CHECK-DEC: 63 12 00 00 00 00 00 00 *(u32 *)(r2 + 0) = w1 +; CHECK-HEX: 63 12 00 00 00 00 00 00 *(u32 *)(r2 + 0x0) = w1 br label %16 ; <label>:16: ; preds = %13, %8 diff --git a/llvm/test/CodeGen/BPF/objdump_static_var.ll b/llvm/test/CodeGen/BPF/objdump_static_var.ll index a91074ebddd4..b743d82fe5e3 100644 --- a/llvm/test/CodeGen/BPF/objdump_static_var.ll +++ b/llvm/test/CodeGen/BPF/objdump_static_var.ll @@ -1,5 +1,5 @@ -; RUN: llc -mtriple=bpfel -filetype=obj -o - %s | llvm-objdump --no-print-imm-hex -d - | FileCheck --check-prefix=CHECK %s -; RUN: llc -mtriple=bpfeb -filetype=obj -o - %s | llvm-objdump --no-print-imm-hex -d - | FileCheck --check-prefix=CHECK %s +; RUN: llc -mtriple=bpfel -filetype=obj -o - %s | llvm-objdump --no-print-imm-hex --mcpu=v1 -d - | FileCheck --check-prefix=CHECK %s +; RUN: llc -mtriple=bpfeb -filetype=obj -o - %s | llvm-objdump --no-print-imm-hex --mcpu=v1 -d - | FileCheck --check-prefix=CHECK %s ; src: ; static volatile long a = 2; diff --git a/llvm/test/CodeGen/Generic/expand-vp-fp-intrinsics.ll b/llvm/test/CodeGen/Generic/expand-vp-fp-intrinsics.ll deleted file mode 100644 index bc89ddea6b85..000000000000 --- a/llvm/test/CodeGen/Generic/expand-vp-fp-intrinsics.ll +++ /dev/null @@ -1,176 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 -; RUN: opt -expandvp -S < %s | FileCheck %s - -define void @vp_fadd_v4f32(<4 x float> %a0, <4 x float> %a1, ptr %out, i32 %vp) nounwind { -; CHECK-LABEL: define void @vp_fadd_v4f32( -; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], ptr [[OUT:%.*]], i32 [[VP:%.*]]) #[[ATTR0:[0-9]+]] { -; CHECK-NEXT: [[RES1:%.*]] = fadd <4 x float> [[A0]], [[A1]] -; CHECK-NEXT: store <4 x float> [[RES1]], ptr [[OUT]], align 16 -; CHECK-NEXT: ret void -; - %res = call <4 x float> @llvm.vp.fadd.v4f32(<4 x float> %a0, <4 x float> %a1, <4 x i1> <i1 -1, i1 -1, i1 -1, i1 -1>, i32 %vp) - store <4 x float> %res, ptr %out - ret void -} -declare <4 x float> @llvm.vp.fadd.v4f32(<4 x float>, <4 x float>, <4 x i1>, i32) - -define void @vp_fsub_v4f32(<4 x float> %a0, <4 x float> %a1, ptr %out, i32 %vp) nounwind { -; CHECK-LABEL: define void @vp_fsub_v4f32( -; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], ptr [[OUT:%.*]], i32 [[VP:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[RES1:%.*]] = fsub <4 x float> [[A0]], [[A1]] -; CHECK-NEXT: store <4 x float> [[RES1]], ptr [[OUT]], align 16 -; CHECK-NEXT: ret void -; - %res = call <4 x float> @llvm.vp.fsub.v4f32(<4 x float> %a0, <4 x float> %a1, <4 x i1> <i1 -1, i1 -1, i1 -1, i1 -1>, i32 %vp) - store <4 x float> %res, ptr %out - ret void -} -declare <4 x float> @llvm.vp.fsub.v4f32(<4 x float>, <4 x float>, <4 x i1>, i32) - -define void @vp_fmul_v4f32(<4 x float> %a0, <4 x float> %a1, ptr %out, i32 %vp) nounwind { -; CHECK-LABEL: define void @vp_fmul_v4f32( -; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], ptr [[OUT:%.*]], i32 [[VP:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[RES1:%.*]] = fmul <4 x float> [[A0]], [[A1]] -; CHECK-NEXT: store <4 x float> [[RES1]], ptr [[OUT]], align 16 -; CHECK-NEXT: ret void -; - %res = call <4 x float> @llvm.vp.fmul.v4f32(<4 x float> %a0, <4 x float> %a1, <4 x i1> <i1 -1, i1 -1, i1 -1, i1 -1>, i32 %vp) - store <4 x float> %res, ptr %out - ret void -} -declare <4 x float> @llvm.vp.fmul.v4f32(<4 x float>, <4 x float>, <4 x i1>, i32) - -define void @vp_fdiv_v4f32(<4 x float> %a0, <4 x float> %a1, ptr %out, i32 %vp) nounwind { -; CHECK-LABEL: define void @vp_fdiv_v4f32( -; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], ptr [[OUT:%.*]], i32 [[VP:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[RES1:%.*]] = fdiv <4 x float> [[A0]], [[A1]] -; CHECK-NEXT: store <4 x float> [[RES1]], ptr [[OUT]], align 16 -; CHECK-NEXT: ret void -; - %res = call <4 x float> @llvm.vp.fdiv.v4f32(<4 x float> %a0, <4 x float> %a1, <4 x i1> <i1 -1, i1 -1, i1 -1, i1 -1>, i32 %vp) - store <4 x float> %res, ptr %out - ret void -} -declare <4 x float> @llvm.vp.fdiv.v4f32(<4 x float>, <4 x float>, <4 x i1>, i32) - -define void @vp_frem_v4f32(<4 x float> %a0, <4 x float> %a1, ptr %out, i32 %vp) nounwind { -; CHECK-LABEL: define void @vp_frem_v4f32( -; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], ptr [[OUT:%.*]], i32 [[VP:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[RES1:%.*]] = frem <4 x float> [[A0]], [[A1]] -; CHECK-NEXT: store <4 x float> [[RES1]], ptr [[OUT]], align 16 -; CHECK-NEXT: ret void -; - %res = call <4 x float> @llvm.vp.frem.v4f32(<4 x float> %a0, <4 x float> %a1, <4 x i1> <i1 -1, i1 -1, i1 -1, i1 -1>, i32 %vp) - store <4 x float> %res, ptr %out - ret void -} -declare <4 x float> @llvm.vp.frem.v4f32(<4 x float>, <4 x float>, <4 x i1>, i32) - -define void @vp_fabs_v4f32(<4 x float> %a0, <4 x float> %a1, ptr %out, i32 %vp) nounwind { -; CHECK-LABEL: define void @vp_fabs_v4f32( -; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], ptr [[OUT:%.*]], i32 [[VP:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[RES1:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[A0]]) -; CHECK-NEXT: store <4 x float> [[RES1]], ptr [[OUT]], align 16 -; CHECK-NEXT: ret void -; - %res = call <4 x float> @llvm.vp.fabs.v4f32(<4 x float> %a0, <4 x i1> <i1 -1, i1 -1, i1 -1, i1 -1>, i32 %vp) - store <4 x float> %res, ptr %out - ret void -} -declare <4 x float> @llvm.vp.fabs.v4f32(<4 x float>, <4 x i1>, i32) - -define void @vp_sqrt_v4f32(<4 x float> %a0, <4 x float> %a1, ptr %out, i32 %vp) nounwind { -; CHECK-LABEL: define void @vp_sqrt_v4f32( -; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], ptr [[OUT:%.*]], i32 [[VP:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[RES1:%.*]] = call <4 x float> @llvm.sqrt.v4f32(<4 x float> [[A0]]) -; CHECK-NEXT: store <4 x float> [[RES1]], ptr [[OUT]], align 16 -; CHECK-NEXT: ret void -; - %res = call <4 x float> @llvm.vp.sqrt.v4f32(<4 x float> %a0, <4 x i1> <i1 -1, i1 -1, i1 -1, i1 -1>, i32 %vp) - store <4 x float> %res, ptr %out - ret void -} -declare <4 x float> @llvm.vp.sqrt.v4f32(<4 x float>, <4 x i1>, i32) - -define void @vp_fneg_v4f32(<4 x float> %a0, <4 x float> %a1, ptr %out, i32 %vp) nounwind { -; CHECK-LABEL: define void @vp_fneg_v4f32( -; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], ptr [[OUT:%.*]], i32 [[VP:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[RES1:%.*]] = fneg <4 x float> [[A0]] -; CHECK-NEXT: store <4 x float> [[RES1]], ptr [[OUT]], align 16 -; CHECK-NEXT: ret void -; - %res = call <4 x float> @llvm.vp.fneg.v4f32(<4 x float> %a0, <4 x i1> <i1 -1, i1 -1, i1 -1, i1 -1>, i32 %vp) - store <4 x float> %res, ptr %out - ret void -} -declare <4 x float> @llvm.vp.fneg.v4f32(<4 x float>, <4 x i1>, i32) - -define void @vp_fma_v4f32(<4 x float> %a0, <4 x float> %a1, ptr %out, i4 %a5) nounwind { -; CHECK-LABEL: define void @vp_fma_v4f32( -; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], ptr [[OUT:%.*]], i4 [[A5:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[RES1:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[A0]], <4 x float> [[A1]], <4 x float> [[A1]]) -; CHECK-NEXT: store <4 x float> [[RES1]], ptr [[OUT]], align 16 -; CHECK-NEXT: ret void -; - %res = call <4 x float> @llvm.vp.fma.v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %a1, <4 x i1> <i1 -1, i1 -1, i1 -1, i1 -1>, i32 4) - store <4 x float> %res, ptr %out - ret void -} -declare <4 x float> @llvm.vp.fma.v4f32(<4 x float>, <4 x float>, <4 x float>, <4 x i1>, i32) - -define void @vp_fmuladd_v4f32(<4 x float> %a0, <4 x float> %a1, ptr %out, i4 %a5) nounwind { -; CHECK-LABEL: define void @vp_fmuladd_v4f32( -; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], ptr [[OUT:%.*]], i4 [[A5:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[RES1:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[A0]], <4 x float> [[A1]], <4 x float> [[A1]]) -; CHECK-NEXT: store <4 x float> [[RES1]], ptr [[OUT]], align 16 -; CHECK-NEXT: ret void -; - %res = call <4 x float> @llvm.vp.fmuladd.v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %a1, <4 x i1> <i1 -1, i1 -1, i1 -1, i1 -1>, i32 4) - store <4 x float> %res, ptr %out - ret void -} -declare <4 x float> @llvm.vp.fmuladd.v4f32(<4 x float>, <4 x float>, <4 x float>, <4 x i1>, i32) - -declare <4 x float> @llvm.vp.maxnum.v4f32(<4 x float>, <4 x float>, <4 x i1>, i32) -define <4 x float> @vfmax_vv_v4f32(<4 x float> %va, <4 x float> %vb, <4 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: define <4 x float> @vfmax_vv_v4f32( -; CHECK-SAME: <4 x float> [[VA:%.*]], <4 x float> [[VB:%.*]], <4 x i1> [[M:%.*]], i32 zeroext [[EVL:%.*]]) { -; CHECK-NEXT: [[V1:%.*]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[VA]], <4 x float> [[VB]]) -; CHECK-NEXT: ret <4 x float> [[V1]] -; - %v = call <4 x float> @llvm.vp.maxnum.v4f32(<4 x float> %va, <4 x float> %vb, <4 x i1> %m, i32 %evl) - ret <4 x float> %v -} - -declare <8 x float> @llvm.vp.maxnum.v8f32(<8 x float>, <8 x float>, <8 x i1>, i32) -define <8 x float> @vfmax_vv_v8f32(<8 x float> %va, <8 x float> %vb, <8 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: define <8 x float> @vfmax_vv_v8f32( -; CHECK-SAME: <8 x float> [[VA:%.*]], <8 x float> [[VB:%.*]], <8 x i1> [[M:%.*]], i32 zeroext [[EVL:%.*]]) { -; CHECK-NEXT: [[V1:%.*]] = call <8 x float> @llvm.maxnum.v8f32(<8 x float> [[VA]], <8 x float> [[VB]]) -; CHECK-NEXT: ret <8 x float> [[V1]] -; - %v = call <8 x float> @llvm.vp.maxnum.v8f32(<8 x float> %va, <8 x float> %vb, <8 x i1> %m, i32 %evl) - ret <8 x float> %v -} - -declare <4 x float> @llvm.vp.minnum.v4f32(<4 x float>, <4 x float>, <4 x i1>, i32) -define <4 x float> @vfmin_vv_v4f32(<4 x float> %va, <4 x float> %vb, <4 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: define <4 x float> @vfmin_vv_v4f32( -; CHECK-SAME: <4 x float> [[VA:%.*]], <4 x float> [[VB:%.*]], <4 x i1> [[M:%.*]], i32 zeroext [[EVL:%.*]]) { -; CHECK-NEXT: [[V1:%.*]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[VA]], <4 x float> [[VB]]) -; CHECK-NEXT: ret <4 x float> [[V1]] -; - %v = call <4 x float> @llvm.vp.minnum.v4f32(<4 x float> %va, <4 x float> %vb, <4 x i1> %m, i32 %evl) - ret <4 x float> %v -} - -declare <8 x float> @llvm.vp.minnum.v8f32(<8 x float>, <8 x float>, <8 x i1>, i32) -define <8 x float> @vfmin_vv_v8f32(<8 x float> %va, <8 x float> %vb, <8 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: define <8 x float> @vfmin_vv_v8f32( -; CHECK-SAME: <8 x float> [[VA:%.*]], <8 x float> [[VB:%.*]], <8 x i1> [[M:%.*]], i32 zeroext [[EVL:%.*]]) { -; CHECK-NEXT: [[V1:%.*]] = call <8 x float> @llvm.minnum.v8f32(<8 x float> [[VA]], <8 x float> [[VB]]) -; CHECK-NEXT: ret <8 x float> [[V1]] -; - %v = call <8 x float> @llvm.vp.minnum.v8f32(<8 x float> %va, <8 x float> %vb, <8 x i1> %m, i32 %evl) - ret <8 x float> %v -} diff --git a/llvm/test/CodeGen/Generic/expand-vp-gather-scatter.ll b/llvm/test/CodeGen/Generic/expand-vp-gather-scatter.ll deleted file mode 100644 index 2e2dba50c845..000000000000 --- a/llvm/test/CodeGen/Generic/expand-vp-gather-scatter.ll +++ /dev/null @@ -1,118 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt --expandvp -S < %s | FileCheck %s - -; Fixed vectors -define <4 x i32> @vpgather_v4i32(<4 x ptr> %ptrs, <4 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: @vpgather_v4i32( -; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[EVL:%.*]], i64 0 -; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP1:%.*]] = icmp ult <4 x i32> <i32 0, i32 1, i32 2, i32 3>, [[DOTSPLAT]] -; CHECK-NEXT: [[TMP2:%.*]] = and <4 x i1> [[TMP1]], [[M:%.*]] -; CHECK-NEXT: [[V1:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[PTRS:%.*]], i32 4, <4 x i1> [[TMP2]], <4 x i32> poison) -; CHECK-NEXT: ret <4 x i32> [[V1]] -; - %v = call <4 x i32> @llvm.vp.gather.v4i32.v4p0(<4 x ptr> %ptrs, <4 x i1> %m, i32 %evl) - ret <4 x i32> %v -} - -define <2 x i64> @vpgather_v2i64(<2 x ptr> %ptrs, <2 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: @vpgather_v2i64( -; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[EVL:%.*]], i64 0 -; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP1:%.*]] = icmp ult <2 x i32> <i32 0, i32 1>, [[DOTSPLAT]] -; CHECK-NEXT: [[TMP2:%.*]] = and <2 x i1> [[TMP1]], [[M:%.*]] -; CHECK-NEXT: [[V1:%.*]] = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> [[PTRS:%.*]], i32 8, <2 x i1> [[TMP2]], <2 x i64> poison) -; CHECK-NEXT: ret <2 x i64> [[V1]] -; - %v = call <2 x i64> @llvm.vp.gather.v2i64.v2p0(<2 x ptr> %ptrs, <2 x i1> %m, i32 %evl) - ret <2 x i64> %v -} - -define void @vpscatter_v4i32(<4 x i32> %val, <4 x ptr> %ptrs, <4 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: @vpscatter_v4i32( -; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[EVL:%.*]], i64 0 -; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP1:%.*]] = icmp ult <4 x i32> <i32 0, i32 1, i32 2, i32 3>, [[DOTSPLAT]] -; CHECK-NEXT: [[TMP2:%.*]] = and <4 x i1> [[TMP1]], [[M:%.*]] -; CHECK-NEXT: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[VAL:%.*]], <4 x ptr> [[PTRS:%.*]], i32 4, <4 x i1> [[TMP2]]) -; CHECK-NEXT: ret void -; - call void @llvm.vp.scatter.v4i32.v4p0(<4 x i32> %val, <4 x ptr> %ptrs, <4 x i1> %m, i32 %evl) - ret void -} - -define void @vpscatter_v2i64(<2 x i64> %val, <2 x ptr> %ptrs, <2 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: @vpscatter_v2i64( -; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[EVL:%.*]], i64 0 -; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP1:%.*]] = icmp ult <2 x i32> <i32 0, i32 1>, [[DOTSPLAT]] -; CHECK-NEXT: [[TMP2:%.*]] = and <2 x i1> [[TMP1]], [[M:%.*]] -; CHECK-NEXT: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> [[VAL:%.*]], <2 x ptr> [[PTRS:%.*]], i32 8, <2 x i1> [[TMP2]]) -; CHECK-NEXT: ret void -; - call void @llvm.vp.scatter.v2i64.v2p0(<2 x i64> %val, <2 x ptr> %ptrs, <2 x i1> %m, i32 %evl) - ret void -} - -; Scalable vectors -define <vscale x 2 x i32> @vpgather_nxv2i32(<vscale x 2 x ptr> %ptrs, <vscale x 2 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: @vpgather_nxv2i32( -; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i32(i32 0, i32 [[EVL:%.*]]) -; CHECK-NEXT: [[TMP2:%.*]] = and <vscale x 2 x i1> [[TMP1]], [[M:%.*]] -; CHECK-NEXT: [[VSCALE:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-NEXT: [[SCALABLE_SIZE:%.*]] = mul nuw i32 [[VSCALE]], 2 -; CHECK-NEXT: [[V1:%.*]] = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32.nxv2p0(<vscale x 2 x ptr> [[PTRS:%.*]], i32 4, <vscale x 2 x i1> [[TMP2]], <vscale x 2 x i32> poison) -; CHECK-NEXT: ret <vscale x 2 x i32> [[V1]] -; - %v = call <vscale x 2 x i32> @llvm.vp.gather.nxv2i32.nxv2p0(<vscale x 2 x ptr> %ptrs, <vscale x 2 x i1> %m, i32 %evl) - ret <vscale x 2 x i32> %v -} - -define <vscale x 1 x i64> @vpgather_nxv1i64(<vscale x 1 x ptr> %ptrs, <vscale x 1 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: @vpgather_nxv1i64( -; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 1 x i1> @llvm.get.active.lane.mask.nxv1i1.i32(i32 0, i32 [[EVL:%.*]]) -; CHECK-NEXT: [[TMP2:%.*]] = and <vscale x 1 x i1> [[TMP1]], [[M:%.*]] -; CHECK-NEXT: [[VSCALE:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-NEXT: [[SCALABLE_SIZE:%.*]] = mul nuw i32 [[VSCALE]], 1 -; CHECK-NEXT: [[V1:%.*]] = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0(<vscale x 1 x ptr> [[PTRS:%.*]], i32 8, <vscale x 1 x i1> [[TMP2]], <vscale x 1 x i64> poison) -; CHECK-NEXT: ret <vscale x 1 x i64> [[V1]] -; - %v = call <vscale x 1 x i64> @llvm.vp.gather.nxv1i64.nxv1p0(<vscale x 1 x ptr> %ptrs, <vscale x 1 x i1> %m, i32 %evl) - ret <vscale x 1 x i64> %v -} - -define void @vpscatter_nxv2i32(<vscale x 2 x i32> %val, <vscale x 2 x ptr> %ptrs, <vscale x 2 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: @vpscatter_nxv2i32( -; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i32(i32 0, i32 [[EVL:%.*]]) -; CHECK-NEXT: [[TMP2:%.*]] = and <vscale x 2 x i1> [[TMP1]], [[M:%.*]] -; CHECK-NEXT: [[VSCALE:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-NEXT: [[SCALABLE_SIZE:%.*]] = mul nuw i32 [[VSCALE]], 2 -; CHECK-NEXT: call void @llvm.masked.scatter.nxv2i32.nxv2p0(<vscale x 2 x i32> [[VAL:%.*]], <vscale x 2 x ptr> [[PTRS:%.*]], i32 4, <vscale x 2 x i1> [[TMP2]]) -; CHECK-NEXT: ret void -; - call void @llvm.vp.scatter.nxv2i32.nxv2p0(<vscale x 2 x i32> %val, <vscale x 2 x ptr> %ptrs, <vscale x 2 x i1> %m, i32 %evl) - ret void -} - -define void @vpscatter_nxv1i64(<vscale x 1 x i64> %val, <vscale x 1 x ptr> %ptrs, <vscale x 1 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: @vpscatter_nxv1i64( -; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 1 x i1> @llvm.get.active.lane.mask.nxv1i1.i32(i32 0, i32 [[EVL:%.*]]) -; CHECK-NEXT: [[TMP2:%.*]] = and <vscale x 1 x i1> [[TMP1]], [[M:%.*]] -; CHECK-NEXT: [[VSCALE:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-NEXT: [[SCALABLE_SIZE:%.*]] = mul nuw i32 [[VSCALE]], 1 -; CHECK-NEXT: call void @llvm.masked.scatter.nxv1i64.nxv1p0(<vscale x 1 x i64> [[VAL:%.*]], <vscale x 1 x ptr> [[PTRS:%.*]], i32 8, <vscale x 1 x i1> [[TMP2]]) -; CHECK-NEXT: ret void -; - call void @llvm.vp.scatter.nxv1i64.nxv1p0(<vscale x 1 x i64> %val, <vscale x 1 x ptr> %ptrs, <vscale x 1 x i1> %m, i32 %evl) - ret void -} - -declare <4 x i32> @llvm.vp.gather.v4i32.v4p0(<4 x ptr>, <4 x i1>, i32) -declare <2 x i64> @llvm.vp.gather.v2i64.v2p0(<2 x ptr>, <2 x i1>, i32) -declare void @llvm.vp.scatter.v4i32.v4p0(<4 x i32>, <4 x ptr>, <4 x i1>, i32) -declare void @llvm.vp.scatter.v2i64.v2p0(<2 x i64>, <2 x ptr>, <2 x i1>, i32) - -declare <vscale x 2 x i32> @llvm.vp.gather.nxv2i32.nxv2p0(<vscale x 2 x ptr>, <vscale x 2 x i1>, i32) -declare <vscale x 1 x i64> @llvm.vp.gather.nxv1i64.nxv1p0(<vscale x 1 x ptr>, <vscale x 1 x i1>, i32) -declare void @llvm.vp.scatter.nxv2i32.nxv2p0(<vscale x 2 x i32>, <vscale x 2 x ptr>, <vscale x 2 x i1>, i32) -declare void @llvm.vp.scatter.nxv1i64.nxv1p0(<vscale x 1 x i64>, <vscale x 1 x ptr>, <vscale x 1 x i1>, i32) diff --git a/llvm/test/CodeGen/Generic/expand-vp-load-store.ll b/llvm/test/CodeGen/Generic/expand-vp-load-store.ll deleted file mode 100644 index 5c6f1e858ce7..000000000000 --- a/llvm/test/CodeGen/Generic/expand-vp-load-store.ll +++ /dev/null @@ -1,205 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt --expandvp -S < %s | FileCheck %s -; RUN: opt --expandvp --expandvp-override-evl-transform=Legal --expandvp-override-mask-transform=Convert -S < %s | FileCheck %s - -; Fixed vectors -define <2 x i64> @vpload_v2i64(ptr %ptr, <2 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: @vpload_v2i64( -; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[EVL:%.*]], i64 0 -; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP1:%.*]] = icmp ult <2 x i32> <i32 0, i32 1>, [[DOTSPLAT]] -; CHECK-NEXT: [[TMP2:%.*]] = and <2 x i1> [[TMP1]], [[M:%.*]] -; CHECK-NEXT: [[TMP3:%.*]] = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr [[PTR:%.*]], i32 1, <2 x i1> [[TMP2]], <2 x i64> poison) -; CHECK-NEXT: ret <2 x i64> [[TMP3]] -; - %load = call <2 x i64> @llvm.vp.load.v2i64.p0(ptr %ptr, <2 x i1> %m, i32 %evl) - ret <2 x i64> %load -} - -define <2 x i64> @vpload_v2i64_vlmax(ptr %ptr, <2 x i1> %m) { -; CHECK-LABEL: @vpload_v2i64_vlmax( -; CHECK-NEXT: [[TMP1:%.*]] = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr [[PTR:%.*]], i32 1, <2 x i1> [[M:%.*]], <2 x i64> poison) -; CHECK-NEXT: ret <2 x i64> [[TMP1]] -; - %load = call <2 x i64> @llvm.vp.load.v2i64.p0(ptr %ptr, <2 x i1> %m, i32 2) - ret <2 x i64> %load -} - -define <2 x i64> @vpload_v2i64_allones_mask(ptr %ptr, i32 zeroext %evl) { -; CHECK-LABEL: @vpload_v2i64_allones_mask( -; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[EVL:%.*]], i64 0 -; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP1:%.*]] = icmp ult <2 x i32> <i32 0, i32 1>, [[DOTSPLAT]] -; CHECK-NEXT: [[TMP2:%.*]] = and <2 x i1> [[TMP1]], <i1 true, i1 true> -; CHECK-NEXT: [[TMP3:%.*]] = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr [[PTR:%.*]], i32 1, <2 x i1> [[TMP2]], <2 x i64> poison) -; CHECK-NEXT: ret <2 x i64> [[TMP3]] -; - %load = call <2 x i64> @llvm.vp.load.v2i64.p0(ptr %ptr, <2 x i1> <i1 1, i1 1>, i32 %evl) - ret <2 x i64> %load -} - -define <2 x i64> @vpload_v2i64_allones_mask_vlmax(ptr %ptr) { -; CHECK-LABEL: @vpload_v2i64_allones_mask_vlmax( -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[PTR:%.*]], align 16 -; CHECK-NEXT: ret <2 x i64> [[TMP1]] -; - %load = call <2 x i64> @llvm.vp.load.v2i64.p0(ptr %ptr, <2 x i1> <i1 1, i1 1>, i32 2) - ret <2 x i64> %load -} - -define void @vpstore_v2i64(<2 x i64> %val, ptr %ptr, <2 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: @vpstore_v2i64( -; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[EVL:%.*]], i64 0 -; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP1:%.*]] = icmp ult <2 x i32> <i32 0, i32 1>, [[DOTSPLAT]] -; CHECK-NEXT: [[TMP2:%.*]] = and <2 x i1> [[TMP1]], [[M:%.*]] -; CHECK-NEXT: call void @llvm.masked.store.v2i64.p0(<2 x i64> [[VAL:%.*]], ptr [[PTR:%.*]], i32 1, <2 x i1> [[TMP2]]) -; CHECK-NEXT: ret void -; - call void @llvm.vp.store.v2i64.p0(<2 x i64> %val, ptr %ptr, <2 x i1> %m, i32 %evl) - ret void -} - -define void @vpstore_v2i64_vlmax(<2 x i64> %val, ptr %ptr, <2 x i1> %m) { -; CHECK-LABEL: @vpstore_v2i64_vlmax( -; CHECK-NEXT: call void @llvm.masked.store.v2i64.p0(<2 x i64> [[VAL:%.*]], ptr [[PTR:%.*]], i32 1, <2 x i1> [[M:%.*]]) -; CHECK-NEXT: ret void -; - call void @llvm.vp.store.v2i64.p0(<2 x i64> %val, ptr %ptr, <2 x i1> %m, i32 2) - ret void -} - -define void @vpstore_v2i64_allones_mask(<2 x i64> %val, ptr %ptr, i32 zeroext %evl) { -; CHECK-LABEL: @vpstore_v2i64_allones_mask( -; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[EVL:%.*]], i64 0 -; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP1:%.*]] = icmp ult <2 x i32> <i32 0, i32 1>, [[DOTSPLAT]] -; CHECK-NEXT: [[TMP2:%.*]] = and <2 x i1> [[TMP1]], <i1 true, i1 true> -; CHECK-NEXT: call void @llvm.masked.store.v2i64.p0(<2 x i64> [[VAL:%.*]], ptr [[PTR:%.*]], i32 1, <2 x i1> [[TMP2]]) -; CHECK-NEXT: ret void -; - call void @llvm.vp.store.v2i64.p0(<2 x i64> %val, ptr %ptr, <2 x i1> <i1 1, i1 1>, i32 %evl) - ret void -} - -define void @vpstore_v2i64_allones_mask_vlmax(<2 x i64> %val, ptr %ptr) { -; CHECK-LABEL: @vpstore_v2i64_allones_mask_vlmax( -; CHECK-NEXT: store <2 x i64> [[VAL:%.*]], ptr [[PTR:%.*]], align 16 -; CHECK-NEXT: ret void -; - call void @llvm.vp.store.v2i64.p0(<2 x i64> %val, ptr %ptr, <2 x i1> <i1 1, i1 1>, i32 2) - ret void -} - -; Scalable vectors -define <vscale x 1 x i64> @vpload_nxv1i64(ptr %ptr, <vscale x 1 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: @vpload_nxv1i64( -; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 1 x i1> @llvm.get.active.lane.mask.nxv1i1.i32(i32 0, i32 [[EVL:%.*]]) -; CHECK-NEXT: [[TMP2:%.*]] = and <vscale x 1 x i1> [[TMP1]], [[M:%.*]] -; CHECK-NEXT: [[VSCALE:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-NEXT: [[SCALABLE_SIZE:%.*]] = mul nuw i32 [[VSCALE]], 1 -; CHECK-NEXT: [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.masked.load.nxv1i64.p0(ptr [[PTR:%.*]], i32 1, <vscale x 1 x i1> [[TMP2]], <vscale x 1 x i64> poison) -; CHECK-NEXT: ret <vscale x 1 x i64> [[TMP3]] -; - %load = call <vscale x 1 x i64> @llvm.vp.load.nxv1i64.p0(ptr %ptr, <vscale x 1 x i1> %m, i32 %evl) - ret <vscale x 1 x i64> %load -} - -define <vscale x 1 x i64> @vpload_nxv1i64_vscale(ptr %ptr, <vscale x 1 x i1> %m) { -; CHECK-LABEL: @vpload_nxv1i64_vscale( -; CHECK-NEXT: [[VSCALE:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-NEXT: [[VLMAX:%.*]] = mul nuw i32 [[VSCALE]], 1 -; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 1 x i64> @llvm.masked.load.nxv1i64.p0(ptr [[PTR:%.*]], i32 1, <vscale x 1 x i1> [[M:%.*]], <vscale x 1 x i64> poison) -; CHECK-NEXT: ret <vscale x 1 x i64> [[TMP1]] -; - %vscale = call i32 @llvm.vscale.i32() - %vlmax = mul nuw i32 %vscale, 1 - %load = call <vscale x 1 x i64> @llvm.vp.load.nxv1i64.p0(ptr %ptr, <vscale x 1 x i1> %m, i32 %vlmax) - ret <vscale x 1 x i64> %load -} - -define <vscale x 1 x i64> @vpload_nxv1i64_allones_mask(ptr %ptr, i32 zeroext %evl) { -; CHECK-LABEL: @vpload_nxv1i64_allones_mask( -; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 1 x i1> @llvm.get.active.lane.mask.nxv1i1.i32(i32 0, i32 [[EVL:%.*]]) -; CHECK-NEXT: [[TMP2:%.*]] = and <vscale x 1 x i1> [[TMP1]], shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer) -; CHECK-NEXT: [[VSCALE:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-NEXT: [[SCALABLE_SIZE:%.*]] = mul nuw i32 [[VSCALE]], 1 -; CHECK-NEXT: [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.masked.load.nxv1i64.p0(ptr [[PTR:%.*]], i32 1, <vscale x 1 x i1> [[TMP2]], <vscale x 1 x i64> poison) -; CHECK-NEXT: ret <vscale x 1 x i64> [[TMP3]] -; - %load = call <vscale x 1 x i64> @llvm.vp.load.nxv1i64.p0(ptr %ptr, <vscale x 1 x i1> splat (i1 true), i32 %evl) - ret <vscale x 1 x i64> %load -} - -define <vscale x 1 x i64> @vpload_nxv1i64_allones_mask_vscale(ptr %ptr) { -; CHECK-LABEL: @vpload_nxv1i64_allones_mask_vscale( -; CHECK-NEXT: [[VSCALE:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-NEXT: [[VLMAX:%.*]] = mul nuw i32 [[VSCALE]], 1 -; CHECK-NEXT: [[TMP1:%.*]] = load <vscale x 1 x i64>, ptr [[PTR:%.*]], align 8 -; CHECK-NEXT: ret <vscale x 1 x i64> [[TMP1]] -; - %vscale = call i32 @llvm.vscale.i32() - %vlmax = mul nuw i32 %vscale, 1 - %load = call <vscale x 1 x i64> @llvm.vp.load.nxv1i64.p0(ptr %ptr, <vscale x 1 x i1> splat (i1 true), i32 %vlmax) - ret <vscale x 1 x i64> %load -} - -define void @vpstore_nxv1i64(<vscale x 1 x i64> %val, ptr %ptr, <vscale x 1 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: @vpstore_nxv1i64( -; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 1 x i1> @llvm.get.active.lane.mask.nxv1i1.i32(i32 0, i32 [[EVL:%.*]]) -; CHECK-NEXT: [[TMP2:%.*]] = and <vscale x 1 x i1> [[TMP1]], [[M:%.*]] -; CHECK-NEXT: [[VSCALE:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-NEXT: [[SCALABLE_SIZE:%.*]] = mul nuw i32 [[VSCALE]], 1 -; CHECK-NEXT: call void @llvm.masked.store.nxv1i64.p0(<vscale x 1 x i64> [[VAL:%.*]], ptr [[PTR:%.*]], i32 1, <vscale x 1 x i1> [[TMP2]]) -; CHECK-NEXT: ret void -; - call void @llvm.vp.store.nxv1i64.p0(<vscale x 1 x i64> %val, ptr %ptr, <vscale x 1 x i1> %m, i32 %evl) - ret void -} - -define void @vpstore_nxv1i64_vscale(<vscale x 1 x i64> %val, ptr %ptr, <vscale x 1 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: @vpstore_nxv1i64_vscale( -; CHECK-NEXT: [[VSCALE:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-NEXT: [[VLMAX:%.*]] = mul nuw i32 [[VSCALE]], 1 -; CHECK-NEXT: call void @llvm.masked.store.nxv1i64.p0(<vscale x 1 x i64> [[VAL:%.*]], ptr [[PTR:%.*]], i32 1, <vscale x 1 x i1> [[M:%.*]]) -; CHECK-NEXT: ret void -; - %vscale = call i32 @llvm.vscale.i32() - %vlmax = mul nuw i32 %vscale, 1 - call void @llvm.vp.store.nxv1i64.p0(<vscale x 1 x i64> %val, ptr %ptr, <vscale x 1 x i1> %m, i32 %vlmax) - ret void -} - -define void @vpstore_nxv1i64_allones_mask(<vscale x 1 x i64> %val, ptr %ptr, i32 zeroext %evl) { -; CHECK-LABEL: @vpstore_nxv1i64_allones_mask( -; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 1 x i1> @llvm.get.active.lane.mask.nxv1i1.i32(i32 0, i32 [[EVL:%.*]]) -; CHECK-NEXT: [[TMP2:%.*]] = and <vscale x 1 x i1> [[TMP1]], shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer) -; CHECK-NEXT: [[VSCALE:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-NEXT: [[SCALABLE_SIZE:%.*]] = mul nuw i32 [[VSCALE]], 1 -; CHECK-NEXT: call void @llvm.masked.store.nxv1i64.p0(<vscale x 1 x i64> [[VAL:%.*]], ptr [[PTR:%.*]], i32 1, <vscale x 1 x i1> [[TMP2]]) -; CHECK-NEXT: ret void -; - call void @llvm.vp.store.nxv1i64.p0(<vscale x 1 x i64> %val, ptr %ptr, <vscale x 1 x i1> splat (i1 true), i32 %evl) - ret void -} - -define void @vpstore_nxv1i64_allones_mask_vscale(<vscale x 1 x i64> %val, ptr %ptr) { -; CHECK-LABEL: @vpstore_nxv1i64_allones_mask_vscale( -; CHECK-NEXT: [[VSCALE:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-NEXT: [[VLMAX:%.*]] = mul nuw i32 [[VSCALE]], 1 -; CHECK-NEXT: store <vscale x 1 x i64> [[VAL:%.*]], ptr [[PTR:%.*]], align 8 -; CHECK-NEXT: ret void -; - %vscale = call i32 @llvm.vscale.i32() - %vlmax = mul nuw i32 %vscale, 1 - call void @llvm.vp.store.nxv1i64.p0(<vscale x 1 x i64> %val, ptr %ptr, <vscale x 1 x i1> splat (i1 true), i32 %vlmax) - ret void -} - -declare i32 @llvm.vscale.i32() - -declare <2 x i64> @llvm.vp.load.v2i64.p0(ptr, <2 x i1>, i32) -declare void @llvm.vp.store.v2i64.p0(<2 x i64>, ptr, <2 x i1>, i32) - -declare <vscale x 1 x i64> @llvm.vp.load.nxv1i64.p0(ptr, <vscale x 1 x i1>, i32) -declare void @llvm.vp.store.nxv1i64.p0(<vscale x 1 x i64>, ptr, <vscale x 1 x i1>, i32) diff --git a/llvm/test/CodeGen/Generic/expand-vp.ll b/llvm/test/CodeGen/Generic/expand-vp.ll deleted file mode 100644 index 4fee9a533b94..000000000000 --- a/llvm/test/CodeGen/Generic/expand-vp.ll +++ /dev/null @@ -1,567 +0,0 @@ -; Partial expansion cases (still VP with parameter expansions). -; RUN: opt --expandvp --expandvp-override-evl-transform=Legal --expandvp-override-mask-transform=Legal -S < %s | FileCheck %s --check-prefix=LEGAL_LEGAL -; RUN: opt --expandvp --expandvp-override-evl-transform=Discard --expandvp-override-mask-transform=Legal -S < %s | FileCheck %s --check-prefix=DISCARD_LEGAL -; RUN: opt --expandvp --expandvp-override-evl-transform=Convert --expandvp-override-mask-transform=Legal -S < %s | FileCheck %s --check-prefix=CONVERT_LEGAL -; Full expansion cases (all expanded to non-VP). -; RUN: opt --expandvp --expandvp-override-evl-transform=Discard --expandvp-override-mask-transform=Convert -S < %s | FileCheck %s --check-prefix=ALL-CONVERT -; RUN: opt --expandvp -S < %s | FileCheck %s --check-prefix=ALL-CONVERT -; RUN: opt --expandvp --expandvp-override-evl-transform=Legal --expandvp-override-mask-transform=Convert -S < %s | FileCheck %s --check-prefix=ALL-CONVERT -; RUN: opt --expandvp --expandvp-override-evl-transform=Convert --expandvp-override-mask-transform=Convert -S < %s | FileCheck %s --check-prefix=ALL-CONVERT - - -; Fixed-width vectors -; Integer arith -declare <8 x i32> @llvm.vp.add.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) -declare <8 x i32> @llvm.vp.sub.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) -declare <8 x i32> @llvm.vp.mul.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) -declare <8 x i32> @llvm.vp.sdiv.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) -declare <8 x i32> @llvm.vp.srem.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) -declare <8 x i32> @llvm.vp.udiv.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) -declare <8 x i32> @llvm.vp.urem.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) -declare <8 x i32> @llvm.vp.smax.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) -declare <8 x i32> @llvm.vp.smin.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) -declare <8 x i32> @llvm.vp.umax.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) -declare <8 x i32> @llvm.vp.umin.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) -; Bit arith -declare <8 x i32> @llvm.vp.and.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) -declare <8 x i32> @llvm.vp.xor.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) -declare <8 x i32> @llvm.vp.or.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) -declare <8 x i32> @llvm.vp.ashr.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) -declare <8 x i32> @llvm.vp.lshr.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) -declare <8 x i32> @llvm.vp.shl.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) -; Reductions -declare i32 @llvm.vp.reduce.add.v4i32(i32, <4 x i32>, <4 x i1>, i32) -declare i32 @llvm.vp.reduce.mul.v4i32(i32, <4 x i32>, <4 x i1>, i32) -declare i32 @llvm.vp.reduce.and.v4i32(i32, <4 x i32>, <4 x i1>, i32) -declare i32 @llvm.vp.reduce.or.v4i32(i32, <4 x i32>, <4 x i1>, i32) -declare i32 @llvm.vp.reduce.xor.v4i32(i32, <4 x i32>, <4 x i1>, i32) -declare i32 @llvm.vp.reduce.smin.v4i32(i32, <4 x i32>, <4 x i1>, i32) -declare i32 @llvm.vp.reduce.smax.v4i32(i32, <4 x i32>, <4 x i1>, i32) -declare i32 @llvm.vp.reduce.umin.v4i32(i32, <4 x i32>, <4 x i1>, i32) -declare i32 @llvm.vp.reduce.umax.v4i32(i32, <4 x i32>, <4 x i1>, i32) -declare float @llvm.vp.reduce.fmin.v4f32(float, <4 x float>, <4 x i1>, i32) -declare float @llvm.vp.reduce.fmax.v4f32(float, <4 x float>, <4 x i1>, i32) -declare float @llvm.vp.reduce.fminimum.v4f32(float, <4 x float>, <4 x i1>, i32) -declare float @llvm.vp.reduce.fmaximum.v4f32(float, <4 x float>, <4 x i1>, i32) -declare float @llvm.vp.reduce.fadd.v4f32(float, <4 x float>, <4 x i1>, i32) -declare float @llvm.vp.reduce.fmul.v4f32(float, <4 x float>, <4 x i1>, i32) -; Comparisons -declare <8 x i1> @llvm.vp.icmp.v8i32(<8 x i32>, <8 x i32>, metadata, <8 x i1>, i32) -declare <8 x i1> @llvm.vp.fcmp.v8f32(<8 x float>, <8 x float>, metadata, <8 x i1>, i32) - -; Fixed vector test function. -define void @test_vp_int_v8(<8 x i32> %i0, <8 x i32> %i1, <8 x i32> %i2, <8 x i32> %f3, <8 x i1> %m, i32 %n) { - %r0 = call <8 x i32> @llvm.vp.add.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n) - %r1 = call <8 x i32> @llvm.vp.sub.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n) - %r2 = call <8 x i32> @llvm.vp.mul.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n) - %r3 = call <8 x i32> @llvm.vp.sdiv.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n) - %r4 = call <8 x i32> @llvm.vp.srem.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n) - %r5 = call <8 x i32> @llvm.vp.udiv.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n) - %r6 = call <8 x i32> @llvm.vp.urem.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n) - %r7 = call <8 x i32> @llvm.vp.smax.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n) - %r8 = call <8 x i32> @llvm.vp.smin.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n) - %r9 = call <8 x i32> @llvm.vp.umax.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n) - %rA = call <8 x i32> @llvm.vp.umin.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n) - %rB = call <8 x i32> @llvm.vp.and.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n) - %rC = call <8 x i32> @llvm.vp.or.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n) - %rD = call <8 x i32> @llvm.vp.xor.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n) - %rE = call <8 x i32> @llvm.vp.ashr.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n) - %rF = call <8 x i32> @llvm.vp.lshr.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n) - %r10 = call <8 x i32> @llvm.vp.shl.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n) - ret void -} - -; Scalable-width vectors -; Integer arith -declare <vscale x 4 x i32> @llvm.vp.add.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i1>, i32) -declare <vscale x 4 x i32> @llvm.vp.sub.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i1>, i32) -declare <vscale x 4 x i32> @llvm.vp.mul.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i1>, i32) -declare <vscale x 4 x i32> @llvm.vp.sdiv.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i1>, i32) -declare <vscale x 4 x i32> @llvm.vp.srem.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i1>, i32) -declare <vscale x 4 x i32> @llvm.vp.udiv.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i1>, i32) -declare <vscale x 4 x i32> @llvm.vp.urem.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i1>, i32) -declare <vscale x 4 x i32> @llvm.vp.smax.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i1>, i32) -declare <vscale x 4 x i32> @llvm.vp.smin.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i1>, i32) -declare <vscale x 4 x i32> @llvm.vp.umax.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i1>, i32) -declare <vscale x 4 x i32> @llvm.vp.umin.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i1>, i32) -; Bit arith -declare <vscale x 4 x i32> @llvm.vp.and.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i1>, i32) -declare <vscale x 4 x i32> @llvm.vp.xor.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i1>, i32) -declare <vscale x 4 x i32> @llvm.vp.or.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i1>, i32) -declare <vscale x 4 x i32> @llvm.vp.ashr.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i1>, i32) -declare <vscale x 4 x i32> @llvm.vp.lshr.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i1>, i32) -declare <vscale x 4 x i32> @llvm.vp.shl.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i1>, i32) - -; Scalable vector test function. -define void @test_vp_int_vscale(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i32> %i2, <vscale x 4 x i32> %f3, <vscale x 4 x i1> %m, i32 %n) { - %r0 = call <vscale x 4 x i32> @llvm.vp.add.nxv4i32(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i1> %m, i32 %n) - %r1 = call <vscale x 4 x i32> @llvm.vp.sub.nxv4i32(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i1> %m, i32 %n) - %r2 = call <vscale x 4 x i32> @llvm.vp.mul.nxv4i32(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i1> %m, i32 %n) - %r3 = call <vscale x 4 x i32> @llvm.vp.sdiv.nxv4i32(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i1> %m, i32 %n) - %r4 = call <vscale x 4 x i32> @llvm.vp.srem.nxv4i32(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i1> %m, i32 %n) - %r5 = call <vscale x 4 x i32> @llvm.vp.udiv.nxv4i32(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i1> %m, i32 %n) - %r6 = call <vscale x 4 x i32> @llvm.vp.urem.nxv4i32(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i1> %m, i32 %n) - %r7 = call <vscale x 4 x i32> @llvm.vp.smax.nxv4i32(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i1> %m, i32 %n) - %r8 = call <vscale x 4 x i32> @llvm.vp.smin.nxv4i32(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i1> %m, i32 %n) - %r9 = call <vscale x 4 x i32> @llvm.vp.umax.nxv4i32(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i1> %m, i32 %n) - %rA = call <vscale x 4 x i32> @llvm.vp.umin.nxv4i32(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i1> %m, i32 %n) - %rB = call <vscale x 4 x i32> @llvm.vp.and.nxv4i32(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i1> %m, i32 %n) - %rC = call <vscale x 4 x i32> @llvm.vp.or.nxv4i32(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i1> %m, i32 %n) - %rD = call <vscale x 4 x i32> @llvm.vp.xor.nxv4i32(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i1> %m, i32 %n) - %rE = call <vscale x 4 x i32> @llvm.vp.ashr.nxv4i32(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i1> %m, i32 %n) - %rF = call <vscale x 4 x i32> @llvm.vp.lshr.nxv4i32(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i1> %m, i32 %n) - %r10 = call <vscale x 4 x i32> @llvm.vp.shl.nxv4i32(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i1> %m, i32 %n) - ret void -} - -; Fixed vector reduce test function. -define void @test_vp_reduce_int_v4(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 %n) { - %r0 = call i32 @llvm.vp.reduce.add.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 %n) - %r1 = call i32 @llvm.vp.reduce.mul.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 %n) - %r2 = call i32 @llvm.vp.reduce.and.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 %n) - %r3 = call i32 @llvm.vp.reduce.or.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 %n) - %r4 = call i32 @llvm.vp.reduce.xor.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 %n) - %r5 = call i32 @llvm.vp.reduce.smin.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 %n) - %r6 = call i32 @llvm.vp.reduce.smax.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 %n) - %r7 = call i32 @llvm.vp.reduce.umin.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 %n) - %r8 = call i32 @llvm.vp.reduce.umax.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 %n) - ret void -} - -define void @test_vp_reduce_fp_v4(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n) { - %r0 = call float @llvm.vp.reduce.fmin.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n) - %r1 = call nnan float @llvm.vp.reduce.fmin.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n) - %r2 = call nnan ninf float @llvm.vp.reduce.fmin.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n) - %r3 = call float @llvm.vp.reduce.fmax.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n) - %r4 = call nnan float @llvm.vp.reduce.fmax.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n) - %r5 = call nnan ninf float @llvm.vp.reduce.fmax.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n) - %r6 = call float @llvm.vp.reduce.fminimum.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n) - %r7 = call nnan float @llvm.vp.reduce.fminimum.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n) - %r8 = call nnan ninf float @llvm.vp.reduce.fminimum.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n) - %r9 = call float @llvm.vp.reduce.fmaximum.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n) - %r10 = call nnan float @llvm.vp.reduce.fmaximum.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n) - %r11 = call nnan ninf float @llvm.vp.reduce.fmaximum.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n) - %r12 = call float @llvm.vp.reduce.fadd.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n) - %r13 = call reassoc float @llvm.vp.reduce.fadd.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n) - %r14 = call float @llvm.vp.reduce.fmul.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n) - %r15 = call reassoc float @llvm.vp.reduce.fmul.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n) - ret void -} - -define void @test_vp_cmp_v8(<8 x i32> %i0, <8 x i32> %i1, <8 x float> %f0, <8 x float> %f1, <8 x i1> %m, i32 %n) { - %r0 = call <8 x i1> @llvm.vp.icmp.v8i32(<8 x i32> %i0, <8 x i32> %i1, metadata !"eq", <8 x i1> %m, i32 %n) - %r1 = call <8 x i1> @llvm.vp.icmp.v8i32(<8 x i32> %i0, <8 x i32> %i1, metadata !"slt", <8 x i1> %m, i32 %n) - %r2 = call <8 x i1> @llvm.vp.fcmp.v8f32(<8 x float> %f0, <8 x float> %f1, metadata !"oeq", <8 x i1> %m, i32 %n) - %r3 = call <8 x i1> @llvm.vp.fcmp.v8f32(<8 x float> %f0, <8 x float> %f1, metadata !"ult", <8 x i1> %m, i32 %n) - ret void -} - -; All VP intrinsics have to be lowered into non-VP ops -; Convert %evl into %mask for non-speculatable VP intrinsics and emit the -; instruction+select idiom with a non-VP SIMD instruction. -; -; ALL-CONVERT-NOT: {{call.* @llvm.vp.add}} -; ALL-CONVERT-NOT: {{call.* @llvm.vp.sub}} -; ALL-CONVERT-NOT: {{call.* @llvm.vp.mul}} -; ALL-CONVERT-NOT: {{call.* @llvm.vp.sdiv}} -; ALL-CONVERT-NOT: {{call.* @llvm.vp.srem}} -; ALL-CONVERT-NOT: {{call.* @llvm.vp.udiv}} -; ALL-CONVERT-NOT: {{call.* @llvm.vp.urem}} -; ALL-CONVERT-NOT: {{call.* @llvm.vp.and}} -; ALL-CONVERT-NOT: {{call.* @llvm.vp.or}} -; ALL-CONVERT-NOT: {{call.* @llvm.vp.xor}} -; ALL-CONVERT-NOT: {{call.* @llvm.vp.ashr}} -; ALL-CONVERT-NOT: {{call.* @llvm.vp.lshr}} -; ALL-CONVERT-NOT: {{call.* @llvm.vp.shl}} -; -; ALL-CONVERT: define void @test_vp_int_v8(<8 x i32> %i0, <8 x i32> %i1, <8 x i32> %i2, <8 x i32> %f3, <8 x i1> %m, i32 %n) { -; ALL-CONVERT-NEXT: %{{.*}} = add <8 x i32> %i0, %i1 -; ALL-CONVERT-NEXT: %{{.*}} = sub <8 x i32> %i0, %i1 -; ALL-CONVERT-NEXT: %{{.*}} = mul <8 x i32> %i0, %i1 -; ALL-CONVERT-NEXT: [[NINS:%.+]] = insertelement <8 x i32> poison, i32 %n, i64 0 -; ALL-CONVERT-NEXT: [[NSPLAT:%.+]] = shufflevector <8 x i32> [[NINS]], <8 x i32> poison, <8 x i32> zeroinitializer -; ALL-CONVERT-NEXT: [[EVLM:%.+]] = icmp ult <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[NSPLAT]] -; ALL-CONVERT-NEXT: [[NEWM:%.+]] = and <8 x i1> [[EVLM]], %m -; ALL-CONVERT-NEXT: [[SELONE:%.+]] = select <8 x i1> [[NEWM]], <8 x i32> %i1, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> -; ALL-CONVERT-NEXT: %{{.+}} = sdiv <8 x i32> %i0, [[SELONE]] -; ALL-CONVERT-NOT: %{{.+}} = srem <8 x i32> %i0, %i1 -; ALL-CONVERT: %{{.+}} = srem <8 x i32> %i0, %{{.+}} -; ALL-CONVERT-NOT: %{{.+}} = udiv <8 x i32> %i0, %i1 -; ALL-CONVERT: %{{.+}} = udiv <8 x i32> %i0, %{{.+}} -; ALL-CONVERT-NOT: %{{.+}} = urem <8 x i32> %i0, %i1 -; ALL-CONVERT: %{{.+}} = urem <8 x i32> %i0, %{{.+}} -; ALL-CONVERT-NEXT: %{{.+}} = call <8 x i32> @llvm.smax.v8i32(<8 x i32> %i0, <8 x i32> %i1) -; ALL-CONVERT-NEXT: %{{.+}} = call <8 x i32> @llvm.smin.v8i32(<8 x i32> %i0, <8 x i32> %i1) -; ALL-CONVERT-NEXT: %{{.+}} = call <8 x i32> @llvm.umax.v8i32(<8 x i32> %i0, <8 x i32> %i1) -; ALL-CONVERT-NEXT: %{{.+}} = call <8 x i32> @llvm.umin.v8i32(<8 x i32> %i0, <8 x i32> %i1) -; ALL-CONVERT-NEXT: %{{.+}} = and <8 x i32> %i0, %i1 -; ALL-CONVERT-NEXT: %{{.+}} = or <8 x i32> %i0, %i1 -; ALL-CONVERT-NEXT: %{{.+}} = xor <8 x i32> %i0, %i1 -; ALL-CONVERT-NEXT: %{{.+}} = ashr <8 x i32> %i0, %i1 -; ALL-CONVERT-NEXT: %{{.+}} = lshr <8 x i32> %i0, %i1 -; ALL-CONVERT-NEXT: %{{.+}} = shl <8 x i32> %i0, %i1 -; ALL-CONVERT: ret void - - -; Check that reductions use the correct neutral element for masked-off elements -; ALL-CONVERT: define void @test_vp_reduce_int_v4(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 %n) { -; ALL-CONVERT-NEXT: [[NINS:%.+]] = insertelement <4 x i32> poison, i32 %n, i64 0 -; ALL-CONVERT-NEXT: [[NSPLAT:%.+]] = shufflevector <4 x i32> [[NINS]], <4 x i32> poison, <4 x i32> zeroinitializer -; ALL-CONVERT-NEXT: [[EVLM:%.+]] = icmp ult <4 x i32> <i32 0, i32 1, i32 2, i32 3>, [[NSPLAT]] -; ALL-CONVERT-NEXT: [[NEWM:%.+]] = and <4 x i1> [[EVLM]], %m -; ALL-CONVERT-NEXT: [[ADD:%.+]] = select <4 x i1> [[NEWM]], <4 x i32> %vi, <4 x i32> zeroinitializer -; ALL-CONVERT-NEXT: [[RED:%.+]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[ADD]]) -; ALL-CONVERT-NEXT: %{{.+}} = add i32 [[RED]], %start -; ALL-CONVERT: [[MUL:%.+]] = select <4 x i1> %{{.+}}, <4 x i32> %vi, <4 x i32> <i32 1, i32 1, i32 1, i32 1> -; ALL-CONVERT-NEXT: [[RED:%.+]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[MUL]]) -; ALL-CONVERT-NEXT: %{{.+}} = mul i32 [[RED]], %start -; ALL-CONVERT: [[AND:%.+]] = select <4 x i1> %{{.+}}, <4 x i32> %vi, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1> -; ALL-CONVERT-NEXT: [[RED:%.+]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[AND]]) -; ALL-CONVERT-NEXT: %{{.+}} = and i32 [[RED]], %start -; ALL-CONVERT: [[OR:%.+]] = select <4 x i1> %{{.+}}, <4 x i32> %vi, <4 x i32> zeroinitializer -; ALL-CONVERT-NEXT: [[RED:%.+]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[OR]]) -; ALL-CONVERT-NEXT: %{{.+}} = or i32 [[RED]], %start -; ALL-CONVERT: [[XOR:%.+]] = select <4 x i1> %{{.+}}, <4 x i32> %vi, <4 x i32> zeroinitializer -; ALL-CONVERT-NEXT: [[RED:%.+]] = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> [[XOR]]) -; ALL-CONVERT-NEXT: %{{.+}} = xor i32 [[RED]], %start -; ALL-CONVERT: [[SMIN:%.+]] = select <4 x i1> %{{.+}}, <4 x i32> %vi, <4 x i32> <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647> -; ALL-CONVERT-NEXT: [[RED:%.+]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[SMIN]]) -; ALL-CONVERT-NEXT: %{{.+}} = call i32 @llvm.smin.i32(i32 [[RED]], i32 %start) -; ALL-CONVERT: [[SMAX:%.+]] = select <4 x i1> %{{.+}}, <4 x i32> %vi, <4 x i32> <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648> -; ALL-CONVERT-NEXT: [[RED:%.+]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[SMAX]]) -; ALL-CONVERT-NEXT: %{{.+}} = call i32 @llvm.smax.i32(i32 [[RED]], i32 %start) -; ALL-CONVERT: [[UMIN:%.+]] = select <4 x i1> %{{.+}}, <4 x i32> %vi, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1> -; ALL-CONVERT-NEXT: [[RED:%.+]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[UMIN]]) -; ALL-CONVERT-NEXT: %{{.+}} = call i32 @llvm.umin.i32(i32 [[RED]], i32 %start) -; ALL-CONVERT: [[UMAX:%.+]] = select <4 x i1> %{{.+}}, <4 x i32> %vi, <4 x i32> zeroinitializer -; ALL-CONVERT-NEXT: [[RED:%.+]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[UMAX]]) -; ALL-CONVERT-NEXT: %{{.+}} = call i32 @llvm.umax.i32(i32 [[RED]], i32 %start) -; ALL-CONVERT-NEXT: ret void - -; Check that reductions use the correct neutral element for masked-off elements -; ALL-CONVERT: define void @test_vp_reduce_fp_v4(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n) { -; ALL-CONVERT-NEXT: [[NINS:%.+]] = insertelement <4 x i32> poison, i32 %n, i64 0 -; ALL-CONVERT-NEXT: [[NSPLAT:%.+]] = shufflevector <4 x i32> [[NINS]], <4 x i32> poison, <4 x i32> zeroinitializer -; ALL-CONVERT-NEXT: [[EVLM:%.+]] = icmp ult <4 x i32> <i32 0, i32 1, i32 2, i32 3>, [[NSPLAT]] -; ALL-CONVERT-NEXT: [[NEWM:%.+]] = and <4 x i1> [[EVLM]], %m -; ALL-CONVERT-NEXT: [[FMIN:%.+]] = select <4 x i1> [[NEWM]], <4 x float> %vf, <4 x float> <float 0x7FF8000000000000, float 0x7FF8000000000000, float 0x7FF8000000000000, float 0x7FF8000000000000> -; ALL-CONVERT-NEXT: [[RED:%.+]] = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> [[FMIN]]) -; ALL-CONVERT-NEXT: %{{.+}} = call float @llvm.minnum.f32(float [[RED]], float %f) -; ALL-CONVERT: [[FMIN_NNAN:%.+]] = select <4 x i1> %{{.+}}, <4 x float> %vf, <4 x float> <float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000> -; ALL-CONVERT-NEXT: [[RED:%.+]] = call nnan float @llvm.vector.reduce.fmin.v4f32(<4 x float> [[FMIN_NNAN]]) -; ALL-CONVERT-NEXT: %{{.+}} = call nnan float @llvm.minnum.f32(float [[RED]], float %f) -; ALL-CONVERT: [[FMIN_NNAN_NINF:%.+]] = select <4 x i1> %{{.+}}, <4 x float> %vf, <4 x float> <float 0x47EFFFFFE0000000, float 0x47EFFFFFE0000000, float 0x47EFFFFFE0000000, float 0x47EFFFFFE0000000> -; ALL-CONVERT-NEXT: [[RED:%.+]] = call nnan ninf float @llvm.vector.reduce.fmin.v4f32(<4 x float> [[FMIN_NNAN_NINF]]) -; ALL-CONVERT-NEXT: %{{.+}} = call nnan ninf float @llvm.minnum.f32(float [[RED]], float %f) -; ALL-CONVERT: [[FMAX:%.+]] = select <4 x i1> %{{.+}}, <4 x float> %vf, <4 x float> <float 0xFFF8000000000000, float 0xFFF8000000000000, float 0xFFF8000000000000, float 0xFFF8000000000000> -; ALL-CONVERT-NEXT: [[RED:%.+]] = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> [[FMAX]]) -; ALL-CONVERT-NEXT: %{{.+}} = call float @llvm.maxnum.f32(float [[RED]], float %f) -; ALL-CONVERT: [[FMAX_NNAN:%.+]] = select <4 x i1> %{{.+}}, <4 x float> %vf, <4 x float> <float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000> -; ALL-CONVERT-NEXT: [[RED:%.+]] = call nnan float @llvm.vector.reduce.fmax.v4f32(<4 x float> [[FMAX_NNAN]]) -; ALL-CONVERT-NEXT: %{{.+}} = call nnan float @llvm.maxnum.f32(float [[RED]], float %f) -; ALL-CONVERT: [[FMAX_NNAN_NINF:%.+]] = select <4 x i1> %{{.+}}, <4 x float> %vf, <4 x float> <float 0xC7EFFFFFE0000000, float 0xC7EFFFFFE0000000, float 0xC7EFFFFFE0000000, float 0xC7EFFFFFE0000000> -; ALL-CONVERT-NEXT: [[RED:%.+]] = call nnan ninf float @llvm.vector.reduce.fmax.v4f32(<4 x float> [[FMAX_NNAN_NINF]]) -; ALL-CONVERT-NEXT: %{{.+}} = call nnan ninf float @llvm.maxnum.f32(float [[RED]], float %f) - -; ALL-CONVERT: [[FMINIMUM:%.+]] = select <4 x i1> %{{.+}}, <4 x float> %vf, <4 x float> <float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000> -; ALL-CONVERT-NEXT: [[RED:%.+]] = call float @llvm.vector.reduce.fminimum.v4f32(<4 x float> [[FMINIMUM]]) -; ALL-CONVERT-NEXT: %{{.+}} = call float @llvm.minimum.f32(float [[RED]], float %f) -; ALL-CONVERT: [[FMINIMUM_NNAN:%.+]] = select <4 x i1> %{{.+}}, <4 x float> %vf, <4 x float> <float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000> -; ALL-CONVERT-NEXT: [[RED:%.+]] = call nnan float @llvm.vector.reduce.fminimum.v4f32(<4 x float> [[FMINIMUM_NNAN]]) -; ALL-CONVERT-NEXT: %{{.+}} = call nnan float @llvm.minimum.f32(float [[RED]], float %f) -; ALL-CONVERT: [[FMINIMUM_NNAN_NINF:%.+]] = select <4 x i1> %{{.+}}, <4 x float> %vf, <4 x float> <float 0x47EFFFFFE0000000, float 0x47EFFFFFE0000000, float 0x47EFFFFFE0000000, float 0x47EFFFFFE0000000> -; ALL-CONVERT-NEXT: [[RED:%.+]] = call nnan ninf float @llvm.vector.reduce.fminimum.v4f32(<4 x float> [[FMINIMUM_NNAN_NINF]]) -; ALL-CONVERT-NEXT: %{{.+}} = call nnan ninf float @llvm.minimum.f32(float [[RED]], float %f) - -; ALL-CONVERT: [[FMAXIMUM:%.+]] = select <4 x i1> %{{.+}}, <4 x float> %vf, <4 x float> <float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000> -; ALL-CONVERT-NEXT: [[RED:%.+]] = call float @llvm.vector.reduce.fmaximum.v4f32(<4 x float> [[FMAXIMUM]]) -; ALL-CONVERT-NEXT: %{{.+}} = call float @llvm.maximum.f32(float [[RED]], float %f) -; ALL-CONVERT: [[FMAXIMUM_NNAN:%.+]] = select <4 x i1> %{{.+}}, <4 x float> %vf, <4 x float> <float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000> -; ALL-CONVERT-NEXT: [[RED:%.+]] = call nnan float @llvm.vector.reduce.fmaximum.v4f32(<4 x float> [[FMAXIMUM_NNAN]]) -; ALL-CONVERT-NEXT: %{{.+}} = call nnan float @llvm.maximum.f32(float [[RED]], float %f) -; ALL-CONVERT: [[FMAXIMUM_NNAN_NINF:%.+]] = select <4 x i1> %{{.+}}, <4 x float> %vf, <4 x float> <float 0xC7EFFFFFE0000000, float 0xC7EFFFFFE0000000, float 0xC7EFFFFFE0000000, float 0xC7EFFFFFE0000000> -; ALL-CONVERT-NEXT: [[RED:%.+]] = call nnan ninf float @llvm.vector.reduce.fmaximum.v4f32(<4 x float> [[FMAXIMUM_NNAN_NINF]]) -; ALL-CONVERT-NEXT: %{{.+}} = call nnan ninf float @llvm.maximum.f32(float [[RED]], float %f) - -; ALL-CONVERT: [[FADD:%.+]] = select <4 x i1> %{{.+}}, <4 x float> %vf, <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00> -; ALL-CONVERT-NEXT: %{{.+}} = call float @llvm.vector.reduce.fadd.v4f32(float %f, <4 x float> [[FADD]]) -; ALL-CONVERT: [[FADD:%.+]] = select <4 x i1> %{{.+}}, <4 x float> %vf, <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00> -; ALL-CONVERT-NEXT: %{{.+}} = call reassoc float @llvm.vector.reduce.fadd.v4f32(float %f, <4 x float> [[FADD]]) -; ALL-CONVERT: [[FMUL:%.+]] = select <4 x i1> %{{.+}}, <4 x float> %vf, <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00> -; ALL-CONVERT-NEXT: %{{.+}} = call float @llvm.vector.reduce.fmul.v4f32(float %f, <4 x float> [[FMUL]]) -; ALL-CONVERT: [[FMUL:%.+]] = select <4 x i1> %{{.+}}, <4 x float> %vf, <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00> -; ALL-CONVERT-NEXT: %{{.+}} = call reassoc float @llvm.vector.reduce.fmul.v4f32(float %f, <4 x float> [[FMUL]]) -; ALL-CONVERT-NEXT: ret void - -; Check that comparisons use the correct condition codes -; ALL-CONVERT: define void @test_vp_cmp_v8(<8 x i32> %i0, <8 x i32> %i1, <8 x float> %f0, <8 x float> %f1, <8 x i1> %m, i32 %n) { -; ALL-CONVERT-NEXT: %{{.+}} = icmp eq <8 x i32> %i0, %i1 -; ALL-CONVERT-NEXT: %{{.+}} = icmp slt <8 x i32> %i0, %i1 -; ALL-CONVERT-NEXT: %{{.+}} = fcmp oeq <8 x float> %f0, %f1 -; ALL-CONVERT-NEXT: %{{.+}} = fcmp ult <8 x float> %f0, %f1 -; ALL-CONVERT-NEXT: ret void - - -; All legal - don't transform anything. - -; LEGAL_LEGAL: define void @test_vp_int_v8(<8 x i32> %i0, <8 x i32> %i1, <8 x i32> %i2, <8 x i32> %f3, <8 x i1> %m, i32 %n) { -; LEGAL_LEGAL-NEXT: %r0 = call <8 x i32> @llvm.vp.add.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n) -; LEGAL_LEGAL-NEXT: %r1 = call <8 x i32> @llvm.vp.sub.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n) -; LEGAL_LEGAL-NEXT: %r2 = call <8 x i32> @llvm.vp.mul.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n) -; LEGAL_LEGAL-NEXT: %r3 = call <8 x i32> @llvm.vp.sdiv.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n) -; LEGAL_LEGAL-NEXT: %r4 = call <8 x i32> @llvm.vp.srem.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n) -; LEGAL_LEGAL-NEXT: %r5 = call <8 x i32> @llvm.vp.udiv.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n) -; LEGAL_LEGAL-NEXT: %r6 = call <8 x i32> @llvm.vp.urem.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n) -; LEGAL_LEGAL-NEXT: %r7 = call <8 x i32> @llvm.vp.smax.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n) -; LEGAL_LEGAL-NEXT: %r8 = call <8 x i32> @llvm.vp.smin.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n) -; LEGAL_LEGAL-NEXT: %r9 = call <8 x i32> @llvm.vp.umax.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n) -; LEGAL_LEGAL-NEXT: %rA = call <8 x i32> @llvm.vp.umin.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n) -; LEGAL_LEGAL-NEXT: %rB = call <8 x i32> @llvm.vp.and.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n) -; LEGAL_LEGAL-NEXT: %rC = call <8 x i32> @llvm.vp.or.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n) -; LEGAL_LEGAL-NEXT: %rD = call <8 x i32> @llvm.vp.xor.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n) -; LEGAL_LEGAL-NEXT: %rE = call <8 x i32> @llvm.vp.ashr.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n) -; LEGAL_LEGAL-NEXT: %rF = call <8 x i32> @llvm.vp.lshr.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n) -; LEGAL_LEGAL-NEXT: %r10 = call <8 x i32> @llvm.vp.shl.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n) -; LEGAL_LEGAL-NEXT: ret void - -; LEGAL_LEGAL:define void @test_vp_int_vscale(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i32> %i2, <vscale x 4 x i32> %f3, <vscale x 4 x i1> %m, i32 %n) { -; LEGAL_LEGAL-NEXT: %r0 = call <vscale x 4 x i32> @llvm.vp.add.nxv4i32(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i1> %m, i32 %n) -; LEGAL_LEGAL-NEXT: %r1 = call <vscale x 4 x i32> @llvm.vp.sub.nxv4i32(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i1> %m, i32 %n) -; LEGAL_LEGAL-NEXT: %r2 = call <vscale x 4 x i32> @llvm.vp.mul.nxv4i32(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i1> %m, i32 %n) -; LEGAL_LEGAL-NEXT: %r3 = call <vscale x 4 x i32> @llvm.vp.sdiv.nxv4i32(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i1> %m, i32 %n) -; LEGAL_LEGAL-NEXT: %r4 = call <vscale x 4 x i32> @llvm.vp.srem.nxv4i32(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i1> %m, i32 %n) -; LEGAL_LEGAL-NEXT: %r5 = call <vscale x 4 x i32> @llvm.vp.udiv.nxv4i32(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i1> %m, i32 %n) -; LEGAL_LEGAL-NEXT: %r6 = call <vscale x 4 x i32> @llvm.vp.urem.nxv4i32(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i1> %m, i32 %n) -; LEGAL_LEGAL-NEXT: %r7 = call <vscale x 4 x i32> @llvm.vp.smax.nxv4i32(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i1> %m, i32 %n) -; LEGAL_LEGAL-NEXT: %r8 = call <vscale x 4 x i32> @llvm.vp.smin.nxv4i32(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i1> %m, i32 %n) -; LEGAL_LEGAL-NEXT: %r9 = call <vscale x 4 x i32> @llvm.vp.umax.nxv4i32(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i1> %m, i32 %n) -; LEGAL_LEGAL-NEXT: %rA = call <vscale x 4 x i32> @llvm.vp.umin.nxv4i32(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i1> %m, i32 %n) -; LEGAL_LEGAL-NEXT: %rB = call <vscale x 4 x i32> @llvm.vp.and.nxv4i32(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i1> %m, i32 %n) -; LEGAL_LEGAL-NEXT: %rC = call <vscale x 4 x i32> @llvm.vp.or.nxv4i32(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i1> %m, i32 %n) -; LEGAL_LEGAL-NEXT: %rD = call <vscale x 4 x i32> @llvm.vp.xor.nxv4i32(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i1> %m, i32 %n) -; LEGAL_LEGAL-NEXT: %rE = call <vscale x 4 x i32> @llvm.vp.ashr.nxv4i32(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i1> %m, i32 %n) -; LEGAL_LEGAL-NEXT: %rF = call <vscale x 4 x i32> @llvm.vp.lshr.nxv4i32(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i1> %m, i32 %n) -; LEGAL_LEGAL-NEXT: %r10 = call <vscale x 4 x i32> @llvm.vp.shl.nxv4i32(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i1> %m, i32 %n) -; LEGAL_LEGAL-NEXT: ret void - -; LEGAL_LEGAL: define void @test_vp_reduce_int_v4(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 %n) { -; LEGAL_LEGAL-NEXT: %r0 = call i32 @llvm.vp.reduce.add.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 %n) -; LEGAL_LEGAL-NEXT: %r1 = call i32 @llvm.vp.reduce.mul.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 %n) -; LEGAL_LEGAL-NEXT: %r2 = call i32 @llvm.vp.reduce.and.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 %n) -; LEGAL_LEGAL-NEXT: %r3 = call i32 @llvm.vp.reduce.or.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 %n) -; LEGAL_LEGAL-NEXT: %r4 = call i32 @llvm.vp.reduce.xor.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 %n) -; LEGAL_LEGAL-NEXT: %r5 = call i32 @llvm.vp.reduce.smin.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 %n) -; LEGAL_LEGAL-NEXT: %r6 = call i32 @llvm.vp.reduce.smax.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 %n) -; LEGAL_LEGAL-NEXT: %r7 = call i32 @llvm.vp.reduce.umin.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 %n) -; LEGAL_LEGAL-NEXT: %r8 = call i32 @llvm.vp.reduce.umax.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 %n) -; LEGAL_LEGAL-NEXT: ret void - -; LEGAL_LEGAL: define void @test_vp_reduce_fp_v4(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n) { -; LEGAL_LEGAL-NEXT: %r0 = call float @llvm.vp.reduce.fmin.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n) -; LEGAL_LEGAL-NEXT: %r1 = call nnan float @llvm.vp.reduce.fmin.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n) -; LEGAL_LEGAL-NEXT: %r2 = call nnan ninf float @llvm.vp.reduce.fmin.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n) -; LEGAL_LEGAL-NEXT: %r3 = call float @llvm.vp.reduce.fmax.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n) -; LEGAL_LEGAL-NEXT: %r4 = call nnan float @llvm.vp.reduce.fmax.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n) -; LEGAL_LEGAL-NEXT: %r5 = call nnan ninf float @llvm.vp.reduce.fmax.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n) -; LEGAL_LEGAL-NEXT: %r6 = call float @llvm.vp.reduce.fminimum.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n) -; LEGAL_LEGAL-NEXT: %r7 = call nnan float @llvm.vp.reduce.fminimum.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n) -; LEGAL_LEGAL-NEXT: %r8 = call nnan ninf float @llvm.vp.reduce.fminimum.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n) -; LEGAL_LEGAL-NEXT: %r9 = call float @llvm.vp.reduce.fmaximum.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n) -; LEGAL_LEGAL-NEXT: %r10 = call nnan float @llvm.vp.reduce.fmaximum.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n) -; LEGAL_LEGAL-NEXT: %r11 = call nnan ninf float @llvm.vp.reduce.fmaximum.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n) -; LEGAL_LEGAL-NEXT: %r12 = call float @llvm.vp.reduce.fadd.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n) -; LEGAL_LEGAL-NEXT: %r13 = call reassoc float @llvm.vp.reduce.fadd.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n) -; LEGAL_LEGAL-NEXT: %r14 = call float @llvm.vp.reduce.fmul.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n) -; LEGAL_LEGAL-NEXT: %r15 = call reassoc float @llvm.vp.reduce.fmul.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n) -; LEGAL_LEGAL-NEXT: ret void - -; LEGAL_LEGAL: define void @test_vp_cmp_v8(<8 x i32> %i0, <8 x i32> %i1, <8 x float> %f0, <8 x float> %f1, <8 x i1> %m, i32 %n) { -; LEGAL_LEGAL-NEXT: %r0 = call <8 x i1> @llvm.vp.icmp.v8i32(<8 x i32> %i0, <8 x i32> %i1, metadata !"eq", <8 x i1> %m, i32 %n) -; LEGAL_LEGAL-NEXT: %r1 = call <8 x i1> @llvm.vp.icmp.v8i32(<8 x i32> %i0, <8 x i32> %i1, metadata !"slt", <8 x i1> %m, i32 %n) -; LEGAL_LEGAL-NEXT: %r2 = call <8 x i1> @llvm.vp.fcmp.v8f32(<8 x float> %f0, <8 x float> %f1, metadata !"oeq", <8 x i1> %m, i32 %n) -; LEGAL_LEGAL-NEXT: %r3 = call <8 x i1> @llvm.vp.fcmp.v8f32(<8 x float> %f0, <8 x float> %f1, metadata !"ult", <8 x i1> %m, i32 %n) -; LEGAL_LEGAL-NEXT: ret void - -; Drop %evl where possible else fold %evl into %mask (%evl Discard, %mask Legal) -; -; There is no caching yet in the ExpandVectorPredication pass and the %evl -; expansion code is emitted for every non-speculatable intrinsic again. Hence, -; only check that.. -; (1) The %evl folding code and %mask are correct for the first -; non-speculatable VP intrinsic. -; (2) All other non-speculatable VP intrinsics have a modified mask argument. -; (3) All speculatable VP intrinsics keep their %mask and %evl. -; (4) All VP intrinsics have an ineffective %evl parameter. - -; DISCARD_LEGAL: define void @test_vp_int_v8(<8 x i32> %i0, <8 x i32> %i1, <8 x i32> %i2, <8 x i32> %f3, <8 x i1> %m, i32 %n) { -; DISCARD_LEGAL-NEXT: %r0 = call <8 x i32> @llvm.vp.add.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 8) -; DISCARD_LEGAL-NEXT: %r1 = call <8 x i32> @llvm.vp.sub.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 8) -; DISCARD_LEGAL-NEXT: %r2 = call <8 x i32> @llvm.vp.mul.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 8) -; DISCARD_LEGAL-NEXT: [[NSPLATINS:%.+]] = insertelement <8 x i32> poison, i32 %n, i64 0 -; DISCARD_LEGAL-NEXT: [[NSPLAT:%.+]] = shufflevector <8 x i32> [[NSPLATINS]], <8 x i32> poison, <8 x i32> zeroinitializer -; DISCARD_LEGAL-NEXT: [[EVLMASK:%.+]] = icmp ult <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[NSPLAT]] -; DISCARD_LEGAL-NEXT: [[NEWMASK:%.+]] = and <8 x i1> [[EVLMASK]], %m -; DISCARD_LEGAL-NEXT: %r3 = call <8 x i32> @llvm.vp.sdiv.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> [[NEWMASK]], i32 8) -; DISCARD_LEGAL-NOT: %r4 = call <8 x i32> @llvm.vp.srem.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 8) -; DISCARD_LEGAL-NOT: %r5 = call <8 x i32> @llvm.vp.udiv.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 8) -; DISCARD_LEGAL-NOT: %r6 = call <8 x i32> @llvm.vp.urem.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 8) -; DISCARD_LEGAL: %r7 = call <8 x i32> @llvm.vp.smax.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 8) -; DISCARD_LEGAL: %r8 = call <8 x i32> @llvm.vp.smin.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 8) -; DISCARD_LEGAL: %r9 = call <8 x i32> @llvm.vp.umax.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 8) -; DISCARD_LEGAL: %rA = call <8 x i32> @llvm.vp.umin.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 8) -; DISCARD_LEGAL-NEXT: %rB = call <8 x i32> @llvm.vp.and.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 8) -; DISCARD_LEGAL-NEXT: %rC = call <8 x i32> @llvm.vp.or.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 8) -; DISCARD_LEGAL-NEXT: %rD = call <8 x i32> @llvm.vp.xor.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 8) -; DISCARD_LEGAL-NEXT: %rE = call <8 x i32> @llvm.vp.ashr.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 8) -; DISCARD_LEGAL-NEXT: %rF = call <8 x i32> @llvm.vp.lshr.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 8) -; DISCARD_LEGAL-NEXT: %r10 = call <8 x i32> @llvm.vp.shl.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 8) -; DISCARD_LEGAL-NEXT: ret void - -; TODO compute vscale only once and use caching. -; In the meantime, we only check for the correct vscale code for the first VP -; intrinsic and skip over it for all others. - -; DISCARD_LEGAL: define void @test_vp_int_vscale(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i32> %i2, <vscale x 4 x i32> %f3, <vscale x 4 x i1> %m, i32 %n) { -; DISCARD_LEGAL-NEXT: %vscale = call i32 @llvm.vscale.i32() -; DISCARD_LEGAL-NEXT: %scalable_size = mul nuw i32 %vscale, 4 -; DISCARD_LEGAL-NEXT: %r0 = call <vscale x 4 x i32> @llvm.vp.add.nxv4i32(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i1> %m, i32 %scalable_size) -; DISCARD_LEGAL: %r1 = call <vscale x 4 x i32> @llvm.vp.sub.nxv4i32(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i1> %m, i32 %scalable_size{{.*}}) -; DISCARD_LEGAL: %r2 = call <vscale x 4 x i32> @llvm.vp.mul.nxv4i32(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i1> %m, i32 %scalable_size{{.*}}) -; DISCARD_LEGAL: [[EVLM:%.+]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 %n) -; DISCARD_LEGAL: [[NEWM:%.+]] = and <vscale x 4 x i1> [[EVLM]], %m -; DISCARD_LEGAL: %r3 = call <vscale x 4 x i32> @llvm.vp.sdiv.nxv4i32(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i1> [[NEWM]], i32 %scalable_size{{.*}}) -; DISCARD_LEGAL-NOT: %{{.+}} = call <vscale x 4 x i32> @llvm.vp.{{.*}}, i32 %n) -; DISCARD_LEGAL: ret void - -; DISCARD_LEGAL: define void @test_vp_reduce_int_v4(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 %n) { -; DISCARD_LEGAL-NEXT: [[NSPLATINS:%.+]] = insertelement <4 x i32> poison, i32 %n, i64 0 -; DISCARD_LEGAL-NEXT: [[NSPLAT:%.+]] = shufflevector <4 x i32> [[NSPLATINS]], <4 x i32> poison, <4 x i32> zeroinitializer -; DISCARD_LEGAL-NEXT: [[EVLMASK:%.+]] = icmp ult <4 x i32> <i32 0, i32 1, i32 2, i32 3>, [[NSPLAT]] -; DISCARD_LEGAL-NEXT: [[NEWMASK:%.+]] = and <4 x i1> [[EVLMASK]], %m -; DISCARD_LEGAL-NEXT: %r0 = call i32 @llvm.vp.reduce.add.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> [[NEWMASK]], i32 4) -; DISCARD_LEGAL-NOT: %r1 = call i32 @llvm.vp.reduce.mul.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 4) -; DISCARD_LEGAL-NOT: %r2 = call i32 @llvm.vp.reduce.and.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 4) -; DISCARD_LEGAL-NOT: %r3 = call i32 @llvm.vp.reduce.or.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 4) -; DISCARD_LEGAL-NOT: %r4 = call i32 @llvm.vp.reduce.xor.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 4) -; DISCARD_LEGAL-NOT: %r5 = call i32 @llvm.vp.reduce.smin.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 4) -; DISCARD_LEGAL-NOT: %r6 = call i32 @llvm.vp.reduce.smax.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 4) -; DISCARD_LEGAL-NOT: %r7 = call i32 @llvm.vp.reduce.umin.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 4) -; DISCARD_LEGAL-NOT: %r8 = call i32 @llvm.vp.reduce.umax.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 4) -; DISCARD_LEGAL: ret void - -; DISCARD_LEGAL: define void @test_vp_reduce_fp_v4(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n) { -; DISCARD_LEGAL-NEXT: [[NSPLATINS:%.+]] = insertelement <4 x i32> poison, i32 %n, i64 0 -; DISCARD_LEGAL-NEXT: [[NSPLAT:%.+]] = shufflevector <4 x i32> [[NSPLATINS]], <4 x i32> poison, <4 x i32> zeroinitializer -; DISCARD_LEGAL-NEXT: [[EVLMASK:%.+]] = icmp ult <4 x i32> <i32 0, i32 1, i32 2, i32 3>, [[NSPLAT]] -; DISCARD_LEGAL-NEXT: [[NEWMASK:%.+]] = and <4 x i1> [[EVLMASK]], %m -; DISCARD_LEGAL-NEXT: %r0 = call float @llvm.vp.reduce.fmin.v4f32(float %f, <4 x float> %vf, <4 x i1> [[NEWMASK]], i32 4) -; DISCARD_LEGAL-NOT: %r1 = call nnan float @llvm.vp.reduce.fmin.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 4) -; DISCARD_LEGAL-NOT: %r2 = call nnan ninf float @llvm.vp.reduce.fmin.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 4) -; DISCARD_LEGAL-NOT: %r3 = call float @llvm.vp.reduce.fmax.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 4) -; DISCARD_LEGAL-NOT: %r4 = call nnan float @llvm.vp.reduce.fmax.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 4) -; DISCARD_LEGAL-NOT: %r5 = call nnan ninf float @llvm.vp.reduce.fmax.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 4) -; DISCARD_LEGAL-NOT: %r6 = call float @llvm.vp.reduce.fminimum.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n) -; DISCARD_LEGAL-NOT: %r7 = call nnan float @llvm.vp.reduce.fminimum.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n) -; DISCARD_LEGAL-NOT: %r8 = call nnan ninf float @llvm.vp.reduce.fminimum.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n) -; DISCARD_LEGAL-NOT: %r9 = call float @llvm.vp.reduce.fmaximum.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n) -; DISCARD_LEGAL-NOT: %r10 = call nnan float @llvm.vp.reduce.fmaximum.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n) -; DISCARD_LEGAL-NOT: %r11 = call nnan ninf float @llvm.vp.reduce.fmaximum.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n) -; DISCARD_LEGAL-NOT: %r12 = call float @llvm.vp.reduce.fadd.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n) -; DISCARD_LEGAL-NOT: %r13 = call reassoc float @llvm.vp.reduce.fadd.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n) -; DISCARD_LEGAL-NOT: %r14 = call float @llvm.vp.reduce.fmul.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n) -; DISCARD_LEGAL-NOT: %r15 = call reassoc float @llvm.vp.reduce.fmul.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n) -; DISCARD_LEGAL: ret void - -; DISCARD_LEGAL: define void @test_vp_cmp_v8(<8 x i32> %i0, <8 x i32> %i1, <8 x float> %f0, <8 x float> %f1, <8 x i1> %m, i32 %n) { -; DISCARD_LEGAL-NEXT: %r0 = call <8 x i1> @llvm.vp.icmp.v8i32(<8 x i32> %i0, <8 x i32> %i1, metadata !"eq", <8 x i1> %m, i32 8) -; DISCARD_LEGAL-NEXT: %r1 = call <8 x i1> @llvm.vp.icmp.v8i32(<8 x i32> %i0, <8 x i32> %i1, metadata !"slt", <8 x i1> %m, i32 8) -; DISCARD_LEGAL-NEXT: %r2 = call <8 x i1> @llvm.vp.fcmp.v8f32(<8 x float> %f0, <8 x float> %f1, metadata !"oeq", <8 x i1> %m, i32 8) -; DISCARD_LEGAL-NEXT: %r3 = call <8 x i1> @llvm.vp.fcmp.v8f32(<8 x float> %f0, <8 x float> %f1, metadata !"ult", <8 x i1> %m, i32 8) - -; Convert %evl into %mask everywhere (%evl Convert, %mask Legal) -; -; For the same reasons as in the (%evl Discard, %mask Legal) case only check that.. -; (1) The %evl folding code and %mask are correct for the first VP intrinsic. -; (2) All other VP intrinsics have a modified mask argument. -; (3) All VP intrinsics have an ineffective %evl parameter. -; -; CONVERT_LEGAL: define void @test_vp_int_v8(<8 x i32> %i0, <8 x i32> %i1, <8 x i32> %i2, <8 x i32> %f3, <8 x i1> %m, i32 %n) { -; CONVERT_LEGAL-NEXT: [[NINS:%.+]] = insertelement <8 x i32> poison, i32 %n, i64 0 -; CONVERT_LEGAL-NEXT: [[NSPLAT:%.+]] = shufflevector <8 x i32> [[NINS]], <8 x i32> poison, <8 x i32> zeroinitializer -; CONVERT_LEGAL-NEXT: [[EVLM:%.+]] = icmp ult <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[NSPLAT]] -; CONVERT_LEGAL-NEXT: [[NEWM:%.+]] = and <8 x i1> [[EVLM]], %m -; CONVERT_LEGAL-NEXT: %{{.+}} = call <8 x i32> @llvm.vp.add.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> [[NEWM]], i32 8) -; CONVERT_LEGAL-NOT: %{{.+}} = call <8 x i32> @llvm.vp.sub.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 8) -; CONVERT_LEGAL-NOT: %{{.+}} = call <8 x i32> @llvm.vp.mul.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 8) -; CONVERT_LEGAL-NOT: %{{.+}} = call <8 x i32> @llvm.vp.sdiv.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 8) -; CONVERT_LEGAL-NOT: %{{.+}} = call <8 x i32> @llvm.vp.srem.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 8) -; CONVERT_LEGAL-NOT: %{{.+}} = call <8 x i32> @llvm.vp.udiv.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 8) -; CONVERT_LEGAL-NOT: %{{.+}} = call <8 x i32> @llvm.vp.urem.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 8) -; CONVERT_LEGAL-NOT: %{{.+}} = call <8 x i32> @llvm.vp.and.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 8) -; CONVERT_LEGAL-NOT: %{{.+}} = call <8 x i32> @llvm.vp.or.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 8) -; CONVERT_LEGAL-NOT: %{{.+}} = call <8 x i32> @llvm.vp.xor.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 8) -; CONVERT_LEGAL-NOT: %{{.+}} = call <8 x i32> @llvm.vp.ashr.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 8) -; CONVERT_LEGAL-NOT: %{{.+}} = call <8 x i32> @llvm.vp.lshr.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 8) -; CONVERT_LEGAL-NOT: %{{.+}} = call <8 x i32> @llvm.vp.shl.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 8) -; CONVERT_LEGAL: ret void - -; Similar to %evl discard, %mask legal but make sure the first VP intrinsic has a legal expansion -; CONVERT_LEGAL: define void @test_vp_int_vscale(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i32> %i2, <vscale x 4 x i32> %f3, <vscale x 4 x i1> %m, i32 %n) { -; CONVERT_LEGAL-NEXT: [[EVLM:%.+]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 %n) -; CONVERT_LEGAL-NEXT: [[NEWM:%.+]] = and <vscale x 4 x i1> [[EVLM]], %m -; CONVERT_LEGAL-NEXT: %vscale = call i32 @llvm.vscale.i32() -; CONVERT_LEGAL-NEXT: %scalable_size = mul nuw i32 %vscale, 4 -; CONVERT_LEGAL-NEXT: %r0 = call <vscale x 4 x i32> @llvm.vp.add.nxv4i32(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i1> [[NEWM]], i32 %scalable_size) -; CONVERT_LEGAL-NOT: %{{.*}} = call <vscale x 4 x i32> @llvm.vp.{{.*}}, i32 %n) -; CONVERT_LEGAL: ret void - -; CONVERT_LEGAL: define void @test_vp_reduce_int_v4(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 %n) { -; CONVERT_LEGAL-NEXT: [[NINS:%.+]] = insertelement <4 x i32> poison, i32 %n, i64 0 -; CONVERT_LEGAL-NEXT: [[NSPLAT:%.+]] = shufflevector <4 x i32> [[NINS]], <4 x i32> poison, <4 x i32> zeroinitializer -; CONVERT_LEGAL-NEXT: [[EVLM:%.+]] = icmp ult <4 x i32> <i32 0, i32 1, i32 2, i32 3>, [[NSPLAT]] -; CONVERT_LEGAL-NEXT: [[NEWM:%.+]] = and <4 x i1> [[EVLM]], %m -; CONVERT_LEGAL-NEXT: %{{.+}} = call i32 @llvm.vp.reduce.add.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> [[NEWM]], i32 4) -; CONVERT_LEGAL-NOT: %{{.+}} = call i32 @llvm.vp.reduce.mul.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 4) -; CONVERT_LEGAL-NOT: %{{.+}} = call i32 @llvm.vp.reduce.and.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 4) -; CONVERT_LEGAL-NOT: %{{.+}} = call i32 @llvm.vp.reduce.or.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 4) -; CONVERT_LEGAL-NOT: %{{.+}} = call i32 @llvm.vp.reduce.xor.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 4) -; CONVERT_LEGAL-NOT: %{{.+}} = call i32 @llvm.vp.reduce.smin.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 4) -; CONVERT_LEGAL-NOT: %{{.+}} = call i32 @llvm.vp.reduce.smax.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 4) -; CONVERT_LEGAL-NOT: %{{.+}} = call i32 @llvm.vp.reduce.umin.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 4) -; CONVERT_LEGAL-NOT: %{{.+}} = call i32 @llvm.vp.reduce.umax.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 4) -; CONVERT_LEGAL: ret void - -; CONVERT_LEGAL: define void @test_vp_reduce_fp_v4(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n) { -; CONVERT_LEGAL-NEXT: [[NINS:%.+]] = insertelement <4 x i32> poison, i32 %n, i64 0 -; CONVERT_LEGAL-NEXT: [[NSPLAT:%.+]] = shufflevector <4 x i32> [[NINS]], <4 x i32> poison, <4 x i32> zeroinitializer -; CONVERT_LEGAL-NEXT: [[EVLM:%.+]] = icmp ult <4 x i32> <i32 0, i32 1, i32 2, i32 3>, [[NSPLAT]] -; CONVERT_LEGAL-NEXT: [[NEWM:%.+]] = and <4 x i1> [[EVLM]], %m -; CONVERT_LEGAL-NEXT: %{{.+}} = call float @llvm.vp.reduce.fmin.v4f32(float %f, <4 x float> %vf, <4 x i1> [[NEWM]], i32 4) -; CONVERT_LEGAL-NOT: %{{.+}} = call nnan float @llvm.vp.reduce.fmin.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 4) -; CONVERT_LEGAL-NOT: %{{.+}} = call nnan ninf float @llvm.vp.reduce.fmin.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 4) -; CONVERT_LEGAL-NOT: %{{.+}} = call float @llvm.vp.reduce.fmax.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 4) -; CONVERT_LEGAL-NOT: %{{.+}} = call nnan float @llvm.vp.reduce.fmax.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 4) -; CONVERT_LEGAL-NOT: %{{.+}} = call nnan ninf float @llvm.vp.reduce.fmax.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 4) -; CONVERT_LEGAL-NOT: %{{.+}} = call float @llvm.vp.reduce.fminimum.v4f32(float %f, <4 x float> %vf, <4 x i1> [[NEWM]], i32 4) -; CONVERT_LEGAL-NOT: %{{.+}} = call nnan float @llvm.vp.reduce.fminimum.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 4) -; CONVERT_LEGAL-NOT: %{{.+}} = call nnan ninf float @llvm.vp.reduce.fminimum.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 4) -; CONVERT_LEGAL-NOT: %{{.+}} = call float @llvm.vp.reduce.fmaximum.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 4) -; CONVERT_LEGAL-NOT: %{{.+}} = call nnan float @llvm.vp.reduce.fmaximum.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 4) -; CONVERT_LEGAL-NOT: %{{.+}} = call nnan ninf float @llvm.vp.reduce.fmaximum.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 4) -; CONVERT_LEGAL-NOT: %{{.+}} = call float @llvm.vp.reduce.fadd.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 4) -; CONVERT_LEGAL-NOT: %{{.+}} = call reassoc float @llvm.vp.reduce.fadd.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 4) -; CONVERT_LEGAL-NOT: %{{.+}} = call float @llvm.vp.reduce.fmul.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 4) -; CONVERT_LEGAL-NOT: %{{.+}} = call reassoc float @llvm.vp.reduce.fmul.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 4) -; CONVERT_LEGAL: ret void - -; CONVERT_LEGAL: define void @test_vp_cmp_v8(<8 x i32> %i0, <8 x i32> %i1, <8 x float> %f0, <8 x float> %f1, <8 x i1> %m, i32 %n) { -; CONVERT_LEGAL-NEXT: [[NINS:%.+]] = insertelement <8 x i32> poison, i32 %n, i64 0 -; CONVERT_LEGAL-NEXT: [[NSPLAT:%.+]] = shufflevector <8 x i32> [[NINS]], <8 x i32> poison, <8 x i32> zeroinitializer -; CONVERT_LEGAL-NEXT: [[EVLM:%.+]] = icmp ult <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[NSPLAT]] -; CONVERT_LEGAL-NEXT: [[NEWM:%.+]] = and <8 x i1> [[EVLM]], %m -; CONVERT_LEGAL-NEXT: %{{.+}} = call <8 x i1> @llvm.vp.icmp.v8i32(<8 x i32> %i0, <8 x i32> %i1, metadata !"eq", <8 x i1> [[NEWM]], i32 8) -; CONVERT_LEGAL-NOT: %{{.+}} = call <8 x i1> @llvm.vp.icmp.v8i32(<8 x i32> %i0, <8 x i32> %i1, metadata !"eq", <8 x i1> %m, i32 %n) -; CONVERT_LEGAL-NOT: %{{.+}} = call <8 x i1> @llvm.vp.icmp.v8i32(<8 x i32> %i0, <8 x i32> %i1, metadata !"slt", <8 x i1> %m, i32 %n -; CONVERT_LEGAL-NOT: %{{.+}} = call <8 x i1> @llvm.vp.fcmp.v8f32(<8 x float> %f0, <8 x float> %f1, metadata !"oeq", <8 x i1> %m, i32 %n) -; CONVERT_LEGAL-NOT: %{{.+}} = call <8 x i1> @llvm.vp.fcmp.v8f32(<8 x float> %f0, <8 x float> %f1, metadata !"ult", <8 x i1> %m, i32 %n) -; CONVERT_LEGAL: ret void diff --git a/llvm/test/CodeGen/LoongArch/O0-pipeline.ll b/llvm/test/CodeGen/LoongArch/O0-pipeline.ll index 138f0c81238b..38c1dbcb1075 100644 --- a/llvm/test/CodeGen/LoongArch/O0-pipeline.ll +++ b/llvm/test/CodeGen/LoongArch/O0-pipeline.ll @@ -26,7 +26,6 @@ ; CHECK-NEXT: Lower Garbage Collection Instructions ; CHECK-NEXT: Shadow Stack GC Lowering ; CHECK-NEXT: Remove unreachable blocks from the CFG -; CHECK-NEXT: Expand vector predication intrinsics ; CHECK-NEXT: Instrument function entry/exit with calls to e.g. mcount() (post inlining) ; CHECK-NEXT: Scalarize Masked Memory Intrinsics ; CHECK-NEXT: Expand reduction intrinsics diff --git a/llvm/test/CodeGen/LoongArch/opt-pipeline.ll b/llvm/test/CodeGen/LoongArch/opt-pipeline.ll index c5c5342e303c..391888a38daf 100644 --- a/llvm/test/CodeGen/LoongArch/opt-pipeline.ll +++ b/llvm/test/CodeGen/LoongArch/opt-pipeline.ll @@ -61,7 +61,6 @@ ; LAXX-NEXT: Constant Hoisting ; LAXX-NEXT: Replace intrinsics with calls to vector library ; LAXX-NEXT: Partially inline calls to library functions -; LAXX-NEXT: Expand vector predication intrinsics ; LAXX-NEXT: Instrument function entry/exit with calls to e.g. mcount() (post inlining) ; LAXX-NEXT: Scalarize Masked Memory Intrinsics ; LAXX-NEXT: Expand reduction intrinsics diff --git a/llvm/test/CodeGen/M68k/pipeline.ll b/llvm/test/CodeGen/M68k/pipeline.ll index 0481d5c18e0c..6aa66d09aa78 100644 --- a/llvm/test/CodeGen/M68k/pipeline.ll +++ b/llvm/test/CodeGen/M68k/pipeline.ll @@ -32,7 +32,6 @@ ; CHECK-NEXT: Constant Hoisting ; CHECK-NEXT: Replace intrinsics with calls to vector library ; CHECK-NEXT: Partially inline calls to library functions -; CHECK-NEXT: Expand vector predication intrinsics ; CHECK-NEXT: Instrument function entry/exit with calls to e.g. mcount() (post inlining) ; CHECK-NEXT: Scalarize Masked Memory Intrinsics ; CHECK-NEXT: Expand reduction intrinsics diff --git a/llvm/test/CodeGen/Mips/llvm-ir/and-srl.ll b/llvm/test/CodeGen/Mips/llvm-ir/and-srl.ll new file mode 100644 index 000000000000..988a0f5ee5ba --- /dev/null +++ b/llvm/test/CodeGen/Mips/llvm-ir/and-srl.ll @@ -0,0 +1,28 @@ +; RUN: llc < %s -mtriple=mips64el-unknown-linux-gnu -mcpu=mips64 | FileCheck %s \ +; RUN: -check-prefix=MIPS4 +; RUN: llc < %s -mtriple=mips64el-unknown-linux-gnu -mcpu=mips64r2 | FileCheck %s \ +; RUN: -check-prefix=MIPS64R2 + +define i64 @foo(i64 noundef %a) { +; MIPS4-LABEL: foo: +; MIPS4: # %bb.0: # %entry +; MIPS4-NEXT: sll $1, $4, 0 +; MIPS4-NEXT: srl $1, $1, 2 +; MIPS4-NEXT: andi $1, $1, 7 +; MIPS4-NEXT: daddiu $2, $zero, 1 +; MIPS4-NEXT: jr $ra +; MIPS4-NEXT: dsllv $2, $2, $1 +; +; MIPS64R2-LABEL: foo: +; MIPS64R2: # %bb.0: # %entry +; MIPS64R2-NEXT: sll $1, $4, 0 +; MIPS64R2-NEXT: ext $1, $1, 2, 3 +; MIPS64R2-NEXT: daddiu $2, $zero, 1 +; MIPS64R2-NEXT: jr $ra +; MIPS64R2-NEXT: dsllv $2, $2, $1 +entry: + %div1 = lshr i64 %a, 2 + %and = and i64 %div1, 7 + %shl = shl nuw nsw i64 1, %and + ret i64 %shl +} diff --git a/llvm/test/CodeGen/NVPTX/fence-proxy-tensormap.ll b/llvm/test/CodeGen/NVPTX/fence-proxy-tensormap.ll new file mode 100644 index 000000000000..83a2ca4f481b --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/fence-proxy-tensormap.ll @@ -0,0 +1,36 @@ +; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx83 | FileCheck --check-prefixes=CHECK %s +; RUN: %if ptxas-12.3 %{ llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx83 | %ptxas-verify -arch=sm_90 %} + +; CHECK-LABEL: test_fence_proxy_tensormap_generic_release +define void @test_fence_proxy_tensormap_generic_release() { + ; CHECK: fence.proxy.tensormap::generic.release.cta; + call void @llvm.nvvm.fence.proxy.tensormap_generic.release.cta(); + + ; CHECK: fence.proxy.tensormap::generic.release.cluster; + call void @llvm.nvvm.fence.proxy.tensormap_generic.release.cluster(); + + ; CHECK: fence.proxy.tensormap::generic.release.gpu; + call void @llvm.nvvm.fence.proxy.tensormap_generic.release.gpu(); + + ; CHECK: fence.proxy.tensormap::generic.release.sys; + call void @llvm.nvvm.fence.proxy.tensormap_generic.release.sys(); + + ret void +} + +; CHECK-LABEL: test_fence_proxy_tensormap_generic_acquire +define void @test_fence_proxy_tensormap_generic_acquire(ptr addrspace(0) %addr) { + ; CHECK: fence.proxy.tensormap::generic.acquire.cta [%rd{{[0-9]+}}], 128; + call void @llvm.nvvm.fence.proxy.tensormap_generic.acquire.cta(ptr addrspace(0) %addr, i32 128); + + ; CHECK: fence.proxy.tensormap::generic.acquire.cluster [%rd{{[0-9]+}}], 128; + call void @llvm.nvvm.fence.proxy.tensormap_generic.acquire.cluster(ptr addrspace(0) %addr, i32 128); + + ; CHECK: fence.proxy.tensormap::generic.acquire.gpu [%rd{{[0-9]+}}], 128; + call void @llvm.nvvm.fence.proxy.tensormap_generic.acquire.gpu(ptr addrspace(0) %addr, i32 128); + + ; CHECK: fence.proxy.tensormap::generic.acquire.sys [%rd{{[0-9]+}}], 128; + call void @llvm.nvvm.fence.proxy.tensormap_generic.acquire.sys(ptr addrspace(0) %addr, i32 128); + + ret void +} diff --git a/llvm/test/CodeGen/NVPTX/load-store-sm-70.ll b/llvm/test/CodeGen/NVPTX/load-store-sm-70.ll index 68915b0f2698..9cea33d12027 100644 --- a/llvm/test/CodeGen/NVPTX/load-store-sm-70.ll +++ b/llvm/test/CodeGen/NVPTX/load-store-sm-70.ll @@ -1,169 +1,7 @@ ; RUN: llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx82 | FileCheck %s ; RUN: %if ptxas-12.2 %{ llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx82 | %ptxas-verify -arch=sm_70 %} -; CHECK-LABEL: generic_plain -define void @generic_plain(ptr %a, ptr %b, ptr %c, ptr %d) local_unnamed_addr { - ; CHECK: ld.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %a.load = load i8, ptr %a - %a.add = add i8 %a.load, 1 - ; CHECK: st.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store i8 %a.add, ptr %a - - ; CHECK: ld.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load i16, ptr %b - %b.add = add i16 %b.load, 1 - ; CHECK: st.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store i16 %b.add, ptr %b - - ; CHECK: ld.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load i32, ptr %c - %c.add = add i32 %c.load, 1 - ; CHECK: st.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store i32 %c.add, ptr %c - - ; CHECK: ld.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load i64, ptr %d - %d.add = add i64 %d.load, 1 - ; CHECK: st.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store i64 %d.add, ptr %d - - ; CHECK: ld.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load float, ptr %c - %e.add = fadd float %e.load, 1. - ; CHECK: st.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store float %e.add, ptr %c - - ; CHECK: ld.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load double, ptr %c - %f.add = fadd double %f.load, 1. - ; CHECK: st.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store double %f.add, ptr %c - - ret void -} - -; CHECK-LABEL: generic_volatile -define void @generic_volatile(ptr %a, ptr %b, ptr %c, ptr %d) local_unnamed_addr { - ; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %a.load = load volatile i8, ptr %a - %a.add = add i8 %a.load, 1 - ; CHECK: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store volatile i8 %a.add, ptr %a - - ; CHECK: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load volatile i16, ptr %b - %b.add = add i16 %b.load, 1 - ; CHECK: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store volatile i16 %b.add, ptr %b - - ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load volatile i32, ptr %c - %c.add = add i32 %c.load, 1 - ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store volatile i32 %c.add, ptr %c - - ; CHECK: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load volatile i64, ptr %d - %d.add = add i64 %d.load, 1 - ; CHECK: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store volatile i64 %d.add, ptr %d - - ; CHECK: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load volatile float, ptr %c - %e.add = fadd float %e.load, 1. - ; CHECK: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store volatile float %e.add, ptr %c - - ; CHECK: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load volatile double, ptr %c - %f.add = fadd double %f.load, 1. - ; CHECK: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store volatile double %f.add, ptr %c - - ret void -} - -; CHECK-LABEL: generic_unordered -define void @generic_unordered(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { - ; CHECK: ld.relaxed.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %a.load = load atomic i8, ptr %a unordered, align 1 - %a.add = add i8 %a.load, 1 - ; CHECK: st.relaxed.sys.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic i8 %a.add, ptr %a unordered, align 1 - - ; CHECK: ld.relaxed.sys.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic i16, ptr %b unordered, align 2 - %b.add = add i16 %b.load, 1 - ; CHECK: st.relaxed.sys.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic i16 %b.add, ptr %b unordered, align 2 - - ; CHECK: ld.relaxed.sys.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic i32, ptr %c unordered, align 4 - %c.add = add i32 %c.load, 1 - ; CHECK: st.relaxed.sys.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic i32 %c.add, ptr %c unordered, align 4 - - ; CHECK: ld.relaxed.sys.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic i64, ptr %d unordered, align 8 - %d.add = add i64 %d.load, 1 - ; CHECK: st.relaxed.sys.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic i64 %d.add, ptr %d unordered, align 8 - - ; CHECK: ld.relaxed.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic float, ptr %e unordered, align 4 - %e.add = fadd float %e.load, 1.0 - ; CHECK: st.relaxed.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic float %e.add, ptr %e unordered, align 4 - - ; CHECK: ld.relaxed.sys.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic double, ptr %e unordered, align 8 - %f.add = fadd double %f.load, 1. - ; CHECK: st.relaxed.sys.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic double %f.add, ptr %e unordered, align 8 - - ret void -} - -; CHECK-LABEL: generic_monotonic -define void @generic_monotonic(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { - ; CHECK: ld.relaxed.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %a.load = load atomic i8, ptr %a monotonic, align 1 - %a.add = add i8 %a.load, 1 - ; CHECK: st.relaxed.sys.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic i8 %a.add, ptr %a monotonic, align 1 - - ; CHECK: ld.relaxed.sys.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic i16, ptr %b monotonic, align 2 - %b.add = add i16 %b.load, 1 - ; CHECK: st.relaxed.sys.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic i16 %b.add, ptr %b monotonic, align 2 - - ; CHECK: ld.relaxed.sys.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic i32, ptr %c monotonic, align 4 - %c.add = add i32 %c.load, 1 - ; CHECK: st.relaxed.sys.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic i32 %c.add, ptr %c monotonic, align 4 - - ; CHECK: ld.relaxed.sys.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic i64, ptr %d monotonic, align 8 - %d.add = add i64 %d.load, 1 - ; CHECK: st.relaxed.sys.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic i64 %d.add, ptr %d monotonic, align 8 - - ; CHECK: ld.relaxed.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic float, ptr %e monotonic, align 4 - %e.add = fadd float %e.load, 1.0 - ; CHECK: st.relaxed.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic float %e.add, ptr %e monotonic, align 4 - - ; CHECK: ld.relaxed.sys.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic double, ptr %e monotonic, align 8 - %f.add = fadd double %f.load, 1. - ; CHECK: st.relaxed.sys.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic double %f.add, ptr %e monotonic, align 8 - - ret void -} +;; generic statespace ; CHECK-LABEL: generic_acq_rel define void @generic_acq_rel(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { @@ -206,335 +44,154 @@ define void @generic_acq_rel(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnam ret void } -; CHECK-LABEL: generic_unordered_volatile -define void @generic_unordered_volatile(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { - ; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %a.load = load atomic volatile i8, ptr %a unordered, align 1 - %a.add = add i8 %a.load, 1 - ; CHECK: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic volatile i8 %a.add, ptr %a unordered, align 1 - - ; CHECK: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic volatile i16, ptr %b unordered, align 2 - %b.add = add i16 %b.load, 1 - ; CHECK: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic volatile i16 %b.add, ptr %b unordered, align 2 - - ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic volatile i32, ptr %c unordered, align 4 - %c.add = add i32 %c.load, 1 - ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic volatile i32 %c.add, ptr %c unordered, align 4 - - ; CHECK: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic volatile i64, ptr %d unordered, align 8 - %d.add = add i64 %d.load, 1 - ; CHECK: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic volatile i64 %d.add, ptr %d unordered, align 8 - - ; CHECK: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic volatile float, ptr %e unordered, align 4 - %e.add = fadd float %e.load, 1.0 - ; CHECK: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic volatile float %e.add, ptr %e unordered, align 4 - - ; CHECK: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic volatile double, ptr %e unordered, align 8 - %f.add = fadd double %f.load, 1. - ; CHECK: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic volatile double %f.add, ptr %e unordered, align 8 - - ret void -} - -; CHECK-LABEL: generic_monotonic_volatile -define void @generic_monotonic_volatile(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { - ; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %a.load = load atomic volatile i8, ptr %a monotonic, align 1 - %a.add = add i8 %a.load, 1 - ; CHECK: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic volatile i8 %a.add, ptr %a monotonic, align 1 - - ; CHECK: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic volatile i16, ptr %b monotonic, align 2 - %b.add = add i16 %b.load, 1 - ; CHECK: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic volatile i16 %b.add, ptr %b monotonic, align 2 - - ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic volatile i32, ptr %c monotonic, align 4 - %c.add = add i32 %c.load, 1 - ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic volatile i32 %c.add, ptr %c monotonic, align 4 - - ; CHECK: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic volatile i64, ptr %d monotonic, align 8 - %d.add = add i64 %d.load, 1 - ; CHECK: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic volatile i64 %d.add, ptr %d monotonic, align 8 - - ; CHECK: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic volatile float, ptr %e monotonic, align 4 - %e.add = fadd float %e.load, 1.0 - ; CHECK: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic volatile float %e.add, ptr %e monotonic, align 4 - - ; CHECK: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic volatile double, ptr %e monotonic, align 8 - %f.add = fadd double %f.load, 1. - ; CHECK: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic volatile double %f.add, ptr %e monotonic, align 8 - - ret void -} - -;; global statespace - -; CHECK-LABEL: global_plain -define void @global_plain(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d) local_unnamed_addr { - ; CHECK: ld.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %a.load = load i8, ptr addrspace(1) %a - %a.add = add i8 %a.load, 1 - ; CHECK: st.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store i8 %a.add, ptr addrspace(1) %a - - ; CHECK: ld.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load i16, ptr addrspace(1) %b - %b.add = add i16 %b.load, 1 - ; CHECK: st.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store i16 %b.add, ptr addrspace(1) %b - - ; CHECK: ld.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load i32, ptr addrspace(1) %c - %c.add = add i32 %c.load, 1 - ; CHECK: st.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store i32 %c.add, ptr addrspace(1) %c - - ; CHECK: ld.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load i64, ptr addrspace(1) %d - %d.add = add i64 %d.load, 1 - ; CHECK: st.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store i64 %d.add, ptr addrspace(1) %d - - ; CHECK: ld.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load float, ptr addrspace(1) %c - %e.add = fadd float %e.load, 1. - ; CHECK: st.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store float %e.add, ptr addrspace(1) %c - - ; CHECK: ld.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load double, ptr addrspace(1) %c - %f.add = fadd double %f.load, 1. - ; CHECK: st.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store double %f.add, ptr addrspace(1) %c - - ret void -} - -; CHECK-LABEL: global_volatile -define void @global_volatile(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d) local_unnamed_addr { - ; CHECK: ld.volatile.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %a.load = load volatile i8, ptr addrspace(1) %a - %a.add = add i8 %a.load, 1 - ; CHECK: st.volatile.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store volatile i8 %a.add, ptr addrspace(1) %a - - ; CHECK: ld.volatile.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load volatile i16, ptr addrspace(1) %b - %b.add = add i16 %b.load, 1 - ; CHECK: st.volatile.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store volatile i16 %b.add, ptr addrspace(1) %b - - ; CHECK: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load volatile i32, ptr addrspace(1) %c - %c.add = add i32 %c.load, 1 - ; CHECK: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store volatile i32 %c.add, ptr addrspace(1) %c - - ; CHECK: ld.volatile.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load volatile i64, ptr addrspace(1) %d - %d.add = add i64 %d.load, 1 - ; CHECK: st.volatile.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store volatile i64 %d.add, ptr addrspace(1) %d - - ; CHECK: ld.volatile.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load volatile float, ptr addrspace(1) %c - %e.add = fadd float %e.load, 1. - ; CHECK: st.volatile.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store volatile float %e.add, ptr addrspace(1) %c - - ; CHECK: ld.volatile.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load volatile double, ptr addrspace(1) %c - %f.add = fadd double %f.load, 1. - ; CHECK: st.volatile.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store volatile double %f.add, ptr addrspace(1) %c - - ret void -} - -; CHECK-LABEL: global_unordered -define void @global_unordered(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { - ; CHECK: ld.relaxed.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %a.load = load atomic i8, ptr addrspace(1) %a unordered, align 1 +; CHECK-LABEL: generic_acq_rel_volatile +define void @generic_acq_rel_volatile(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { + ; CHECK: ld.acquire.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr %a acquire, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.relaxed.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic i8 %a.add, ptr addrspace(1) %a unordered, align 1 + ; CHECK: st.release.sys.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr %a release, align 1 - ; CHECK: ld.relaxed.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic i16, ptr addrspace(1) %b unordered, align 2 + ; CHECK: ld.acquire.sys.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr %b acquire, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.relaxed.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic i16 %b.add, ptr addrspace(1) %b unordered, align 2 + ; CHECK: st.release.sys.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr %b release, align 2 - ; CHECK: ld.relaxed.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic i32, ptr addrspace(1) %c unordered, align 4 + ; CHECK: ld.acquire.sys.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr %c acquire, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.relaxed.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic i32 %c.add, ptr addrspace(1) %c unordered, align 4 + ; CHECK: st.release.sys.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr %c release, align 4 - ; CHECK: ld.relaxed.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic i64, ptr addrspace(1) %d unordered, align 8 + ; CHECK: ld.acquire.sys.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr %d acquire, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.relaxed.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic i64 %d.add, ptr addrspace(1) %d unordered, align 8 + ; CHECK: st.release.sys.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr %d release, align 8 - ; CHECK: ld.relaxed.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic float, ptr addrspace(1) %e unordered, align 4 + ; CHECK: ld.acquire.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr %e acquire, align 4 %e.add = fadd float %e.load, 1.0 - ; CHECK: st.relaxed.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic float %e.add, ptr addrspace(1) %e unordered, align 4 + ; CHECK: st.release.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr %e release, align 4 - ; CHECK: ld.relaxed.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic double, ptr addrspace(1) %e unordered, align 8 + ; CHECK: ld.acquire.sys.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr %e acquire, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.relaxed.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic double %f.add, ptr addrspace(1) %e unordered, align 8 + ; CHECK: st.release.sys.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr %e release, align 8 ret void } -; CHECK-LABEL: global_monotonic -define void @global_monotonic(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { - ; CHECK: ld.relaxed.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %a.load = load atomic i8, ptr addrspace(1) %a monotonic, align 1 +; CHECK-LABEL: generic_sc +define void @generic_sc(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr %a seq_cst, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.relaxed.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic i8 %a.add, ptr addrspace(1) %a monotonic, align 1 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr %a seq_cst, align 1 - ; CHECK: ld.relaxed.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic i16, ptr addrspace(1) %b monotonic, align 2 + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr %b seq_cst, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.relaxed.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic i16 %b.add, ptr addrspace(1) %b monotonic, align 2 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr %b seq_cst, align 2 - ; CHECK: ld.relaxed.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic i32, ptr addrspace(1) %c monotonic, align 4 + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr %c seq_cst, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.relaxed.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic i32 %c.add, ptr addrspace(1) %c monotonic, align 4 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr %c seq_cst, align 4 - ; CHECK: ld.relaxed.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic i64, ptr addrspace(1) %d monotonic, align 8 + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr %d seq_cst, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.relaxed.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic i64 %d.add, ptr addrspace(1) %d monotonic, align 8 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr %d seq_cst, align 8 - ; CHECK: ld.relaxed.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic float, ptr addrspace(1) %e monotonic, align 4 + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr %e seq_cst, align 4 %e.add = fadd float %e.load, 1.0 - ; CHECK: st.relaxed.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic float %e.add, ptr addrspace(1) %e monotonic, align 4 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr %e seq_cst, align 4 - ; CHECK: ld.relaxed.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic double, ptr addrspace(1) %e monotonic, align 8 + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr %e seq_cst, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.relaxed.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic double %f.add, ptr addrspace(1) %e monotonic, align 8 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr %e seq_cst, align 8 ret void } -; CHECK-LABEL: global_unordered_volatile -define void @global_unordered_volatile(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { - ; CHECK: ld.mmio.relaxed.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %a.load = load atomic volatile i8, ptr addrspace(1) %a unordered, align 1 +; CHECK-LABEL: generic_sc_volatile +define void @generic_sc_volatile(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr %a seq_cst, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.mmio.relaxed.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic volatile i8 %a.add, ptr addrspace(1) %a unordered, align 1 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr %a seq_cst, align 1 - ; CHECK: ld.mmio.relaxed.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic volatile i16, ptr addrspace(1) %b unordered, align 2 + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr %b seq_cst, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.mmio.relaxed.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic volatile i16 %b.add, ptr addrspace(1) %b unordered, align 2 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr %b seq_cst, align 2 - ; CHECK: ld.mmio.relaxed.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic volatile i32, ptr addrspace(1) %c unordered, align 4 + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr %c seq_cst, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.mmio.relaxed.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic volatile i32 %c.add, ptr addrspace(1) %c unordered, align 4 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr %c seq_cst, align 4 - ; CHECK: ld.mmio.relaxed.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic volatile i64, ptr addrspace(1) %d unordered, align 8 + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr %d seq_cst, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.mmio.relaxed.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic volatile i64 %d.add, ptr addrspace(1) %d unordered, align 8 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr %d seq_cst, align 8 - ; CHECK: ld.mmio.relaxed.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic volatile float, ptr addrspace(1) %e unordered, align 4 + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr %e seq_cst, align 4 %e.add = fadd float %e.load, 1.0 - ; CHECK: st.mmio.relaxed.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic volatile float %e.add, ptr addrspace(1) %e unordered, align 4 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr %e seq_cst, align 4 - ; CHECK: ld.mmio.relaxed.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic volatile double, ptr addrspace(1) %e unordered, align 8 + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr %e seq_cst, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.mmio.relaxed.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic volatile double %f.add, ptr addrspace(1) %e unordered, align 8 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr %e seq_cst, align 8 ret void } -; CHECK-LABEL: global_monotonic_volatile -define void @global_monotonic_volatile(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { - ; CHECK: ld.mmio.relaxed.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %a.load = load atomic volatile i8, ptr addrspace(1) %a monotonic, align 1 - %a.add = add i8 %a.load, 1 - ; CHECK: st.mmio.relaxed.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic volatile i8 %a.add, ptr addrspace(1) %a monotonic, align 1 - - ; CHECK: ld.mmio.relaxed.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic volatile i16, ptr addrspace(1) %b monotonic, align 2 - %b.add = add i16 %b.load, 1 - ; CHECK: st.mmio.relaxed.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic volatile i16 %b.add, ptr addrspace(1) %b monotonic, align 2 - - ; CHECK: ld.mmio.relaxed.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic volatile i32, ptr addrspace(1) %c monotonic, align 4 - %c.add = add i32 %c.load, 1 - ; CHECK: st.mmio.relaxed.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic volatile i32 %c.add, ptr addrspace(1) %c monotonic, align 4 - - ; CHECK: ld.mmio.relaxed.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic volatile i64, ptr addrspace(1) %d monotonic, align 8 - %d.add = add i64 %d.load, 1 - ; CHECK: st.mmio.relaxed.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic volatile i64 %d.add, ptr addrspace(1) %d monotonic, align 8 - - ; CHECK: ld.mmio.relaxed.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic volatile float, ptr addrspace(1) %e monotonic, align 4 - %e.add = fadd float %e.load, 1.0 - ; CHECK: st.mmio.relaxed.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic volatile float %e.add, ptr addrspace(1) %e monotonic, align 4 - - ; CHECK: ld.mmio.relaxed.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic volatile double, ptr addrspace(1) %e monotonic, align 8 - %f.add = fadd double %f.load, 1. - ; CHECK: st.mmio.relaxed.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic volatile double %f.add, ptr addrspace(1) %e monotonic, align 8 - - ret void -} +;; global statespace ; CHECK-LABEL: global_acq_rel define void @global_acq_rel(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { @@ -618,253 +275,113 @@ define void @global_acq_rel_volatile(ptr addrspace(1) %a, ptr addrspace(1) %b, p ret void } -;; shared statespace - -; CHECK-LABEL: shared_plain -define void @shared_plain(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d) local_unnamed_addr { - ; CHECK: ld.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %a.load = load i8, ptr addrspace(3) %a - %a.add = add i8 %a.load, 1 - ; CHECK: st.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store i8 %a.add, ptr addrspace(3) %a - - ; CHECK: ld.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load i16, ptr addrspace(3) %b - %b.add = add i16 %b.load, 1 - ; CHECK: st.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store i16 %b.add, ptr addrspace(3) %b - - ; CHECK: ld.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load i32, ptr addrspace(3) %c - %c.add = add i32 %c.load, 1 - ; CHECK: st.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store i32 %c.add, ptr addrspace(3) %c - - ; CHECK: ld.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load i64, ptr addrspace(3) %d - %d.add = add i64 %d.load, 1 - ; CHECK: st.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store i64 %d.add, ptr addrspace(3) %d - - ; CHECK: ld.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load float, ptr addrspace(3) %c - %e.add = fadd float %e.load, 1. - ; CHECK: st.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store float %e.add, ptr addrspace(3) %c - - ; CHECK: ld.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load double, ptr addrspace(3) %c - %f.add = fadd double %f.load, 1. - ; CHECK: st.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store double %f.add, ptr addrspace(3) %c - - ret void -} - -; CHECK-LABEL: shared_volatile -define void @shared_volatile(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d) local_unnamed_addr { - ; CHECK: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %a.load = load volatile i8, ptr addrspace(3) %a - %a.add = add i8 %a.load, 1 - ; CHECK: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store volatile i8 %a.add, ptr addrspace(3) %a - - ; CHECK: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load volatile i16, ptr addrspace(3) %b - %b.add = add i16 %b.load, 1 - ; CHECK: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store volatile i16 %b.add, ptr addrspace(3) %b - - ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load volatile i32, ptr addrspace(3) %c - %c.add = add i32 %c.load, 1 - ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store volatile i32 %c.add, ptr addrspace(3) %c - - ; CHECK: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load volatile i64, ptr addrspace(3) %d - %d.add = add i64 %d.load, 1 - ; CHECK: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store volatile i64 %d.add, ptr addrspace(3) %d - - ; CHECK: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load volatile float, ptr addrspace(3) %c - %e.add = fadd float %e.load, 1. - ; CHECK: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store volatile float %e.add, ptr addrspace(3) %c - - ; CHECK: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load volatile double, ptr addrspace(3) %c - %f.add = fadd double %f.load, 1. - ; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store volatile double %f.add, ptr addrspace(3) %c - - ret void -} - -; CHECK-LABEL: shared_unordered -define void @shared_unordered(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { - ; CHECK: ld.relaxed.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %a.load = load atomic i8, ptr addrspace(3) %a unordered, align 1 - %a.add = add i8 %a.load, 1 - ; CHECK: st.relaxed.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic i8 %a.add, ptr addrspace(3) %a unordered, align 1 - - ; CHECK: ld.relaxed.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic i16, ptr addrspace(3) %b unordered, align 2 - %b.add = add i16 %b.load, 1 - ; CHECK: st.relaxed.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic i16 %b.add, ptr addrspace(3) %b unordered, align 2 - - ; CHECK: ld.relaxed.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic i32, ptr addrspace(3) %c unordered, align 4 - %c.add = add i32 %c.load, 1 - ; CHECK: st.relaxed.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic i32 %c.add, ptr addrspace(3) %c unordered, align 4 - - ; CHECK: ld.relaxed.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic i64, ptr addrspace(3) %d unordered, align 8 - %d.add = add i64 %d.load, 1 - ; CHECK: st.relaxed.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic i64 %d.add, ptr addrspace(3) %d unordered, align 8 - - ; CHECK: ld.relaxed.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic float, ptr addrspace(3) %e unordered, align 4 - %e.add = fadd float %e.load, 1.0 - ; CHECK: st.relaxed.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic float %e.add, ptr addrspace(3) %e unordered, align 4 - - ; CHECK: ld.relaxed.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic double, ptr addrspace(3) %e unordered, align 8 - %f.add = fadd double %f.load, 1. - ; CHECK: st.relaxed.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic double %f.add, ptr addrspace(3) %e unordered, align 8 - - ret void -} - -; CHECK-LABEL: shared_unordered_volatile -define void @shared_unordered_volatile(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { - ; CHECK: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %a.load = load atomic volatile i8, ptr addrspace(3) %a unordered, align 1 +; CHECK-LABEL: global_seq_cst +define void @global_seq_cst(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr addrspace(1) %a seq_cst, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic volatile i8 %a.add, ptr addrspace(3) %a unordered, align 1 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr addrspace(1) %a seq_cst, align 1 - ; CHECK: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic volatile i16, ptr addrspace(3) %b unordered, align 2 + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr addrspace(1) %b seq_cst, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic volatile i16 %b.add, ptr addrspace(3) %b unordered, align 2 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr addrspace(1) %b seq_cst, align 2 - ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic volatile i32, ptr addrspace(3) %c unordered, align 4 + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr addrspace(1) %c seq_cst, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic volatile i32 %c.add, ptr addrspace(3) %c unordered, align 4 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr addrspace(1) %c seq_cst, align 4 - ; CHECK: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic volatile i64, ptr addrspace(3) %d unordered, align 8 + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr addrspace(1) %d seq_cst, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic volatile i64 %d.add, ptr addrspace(3) %d unordered, align 8 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr addrspace(1) %d seq_cst, align 8 - ; CHECK: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic volatile float, ptr addrspace(3) %e unordered, align 4 + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr addrspace(1) %e seq_cst, align 4 %e.add = fadd float %e.load, 1.0 - ; CHECK: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic volatile float %e.add, ptr addrspace(3) %e unordered, align 4 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr addrspace(1) %e seq_cst, align 4 - ; CHECK: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic volatile double, ptr addrspace(3) %e unordered, align 8 + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr addrspace(1) %e seq_cst, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic volatile double %f.add, ptr addrspace(3) %e unordered, align 8 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr addrspace(1) %e seq_cst, align 8 ret void } -; CHECK-LABEL: shared_monotonic -define void @shared_monotonic(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { - ; CHECK: ld.relaxed.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %a.load = load atomic i8, ptr addrspace(3) %a monotonic, align 1 +; CHECK-LABEL: global_seq_cst_volatile +define void @global_seq_cst_volatile(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr addrspace(1) %a seq_cst, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.relaxed.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic i8 %a.add, ptr addrspace(3) %a monotonic, align 1 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr addrspace(1) %a seq_cst, align 1 - ; CHECK: ld.relaxed.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic i16, ptr addrspace(3) %b monotonic, align 2 + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr addrspace(1) %b seq_cst, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.relaxed.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic i16 %b.add, ptr addrspace(3) %b monotonic, align 2 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr addrspace(1) %b seq_cst, align 2 - ; CHECK: ld.relaxed.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic i32, ptr addrspace(3) %c monotonic, align 4 + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr addrspace(1) %c seq_cst, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.relaxed.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic i32 %c.add, ptr addrspace(3) %c monotonic, align 4 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr addrspace(1) %c seq_cst, align 4 - ; CHECK: ld.relaxed.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic i64, ptr addrspace(3) %d monotonic, align 8 + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr addrspace(1) %d seq_cst, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.relaxed.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic i64 %d.add, ptr addrspace(3) %d monotonic, align 8 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr addrspace(1) %d seq_cst, align 8 - ; CHECK: ld.relaxed.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic float, ptr addrspace(3) %e monotonic, align 4 + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr addrspace(1) %e seq_cst, align 4 %e.add = fadd float %e.load, 1.0 - ; CHECK: st.relaxed.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic float %e.add, ptr addrspace(3) %e monotonic, align 4 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr addrspace(1) %e seq_cst, align 4 - ; CHECK: ld.relaxed.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic double, ptr addrspace(3) %e monotonic, align 8 + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr addrspace(1) %e seq_cst, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.relaxed.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic double %f.add, ptr addrspace(3) %e monotonic, align 8 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr addrspace(1) %e seq_cst, align 8 ret void } -; CHECK-LABEL: shared_monotonic_volatile -define void @shared_monotonic_volatile(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { - ; CHECK: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %a.load = load atomic volatile i8, ptr addrspace(3) %a monotonic, align 1 - %a.add = add i8 %a.load, 1 - ; CHECK: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic volatile i8 %a.add, ptr addrspace(3) %a monotonic, align 1 - - ; CHECK: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic volatile i16, ptr addrspace(3) %b monotonic, align 2 - %b.add = add i16 %b.load, 1 - ; CHECK: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic volatile i16 %b.add, ptr addrspace(3) %b monotonic, align 2 - - ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic volatile i32, ptr addrspace(3) %c monotonic, align 4 - %c.add = add i32 %c.load, 1 - ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic volatile i32 %c.add, ptr addrspace(3) %c monotonic, align 4 - - ; CHECK: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic volatile i64, ptr addrspace(3) %d monotonic, align 8 - %d.add = add i64 %d.load, 1 - ; CHECK: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic volatile i64 %d.add, ptr addrspace(3) %d monotonic, align 8 - - ; CHECK: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic volatile float, ptr addrspace(3) %e monotonic, align 4 - %e.add = fadd float %e.load, 1.0 - ; CHECK: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic volatile float %e.add, ptr addrspace(3) %e monotonic, align 4 - - ; CHECK: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic volatile double, ptr addrspace(3) %e monotonic, align 8 - %f.add = fadd double %f.load, 1. - ; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic volatile double %f.add, ptr addrspace(3) %e monotonic, align 8 - - ret void -} +;; shared statespace ; CHECK-LABEL: shared_acq_rel define void @shared_acq_rel(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { @@ -948,332 +465,291 @@ define void @shared_acq_rel_volatile(ptr addrspace(3) %a, ptr addrspace(3) %b, p ret void } -;; local statespace - -; CHECK-LABEL: local_plain -define void @local_plain(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d) local_unnamed_addr { - ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %a.load = load i8, ptr addrspace(5) %a - %a.add = add i8 %a.load, 1 - ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store i8 %a.add, ptr addrspace(5) %a - - ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load i16, ptr addrspace(5) %b - %b.add = add i16 %b.load, 1 - ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store i16 %b.add, ptr addrspace(5) %b - - ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load i32, ptr addrspace(5) %c - %c.add = add i32 %c.load, 1 - ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store i32 %c.add, ptr addrspace(5) %c - - ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load i64, ptr addrspace(5) %d - %d.add = add i64 %d.load, 1 - ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store i64 %d.add, ptr addrspace(5) %d - - ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load float, ptr addrspace(5) %c - %e.add = fadd float %e.load, 1. - ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store float %e.add, ptr addrspace(5) %c - - ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load double, ptr addrspace(5) %c - %f.add = fadd double %f.load, 1. - ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store double %f.add, ptr addrspace(5) %c - - ret void -} - -; CHECK-LABEL: local_volatile -define void @local_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d) local_unnamed_addr { - ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %a.load = load volatile i8, ptr addrspace(5) %a +; CHECK-LABEL: shared_seq_cst +define void @shared_seq_cst(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr addrspace(3) %a seq_cst, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store volatile i8 %a.add, ptr addrspace(5) %a + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr addrspace(3) %a seq_cst, align 1 - ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load volatile i16, ptr addrspace(5) %b + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr addrspace(3) %b seq_cst, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store volatile i16 %b.add, ptr addrspace(5) %b + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr addrspace(3) %b seq_cst, align 2 - ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load volatile i32, ptr addrspace(5) %c + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr addrspace(3) %c seq_cst, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store volatile i32 %c.add, ptr addrspace(5) %c + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr addrspace(3) %c seq_cst, align 4 - ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load volatile i64, ptr addrspace(5) %d + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr addrspace(3) %d seq_cst, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store volatile i64 %d.add, ptr addrspace(5) %d + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr addrspace(3) %d seq_cst, align 8 - ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load volatile float, ptr addrspace(5) %c - %e.add = fadd float %e.load, 1. - ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store volatile float %e.add, ptr addrspace(5) %c + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr addrspace(3) %e seq_cst, align 4 + %e.add = fadd float %e.load, 1.0 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr addrspace(3) %e seq_cst, align 4 - ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load volatile double, ptr addrspace(5) %c - %f.add = fadd double %f.load, 1. - ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store volatile double %f.add, ptr addrspace(5) %c + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr addrspace(3) %e seq_cst, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr addrspace(3) %e seq_cst, align 8 ret void } -; CHECK-LABEL: local_unordered -define void @local_unordered(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { - ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %a.load = load atomic i8, ptr addrspace(5) %a unordered, align 1 +; CHECK-LABEL: shared_seq_cst_volatile +define void @shared_seq_cst_volatile(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr addrspace(3) %a seq_cst, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic i8 %a.add, ptr addrspace(5) %a unordered, align 1 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr addrspace(3) %a seq_cst, align 1 - ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic i16, ptr addrspace(5) %b unordered, align 2 + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr addrspace(3) %b seq_cst, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic i16 %b.add, ptr addrspace(5) %b unordered, align 2 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr addrspace(3) %b seq_cst, align 2 - ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic i32, ptr addrspace(5) %c unordered, align 4 + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr addrspace(3) %c seq_cst, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic i32 %c.add, ptr addrspace(5) %c unordered, align 4 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr addrspace(3) %c seq_cst, align 4 - ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic i64, ptr addrspace(5) %d unordered, align 8 + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr addrspace(3) %d seq_cst, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic i64 %d.add, ptr addrspace(5) %d unordered, align 8 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr addrspace(3) %d seq_cst, align 8 - ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic float, ptr addrspace(5) %e unordered, align 4 + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr addrspace(3) %e seq_cst, align 4 %e.add = fadd float %e.load, 1.0 - ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic float %e.add, ptr addrspace(5) %e unordered, align 4 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr addrspace(3) %e seq_cst, align 4 - ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic double, ptr addrspace(5) %e unordered, align 8 + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr addrspace(3) %e seq_cst, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic double %f.add, ptr addrspace(5) %e unordered, align 8 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr addrspace(3) %e seq_cst, align 8 ret void } -; CHECK-LABEL: local_unordered_volatile -define void @local_unordered_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { - ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %a.load = load atomic volatile i8, ptr addrspace(5) %a unordered, align 1 - %a.add = add i8 %a.load, 1 - ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic volatile i8 %a.add, ptr addrspace(5) %a unordered, align 1 - - ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic volatile i16, ptr addrspace(5) %b unordered, align 2 - %b.add = add i16 %b.load, 1 - ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic volatile i16 %b.add, ptr addrspace(5) %b unordered, align 2 - - ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic volatile i32, ptr addrspace(5) %c unordered, align 4 - %c.add = add i32 %c.load, 1 - ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic volatile i32 %c.add, ptr addrspace(5) %c unordered, align 4 - - ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic volatile i64, ptr addrspace(5) %d unordered, align 8 - %d.add = add i64 %d.load, 1 - ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic volatile i64 %d.add, ptr addrspace(5) %d unordered, align 8 - - ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic volatile float, ptr addrspace(5) %e unordered, align 4 - %e.add = fadd float %e.load, 1.0 - ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic volatile float %e.add, ptr addrspace(5) %e unordered, align 4 - - ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic volatile double, ptr addrspace(5) %e unordered, align 8 - %f.add = fadd double %f.load, 1. - ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic volatile double %f.add, ptr addrspace(5) %e unordered, align 8 +;; local statespace - ret void -} +; CHECK-LABEL: local_acq_rel +define void @local_acq_rel(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { + ; TODO: generate PTX that preserves Concurrent Forward Progress + ; by using PTX atomic operations. -; CHECK-LABEL: local_monotonic -define void @local_monotonic(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %a.load = load atomic i8, ptr addrspace(5) %a monotonic, align 1 + %a.load = load atomic i8, ptr addrspace(5) %a acquire, align 1 %a.add = add i8 %a.load, 1 ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic i8 %a.add, ptr addrspace(5) %a monotonic, align 1 + store atomic i8 %a.add, ptr addrspace(5) %a release, align 1 ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic i16, ptr addrspace(5) %b monotonic, align 2 + %b.load = load atomic i16, ptr addrspace(5) %b acquire, align 2 %b.add = add i16 %b.load, 1 ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic i16 %b.add, ptr addrspace(5) %b monotonic, align 2 + store atomic i16 %b.add, ptr addrspace(5) %b release, align 2 ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic i32, ptr addrspace(5) %c monotonic, align 4 + %c.load = load atomic i32, ptr addrspace(5) %c acquire, align 4 %c.add = add i32 %c.load, 1 ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic i32 %c.add, ptr addrspace(5) %c monotonic, align 4 + store atomic i32 %c.add, ptr addrspace(5) %c release, align 4 ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic i64, ptr addrspace(5) %d monotonic, align 8 + %d.load = load atomic i64, ptr addrspace(5) %d acquire, align 8 %d.add = add i64 %d.load, 1 ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic i64 %d.add, ptr addrspace(5) %d monotonic, align 8 + store atomic i64 %d.add, ptr addrspace(5) %d release, align 8 ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic float, ptr addrspace(5) %e monotonic, align 4 + %e.load = load atomic float, ptr addrspace(5) %e acquire, align 4 %e.add = fadd float %e.load, 1.0 ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic float %e.add, ptr addrspace(5) %e monotonic, align 4 + store atomic float %e.add, ptr addrspace(5) %e release, align 4 ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic double, ptr addrspace(5) %e monotonic, align 8 + %f.load = load atomic double, ptr addrspace(5) %e acquire, align 8 %f.add = fadd double %f.load, 1. ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic double %f.add, ptr addrspace(5) %e monotonic, align 8 + store atomic double %f.add, ptr addrspace(5) %e release, align 8 ret void } -; CHECK-LABEL: local_monotonic_volatile -define void @local_monotonic_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { +; CHECK-LABEL: local_acq_rel_volatile +define void @local_acq_rel_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { + ; TODO: generate PTX that preserves Concurrent Forward Progress + ; by using PTX atomic operations. + ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %a.load = load atomic volatile i8, ptr addrspace(5) %a monotonic, align 1 + %a.load = load atomic volatile i8, ptr addrspace(5) %a acquire, align 1 %a.add = add i8 %a.load, 1 ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic volatile i8 %a.add, ptr addrspace(5) %a monotonic, align 1 + store atomic volatile i8 %a.add, ptr addrspace(5) %a release, align 1 ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic volatile i16, ptr addrspace(5) %b monotonic, align 2 + %b.load = load atomic volatile i16, ptr addrspace(5) %b acquire, align 2 %b.add = add i16 %b.load, 1 ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic volatile i16 %b.add, ptr addrspace(5) %b monotonic, align 2 + store atomic volatile i16 %b.add, ptr addrspace(5) %b release, align 2 ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic volatile i32, ptr addrspace(5) %c monotonic, align 4 + %c.load = load atomic volatile i32, ptr addrspace(5) %c acquire, align 4 %c.add = add i32 %c.load, 1 ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic volatile i32 %c.add, ptr addrspace(5) %c monotonic, align 4 + store atomic volatile i32 %c.add, ptr addrspace(5) %c release, align 4 ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic volatile i64, ptr addrspace(5) %d monotonic, align 8 + %d.load = load atomic volatile i64, ptr addrspace(5) %d acquire, align 8 %d.add = add i64 %d.load, 1 ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic volatile i64 %d.add, ptr addrspace(5) %d monotonic, align 8 + store atomic volatile i64 %d.add, ptr addrspace(5) %d release, align 8 ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic volatile float, ptr addrspace(5) %e monotonic, align 4 + %e.load = load atomic volatile float, ptr addrspace(5) %e acquire, align 4 %e.add = fadd float %e.load, 1.0 ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic volatile float %e.add, ptr addrspace(5) %e monotonic, align 4 + store atomic volatile float %e.add, ptr addrspace(5) %e release, align 4 ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic volatile double, ptr addrspace(5) %e monotonic, align 8 + %f.load = load atomic volatile double, ptr addrspace(5) %e acquire, align 8 %f.add = fadd double %f.load, 1. ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic volatile double %f.add, ptr addrspace(5) %e monotonic, align 8 + store atomic volatile double %f.add, ptr addrspace(5) %e release, align 8 ret void } -; CHECK-LABEL: local_acq_rel -define void @local_acq_rel(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { +; CHECK-LABEL: local_seq_cst +define void @local_seq_cst(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { + ; TODO: generate PTX that preserves Concurrent Forward Progress + ; by using PTX atomic operations. + ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %a.load = load atomic i8, ptr addrspace(5) %a acquire, align 1 + %a.load = load atomic i8, ptr addrspace(5) %a seq_cst, align 1 %a.add = add i8 %a.load, 1 ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic i8 %a.add, ptr addrspace(5) %a release, align 1 + store atomic i8 %a.add, ptr addrspace(5) %a seq_cst, align 1 ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic i16, ptr addrspace(5) %b acquire, align 2 + %b.load = load atomic i16, ptr addrspace(5) %b seq_cst, align 2 %b.add = add i16 %b.load, 1 ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic i16 %b.add, ptr addrspace(5) %b release, align 2 + store atomic i16 %b.add, ptr addrspace(5) %b seq_cst, align 2 ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic i32, ptr addrspace(5) %c acquire, align 4 + %c.load = load atomic i32, ptr addrspace(5) %c seq_cst, align 4 %c.add = add i32 %c.load, 1 ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic i32 %c.add, ptr addrspace(5) %c release, align 4 + store atomic i32 %c.add, ptr addrspace(5) %c seq_cst, align 4 ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic i64, ptr addrspace(5) %d acquire, align 8 + %d.load = load atomic i64, ptr addrspace(5) %d seq_cst, align 8 %d.add = add i64 %d.load, 1 ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic i64 %d.add, ptr addrspace(5) %d release, align 8 + store atomic i64 %d.add, ptr addrspace(5) %d seq_cst, align 8 ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic float, ptr addrspace(5) %e acquire, align 4 + %e.load = load atomic float, ptr addrspace(5) %e seq_cst, align 4 %e.add = fadd float %e.load, 1.0 ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic float %e.add, ptr addrspace(5) %e release, align 4 + store atomic float %e.add, ptr addrspace(5) %e seq_cst, align 4 ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic double, ptr addrspace(5) %e acquire, align 8 + %f.load = load atomic double, ptr addrspace(5) %e seq_cst, align 8 %f.add = fadd double %f.load, 1. ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic double %f.add, ptr addrspace(5) %e release, align 8 + store atomic double %f.add, ptr addrspace(5) %e seq_cst, align 8 ret void } -; CHECK-LABEL: local_acq_rel_volatile -define void @local_acq_rel_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { +; CHECK-LABEL: local_seq_cst_volatile +define void @local_seq_cst_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { + ; TODO: generate PTX that preserves Concurrent Forward Progress + ; by using PTX atomic operations. + ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %a.load = load atomic volatile i8, ptr addrspace(5) %a acquire, align 1 + %a.load = load atomic volatile i8, ptr addrspace(5) %a seq_cst, align 1 %a.add = add i8 %a.load, 1 ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic volatile i8 %a.add, ptr addrspace(5) %a release, align 1 + store atomic volatile i8 %a.add, ptr addrspace(5) %a seq_cst, align 1 ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic volatile i16, ptr addrspace(5) %b acquire, align 2 + %b.load = load atomic volatile i16, ptr addrspace(5) %b seq_cst, align 2 %b.add = add i16 %b.load, 1 ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic volatile i16 %b.add, ptr addrspace(5) %b release, align 2 + store atomic volatile i16 %b.add, ptr addrspace(5) %b seq_cst, align 2 ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic volatile i32, ptr addrspace(5) %c acquire, align 4 + %c.load = load atomic volatile i32, ptr addrspace(5) %c seq_cst, align 4 %c.add = add i32 %c.load, 1 ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic volatile i32 %c.add, ptr addrspace(5) %c release, align 4 + store atomic volatile i32 %c.add, ptr addrspace(5) %c seq_cst, align 4 ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic volatile i64, ptr addrspace(5) %d acquire, align 8 + %d.load = load atomic volatile i64, ptr addrspace(5) %d seq_cst, align 8 %d.add = add i64 %d.load, 1 ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic volatile i64 %d.add, ptr addrspace(5) %d release, align 8 + store atomic volatile i64 %d.add, ptr addrspace(5) %d seq_cst, align 8 ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic volatile float, ptr addrspace(5) %e acquire, align 4 + %e.load = load atomic volatile float, ptr addrspace(5) %e seq_cst, align 4 %e.add = fadd float %e.load, 1.0 ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic volatile float %e.add, ptr addrspace(5) %e release, align 4 + store atomic volatile float %e.add, ptr addrspace(5) %e seq_cst, align 4 ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic volatile double, ptr addrspace(5) %e acquire, align 8 + %f.load = load atomic volatile double, ptr addrspace(5) %e seq_cst, align 8 %f.add = fadd double %f.load, 1. ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic volatile double %f.add, ptr addrspace(5) %e release, align 8 + store atomic volatile double %f.add, ptr addrspace(5) %e seq_cst, align 8 + + ; TODO: LLVM IR Verifier does not support atomics on vector types. ret void } + +; TODO: add plain,atomic,volatile,atomic volatile tests +; for .const and .param statespaces
\ No newline at end of file diff --git a/llvm/test/CodeGen/NVPTX/load-store.ll b/llvm/test/CodeGen/NVPTX/load-store.ll index 4c5e0920ce1a..aac73f71a676 100644 --- a/llvm/test/CodeGen/NVPTX/load-store.ll +++ b/llvm/test/CodeGen/NVPTX/load-store.ll @@ -1,5 +1,13 @@ -; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s +; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck -check-prefixes=CHECK,SM60 %s ; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %} +; RUN: llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx82 | FileCheck %s -check-prefixes=CHECK,SM70 +; RUN: %if ptxas-12.2 %{ llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx82 | %ptxas-verify -arch=sm_70 %} + +; TODO: add i1, <8 x i8>, and <6 x i8> vector tests. + +; TODO: add test for vectors that exceed 128-bit length +; Per https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#vectors +; vectors cannot exceed 128-bit in length, i.e., .v4.u64 is not allowed. ; generic statespace @@ -36,10 +44,76 @@ define void @generic_plain(ptr %a, ptr %b, ptr %c, ptr %d) local_unnamed_addr { store float %e.add, ptr %c ; CHECK: ld.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load double, ptr %c + %f.load = load double, ptr %d %f.add = fadd double %f.load, 1. ; CHECK: st.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store double %f.add, ptr %c + store double %f.add, ptr %d + + ; TODO: make the lowering of this weak vector ops consistent with + ; the ones of the next tests. This test lowers to a weak PTX + ; vector op, but next test lowers to a vector PTX op. + ; CHECK: ld.v2.u8 {%rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}] + %h.load = load <2 x i8>, ptr %b + %h.add = add <2 x i8> %h.load, <i8 1, i8 1> + ; CHECK: st.v2.u8 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}} + store <2 x i8> %h.add, ptr %b + + ; TODO: make the lowering of this weak vector ops consistent with + ; the ones of the previous test. This test lowers to a weak + ; PTX scalar op, but prior test lowers to a vector PTX op. + ; CHECK: ld.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %i.load = load <4 x i8>, ptr %c + %i.add = add <4 x i8> %i.load, <i8 1, i8 1, i8 1, i8 1> + ; CHECK: st.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store <4 x i8> %i.add, ptr %c + + ; CHECK: ld.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %j.load = load <2 x i16>, ptr %c + %j.add = add <2 x i16> %j.load, <i16 1, i16 1> + ; CHECK: st.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store <2 x i16> %j.add, ptr %c + + ; CHECK: ld.v4.u16 {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}] + %k.load = load <4 x i16>, ptr %d + %k.add = add <4 x i16> %k.load, <i16 1, i16 1, i16 1, i16 1> + ; CHECK: st.v4.u16 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}} + store <4 x i16> %k.add, ptr %d + + ; CHECK: ld.v2.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}] + %l.load = load <2 x i32>, ptr %d + %l.add = add <2 x i32> %l.load, <i32 1, i32 1> + ; CHECK: st.v2.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}} + store <2 x i32> %l.add, ptr %d + + ; CHECK: ld.v4.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}] + %m.load = load <4 x i32>, ptr %d + %m.add = add <4 x i32> %m.load, <i32 1, i32 1, i32 1, i32 1> + ; CHECK: st.v4.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}} + store <4 x i32> %m.add, ptr %d + + ; CHECK: ld.v2.u64 {%rd{{[0-9]+}}, %rd{{[0-9]+}}}, [%rd{{[0-9]+}}] + %n.load = load <2 x i64>, ptr %d + %n.add = add <2 x i64> %n.load, <i64 1, i64 1> + ; CHECK: st.v2.u64 [%rd{{[0-9]+}}], {%rd{{[0-9]+}}, %rd{{[0-9]+}}} + store <2 x i64> %n.add, ptr %d + + ; CHECK: ld.v2.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}] + %o.load = load <2 x float>, ptr %d + %o.add = fadd <2 x float> %o.load, <float 1., float 1.> + ; CHECK: st.v2.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}} + store <2 x float> %o.add, ptr %d + + ; CHECK: ld.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}] + %p.load = load <4 x float>, ptr %d + %p.add = fadd <4 x float> %p.load, <float 1., float 1., float 1., float 1.> + ; CHECK: st.v4.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}} + store <4 x float> %p.add, ptr %d + + ; CHECK: ld.v2.f64 {%fd{{[0-9]+}}, %fd{{[0-9]+}}}, [%rd{{[0-9]+}}] + %q.load = load <2 x double>, ptr %d + %q.add = fadd <2 x double> %q.load, <double 1., double 1.> + ; CHECK: st.v2.f64 [%rd{{[0-9]+}}], {%fd{{[0-9]+}}, %fd{{[0-9]+}}} + store <2 x double> %q.add, ptr %d ret void } @@ -82,45 +156,136 @@ define void @generic_volatile(ptr %a, ptr %b, ptr %c, ptr %d) local_unnamed_addr ; CHECK: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store volatile double %f.add, ptr %c + ; TODO: volatile, atomic, and volatile atomic memory operations on vector types. + ; Currently, LLVM: + ; - does not allow atomic operations on vectors. + ; - it allows volatile operations but not clear what that means. + ; Following both semantics make sense in general and PTX supports both: + ; - volatile/atomic/volatile atomic applies to the whole vector + ; - volatile/atomic/volatile atomic applies elementwise + ; Actions required: + ; - clarify LLVM semantics for volatile on vectors and align the NVPTX backend with those + ; Below tests show that the current implementation picks the semantics in an inconsistent way + ; * volatile <2 x i8> lowers to "elementwise volatile" + ; * <4 x i8> lowers to "full vector volatile" + ; - provide support for vector atomics, e.g., by extending LLVM IR or via intrinsics + ; - update tests in load-store-sm70.ll as well. + + ; TODO: make this operation consistent with the one for <4 x i8> + ; This operation lowers to a "element wise volatile PTX operation". + ; CHECK: ld.volatile.v2.u8 {%rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}] + %h.load = load volatile <2 x i8>, ptr %b + %h.add = add <2 x i8> %h.load, <i8 1, i8 1> + ; CHECK: st.volatile.v2.u8 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}} + store volatile <2 x i8> %h.add, ptr %b + + ; TODO: make this operation consistent with the one for <2 x i8> + ; This operation lowers to a "full vector volatile PTX operation". + ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %i.load = load volatile <4 x i8>, ptr %c + %i.add = add <4 x i8> %i.load, <i8 1, i8 1, i8 1, i8 1> + ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store volatile <4 x i8> %i.add, ptr %c + + ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %j.load = load volatile <2 x i16>, ptr %c + %j.add = add <2 x i16> %j.load, <i16 1, i16 1> + ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store volatile <2 x i16> %j.add, ptr %c + + ; CHECK: ld.volatile.v4.u16 {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}] + %k.load = load volatile <4 x i16>, ptr %d + %k.add = add <4 x i16> %k.load, <i16 1, i16 1, i16 1, i16 1> + ; CHECK: st.volatile.v4.u16 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}} + store volatile <4 x i16> %k.add, ptr %d + + ; CHECK: ld.volatile.v2.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}] + %l.load = load volatile <2 x i32>, ptr %d + %l.add = add <2 x i32> %l.load, <i32 1, i32 1> + ; CHECK: st.volatile.v2.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}} + store volatile <2 x i32> %l.add, ptr %d + + ; CHECK: ld.volatile.v4.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}] + %m.load = load volatile <4 x i32>, ptr %d + %m.add = add <4 x i32> %m.load, <i32 1, i32 1, i32 1, i32 1> + ; CHECK: st.volatile.v4.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}} + store volatile <4 x i32> %m.add, ptr %d + + ; CHECK: ld.volatile.v2.u64 {%rd{{[0-9]+}}, %rd{{[0-9]+}}}, [%rd{{[0-9]+}}] + %n.load = load volatile <2 x i64>, ptr %d + %n.add = add <2 x i64> %n.load, <i64 1, i64 1> + ; CHECK: st.volatile.v2.u64 [%rd{{[0-9]+}}], {%rd{{[0-9]+}}, %rd{{[0-9]+}}} + store volatile <2 x i64> %n.add, ptr %d + + ; CHECK: ld.volatile.v2.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}] + %o.load = load volatile <2 x float>, ptr %d + %o.add = fadd <2 x float> %o.load, <float 1., float 1.> + ; CHECK: st.volatile.v2.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}} + store volatile <2 x float> %o.add, ptr %d + + ; CHECK: ld.volatile.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}] + %p.load = load volatile <4 x float>, ptr %d + %p.add = fadd <4 x float> %p.load, <float 1., float 1., float 1., float 1.> + ; CHECK: st.volatile.v4.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}} + store volatile <4 x float> %p.add, ptr %d + + ; CHECK: ld.volatile.v2.f64 {%fd{{[0-9]+}}, %fd{{[0-9]+}}}, [%rd{{[0-9]+}}] + %q.load = load volatile <2 x double>, ptr %d + %q.add = fadd <2 x double> %q.load, <double 1., double 1.> + ; CHECK: st.volatile.v2.f64 [%rd{{[0-9]+}}], {%fd{{[0-9]+}}, %fd{{[0-9]+}}} + store volatile <2 x double> %q.add, ptr %d + ret void } ; CHECK-LABEL: generic_monotonic define void @generic_monotonic(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { - ; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.relaxed.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic i8, ptr %a monotonic, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; SM60: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; SM70: st.relaxed.sys.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i8 %a.add, ptr %a monotonic, align 1 - ; CHECK: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.relaxed.sys.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic i16, ptr %b monotonic, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; SM60: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; SM70: st.relaxed.sys.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i16 %b.add, ptr %b monotonic, align 2 - ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.relaxed.sys.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic i32, ptr %c monotonic, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; SM60: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; SM70: st.relaxed.sys.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic i32 %c.add, ptr %c monotonic, align 4 - ; CHECK: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.relaxed.sys.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic i64, ptr %d monotonic, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; SM60: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; SM70: st.relaxed.sys.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic i64 %d.add, ptr %d monotonic, align 8 - ; CHECK: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.relaxed.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic float, ptr %e monotonic, align 4 %e.add = fadd float %e.load, 1.0 - ; CHECK: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; SM60: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; SM70: st.relaxed.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic float %e.add, ptr %e monotonic, align 4 - ; CHECK: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.relaxed.sys.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic double, ptr %e monotonic, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; SM60: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; SM70: st.relaxed.sys.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic double %f.add, ptr %e monotonic, align 8 ret void @@ -169,40 +334,52 @@ define void @generic_monotonic_volatile(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) ; CHECK-LABEL: generic_unordered define void @generic_unordered(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { - ; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.relaxed.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic i8, ptr %a unordered, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; SM60: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; SM70: st.relaxed.sys.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i8 %a.add, ptr %a unordered, align 1 - ; CHECK: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.relaxed.sys.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic i16, ptr %b unordered, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; SM60: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; SM70: st.relaxed.sys.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i16 %b.add, ptr %b unordered, align 2 - ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.relaxed.sys.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic i32, ptr %c unordered, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; SM60: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; SM70: st.relaxed.sys.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic i32 %c.add, ptr %c unordered, align 4 - ; CHECK: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.relaxed.sys.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic i64, ptr %d unordered, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; SM60: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; SM70: st.relaxed.sys.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic i64 %d.add, ptr %d unordered, align 8 - ; CHECK: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.relaxed.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic float, ptr %e unordered, align 4 %e.add = fadd float %e.load, 1.0 - ; CHECK: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; SM60: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; SM70: st.relaxed.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic float %e.add, ptr %e unordered, align 4 - ; CHECK: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.relaxed.sys.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic double, ptr %e unordered, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; SM60: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; SM70: st.relaxed.sys.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic double %f.add, ptr %e unordered, align 8 ret void @@ -289,6 +466,66 @@ define void @global_plain(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspac ; CHECK: st.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store double %f.add, ptr addrspace(1) %c + ; CHECK: ld.global.v2.u8 {%rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}] + %h.load = load <2 x i8>, ptr addrspace(1) %b + %h.add = add <2 x i8> %h.load, <i8 1, i8 1> + ; CHECK: st.global.v2.u8 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}} + store <2 x i8> %h.add, ptr addrspace(1) %b + + ; CHECK: ld.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %i.load = load <4 x i8>, ptr addrspace(1) %c + %i.add = add <4 x i8> %i.load, <i8 1, i8 1, i8 1, i8 1> + ; CHECK: st.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store <4 x i8> %i.add, ptr addrspace(1) %c + + ; CHECK: ld.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %j.load = load <2 x i16>, ptr addrspace(1) %c + %j.add = add <2 x i16> %j.load, <i16 1, i16 1> + ; CHECK: st.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store <2 x i16> %j.add, ptr addrspace(1) %c + + ; CHECK: ld.global.v4.u16 {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}] + %k.load = load <4 x i16>, ptr addrspace(1) %d + %k.add = add <4 x i16> %k.load, <i16 1, i16 1, i16 1, i16 1> + ; CHECK: st.global.v4.u16 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}} + store <4 x i16> %k.add, ptr addrspace(1) %d + + ; CHECK: ld.global.v2.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}] + %l.load = load <2 x i32>, ptr addrspace(1) %d + %l.add = add <2 x i32> %l.load, <i32 1, i32 1> + ; CHECK: st.global.v2.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}} + store <2 x i32> %l.add, ptr addrspace(1) %d + + ; CHECK: ld.global.v4.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}] + %m.load = load <4 x i32>, ptr addrspace(1) %d + %m.add = add <4 x i32> %m.load, <i32 1, i32 1, i32 1, i32 1> + ; CHECK: st.global.v4.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}} + store <4 x i32> %m.add, ptr addrspace(1) %d + + ; CHECK: ld.global.v2.u64 {%rd{{[0-9]+}}, %rd{{[0-9]+}}}, [%rd{{[0-9]+}}] + %n.load = load <2 x i64>, ptr addrspace(1) %d + %n.add = add <2 x i64> %n.load, <i64 1, i64 1> + ; CHECK: st.global.v2.u64 [%rd{{[0-9]+}}], {%rd{{[0-9]+}}, %rd{{[0-9]+}}} + store <2 x i64> %n.add, ptr addrspace(1) %d + + ; CHECK: ld.global.v2.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}] + %o.load = load <2 x float>, ptr addrspace(1) %d + %o.add = fadd <2 x float> %o.load, <float 1., float 1.> + ; CHECK: st.global.v2.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}} + store <2 x float> %o.add, ptr addrspace(1) %d + + ; CHECK: ld.global.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}] + %p.load = load <4 x float>, ptr addrspace(1) %d + %p.add = fadd <4 x float> %p.load, <float 1., float 1., float 1., float 1.> + ; CHECK: st.global.v4.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}} + store <4 x float> %p.add, ptr addrspace(1) %d + + ; CHECK: ld.global.v2.f64 {%fd{{[0-9]+}}, %fd{{[0-9]+}}}, [%rd{{[0-9]+}}] + %q.load = load <2 x double>, ptr addrspace(1) %d + %q.add = fadd <2 x double> %q.load, <double 1., double 1.> + ; CHECK: st.global.v2.f64 [%rd{{[0-9]+}}], {%fd{{[0-9]+}}, %fd{{[0-9]+}}} + store <2 x double> %q.add, ptr addrspace(1) %d + ret void } @@ -330,45 +567,117 @@ define void @global_volatile(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrs ; CHECK: st.volatile.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store volatile double %f.add, ptr addrspace(1) %c + ; CHECK: ld.volatile.global.v2.u8 {%rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}] + %h.load = load volatile <2 x i8>, ptr addrspace(1) %b + %h.add = add <2 x i8> %h.load, <i8 1, i8 1> + ; CHECK: st.volatile.global.v2.u8 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}} + store volatile<2 x i8> %h.add, ptr addrspace(1) %b + + ; CHECK: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %i.load = load volatile <4 x i8>, ptr addrspace(1) %c + %i.add = add <4 x i8> %i.load, <i8 1, i8 1, i8 1, i8 1> + ; CHECK: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store volatile<4 x i8> %i.add, ptr addrspace(1) %c + + ; CHECK: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %j.load = load volatile <2 x i16>, ptr addrspace(1) %c + %j.add = add <2 x i16> %j.load, <i16 1, i16 1> + ; CHECK: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store volatile<2 x i16> %j.add, ptr addrspace(1) %c + + ; CHECK: ld.volatile.global.v4.u16 {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}] + %k.load = load volatile <4 x i16>, ptr addrspace(1) %d + %k.add = add <4 x i16> %k.load, <i16 1, i16 1, i16 1, i16 1> + ; CHECK: st.volatile.global.v4.u16 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}} + store volatile<4 x i16> %k.add, ptr addrspace(1) %d + + ; CHECK: ld.volatile.global.v2.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}] + %l.load = load volatile <2 x i32>, ptr addrspace(1) %d + %l.add = add <2 x i32> %l.load, <i32 1, i32 1> + ; CHECK: st.volatile.global.v2.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}} + store volatile<2 x i32> %l.add, ptr addrspace(1) %d + + ; CHECK: ld.volatile.global.v4.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}] + %m.load = load volatile <4 x i32>, ptr addrspace(1) %d + %m.add = add <4 x i32> %m.load, <i32 1, i32 1, i32 1, i32 1> + ; CHECK: st.volatile.global.v4.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}} + store volatile<4 x i32> %m.add, ptr addrspace(1) %d + + ; CHECK: ld.volatile.global.v2.u64 {%rd{{[0-9]+}}, %rd{{[0-9]+}}}, [%rd{{[0-9]+}}] + %n.load = load volatile <2 x i64>, ptr addrspace(1) %d + %n.add = add <2 x i64> %n.load, <i64 1, i64 1> + ; CHECK: st.volatile.global.v2.u64 [%rd{{[0-9]+}}], {%rd{{[0-9]+}}, %rd{{[0-9]+}}} + store volatile<2 x i64> %n.add, ptr addrspace(1) %d + + ; CHECK: ld.volatile.global.v2.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}] + %o.load = load volatile <2 x float>, ptr addrspace(1) %d + %o.add = fadd <2 x float> %o.load, <float 1., float 1.> + ; CHECK: st.volatile.global.v2.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}} + store volatile<2 x float> %o.add, ptr addrspace(1) %d + + ; CHECK: ld.volatile.global.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}] + %p.load = load volatile <4 x float>, ptr addrspace(1) %d + %p.add = fadd <4 x float> %p.load, <float 1., float 1., float 1., float 1.> + ; CHECK: st.volatile.global.v4.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}} + store volatile<4 x float> %p.add, ptr addrspace(1) %d + + ; CHECK: ld.volatile.global.v2.f64 {%fd{{[0-9]+}}, %fd{{[0-9]+}}}, [%rd{{[0-9]+}}] + %q.load = load volatile <2 x double>, ptr addrspace(1) %d + %q.add = fadd <2 x double> %q.load, <double 1., double 1.> + ; CHECK: st.volatile.global.v2.f64 [%rd{{[0-9]+}}], {%fd{{[0-9]+}}, %fd{{[0-9]+}}} + store volatile<2 x double> %q.add, ptr addrspace(1) %d + ret void } ; CHECK-LABEL: global_monotonic define void @global_monotonic(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { - ; CHECK: ld.volatile.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.relaxed.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic i8, ptr addrspace(1) %a monotonic, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.volatile.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; SM60: st.volatile.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; SM70: st.relaxed.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i8 %a.add, ptr addrspace(1) %a monotonic, align 1 - ; CHECK: ld.volatile.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.relaxed.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic i16, ptr addrspace(1) %b monotonic, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.volatile.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; SM60: st.volatile.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; SM70: st.relaxed.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i16 %b.add, ptr addrspace(1) %b monotonic, align 2 - ; CHECK: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.relaxed.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic i32, ptr addrspace(1) %c monotonic, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; SM60: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; SM70: st.relaxed.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic i32 %c.add, ptr addrspace(1) %c monotonic, align 4 - ; CHECK: ld.volatile.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.relaxed.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic i64, ptr addrspace(1) %d monotonic, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.volatile.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; SM60: st.volatile.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; SM70: st.relaxed.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic i64 %d.add, ptr addrspace(1) %d monotonic, align 8 - ; CHECK: ld.volatile.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.relaxed.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic float, ptr addrspace(1) %e monotonic, align 4 %e.add = fadd float %e.load, 1.0 - ; CHECK: st.volatile.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; SM60: st.volatile.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; SM70: st.relaxed.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic float %e.add, ptr addrspace(1) %e monotonic, align 4 - ; CHECK: ld.volatile.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.relaxed.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic double, ptr addrspace(1) %e monotonic, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.volatile.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; SM60: st.volatile.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; SM70: st.relaxed.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic double %f.add, ptr addrspace(1) %e monotonic, align 8 ret void @@ -376,40 +685,52 @@ define void @global_monotonic(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addr ; CHECK-LABEL: global_monotonic_volatile define void @global_monotonic_volatile(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { - ; CHECK: ld.volatile.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.mmio.relaxed.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic volatile i8, ptr addrspace(1) %a monotonic, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.volatile.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; SM60: st.volatile.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; SM70: st.mmio.relaxed.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i8 %a.add, ptr addrspace(1) %a monotonic, align 1 - ; CHECK: ld.volatile.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.mmio.relaxed.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic volatile i16, ptr addrspace(1) %b monotonic, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.volatile.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; SM60: st.volatile.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; SM70: st.mmio.relaxed.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i16 %b.add, ptr addrspace(1) %b monotonic, align 2 - ; CHECK: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.mmio.relaxed.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic volatile i32, ptr addrspace(1) %c monotonic, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; SM60: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; SM70: st.mmio.relaxed.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic volatile i32 %c.add, ptr addrspace(1) %c monotonic, align 4 - ; CHECK: ld.volatile.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.mmio.relaxed.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic volatile i64, ptr addrspace(1) %d monotonic, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.volatile.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; SM60: st.volatile.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; SM70: st.mmio.relaxed.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic volatile i64 %d.add, ptr addrspace(1) %d monotonic, align 8 - ; CHECK: ld.volatile.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.mmio.relaxed.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic volatile float, ptr addrspace(1) %e monotonic, align 4 %e.add = fadd float %e.load, 1.0 - ; CHECK: st.volatile.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; SM60: st.volatile.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; SM70: st.mmio.relaxed.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic volatile float %e.add, ptr addrspace(1) %e monotonic, align 4 - ; CHECK: ld.volatile.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.mmio.relaxed.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic volatile double, ptr addrspace(1) %e monotonic, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.volatile.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; SM60: st.volatile.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; SM70: st.mmio.relaxed.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic volatile double %f.add, ptr addrspace(1) %e monotonic, align 8 ret void @@ -417,40 +738,52 @@ define void @global_monotonic_volatile(ptr addrspace(1) %a, ptr addrspace(1) %b, ; CHECK-LABEL: global_unordered define void @global_unordered(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { - ; CHECK: ld.volatile.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.relaxed.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic i8, ptr addrspace(1) %a unordered, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.volatile.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; SM60: st.volatile.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; SM70: st.relaxed.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i8 %a.add, ptr addrspace(1) %a unordered, align 1 - ; CHECK: ld.volatile.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.relaxed.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic i16, ptr addrspace(1) %b unordered, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.volatile.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; SM60: st.volatile.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; SM70: st.relaxed.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i16 %b.add, ptr addrspace(1) %b unordered, align 2 - ; CHECK: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.relaxed.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic i32, ptr addrspace(1) %c unordered, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; SM60: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; SM70: st.relaxed.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic i32 %c.add, ptr addrspace(1) %c unordered, align 4 - ; CHECK: ld.volatile.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.relaxed.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic i64, ptr addrspace(1) %d unordered, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.volatile.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; SM60: st.volatile.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; SM70: st.relaxed.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic i64 %d.add, ptr addrspace(1) %d unordered, align 8 - ; CHECK: ld.volatile.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.relaxed.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic float, ptr addrspace(1) %e unordered, align 4 %e.add = fadd float %e.load, 1.0 - ; CHECK: st.volatile.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; SM60: st.volatile.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; SM70: st.relaxed.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic float %e.add, ptr addrspace(1) %e unordered, align 4 - ; CHECK: ld.volatile.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.relaxed.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic double, ptr addrspace(1) %e unordered, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.volatile.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; SM60: st.volatile.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; SM70: st.relaxed.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic double %f.add, ptr addrspace(1) %e unordered, align 8 ret void @@ -458,40 +791,52 @@ define void @global_unordered(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addr ; CHECK-LABEL: global_unordered_volatile define void @global_unordered_volatile(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { - ; CHECK: ld.volatile.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.mmio.relaxed.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic volatile i8, ptr addrspace(1) %a unordered, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.volatile.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; SM60: st.volatile.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; SM70: st.mmio.relaxed.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i8 %a.add, ptr addrspace(1) %a unordered, align 1 - ; CHECK: ld.volatile.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.mmio.relaxed.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic volatile i16, ptr addrspace(1) %b unordered, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.volatile.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; SM60: st.volatile.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; SM70: st.mmio.relaxed.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i16 %b.add, ptr addrspace(1) %b unordered, align 2 - ; CHECK: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.mmio.relaxed.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic volatile i32, ptr addrspace(1) %c unordered, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; SM60: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; SM70: st.mmio.relaxed.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic volatile i32 %c.add, ptr addrspace(1) %c unordered, align 4 - ; CHECK: ld.volatile.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.mmio.relaxed.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic volatile i64, ptr addrspace(1) %d unordered, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.volatile.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; SM60: st.volatile.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; SM70: st.mmio.relaxed.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic volatile i64 %d.add, ptr addrspace(1) %d unordered, align 8 - ; CHECK: ld.volatile.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.mmio.relaxed.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic volatile float, ptr addrspace(1) %e unordered, align 4 %e.add = fadd float %e.load, 1.0 - ; CHECK: st.volatile.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; SM60: st.volatile.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; SM70: st.mmio.relaxed.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic volatile float %e.add, ptr addrspace(1) %e unordered, align 4 - ; CHECK: ld.volatile.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.mmio.relaxed.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic volatile double, ptr addrspace(1) %e unordered, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.volatile.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; SM60: st.volatile.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; SM70: st.mmio.relaxed.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic volatile double %f.add, ptr addrspace(1) %e unordered, align 8 ret void @@ -537,6 +882,66 @@ define void @shared_plain(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspac ; CHECK: st.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store double %f.add, ptr addrspace(3) %c + ; CHECK: ld.shared.v2.u8 {%rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}] + %h.load = load <2 x i8>, ptr addrspace(3) %b + %h.add = add <2 x i8> %h.load, <i8 1, i8 1> + ; CHECK: st.shared.v2.u8 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}} + store <2 x i8> %h.add, ptr addrspace(3) %b + + ; CHECK: ld.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %i.load = load <4 x i8>, ptr addrspace(3) %c + %i.add = add <4 x i8> %i.load, <i8 1, i8 1, i8 1, i8 1> + ; CHECK: st.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store <4 x i8> %i.add, ptr addrspace(3) %c + + ; CHECK: ld.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %j.load = load <2 x i16>, ptr addrspace(3) %c + %j.add = add <2 x i16> %j.load, <i16 1, i16 1> + ; CHECK: st.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store <2 x i16> %j.add, ptr addrspace(3) %c + + ; CHECK: ld.shared.v4.u16 {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}] + %k.load = load <4 x i16>, ptr addrspace(3) %d + %k.add = add <4 x i16> %k.load, <i16 1, i16 1, i16 1, i16 1> + ; CHECK: st.shared.v4.u16 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}} + store <4 x i16> %k.add, ptr addrspace(3) %d + + ; CHECK: ld.shared.v2.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}] + %l.load = load <2 x i32>, ptr addrspace(3) %d + %l.add = add <2 x i32> %l.load, <i32 1, i32 1> + ; CHECK: st.shared.v2.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}} + store <2 x i32> %l.add, ptr addrspace(3) %d + + ; CHECK: ld.shared.v4.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}] + %m.load = load <4 x i32>, ptr addrspace(3) %d + %m.add = add <4 x i32> %m.load, <i32 1, i32 1, i32 1, i32 1> + ; CHECK: st.shared.v4.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}} + store <4 x i32> %m.add, ptr addrspace(3) %d + + ; CHECK: ld.shared.v2.u64 {%rd{{[0-9]+}}, %rd{{[0-9]+}}}, [%rd{{[0-9]+}}] + %n.load = load <2 x i64>, ptr addrspace(3) %d + %n.add = add <2 x i64> %n.load, <i64 1, i64 1> + ; CHECK: st.shared.v2.u64 [%rd{{[0-9]+}}], {%rd{{[0-9]+}}, %rd{{[0-9]+}}} + store <2 x i64> %n.add, ptr addrspace(3) %d + + ; CHECK: ld.shared.v2.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}] + %o.load = load <2 x float>, ptr addrspace(3) %d + %o.add = fadd <2 x float> %o.load, <float 1., float 1.> + ; CHECK: st.shared.v2.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}} + store <2 x float> %o.add, ptr addrspace(3) %d + + ; CHECK: ld.shared.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}] + %p.load = load <4 x float>, ptr addrspace(3) %d + %p.add = fadd <4 x float> %p.load, <float 1., float 1., float 1., float 1.> + ; CHECK: st.shared.v4.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}} + store <4 x float> %p.add, ptr addrspace(3) %d + + ; CHECK: ld.shared.v2.f64 {%fd{{[0-9]+}}, %fd{{[0-9]+}}}, [%rd{{[0-9]+}}] + %q.load = load <2 x double>, ptr addrspace(3) %d + %q.add = fadd <2 x double> %q.load, <double 1., double 1.> + ; CHECK: st.shared.v2.f64 [%rd{{[0-9]+}}], {%fd{{[0-9]+}}, %fd{{[0-9]+}}} + store <2 x double> %q.add, ptr addrspace(3) %d + ret void } @@ -578,45 +983,119 @@ define void @shared_volatile(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrs ; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store volatile double %f.add, ptr addrspace(3) %c + ; CHECK: ld.volatile.shared.v2.u8 {%rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}] + %h.load = load volatile <2 x i8>, ptr addrspace(3) %b + %h.add = add <2 x i8> %h.load, <i8 1, i8 1> + ; CHECK: st.volatile.shared.v2.u8 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}} + store volatile <2 x i8> %h.add, ptr addrspace(3) %b + + ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %i.load = load volatile <4 x i8>, ptr addrspace(3) %c + %i.add = add <4 x i8> %i.load, <i8 1, i8 1, i8 1, i8 1> + ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store volatile <4 x i8> %i.add, ptr addrspace(3) %c + + ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %j.load = load volatile <2 x i16>, ptr addrspace(3) %c + %j.add = add <2 x i16> %j.load, <i16 1, i16 1> + ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store volatile <2 x i16> %j.add, ptr addrspace(3) %c + + ; CHECK: ld.volatile.shared.v4.u16 {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}] + %k.load = load volatile <4 x i16>, ptr addrspace(3) %d + %k.add = add <4 x i16> %k.load, <i16 1, i16 1, i16 1, i16 1> + ; CHECK: st.volatile.shared.v4.u16 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}} + store volatile <4 x i16> %k.add, ptr addrspace(3) %d + + ; CHECK: ld.volatile.shared.v2.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}] + %l.load = load volatile <2 x i32>, ptr addrspace(3) %d + %l.add = add <2 x i32> %l.load, <i32 1, i32 1> + ; CHECK: st.volatile.shared.v2.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}} + store volatile <2 x i32> %l.add, ptr addrspace(3) %d + + ; CHECK: ld.volatile.shared.v4.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}] + %m.load = load volatile <4 x i32>, ptr addrspace(3) %d + %m.add = add <4 x i32> %m.load, <i32 1, i32 1, i32 1, i32 1> + ; CHECK: st.volatile.shared.v4.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}} + store volatile <4 x i32> %m.add, ptr addrspace(3) %d + + ; CHECK: ld.volatile.shared.v2.u64 {%rd{{[0-9]+}}, %rd{{[0-9]+}}}, [%rd{{[0-9]+}}] + %n.load = load volatile <2 x i64>, ptr addrspace(3) %d + %n.add = add <2 x i64> %n.load, <i64 1, i64 1> + ; CHECK: st.volatile.shared.v2.u64 [%rd{{[0-9]+}}], {%rd{{[0-9]+}}, %rd{{[0-9]+}}} + store volatile <2 x i64> %n.add, ptr addrspace(3) %d + + ; CHECK: ld.volatile.shared.v2.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}] + %o.load = load volatile <2 x float>, ptr addrspace(3) %d + %o.add = fadd <2 x float> %o.load, <float 1., float 1.> + ; CHECK: st.volatile.shared.v2.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}} + store volatile <2 x float> %o.add, ptr addrspace(3) %d + + ; CHECK: ld.volatile.shared.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}] + %p.load = load volatile <4 x float>, ptr addrspace(3) %d + %p.add = fadd <4 x float> %p.load, <float 1., float 1., float 1., float 1.> + ; CHECK: st.volatile.shared.v4.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}} + store volatile <4 x float> %p.add, ptr addrspace(3) %d + + ; CHECK: ld.volatile.shared.v2.f64 {%fd{{[0-9]+}}, %fd{{[0-9]+}}}, [%rd{{[0-9]+}}] + %q.load = load volatile <2 x double>, ptr addrspace(3) %d + %q.add = fadd <2 x double> %q.load, <double 1., double 1.> + ; CHECK: st.volatile.shared.v2.f64 [%rd{{[0-9]+}}], {%fd{{[0-9]+}}, %fd{{[0-9]+}}} + store volatile <2 x double> %q.add, ptr addrspace(3) %d + ret void } ; CHECK-LABEL: shared_monotonic define void @shared_monotonic(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { - ; CHECK: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; TODO: optimize .sys.shared to .cta.shared or .cluster.shared. + + ; SM60: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.relaxed.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic i8, ptr addrspace(3) %a monotonic, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; SM60: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; SM70: st.relaxed.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i8 %a.add, ptr addrspace(3) %a monotonic, align 1 - ; CHECK: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.relaxed.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic i16, ptr addrspace(3) %b monotonic, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; SM60: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; SM70: st.relaxed.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i16 %b.add, ptr addrspace(3) %b monotonic, align 2 - ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.relaxed.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic i32, ptr addrspace(3) %c monotonic, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; SM60: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; SM70: st.relaxed.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic i32 %c.add, ptr addrspace(3) %c monotonic, align 4 - ; CHECK: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.relaxed.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic i64, ptr addrspace(3) %d monotonic, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; SM60: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; SM70: st.relaxed.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic i64 %d.add, ptr addrspace(3) %d monotonic, align 8 - ; CHECK: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.relaxed.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic float, ptr addrspace(3) %e monotonic, align 4 %e.add = fadd float %e.load, 1.0 - ; CHECK: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; SM60: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; SM70: st.relaxed.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic float %e.add, ptr addrspace(3) %e monotonic, align 4 - ; CHECK: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.relaxed.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic double, ptr addrspace(3) %e monotonic, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; SM60: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; SM70: st.relaxed.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic double %f.add, ptr addrspace(3) %e monotonic, align 8 ret void @@ -665,40 +1144,54 @@ define void @shared_monotonic_volatile(ptr addrspace(3) %a, ptr addrspace(3) %b, ; CHECK-LABEL: shared_unordered define void @shared_unordered(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { - ; CHECK: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; TODO: optimize .sys.shared to .cta.shared or .cluster.shared. + + ; SM60: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.relaxed.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic i8, ptr addrspace(3) %a unordered, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; SM60: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; SM70: st.relaxed.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i8 %a.add, ptr addrspace(3) %a unordered, align 1 - ; CHECK: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.relaxed.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic i16, ptr addrspace(3) %b unordered, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; SM60: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; SM70: st.relaxed.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i16 %b.add, ptr addrspace(3) %b unordered, align 2 - ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.relaxed.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic i32, ptr addrspace(3) %c unordered, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; SM60: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; SM70: st.relaxed.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic i32 %c.add, ptr addrspace(3) %c unordered, align 4 - ; CHECK: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.relaxed.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic i64, ptr addrspace(3) %d unordered, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; SM60: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; SM70: st.relaxed.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic i64 %d.add, ptr addrspace(3) %d unordered, align 8 - ; CHECK: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.relaxed.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic float, ptr addrspace(3) %e unordered, align 4 %e.add = fadd float %e.load, 1.0 - ; CHECK: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; SM60: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; SM70: st.relaxed.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic float %e.add, ptr addrspace(3) %e unordered, align 4 - ; CHECK: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM60: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; SM70: ld.relaxed.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic double, ptr addrspace(3) %e unordered, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; SM60: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; SM70: st.relaxed.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic double %f.add, ptr addrspace(3) %e unordered, align 8 ret void @@ -785,11 +1278,74 @@ define void @local_plain(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store double %f.add, ptr addrspace(5) %c + ; CHECK: ld.local.v2.u8 {%rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}] + %h.load = load <2 x i8>, ptr addrspace(5) %b + %h.add = add <2 x i8> %h.load, <i8 1, i8 1> + ; CHECK: st.local.v2.u8 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}} + store <2 x i8> %h.add, ptr addrspace(5) %b + + ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %i.load = load <4 x i8>, ptr addrspace(5) %c + %i.add = add <4 x i8> %i.load, <i8 1, i8 1, i8 1, i8 1> + ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store <4 x i8> %i.add, ptr addrspace(5) %c + + ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %j.load = load <2 x i16>, ptr addrspace(5) %c + %j.add = add <2 x i16> %j.load, <i16 1, i16 1> + ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store <2 x i16> %j.add, ptr addrspace(5) %c + + ; CHECK: ld.local.v4.u16 {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}] + %k.load = load <4 x i16>, ptr addrspace(5) %d + %k.add = add <4 x i16> %k.load, <i16 1, i16 1, i16 1, i16 1> + ; CHECK: st.local.v4.u16 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}} + store <4 x i16> %k.add, ptr addrspace(5) %d + + ; CHECK: ld.local.v2.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}] + %l.load = load <2 x i32>, ptr addrspace(5) %d + %l.add = add <2 x i32> %l.load, <i32 1, i32 1> + ; CHECK: st.local.v2.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}} + store <2 x i32> %l.add, ptr addrspace(5) %d + + ; CHECK: ld.local.v4.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}] + %m.load = load <4 x i32>, ptr addrspace(5) %d + %m.add = add <4 x i32> %m.load, <i32 1, i32 1, i32 1, i32 1> + ; CHECK: st.local.v4.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}} + store <4 x i32> %m.add, ptr addrspace(5) %d + + ; CHECK: ld.local.v2.u64 {%rd{{[0-9]+}}, %rd{{[0-9]+}}}, [%rd{{[0-9]+}}] + %n.load = load <2 x i64>, ptr addrspace(5) %d + %n.add = add <2 x i64> %n.load, <i64 1, i64 1> + ; CHECK: st.local.v2.u64 [%rd{{[0-9]+}}], {%rd{{[0-9]+}}, %rd{{[0-9]+}}} + store <2 x i64> %n.add, ptr addrspace(5) %d + + ; CHECK: ld.local.v2.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}] + %o.load = load <2 x float>, ptr addrspace(5) %d + %o.add = fadd <2 x float> %o.load, <float 1., float 1.> + ; CHECK: st.local.v2.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}} + store <2 x float> %o.add, ptr addrspace(5) %d + + ; CHECK: ld.local.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}] + %p.load = load <4 x float>, ptr addrspace(5) %d + %p.add = fadd <4 x float> %p.load, <float 1., float 1., float 1., float 1.> + ; CHECK: st.local.v4.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}} + store <4 x float> %p.add, ptr addrspace(5) %d + + ; CHECK: ld.local.v2.f64 {%fd{{[0-9]+}}, %fd{{[0-9]+}}}, [%rd{{[0-9]+}}] + %q.load = load <2 x double>, ptr addrspace(5) %d + %q.add = fadd <2 x double> %q.load, <double 1., double 1.> + ; CHECK: st.local.v2.f64 [%rd{{[0-9]+}}], {%fd{{[0-9]+}}, %fd{{[0-9]+}}} + store <2 x double> %q.add, ptr addrspace(5) %d + ret void } ; CHECK-LABEL: local_volatile define void @local_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d) local_unnamed_addr { + ; TODO: generate PTX that preserves Concurrent Forward Progress + ; by using volatile operations. + ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load volatile i8, ptr addrspace(5) %a %a.add = add i8 %a.load, 1 @@ -826,11 +1382,74 @@ define void @local_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrsp ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store volatile double %f.add, ptr addrspace(5) %c + ; CHECK: ld.local.v2.u8 {%rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}] + %h.load = load volatile <2 x i8>, ptr addrspace(5) %b + %h.add = add <2 x i8> %h.load, <i8 1, i8 1> + ; CHECK: st.local.v2.u8 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}} + store volatile <2 x i8> %h.add, ptr addrspace(5) %b + + ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %i.load = load volatile <4 x i8>, ptr addrspace(5) %c + %i.add = add <4 x i8> %i.load, <i8 1, i8 1, i8 1, i8 1> + ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store volatile <4 x i8> %i.add, ptr addrspace(5) %c + + ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %j.load = load volatile <2 x i16>, ptr addrspace(5) %c + %j.add = add <2 x i16> %j.load, <i16 1, i16 1> + ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store volatile <2 x i16> %j.add, ptr addrspace(5) %c + + ; CHECK: ld.local.v4.u16 {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}] + %k.load = load volatile <4 x i16>, ptr addrspace(5) %d + %k.add = add <4 x i16> %k.load, <i16 1, i16 1, i16 1, i16 1> + ; CHECK: st.local.v4.u16 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}} + store volatile <4 x i16> %k.add, ptr addrspace(5) %d + + ; CHECK: ld.local.v2.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}] + %l.load = load volatile <2 x i32>, ptr addrspace(5) %d + %l.add = add <2 x i32> %l.load, <i32 1, i32 1> + ; CHECK: st.local.v2.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}} + store volatile <2 x i32> %l.add, ptr addrspace(5) %d + + ; CHECK: ld.local.v4.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}] + %m.load = load volatile <4 x i32>, ptr addrspace(5) %d + %m.add = add <4 x i32> %m.load, <i32 1, i32 1, i32 1, i32 1> + ; CHECK: st.local.v4.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}} + store volatile <4 x i32> %m.add, ptr addrspace(5) %d + + ; CHECK: ld.local.v2.u64 {%rd{{[0-9]+}}, %rd{{[0-9]+}}}, [%rd{{[0-9]+}}] + %n.load = load volatile <2 x i64>, ptr addrspace(5) %d + %n.add = add <2 x i64> %n.load, <i64 1, i64 1> + ; CHECK: st.local.v2.u64 [%rd{{[0-9]+}}], {%rd{{[0-9]+}}, %rd{{[0-9]+}}} + store volatile <2 x i64> %n.add, ptr addrspace(5) %d + + ; CHECK: ld.local.v2.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}] + %o.load = load volatile <2 x float>, ptr addrspace(5) %d + %o.add = fadd <2 x float> %o.load, <float 1., float 1.> + ; CHECK: st.local.v2.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}} + store volatile <2 x float> %o.add, ptr addrspace(5) %d + + ; CHECK: ld.local.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}] + %p.load = load volatile <4 x float>, ptr addrspace(5) %d + %p.add = fadd <4 x float> %p.load, <float 1., float 1., float 1., float 1.> + ; CHECK: st.local.v4.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}} + store volatile <4 x float> %p.add, ptr addrspace(5) %d + + ; CHECK: ld.local.v2.f64 {%fd{{[0-9]+}}, %fd{{[0-9]+}}}, [%rd{{[0-9]+}}] + %q.load = load volatile <2 x double>, ptr addrspace(5) %d + %q.add = fadd <2 x double> %q.load, <double 1., double 1.> + ; CHECK: st.local.v2.f64 [%rd{{[0-9]+}}], {%fd{{[0-9]+}}, %fd{{[0-9]+}}} + store volatile <2 x double> %q.add, ptr addrspace(5) %d + ret void } ; CHECK-LABEL: local_monotonic define void @local_monotonic(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { + ; TODO: generate PTX that preserves Concurrent Forward Progress + ; by using PTX atomic operations. + ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic i8, ptr addrspace(5) %a monotonic, align 1 %a.add = add i8 %a.load, 1 @@ -872,6 +1491,9 @@ define void @local_monotonic(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrs ; CHECK-LABEL: local_monotonic_volatile define void @local_monotonic_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { + ; TODO: generate PTX that preserves Concurrent Forward Progress + ; by generating atomic or volatile operations + ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic volatile i8, ptr addrspace(5) %a monotonic, align 1 %a.add = add i8 %a.load, 1 @@ -992,3 +1614,6 @@ define void @local_unordered_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ret void } + +; TODO: add plain,atomic,volatile,atomic volatile tests +; for .const and .param statespaces
\ No newline at end of file diff --git a/llvm/test/CodeGen/PowerPC/O0-pipeline.ll b/llvm/test/CodeGen/PowerPC/O0-pipeline.ll index f1c9da078a16..70b421f8c0c5 100644 --- a/llvm/test/CodeGen/PowerPC/O0-pipeline.ll +++ b/llvm/test/CodeGen/PowerPC/O0-pipeline.ll @@ -25,7 +25,6 @@ ; CHECK-NEXT: Lower Garbage Collection Instructions ; CHECK-NEXT: Shadow Stack GC Lowering ; CHECK-NEXT: Remove unreachable blocks from the CFG -; CHECK-NEXT: Expand vector predication intrinsics ; CHECK-NEXT: Instrument function entry/exit with calls to e.g. mcount() (post inlining) ; CHECK-NEXT: Scalarize Masked Memory Intrinsics ; CHECK-NEXT: Expand reduction intrinsics diff --git a/llvm/test/CodeGen/PowerPC/O3-pipeline.ll b/llvm/test/CodeGen/PowerPC/O3-pipeline.ll index be0fbf3abcda..f4f492782eb6 100644 --- a/llvm/test/CodeGen/PowerPC/O3-pipeline.ll +++ b/llvm/test/CodeGen/PowerPC/O3-pipeline.ll @@ -62,7 +62,6 @@ ; CHECK-NEXT: Constant Hoisting ; CHECK-NEXT: Replace intrinsics with calls to vector library ; CHECK-NEXT: Partially inline calls to library functions -; CHECK-NEXT: Expand vector predication intrinsics ; CHECK-NEXT: Instrument function entry/exit with calls to e.g. mcount() (post inlining) ; CHECK-NEXT: Scalarize Masked Memory Intrinsics ; CHECK-NEXT: Expand reduction intrinsics diff --git a/llvm/test/CodeGen/PowerPC/aix-base-pointer.ll b/llvm/test/CodeGen/PowerPC/aix-base-pointer.ll index ab222d770360..5e66e5ec2763 100644 --- a/llvm/test/CodeGen/PowerPC/aix-base-pointer.ll +++ b/llvm/test/CodeGen/PowerPC/aix-base-pointer.ll @@ -6,6 +6,7 @@ ; Use an overaligned buffer to force base-pointer usage. Test verifies: ; - base pointer register (r30) is saved/defined/restored. +; - frame pointer register (r31) is saved/defined/restored. ; - stack frame is allocated with correct alignment. ; - Address of %AlignedBuffer is calculated based off offset from the stack ; pointer. @@ -25,7 +26,9 @@ declare void @callee(ptr) ; 32BIT: subfic 0, 0, -224 ; 32BIT: stwux 1, 1, 0 ; 32BIT: addi 3, 1, 64 +; 32BIT: stw 31, -12(30) ; 32BIT: bl .callee +; 32BIT: lwz 31, -12(30) ; 32BIT: mr 1, 30 ; 32BIT: lwz 30, -16(1) @@ -36,6 +39,8 @@ declare void @callee(ptr) ; 64BIT: subfic 0, 0, -288 ; 64BIT: stdux 1, 1, 0 ; 64BIT: addi 3, 1, 128 +; 64BIT: std 31, -16(30) ; 64BIT: bl .callee +; 64BIT: ld 31, -16(30) ; 64BIT: mr 1, 30 ; 64BIT: ld 30, -24(1) diff --git a/llvm/test/CodeGen/PowerPC/builtins-bcd-assist.ll b/llvm/test/CodeGen/PowerPC/builtins-bcd-assist.ll new file mode 100644 index 000000000000..cc5d6bee3c97 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/builtins-bcd-assist.ll @@ -0,0 +1,111 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux \ +; RUN: --ppc-asm-full-reg-names -mcpu=pwr7 < %s | FileCheck %s +; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-aix \ +; RUN: --ppc-asm-full-reg-names -mcpu=pwr7 < %s | FileCheck %s +; RUN: llc -verify-machineinstrs -mtriple=powerpc-unknown-aix \ +; RUN: --ppc-asm-full-reg-names -mcpu=pwr7 < %s | FileCheck %s --check-prefix=CHECK-AIX32 + +define dso_local i64 @cdtbcd_test(i64 noundef %ll) { +; CHECK-LABEL: cdtbcd_test: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: cdtbcd r3, r3 +; CHECK-NEXT: clrldi r3, r3, 32 +; CHECK-NEXT: blr +; CHECK-AIX32-LABEL: cdtbcd_test: +; CHECK-AIX32: # %bb.0: # %entry +; CHECK-AIX32-NEXT: li r3, 0 +; CHECK-AIX32-NEXT: cdtbcd r4, r4 +; CHECK-AIX32-NEXT: blr +entry: + %conv = trunc i64 %ll to i32 + %0 = tail call i32 @llvm.ppc.cdtbcd(i32 %conv) + %conv1 = zext i32 %0 to i64 + ret i64 %conv1 +} + +define dso_local zeroext i32 @cdtbcd_test_ui(i32 noundef zeroext %ui) { +; CHECK-LABEL: cdtbcd_test_ui: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: cdtbcd r3, r3 +; CHECK-NEXT: clrldi r3, r3, 32 +; CHECK-NEXT: blr +; CHECK-AIX32-LABEL: cdtbcd_test_ui: +; CHECK-AIX32: # %bb.0: # %entry +; CHECK-AIX32-NEXT: cdtbcd r3, r3 +; CHECK-AIX32-NEXT: blr +entry: + %0 = tail call i32 @llvm.ppc.cdtbcd(i32 %ui) + ret i32 %0 +} + +define dso_local i64 @cbcdtd_test(i64 noundef %ll) { +; CHECK-LABEL: cbcdtd_test: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: cbcdtd r3, r3 +; CHECK-NEXT: clrldi r3, r3, 32 +; CHECK-NEXT: blr +; CHECK-AIX32-LABEL: cbcdtd_test: +; CHECK-AIX32: # %bb.0: # %entry +; CHECK-AIX32-NEXT: li r3, 0 +; CHECK-AIX32-NEXT: cbcdtd r4, r4 +; CHECK-AIX32-NEXT: blr +entry: + %conv = trunc i64 %ll to i32 + %0 = tail call i32@llvm.ppc.cbcdtd(i32 %conv) + %conv1 = zext i32 %0 to i64 + ret i64 %conv1 +} + +define dso_local zeroext i32 @cbcdtd_test_ui(i32 noundef zeroext %ui) { +; CHECK-LABEL: cbcdtd_test_ui: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: cbcdtd r3, r3 +; CHECK-NEXT: clrldi r3, r3, 32 +; CHECK-NEXT: blr +; CHECK-AIX32-LABEL: cbcdtd_test_ui: +; CHECK-AIX32: # %bb.0: # %entry +; CHECK-AIX32-NEXT: cbcdtd r3, r3 +; CHECK-AIX32-NEXT: blr +entry: + %0 = tail call i32 @llvm.ppc.cbcdtd(i32 %ui) + ret i32 %0 +} + +define dso_local i64 @addg6s_test(i64 noundef %ll, i64 noundef %ll2) { +; CHECK-LABEL: addg6s_test: +; CHECK: bb.0: # %entry +; CHECK-NEXT: addg6s r3, r3, r4 +; CHECK-NEXT: clrldi r3, r3, 32 +; CHECK-NEXT: blr +; CHECK-AIX32-LABEL: addg6s_test: +; CHECK-AIX32: # %bb.0: # %entry +; CHECK-AIX32-NEXT: li r3, 0 +; CHECK-AIX32-NEXT: addg6s r4, r4, r6 +; CHECK-AIX32-NEXT: blr +entry: + %conv = trunc i64 %ll to i32 + %conv1 = trunc i64 %ll2 to i32 + %0 = tail call i32 @llvm.ppc.addg6s(i32 %conv, i32 %conv1) + %conv2 = zext i32 %0 to i64 + ret i64 %conv2 +} + +define dso_local zeroext i32 @addg6s_test_ui(i32 noundef zeroext %ui, i32 noundef zeroext %ui2) { +; CHECK-LABEL: addg6s_test_ui: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: addg6s r3, r3, r4 +; CHECK-NEXT: clrldi r3, r3, 32 +; CHECK-NEXT: blr +; CHECK-AIX32-LABEL: addg6s_test_ui: +; CHECK-AIX32: # %bb.0: # %entry +; CHECK-AIX32-NEXT: addg6s r3, r3, r4 +; CHECK-AIX32-NEXT: blr +entry: + %0 = tail call i32 @llvm.ppc.addg6s(i32 %ui, i32 %ui2) + ret i32 %0 +} + +declare i32 @llvm.ppc.cdtbcd(i32) +declare i32 @llvm.ppc.cbcdtd(i32) +declare i32 @llvm.ppc.addg6s(i32, i32) diff --git a/llvm/test/CodeGen/PowerPC/builtins-ppc-bcd-assist.ll b/llvm/test/CodeGen/PowerPC/builtins-ppc-bcd-assist.ll new file mode 100644 index 000000000000..d188f6014f0c --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/builtins-ppc-bcd-assist.ll @@ -0,0 +1,79 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux \ +; RUN: --ppc-asm-full-reg-names -mcpu=pwr7 < %s | FileCheck %s +; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-aix \ +; RUN: --ppc-asm-full-reg-names -mcpu=pwr7 < %s | FileCheck %s + +define i64 @cdtbcd_test(i64 noundef %ll) { +; CHECK-LABEL: cdtbcd_test: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: cdtbcd r3, r3 +; CHECK-NEXT: blr +entry: + %0 = tail call i64 @llvm.ppc.cdtbcdd(i64 %ll) + ret i64 %0 +} + +define zeroext i32 @cdtbcd_test_ui(i32 noundef zeroext %ui) { +; CHECK-LABEL: cdtbcd_test_ui: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: cdtbcd r3, r3 +; CHECK-NEXT: clrldi r3, r3, 32 +; CHECK-NEXT: blr +entry: + %conv = zext i32 %ui to i64 + %0 = tail call i64 @llvm.ppc.cdtbcdd(i64 %conv) + %conv1 = trunc i64 %0 to i32 + ret i32 %conv1 +} + +define i64 @cbcdtd_test(i64 noundef %ll) { +; CHECK-LABEL: cbcdtd_test: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: cbcdtd r3, r3 +; CHECK-NEXT: blr +entry: + %0 = tail call i64 @llvm.ppc.cbcdtdd(i64 %ll) + ret i64 %0 +} + +define zeroext i32 @cbcdtd_test_ui(i32 noundef zeroext %ui) { +; CHECK-LABEL: cbcdtd_test_ui: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: cbcdtd r3, r3 +; CHECK-NEXT: clrldi r3, r3, 32 +; CHECK-NEXT: blr +entry: + %conv = zext i32 %ui to i64 + %0 = tail call i64 @llvm.ppc.cbcdtdd(i64 %conv) + %conv1 = trunc i64 %0 to i32 + ret i32 %conv1 +} + +define i64 @addg6s_test(i64 noundef %ll, i64 noundef %ll2) { +; CHECK-LABEL: addg6s_test: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: addg6s r3, r3, r4 +; CHECK-NEXT: blr +entry: + %0 = tail call i64 @llvm.ppc.addg6sd(i64 %ll, i64 %ll2) + ret i64 %0 +} + +define zeroext i32 @addg6s_test_ui(i32 noundef zeroext %ui, i32 noundef zeroext %ui2) { +; CHECK-LABEL: addg6s_test_ui: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: addg6s r3, r3, r4 +; CHECK-NEXT: clrldi r3, r3, 32 +; CHECK-NEXT: blr +entry: + %conv = zext i32 %ui to i64 + %conv1 = zext i32 %ui2 to i64 + %0 = tail call i64 @llvm.ppc.addg6sd(i64 %conv, i64 %conv1) + %conv2 = trunc i64 %0 to i32 + ret i32 %conv2 +} + +declare i64 @llvm.ppc.cdtbcdd(i64) +declare i64 @llvm.ppc.cbcdtdd(i64) +declare i64 @llvm.ppc.addg6sd(i64, i64) diff --git a/llvm/test/CodeGen/PowerPC/common-chain.ll b/llvm/test/CodeGen/PowerPC/common-chain.ll index ccf0e4520f46..b71a360d1be1 100644 --- a/llvm/test/CodeGen/PowerPC/common-chain.ll +++ b/llvm/test/CodeGen/PowerPC/common-chain.ll @@ -743,219 +743,214 @@ define signext i32 @spill_reduce_succ(ptr %input1, ptr %input2, ptr %output, i64 ; CHECK-NEXT: std r9, -184(r1) # 8-byte Folded Spill ; CHECK-NEXT: std r8, -176(r1) # 8-byte Folded Spill ; CHECK-NEXT: std r7, -168(r1) # 8-byte Folded Spill -; CHECK-NEXT: std r4, -160(r1) # 8-byte Folded Spill +; CHECK-NEXT: std r3, -160(r1) # 8-byte Folded Spill ; CHECK-NEXT: ble cr0, .LBB7_7 ; CHECK-NEXT: # %bb.1: # %for.body.preheader -; CHECK-NEXT: sldi r4, r6, 2 -; CHECK-NEXT: li r6, 1 -; CHECK-NEXT: mr r0, r10 -; CHECK-NEXT: std r10, -192(r1) # 8-byte Folded Spill -; CHECK-NEXT: cmpdi r4, 1 -; CHECK-NEXT: iselgt r4, r4, r6 -; CHECK-NEXT: addi r7, r4, -1 -; CHECK-NEXT: clrldi r6, r4, 63 -; CHECK-NEXT: cmpldi r7, 3 +; CHECK-NEXT: sldi r6, r6, 2 +; CHECK-NEXT: li r7, 1 +; CHECK-NEXT: mr r30, r10 +; CHECK-NEXT: cmpdi r6, 1 +; CHECK-NEXT: iselgt r7, r6, r7 +; CHECK-NEXT: addi r8, r7, -1 +; CHECK-NEXT: clrldi r6, r7, 63 +; CHECK-NEXT: cmpldi r8, 3 ; CHECK-NEXT: blt cr0, .LBB7_4 ; CHECK-NEXT: # %bb.2: # %for.body.preheader.new -; CHECK-NEXT: ld r0, -192(r1) # 8-byte Folded Reload -; CHECK-NEXT: ld r30, -184(r1) # 8-byte Folded Reload -; CHECK-NEXT: ld r8, -176(r1) # 8-byte Folded Reload -; CHECK-NEXT: rldicl r7, r4, 62, 2 -; CHECK-NEXT: ld r9, -168(r1) # 8-byte Folded Reload -; CHECK-NEXT: add r11, r0, r30 -; CHECK-NEXT: add r4, r0, r0 -; CHECK-NEXT: mulli r23, r0, 24 -; CHECK-NEXT: add r14, r0, r8 -; CHECK-NEXT: sldi r12, r0, 5 -; CHECK-NEXT: add r31, r0, r9 -; CHECK-NEXT: sldi r9, r9, 3 -; CHECK-NEXT: sldi r18, r0, 4 -; CHECK-NEXT: sldi r8, r8, 3 -; CHECK-NEXT: add r10, r4, r4 -; CHECK-NEXT: sldi r4, r30, 3 -; CHECK-NEXT: sldi r11, r11, 3 -; CHECK-NEXT: add r26, r12, r9 -; CHECK-NEXT: add r16, r18, r9 -; CHECK-NEXT: add r29, r12, r8 -; CHECK-NEXT: add r19, r18, r8 -; CHECK-NEXT: add r30, r12, r4 -; CHECK-NEXT: mr r20, r4 -; CHECK-NEXT: std r4, -200(r1) # 8-byte Folded Spill -; CHECK-NEXT: ld r4, -160(r1) # 8-byte Folded Reload -; CHECK-NEXT: add r15, r5, r11 -; CHECK-NEXT: sldi r11, r14, 3 -; CHECK-NEXT: add r29, r5, r29 -; CHECK-NEXT: add r28, r3, r26 -; CHECK-NEXT: add r19, r5, r19 -; CHECK-NEXT: add r21, r23, r9 -; CHECK-NEXT: add r24, r23, r8 -; CHECK-NEXT: add r14, r5, r11 -; CHECK-NEXT: sldi r11, r31, 3 -; CHECK-NEXT: add r25, r23, r20 -; CHECK-NEXT: add r20, r18, r20 -; CHECK-NEXT: add r30, r5, r30 -; CHECK-NEXT: add r18, r3, r16 -; CHECK-NEXT: add r24, r5, r24 -; CHECK-NEXT: add r23, r3, r21 -; CHECK-NEXT: add r27, r4, r26 -; CHECK-NEXT: add r22, r4, r21 -; CHECK-NEXT: add r17, r4, r16 -; CHECK-NEXT: add r2, r4, r11 -; CHECK-NEXT: rldicl r4, r7, 2, 1 -; CHECK-NEXT: sub r7, r8, r9 -; CHECK-NEXT: ld r8, -200(r1) # 8-byte Folded Reload +; CHECK-NEXT: ld r14, -168(r1) # 8-byte Folded Reload +; CHECK-NEXT: mulli r24, r30, 24 +; CHECK-NEXT: ld r16, -184(r1) # 8-byte Folded Reload +; CHECK-NEXT: ld r15, -176(r1) # 8-byte Folded Reload +; CHECK-NEXT: ld r3, -160(r1) # 8-byte Folded Reload +; CHECK-NEXT: rldicl r0, r7, 62, 2 +; CHECK-NEXT: sldi r11, r30, 5 +; CHECK-NEXT: sldi r19, r30, 4 +; CHECK-NEXT: sldi r7, r14, 3 +; CHECK-NEXT: add r14, r30, r14 +; CHECK-NEXT: sldi r10, r16, 3 +; CHECK-NEXT: sldi r12, r15, 3 +; CHECK-NEXT: add r16, r30, r16 +; CHECK-NEXT: add r15, r30, r15 +; CHECK-NEXT: add r27, r11, r7 +; CHECK-NEXT: add r22, r24, r7 +; CHECK-NEXT: add r17, r19, r7 +; CHECK-NEXT: sldi r2, r14, 3 +; CHECK-NEXT: add r26, r24, r10 +; CHECK-NEXT: add r25, r24, r12 +; CHECK-NEXT: add r21, r19, r10 +; CHECK-NEXT: add r20, r19, r12 +; CHECK-NEXT: add r8, r11, r10 +; CHECK-NEXT: sldi r16, r16, 3 +; CHECK-NEXT: add r29, r5, r27 +; CHECK-NEXT: add r28, r4, r27 +; CHECK-NEXT: add r27, r3, r27 +; CHECK-NEXT: add r24, r5, r22 +; CHECK-NEXT: add r23, r4, r22 +; CHECK-NEXT: add r22, r3, r22 +; CHECK-NEXT: add r19, r5, r17 +; CHECK-NEXT: add r18, r4, r17 +; CHECK-NEXT: add r17, r3, r17 +; CHECK-NEXT: add r14, r5, r2 +; CHECK-NEXT: add r31, r4, r2 +; CHECK-NEXT: add r2, r3, r2 +; CHECK-NEXT: add r9, r5, r8 +; CHECK-NEXT: add r8, r11, r12 ; CHECK-NEXT: add r26, r5, r26 ; CHECK-NEXT: add r25, r5, r25 ; CHECK-NEXT: add r21, r5, r21 ; CHECK-NEXT: add r20, r5, r20 ; CHECK-NEXT: add r16, r5, r16 -; CHECK-NEXT: add r31, r5, r11 -; CHECK-NEXT: add r11, r3, r11 -; CHECK-NEXT: addi r4, r4, -4 -; CHECK-NEXT: rldicl r4, r4, 62, 2 -; CHECK-NEXT: sub r8, r8, r9 -; CHECK-NEXT: li r9, 0 -; CHECK-NEXT: addi r4, r4, 1 -; CHECK-NEXT: mtctr r4 +; CHECK-NEXT: add r8, r5, r8 +; CHECK-NEXT: rldicl r3, r0, 2, 1 +; CHECK-NEXT: addi r3, r3, -4 +; CHECK-NEXT: sub r0, r12, r7 +; CHECK-NEXT: sub r12, r10, r7 +; CHECK-NEXT: li r7, 0 +; CHECK-NEXT: mr r10, r30 +; CHECK-NEXT: sldi r15, r15, 3 +; CHECK-NEXT: add r15, r5, r15 +; CHECK-NEXT: rldicl r3, r3, 62, 2 +; CHECK-NEXT: addi r3, r3, 1 +; CHECK-NEXT: mtctr r3 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB7_3: # %for.body ; CHECK-NEXT: # -; CHECK-NEXT: lfd f0, 0(r11) -; CHECK-NEXT: lfd f1, 0(r2) -; CHECK-NEXT: add r0, r0, r10 -; CHECK-NEXT: xsmuldp f0, f0, f1 +; CHECK-NEXT: lfd f0, 0(r2) ; CHECK-NEXT: lfd f1, 0(r31) +; CHECK-NEXT: add r3, r10, r30 +; CHECK-NEXT: add r3, r3, r30 +; CHECK-NEXT: xsmuldp f0, f0, f1 +; CHECK-NEXT: lfd f1, 0(r14) +; CHECK-NEXT: add r3, r3, r30 +; CHECK-NEXT: add r10, r3, r30 ; CHECK-NEXT: xsadddp f0, f1, f0 -; CHECK-NEXT: stfd f0, 0(r31) -; CHECK-NEXT: add r31, r31, r12 -; CHECK-NEXT: lfdx f0, r11, r7 -; CHECK-NEXT: lfdx f1, r2, r7 +; CHECK-NEXT: stfd f0, 0(r14) +; CHECK-NEXT: add r14, r14, r11 +; CHECK-NEXT: lfdx f0, r2, r0 +; CHECK-NEXT: lfdx f1, r31, r0 ; CHECK-NEXT: xsmuldp f0, f0, f1 -; CHECK-NEXT: lfdx f1, r14, r9 +; CHECK-NEXT: lfdx f1, r15, r7 ; CHECK-NEXT: xsadddp f0, f1, f0 -; CHECK-NEXT: stfdx f0, r14, r9 -; CHECK-NEXT: lfdx f0, r11, r8 -; CHECK-NEXT: lfdx f1, r2, r8 -; CHECK-NEXT: add r11, r11, r12 -; CHECK-NEXT: add r2, r2, r12 +; CHECK-NEXT: stfdx f0, r15, r7 +; CHECK-NEXT: lfdx f0, r2, r12 +; CHECK-NEXT: lfdx f1, r31, r12 +; CHECK-NEXT: add r2, r2, r11 +; CHECK-NEXT: add r31, r31, r11 ; CHECK-NEXT: xsmuldp f0, f0, f1 -; CHECK-NEXT: lfdx f1, r15, r9 +; CHECK-NEXT: lfdx f1, r16, r7 ; CHECK-NEXT: xsadddp f0, f1, f0 -; CHECK-NEXT: stfdx f0, r15, r9 -; CHECK-NEXT: lfd f0, 0(r18) -; CHECK-NEXT: lfd f1, 0(r17) +; CHECK-NEXT: stfdx f0, r16, r7 +; CHECK-NEXT: lfd f0, 0(r17) +; CHECK-NEXT: lfd f1, 0(r18) ; CHECK-NEXT: xsmuldp f0, f0, f1 -; CHECK-NEXT: lfdx f1, r16, r9 +; CHECK-NEXT: lfdx f1, r19, r7 ; CHECK-NEXT: xsadddp f0, f1, f0 -; CHECK-NEXT: stfdx f0, r16, r9 -; CHECK-NEXT: lfdx f0, r18, r7 -; CHECK-NEXT: lfdx f1, r17, r7 +; CHECK-NEXT: stfdx f0, r19, r7 +; CHECK-NEXT: lfdx f0, r17, r0 +; CHECK-NEXT: lfdx f1, r18, r0 ; CHECK-NEXT: xsmuldp f0, f0, f1 -; CHECK-NEXT: lfdx f1, r19, r9 +; CHECK-NEXT: lfdx f1, r20, r7 ; CHECK-NEXT: xsadddp f0, f1, f0 -; CHECK-NEXT: stfdx f0, r19, r9 -; CHECK-NEXT: lfdx f0, r18, r8 -; CHECK-NEXT: lfdx f1, r17, r8 -; CHECK-NEXT: add r18, r18, r12 -; CHECK-NEXT: add r17, r17, r12 +; CHECK-NEXT: stfdx f0, r20, r7 +; CHECK-NEXT: lfdx f0, r17, r12 +; CHECK-NEXT: lfdx f1, r18, r12 +; CHECK-NEXT: add r17, r17, r11 +; CHECK-NEXT: add r18, r18, r11 ; CHECK-NEXT: xsmuldp f0, f0, f1 -; CHECK-NEXT: lfdx f1, r20, r9 +; CHECK-NEXT: lfdx f1, r21, r7 ; CHECK-NEXT: xsadddp f0, f1, f0 -; CHECK-NEXT: stfdx f0, r20, r9 -; CHECK-NEXT: lfd f0, 0(r23) -; CHECK-NEXT: lfd f1, 0(r22) +; CHECK-NEXT: stfdx f0, r21, r7 +; CHECK-NEXT: lfd f0, 0(r22) +; CHECK-NEXT: lfd f1, 0(r23) ; CHECK-NEXT: xsmuldp f0, f0, f1 -; CHECK-NEXT: lfdx f1, r21, r9 +; CHECK-NEXT: lfdx f1, r24, r7 ; CHECK-NEXT: xsadddp f0, f1, f0 -; CHECK-NEXT: stfdx f0, r21, r9 -; CHECK-NEXT: lfdx f0, r23, r7 -; CHECK-NEXT: lfdx f1, r22, r7 +; CHECK-NEXT: stfdx f0, r24, r7 +; CHECK-NEXT: lfdx f0, r22, r0 +; CHECK-NEXT: lfdx f1, r23, r0 ; CHECK-NEXT: xsmuldp f0, f0, f1 -; CHECK-NEXT: lfdx f1, r24, r9 +; CHECK-NEXT: lfdx f1, r25, r7 ; CHECK-NEXT: xsadddp f0, f1, f0 -; CHECK-NEXT: stfdx f0, r24, r9 -; CHECK-NEXT: lfdx f0, r23, r8 -; CHECK-NEXT: lfdx f1, r22, r8 -; CHECK-NEXT: add r23, r23, r12 -; CHECK-NEXT: add r22, r22, r12 +; CHECK-NEXT: stfdx f0, r25, r7 +; CHECK-NEXT: lfdx f0, r22, r12 +; CHECK-NEXT: lfdx f1, r23, r12 +; CHECK-NEXT: add r22, r22, r11 +; CHECK-NEXT: add r23, r23, r11 ; CHECK-NEXT: xsmuldp f0, f0, f1 -; CHECK-NEXT: lfdx f1, r25, r9 +; CHECK-NEXT: lfdx f1, r26, r7 ; CHECK-NEXT: xsadddp f0, f1, f0 -; CHECK-NEXT: stfdx f0, r25, r9 -; CHECK-NEXT: lfd f0, 0(r28) -; CHECK-NEXT: lfd f1, 0(r27) +; CHECK-NEXT: stfdx f0, r26, r7 +; CHECK-NEXT: lfd f0, 0(r27) +; CHECK-NEXT: lfd f1, 0(r28) ; CHECK-NEXT: xsmuldp f0, f0, f1 -; CHECK-NEXT: lfdx f1, r26, r9 +; CHECK-NEXT: lfdx f1, r29, r7 ; CHECK-NEXT: xsadddp f0, f1, f0 -; CHECK-NEXT: stfdx f0, r26, r9 -; CHECK-NEXT: lfdx f0, r28, r7 -; CHECK-NEXT: lfdx f1, r27, r7 +; CHECK-NEXT: stfdx f0, r29, r7 +; CHECK-NEXT: lfdx f0, r27, r0 +; CHECK-NEXT: lfdx f1, r28, r0 ; CHECK-NEXT: xsmuldp f0, f0, f1 -; CHECK-NEXT: lfdx f1, r29, r9 +; CHECK-NEXT: lfdx f1, r8, r7 ; CHECK-NEXT: xsadddp f0, f1, f0 -; CHECK-NEXT: stfdx f0, r29, r9 -; CHECK-NEXT: lfdx f0, r28, r8 -; CHECK-NEXT: lfdx f1, r27, r8 -; CHECK-NEXT: add r28, r28, r12 -; CHECK-NEXT: add r27, r27, r12 +; CHECK-NEXT: stfdx f0, r8, r7 +; CHECK-NEXT: lfdx f0, r27, r12 +; CHECK-NEXT: lfdx f1, r28, r12 +; CHECK-NEXT: add r27, r27, r11 +; CHECK-NEXT: add r28, r28, r11 ; CHECK-NEXT: xsmuldp f0, f0, f1 -; CHECK-NEXT: lfdx f1, r30, r9 +; CHECK-NEXT: lfdx f1, r9, r7 ; CHECK-NEXT: xsadddp f0, f1, f0 -; CHECK-NEXT: stfdx f0, r30, r9 -; CHECK-NEXT: add r9, r9, r12 +; CHECK-NEXT: stfdx f0, r9, r7 +; CHECK-NEXT: add r7, r7, r11 ; CHECK-NEXT: bdnz .LBB7_3 ; CHECK-NEXT: .LBB7_4: # %for.cond.cleanup.loopexit.unr-lcssa -; CHECK-NEXT: ld r7, -192(r1) # 8-byte Folded Reload ; CHECK-NEXT: cmpldi r6, 0 ; CHECK-NEXT: beq cr0, .LBB7_7 ; CHECK-NEXT: # %bb.5: # %for.body.epil.preheader -; CHECK-NEXT: ld r4, -184(r1) # 8-byte Folded Reload -; CHECK-NEXT: ld r29, -160(r1) # 8-byte Folded Reload -; CHECK-NEXT: mr r30, r3 -; CHECK-NEXT: sldi r7, r7, 3 -; CHECK-NEXT: add r4, r0, r4 -; CHECK-NEXT: sldi r4, r4, 3 -; CHECK-NEXT: add r3, r5, r4 -; CHECK-NEXT: add r8, r29, r4 -; CHECK-NEXT: add r9, r30, r4 -; CHECK-NEXT: ld r4, -176(r1) # 8-byte Folded Reload -; CHECK-NEXT: add r4, r0, r4 -; CHECK-NEXT: sldi r4, r4, 3 -; CHECK-NEXT: add r10, r5, r4 -; CHECK-NEXT: add r11, r29, r4 -; CHECK-NEXT: add r12, r30, r4 -; CHECK-NEXT: ld r4, -168(r1) # 8-byte Folded Reload -; CHECK-NEXT: add r4, r0, r4 -; CHECK-NEXT: sldi r0, r4, 3 -; CHECK-NEXT: add r5, r5, r0 -; CHECK-NEXT: add r4, r29, r0 -; CHECK-NEXT: add r30, r30, r0 -; CHECK-NEXT: li r0, 0 +; CHECK-NEXT: ld r3, -184(r1) # 8-byte Folded Reload +; CHECK-NEXT: ld r0, -160(r1) # 8-byte Folded Reload +; CHECK-NEXT: sldi r8, r30, 3 +; CHECK-NEXT: add r3, r10, r3 +; CHECK-NEXT: sldi r3, r3, 3 +; CHECK-NEXT: add r7, r5, r3 +; CHECK-NEXT: add r9, r4, r3 +; CHECK-NEXT: add r11, r0, r3 +; CHECK-NEXT: ld r3, -176(r1) # 8-byte Folded Reload +; CHECK-NEXT: add r3, r10, r3 +; CHECK-NEXT: sldi r3, r3, 3 +; CHECK-NEXT: add r12, r5, r3 +; CHECK-NEXT: add r30, r4, r3 +; CHECK-NEXT: add r29, r0, r3 +; CHECK-NEXT: ld r3, -168(r1) # 8-byte Folded Reload +; CHECK-NEXT: add r3, r10, r3 +; CHECK-NEXT: li r10, 0 +; CHECK-NEXT: sldi r3, r3, 3 +; CHECK-NEXT: add r5, r5, r3 +; CHECK-NEXT: add r4, r4, r3 +; CHECK-NEXT: add r3, r0, r3 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB7_6: # %for.body.epil ; CHECK-NEXT: # -; CHECK-NEXT: lfdx f0, r30, r0 -; CHECK-NEXT: lfdx f1, r4, r0 +; CHECK-NEXT: lfdx f0, r3, r10 +; CHECK-NEXT: lfdx f1, r4, r10 ; CHECK-NEXT: addi r6, r6, -1 ; CHECK-NEXT: cmpldi r6, 0 ; CHECK-NEXT: xsmuldp f0, f0, f1 ; CHECK-NEXT: lfd f1, 0(r5) ; CHECK-NEXT: xsadddp f0, f1, f0 ; CHECK-NEXT: stfd f0, 0(r5) -; CHECK-NEXT: add r5, r5, r7 -; CHECK-NEXT: lfdx f0, r12, r0 -; CHECK-NEXT: lfdx f1, r11, r0 +; CHECK-NEXT: add r5, r5, r8 +; CHECK-NEXT: lfdx f0, r29, r10 +; CHECK-NEXT: lfdx f1, r30, r10 ; CHECK-NEXT: xsmuldp f0, f0, f1 -; CHECK-NEXT: lfdx f1, r10, r0 +; CHECK-NEXT: lfdx f1, r12, r10 ; CHECK-NEXT: xsadddp f0, f1, f0 -; CHECK-NEXT: stfdx f0, r10, r0 -; CHECK-NEXT: lfdx f0, r9, r0 -; CHECK-NEXT: lfdx f1, r8, r0 +; CHECK-NEXT: stfdx f0, r12, r10 +; CHECK-NEXT: lfdx f0, r11, r10 +; CHECK-NEXT: lfdx f1, r9, r10 ; CHECK-NEXT: xsmuldp f0, f0, f1 -; CHECK-NEXT: lfdx f1, r3, r0 +; CHECK-NEXT: lfdx f1, r7, r10 ; CHECK-NEXT: xsadddp f0, f1, f0 -; CHECK-NEXT: stfdx f0, r3, r0 -; CHECK-NEXT: add r0, r0, r7 +; CHECK-NEXT: stfdx f0, r7, r10 +; CHECK-NEXT: add r10, r10, r8 ; CHECK-NEXT: bne cr0, .LBB7_6 ; CHECK-NEXT: .LBB7_7: # %for.cond.cleanup ; CHECK-NEXT: ld r2, -152(r1) # 8-byte Folded Reload diff --git a/llvm/test/CodeGen/RISCV/O0-pipeline.ll b/llvm/test/CodeGen/RISCV/O0-pipeline.ll index fd2ba49ea861..9be03d557bd8 100644 --- a/llvm/test/CodeGen/RISCV/O0-pipeline.ll +++ b/llvm/test/CodeGen/RISCV/O0-pipeline.ll @@ -26,7 +26,6 @@ ; CHECK-NEXT: Lower Garbage Collection Instructions ; CHECK-NEXT: Shadow Stack GC Lowering ; CHECK-NEXT: Remove unreachable blocks from the CFG -; CHECK-NEXT: Expand vector predication intrinsics ; CHECK-NEXT: Instrument function entry/exit with calls to e.g. mcount() (post inlining) ; CHECK-NEXT: Scalarize Masked Memory Intrinsics ; CHECK-NEXT: Expand reduction intrinsics @@ -69,6 +68,7 @@ ; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: Machine Optimization Remark Emitter ; CHECK-NEXT: Stack Frame Layout Analysis +; CHECK-NEXT: RISC-V Indirect Branch Tracking ; CHECK-NEXT: RISC-V pseudo instruction expansion pass ; CHECK-NEXT: RISC-V atomic pseudo instruction expansion pass ; CHECK-NEXT: Unpack machine instruction bundles diff --git a/llvm/test/CodeGen/RISCV/O3-pipeline.ll b/llvm/test/CodeGen/RISCV/O3-pipeline.ll index d6d0cca6ddae..7bad290bf313 100644 --- a/llvm/test/CodeGen/RISCV/O3-pipeline.ll +++ b/llvm/test/CodeGen/RISCV/O3-pipeline.ll @@ -62,7 +62,6 @@ ; CHECK-NEXT: Constant Hoisting ; CHECK-NEXT: Replace intrinsics with calls to vector library ; CHECK-NEXT: Partially inline calls to library functions -; CHECK-NEXT: Expand vector predication intrinsics ; CHECK-NEXT: Instrument function entry/exit with calls to e.g. mcount() (post inlining) ; CHECK-NEXT: Scalarize Masked Memory Intrinsics ; CHECK-NEXT: Expand reduction intrinsics @@ -195,6 +194,7 @@ ; CHECK-NEXT: Stack Frame Layout Analysis ; CHECK-NEXT: RISC-V Zcmp move merging pass ; CHECK-NEXT: RISC-V Zcmp Push/Pop optimization pass +; CHECK-NEXT: RISC-V Indirect Branch Tracking ; CHECK-NEXT: RISC-V pseudo instruction expansion pass ; CHECK-NEXT: RISC-V atomic pseudo instruction expansion pass ; CHECK-NEXT: Unpack machine instruction bundles diff --git a/llvm/test/CodeGen/RISCV/jumptable-swguarded.ll b/llvm/test/CodeGen/RISCV/jumptable-swguarded.ll index 9d57ca74cd78..0e87d8d6f82f 100644 --- a/llvm/test/CodeGen/RISCV/jumptable-swguarded.ll +++ b/llvm/test/CodeGen/RISCV/jumptable-swguarded.ll @@ -8,6 +8,7 @@ define void @above_threshold(i32 signext %in, ptr %out) nounwind { ; CHECK-LABEL: above_threshold: ; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lpad 0 ; CHECK-NEXT: addi a0, a0, -1 ; CHECK-NEXT: li a2, 5 ; CHECK-NEXT: bltu a2, a0, .LBB0_9 diff --git a/llvm/test/CodeGen/RISCV/lpad.ll b/llvm/test/CodeGen/RISCV/lpad.ll new file mode 100644 index 000000000000..de82a9ee4e34 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/lpad.ll @@ -0,0 +1,101 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple riscv32 -mattr=+experimental-zicfilp < %s | FileCheck %s --check-prefixes=CHECK,RV32 +; RUN: llc -mtriple riscv64 -mattr=+experimental-zicfilp < %s | FileCheck %s --check-prefixes=CHECK,RV64 + +; Check indirectbr. +@__const.indirctbr.addr = private unnamed_addr constant [2 x ptr] [ptr blockaddress(@indirctbr, %labelA), ptr blockaddress(@indirctbr, %labelB)], align 8 +define void @indirctbr(i32 %i, ptr %p) { +; RV32-LABEL: indirctbr: +; RV32: # %bb.0: # %entry +; RV32-NEXT: lpad 0 +; RV32-NEXT: slli a0, a0, 2 +; RV32-NEXT: lui a2, %hi(.L__const.indirctbr.addr) +; RV32-NEXT: addi a2, a2, %lo(.L__const.indirctbr.addr) +; RV32-NEXT: add a0, a2, a0 +; RV32-NEXT: lw a0, 0(a0) +; RV32-NEXT: jr a0 +; RV32-NEXT: .p2align 2 +; RV32-NEXT: .Ltmp0: # Block address taken +; RV32-NEXT: .LBB0_1: # %labelA +; RV32-NEXT: lpad 0 +; RV32-NEXT: li a0, 1 +; RV32-NEXT: sw a0, 0(a1) +; RV32-NEXT: .p2align 2 +; RV32-NEXT: .Ltmp1: # Block address taken +; RV32-NEXT: .LBB0_2: # %labelB +; RV32-NEXT: lpad 0 +; RV32-NEXT: li a0, 2 +; RV32-NEXT: sw a0, 0(a1) +; RV32-NEXT: ret +; +; RV64-LABEL: indirctbr: +; RV64: # %bb.0: # %entry +; RV64-NEXT: lpad 0 +; RV64-NEXT: sext.w a0, a0 +; RV64-NEXT: slli a0, a0, 3 +; RV64-NEXT: lui a2, %hi(.L__const.indirctbr.addr) +; RV64-NEXT: addi a2, a2, %lo(.L__const.indirctbr.addr) +; RV64-NEXT: add a0, a2, a0 +; RV64-NEXT: ld a0, 0(a0) +; RV64-NEXT: jr a0 +; RV64-NEXT: .p2align 2 +; RV64-NEXT: .Ltmp0: # Block address taken +; RV64-NEXT: .LBB0_1: # %labelA +; RV64-NEXT: lpad 0 +; RV64-NEXT: li a0, 1 +; RV64-NEXT: sw a0, 0(a1) +; RV64-NEXT: .p2align 2 +; RV64-NEXT: .Ltmp1: # Block address taken +; RV64-NEXT: .LBB0_2: # %labelB +; RV64-NEXT: lpad 0 +; RV64-NEXT: li a0, 2 +; RV64-NEXT: sw a0, 0(a1) +; RV64-NEXT: ret +entry: + %arrayidx = getelementptr inbounds [2 x ptr], ptr @__const.indirctbr.addr, i64 0, i32 %i + %0 = load ptr, ptr %arrayidx + indirectbr ptr %0, [label %labelA, label %labelB] + +labelA: ; preds = %entry + store volatile i32 1, ptr %p + br label %labelB + +labelB: ; preds = %labelA, %entry + store volatile i32 2, ptr %p + ret void +} + +; Check external linkage function. +define void @external() { +; CHECK-LABEL: external: +; CHECK: # %bb.0: +; CHECK-NEXT: lpad 0 +; CHECK-NEXT: ret + ret void +} + +; Check internal linkage function. +define internal void @internal() { +; CHECK-LABEL: internal: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + ret void +} + +; Check internal linkage function with taken address. +@foo = constant ptr @internal2 +define internal void @internal2() { +; CHECK-LABEL: internal2: +; CHECK: # %bb.0: +; CHECK-NEXT: lpad 0 +; CHECK-NEXT: ret + ret void +} + +; Check interrupt function does not need landing pad. +define void @interrupt() "interrupt"="user" { +; CHECK-LABEL: interrupt: +; CHECK: # %bb.0: +; CHECK-NEXT: mret + ret void +} diff --git a/llvm/test/CodeGen/RISCV/rvv/vcompress.ll b/llvm/test/CodeGen/RISCV/rvv/vcompress.ll index 85663f08db6a..b763e116a9f6 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vcompress.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vcompress.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zfh,+zvfhmin,+zvfbfmin \ ; RUN: -verify-machineinstrs | FileCheck %s -; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfhmin,+zvfbfmin \ ; RUN: -verify-machineinstrs | FileCheck %s declare <vscale x 1 x i8> @llvm.riscv.vcompress.nxv1i8( @@ -817,3 +817,136 @@ entry: ret <vscale x 8 x double> %a } + +declare <vscale x 1 x bfloat> @llvm.riscv.vcompress.nxv1bf16( + <vscale x 1 x bfloat>, + <vscale x 1 x bfloat>, + <vscale x 1 x i1>, + iXLen); + +define <vscale x 1 x bfloat> @intrinsic_vcompress_vm_nxv1bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x i1> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vcompress_vm_nxv1bf16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, tu, ma +; CHECK-NEXT: vcompress.vm v8, v9, v0 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x bfloat> @llvm.riscv.vcompress.nxv1bf16( + <vscale x 1 x bfloat> %0, + <vscale x 1 x bfloat> %1, + <vscale x 1 x i1> %2, + iXLen %3) + + ret <vscale x 1 x bfloat> %a +} + +declare <vscale x 2 x bfloat> @llvm.riscv.vcompress.nxv2bf16( + <vscale x 2 x bfloat>, + <vscale x 2 x bfloat>, + <vscale x 2 x i1>, + iXLen); + +define <vscale x 2 x bfloat> @intrinsic_vcompress_vm_nxv2bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x bfloat> %1, <vscale x 2 x i1> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vcompress_vm_nxv2bf16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a0, e16, mf2, tu, ma +; CHECK-NEXT: vcompress.vm v8, v9, v0 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x bfloat> @llvm.riscv.vcompress.nxv2bf16( + <vscale x 2 x bfloat> %0, + <vscale x 2 x bfloat> %1, + <vscale x 2 x i1> %2, + iXLen %3) + + ret <vscale x 2 x bfloat> %a +} + +declare <vscale x 4 x bfloat> @llvm.riscv.vcompress.nxv4bf16( + <vscale x 4 x bfloat>, + <vscale x 4 x bfloat>, + <vscale x 4 x i1>, + iXLen); + +define <vscale x 4 x bfloat> @intrinsic_vcompress_vm_nxv4bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x bfloat> %1, <vscale x 4 x i1> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vcompress_vm_nxv4bf16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a0, e16, m1, tu, ma +; CHECK-NEXT: vcompress.vm v8, v9, v0 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x bfloat> @llvm.riscv.vcompress.nxv4bf16( + <vscale x 4 x bfloat> %0, + <vscale x 4 x bfloat> %1, + <vscale x 4 x i1> %2, + iXLen %3) + + ret <vscale x 4 x bfloat> %a +} + +declare <vscale x 8 x bfloat> @llvm.riscv.vcompress.nxv8bf16( + <vscale x 8 x bfloat>, + <vscale x 8 x bfloat>, + <vscale x 8 x i1>, + iXLen); + +define <vscale x 8 x bfloat> @intrinsic_vcompress_vm_nxv8bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x bfloat> %1, <vscale x 8 x i1> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vcompress_vm_nxv8bf16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a0, e16, m2, tu, ma +; CHECK-NEXT: vcompress.vm v8, v10, v0 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x bfloat> @llvm.riscv.vcompress.nxv8bf16( + <vscale x 8 x bfloat> %0, + <vscale x 8 x bfloat> %1, + <vscale x 8 x i1> %2, + iXLen %3) + + ret <vscale x 8 x bfloat> %a +} + +declare <vscale x 16 x bfloat> @llvm.riscv.vcompress.nxv16bf16( + <vscale x 16 x bfloat>, + <vscale x 16 x bfloat>, + <vscale x 16 x i1>, + iXLen); + +define <vscale x 16 x bfloat> @intrinsic_vcompress_vm_nxv16bf16(<vscale x 16 x bfloat> %0, <vscale x 16 x bfloat> %1, <vscale x 16 x i1> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vcompress_vm_nxv16bf16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a0, e16, m4, tu, ma +; CHECK-NEXT: vcompress.vm v8, v12, v0 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x bfloat> @llvm.riscv.vcompress.nxv16bf16( + <vscale x 16 x bfloat> %0, + <vscale x 16 x bfloat> %1, + <vscale x 16 x i1> %2, + iXLen %3) + + ret <vscale x 16 x bfloat> %a +} + +declare <vscale x 32 x bfloat> @llvm.riscv.vcompress.nxv32bf16( + <vscale x 32 x bfloat>, + <vscale x 32 x bfloat>, + <vscale x 32 x i1>, + iXLen); + +define <vscale x 32 x bfloat> @intrinsic_vcompress_vm_nxv32bf16(<vscale x 32 x bfloat> %0, <vscale x 32 x bfloat> %1, <vscale x 32 x i1> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vcompress_vm_nxv32bf16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a0, e16, m8, tu, ma +; CHECK-NEXT: vcompress.vm v8, v16, v0 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 32 x bfloat> @llvm.riscv.vcompress.nxv32bf16( + <vscale x 32 x bfloat> %0, + <vscale x 32 x bfloat> %1, + <vscale x 32 x i1> %2, + iXLen %3) + + ret <vscale x 32 x bfloat> %a +} + diff --git a/llvm/test/CodeGen/RISCV/rvv/vp-select.ll b/llvm/test/CodeGen/RISCV/rvv/vp-select.ll new file mode 100644 index 000000000000..c8a048971a80 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/vp-select.ll @@ -0,0 +1,19 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=riscv64 -mattr=+v -verify-machineinstrs | FileCheck %s + +define <vscale x 1 x i64> @all_ones(<vscale x 1 x i64> %true, <vscale x 1 x i64> %false, i32 %evl) { +; CHECK-LABEL: all_ones: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %v = call <vscale x 1 x i64> @llvm.vp.select.nxv1i64(<vscale x 1 x i1> splat (i1 true), <vscale x 1 x i64> %true, <vscale x 1 x i64> %false, i32 %evl) + ret <vscale x 1 x i64> %v +} + +define <vscale x 1 x i64> @all_zeroes(<vscale x 1 x i64> %true, <vscale x 1 x i64> %false, i32 %evl) { +; CHECK-LABEL: all_zeroes: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret + %v = call <vscale x 1 x i64> @llvm.vp.select.nxv1i64(<vscale x 1 x i1> splat (i1 false), <vscale x 1 x i64> %true, <vscale x 1 x i64> %false, i32 %evl) + ret <vscale x 1 x i64> %v +} diff --git a/llvm/test/CodeGen/RISCV/rvv/vrgather.ll b/llvm/test/CodeGen/RISCV/rvv/vrgather.ll index d11e172b2503..5d700e683a96 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vrgather.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vrgather.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+d,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+d,+zfhmin,+zvfh,+zvfbfmin \ ; RUN: -verify-machineinstrs | FileCheck %s -; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+d,+zfh,+zvfh \ +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+d,+zfhmin,+zvfh,+zvfbfmin \ ; RUN: -verify-machineinstrs | FileCheck %s declare <vscale x 1 x i8> @llvm.riscv.vrgather.vv.nxv1i8.iXLen( @@ -4820,3 +4820,785 @@ entry: ret <vscale x 8 x double> %a } + +declare <vscale x 1 x bfloat> @llvm.riscv.vrgather.vv.nxv1bf16.iXLen( + <vscale x 1 x bfloat>, + <vscale x 1 x bfloat>, + <vscale x 1 x i16>, + iXLen) + +define <vscale x 1 x bfloat> @intrinsic_vrgather_vv_nxv1bf16_nxv1bf16_nxv1i16(<vscale x 1 x bfloat> %0, <vscale x 1 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vrgather_vv_nxv1bf16_nxv1bf16_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; CHECK-NEXT: vrgather.vv v10, v8, v9 +; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x bfloat> @llvm.riscv.vrgather.vv.nxv1bf16.iXLen( + <vscale x 1 x bfloat> undef, + <vscale x 1 x bfloat> %0, + <vscale x 1 x i16> %1, + iXLen %2) + + ret <vscale x 1 x bfloat> %a +} + +declare <vscale x 1 x bfloat> @llvm.riscv.vrgather.vv.mask.nxv1bf16.iXLen( + <vscale x 1 x bfloat>, + <vscale x 1 x bfloat>, + <vscale x 1 x i16>, + <vscale x 1 x i1>, + iXLen, + iXLen) + +define <vscale x 1 x bfloat> @intrinsic_vrgather_mask_vv_nxv1bf16_nxv1bf16_nxv1i16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv1bf16_nxv1bf16_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, mu +; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x bfloat> @llvm.riscv.vrgather.vv.mask.nxv1bf16.iXLen( + <vscale x 1 x bfloat> %0, + <vscale x 1 x bfloat> %1, + <vscale x 1 x i16> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x bfloat> %a +} + +declare <vscale x 2 x bfloat> @llvm.riscv.vrgather.vv.nxv2bf16.iXLen( + <vscale x 2 x bfloat>, + <vscale x 2 x bfloat>, + <vscale x 2 x i16>, + iXLen) + +define <vscale x 2 x bfloat> @intrinsic_vrgather_vv_nxv2bf16_nxv2bf16_nxv2i16(<vscale x 2 x bfloat> %0, <vscale x 2 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vrgather_vv_nxv2bf16_nxv2bf16_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; CHECK-NEXT: vrgather.vv v10, v8, v9 +; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x bfloat> @llvm.riscv.vrgather.vv.nxv2bf16.iXLen( + <vscale x 2 x bfloat> undef, + <vscale x 2 x bfloat> %0, + <vscale x 2 x i16> %1, + iXLen %2) + + ret <vscale x 2 x bfloat> %a +} + +declare <vscale x 2 x bfloat> @llvm.riscv.vrgather.vv.mask.nxv2bf16.iXLen( + <vscale x 2 x bfloat>, + <vscale x 2 x bfloat>, + <vscale x 2 x i16>, + <vscale x 2 x i1>, + iXLen, + iXLen) + +define <vscale x 2 x bfloat> @intrinsic_vrgather_mask_vv_nxv2bf16_nxv2bf16_nxv2i16(<vscale x 2 x bfloat> %0, <vscale x 2 x bfloat> %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv2bf16_nxv2bf16_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, mu +; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x bfloat> @llvm.riscv.vrgather.vv.mask.nxv2bf16.iXLen( + <vscale x 2 x bfloat> %0, + <vscale x 2 x bfloat> %1, + <vscale x 2 x i16> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x bfloat> %a +} + +declare <vscale x 4 x bfloat> @llvm.riscv.vrgather.vv.nxv4bf16.iXLen( + <vscale x 4 x bfloat>, + <vscale x 4 x bfloat>, + <vscale x 4 x i16>, + iXLen) + +define <vscale x 4 x bfloat> @intrinsic_vrgather_vv_nxv4bf16_nxv4bf16_nxv4i16(<vscale x 4 x bfloat> %0, <vscale x 4 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vrgather_vv_nxv4bf16_nxv4bf16_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; CHECK-NEXT: vrgather.vv v10, v8, v9 +; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x bfloat> @llvm.riscv.vrgather.vv.nxv4bf16.iXLen( + <vscale x 4 x bfloat> undef, + <vscale x 4 x bfloat> %0, + <vscale x 4 x i16> %1, + iXLen %2) + + ret <vscale x 4 x bfloat> %a +} + +declare <vscale x 4 x bfloat> @llvm.riscv.vrgather.vv.mask.nxv4bf16.iXLen( + <vscale x 4 x bfloat>, + <vscale x 4 x bfloat>, + <vscale x 4 x i16>, + <vscale x 4 x i1>, + iXLen, + iXLen) + +define <vscale x 4 x bfloat> @intrinsic_vrgather_mask_vv_nxv4bf16_nxv4bf16_nxv4i16(<vscale x 4 x bfloat> %0, <vscale x 4 x bfloat> %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv4bf16_nxv4bf16_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, mu +; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x bfloat> @llvm.riscv.vrgather.vv.mask.nxv4bf16.iXLen( + <vscale x 4 x bfloat> %0, + <vscale x 4 x bfloat> %1, + <vscale x 4 x i16> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x bfloat> %a +} + +declare <vscale x 8 x bfloat> @llvm.riscv.vrgather.vv.nxv8bf16.iXLen( + <vscale x 8 x bfloat>, + <vscale x 8 x bfloat>, + <vscale x 8 x i16>, + iXLen) + +define <vscale x 8 x bfloat> @intrinsic_vrgather_vv_nxv8bf16_nxv8bf16_nxv8i16(<vscale x 8 x bfloat> %0, <vscale x 8 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vrgather_vv_nxv8bf16_nxv8bf16_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vrgather.vv v12, v8, v10 +; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x bfloat> @llvm.riscv.vrgather.vv.nxv8bf16.iXLen( + <vscale x 8 x bfloat> undef, + <vscale x 8 x bfloat> %0, + <vscale x 8 x i16> %1, + iXLen %2) + + ret <vscale x 8 x bfloat> %a +} + +declare <vscale x 8 x bfloat> @llvm.riscv.vrgather.vv.mask.nxv8bf16.iXLen( + <vscale x 8 x bfloat>, + <vscale x 8 x bfloat>, + <vscale x 8 x i16>, + <vscale x 8 x i1>, + iXLen, + iXLen) + +define <vscale x 8 x bfloat> @intrinsic_vrgather_mask_vv_nxv8bf16_nxv8bf16_nxv8i16(<vscale x 8 x bfloat> %0, <vscale x 8 x bfloat> %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv8bf16_nxv8bf16_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, mu +; CHECK-NEXT: vrgather.vv v8, v10, v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x bfloat> @llvm.riscv.vrgather.vv.mask.nxv8bf16.iXLen( + <vscale x 8 x bfloat> %0, + <vscale x 8 x bfloat> %1, + <vscale x 8 x i16> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x bfloat> %a +} + +declare <vscale x 16 x bfloat> @llvm.riscv.vrgather.vv.nxv16bf16.iXLen( + <vscale x 16 x bfloat>, + <vscale x 16 x bfloat>, + <vscale x 16 x i16>, + iXLen) + +define <vscale x 16 x bfloat> @intrinsic_vrgather_vv_nxv16bf16_nxv16bf16_nxv16i16(<vscale x 16 x bfloat> %0, <vscale x 16 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vrgather_vv_nxv16bf16_nxv16bf16_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vrgather.vv v16, v8, v12 +; CHECK-NEXT: vmv.v.v v8, v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x bfloat> @llvm.riscv.vrgather.vv.nxv16bf16.iXLen( + <vscale x 16 x bfloat> undef, + <vscale x 16 x bfloat> %0, + <vscale x 16 x i16> %1, + iXLen %2) + + ret <vscale x 16 x bfloat> %a +} + +declare <vscale x 16 x bfloat> @llvm.riscv.vrgather.vv.mask.nxv16bf16.iXLen( + <vscale x 16 x bfloat>, + <vscale x 16 x bfloat>, + <vscale x 16 x i16>, + <vscale x 16 x i1>, + iXLen, + iXLen) + +define <vscale x 16 x bfloat> @intrinsic_vrgather_mask_vv_nxv16bf16_nxv16bf16_nxv16i16(<vscale x 16 x bfloat> %0, <vscale x 16 x bfloat> %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv16bf16_nxv16bf16_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, mu +; CHECK-NEXT: vrgather.vv v8, v12, v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x bfloat> @llvm.riscv.vrgather.vv.mask.nxv16bf16.iXLen( + <vscale x 16 x bfloat> %0, + <vscale x 16 x bfloat> %1, + <vscale x 16 x i16> %2, + <vscale x 16 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 16 x bfloat> %a +} + +declare <vscale x 32 x bfloat> @llvm.riscv.vrgather.vv.nxv32bf16.iXLen( + <vscale x 32 x bfloat>, + <vscale x 32 x bfloat>, + <vscale x 32 x i16>, + iXLen) + +define <vscale x 32 x bfloat> @intrinsic_vrgather_vv_nxv32bf16_nxv32bf16_nxv32i16(<vscale x 32 x bfloat> %0, <vscale x 32 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vrgather_vv_nxv32bf16_nxv32bf16_nxv32i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma +; CHECK-NEXT: vrgather.vv v24, v8, v16 +; CHECK-NEXT: vmv.v.v v8, v24 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 32 x bfloat> @llvm.riscv.vrgather.vv.nxv32bf16.iXLen( + <vscale x 32 x bfloat> undef, + <vscale x 32 x bfloat> %0, + <vscale x 32 x i16> %1, + iXLen %2) + + ret <vscale x 32 x bfloat> %a +} + +declare <vscale x 32 x bfloat> @llvm.riscv.vrgather.vv.mask.nxv32bf16.iXLen( + <vscale x 32 x bfloat>, + <vscale x 32 x bfloat>, + <vscale x 32 x i16>, + <vscale x 32 x i1>, + iXLen, + iXLen) + +define <vscale x 32 x bfloat> @intrinsic_vrgather_mask_vv_nxv32bf16_nxv32bf16_nxv32i16(<vscale x 32 x bfloat> %0, <vscale x 32 x bfloat> %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv32bf16_nxv32bf16_nxv32i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vl8re16.v v24, (a0) +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, mu +; CHECK-NEXT: vrgather.vv v8, v16, v24, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 32 x bfloat> @llvm.riscv.vrgather.vv.mask.nxv32bf16.iXLen( + <vscale x 32 x bfloat> %0, + <vscale x 32 x bfloat> %1, + <vscale x 32 x i16> %2, + <vscale x 32 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 32 x bfloat> %a +} + +declare <vscale x 1 x bfloat> @llvm.riscv.vrgather.vx.nxv1bf16.iXLen( + <vscale x 1 x bfloat>, + <vscale x 1 x bfloat>, + iXLen, + iXLen) + +define <vscale x 1 x bfloat> @intrinsic_vrgather_vx_nxv1bf16_nxv1bf16(<vscale x 1 x bfloat> %0, iXLen %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vrgather_vx_nxv1bf16_nxv1bf16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vrgather.vx v9, v8, a0 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x bfloat> @llvm.riscv.vrgather.vx.nxv1bf16.iXLen( + <vscale x 1 x bfloat> undef, + <vscale x 1 x bfloat> %0, + iXLen %1, + iXLen %2) + + ret <vscale x 1 x bfloat> %a +} + +declare <vscale x 1 x bfloat> @llvm.riscv.vrgather.vx.mask.nxv1bf16.iXLen( + <vscale x 1 x bfloat>, + <vscale x 1 x bfloat>, + iXLen, + <vscale x 1 x i1>, + iXLen, + iXLen) + +define <vscale x 1 x bfloat> @intrinsic_vrgather_mask_vx_nxv1bf16_nxv1bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, iXLen %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv1bf16_nxv1bf16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, mu +; CHECK-NEXT: vrgather.vx v8, v9, a0, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x bfloat> @llvm.riscv.vrgather.vx.mask.nxv1bf16.iXLen( + <vscale x 1 x bfloat> %0, + <vscale x 1 x bfloat> %1, + iXLen %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x bfloat> %a +} + +declare <vscale x 2 x bfloat> @llvm.riscv.vrgather.vx.nxv2bf16.iXLen( + <vscale x 2 x bfloat>, + <vscale x 2 x bfloat>, + iXLen, + iXLen) + +define <vscale x 2 x bfloat> @intrinsic_vrgather_vx_nxv2bf16_nxv2bf16(<vscale x 2 x bfloat> %0, iXLen %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vrgather_vx_nxv2bf16_nxv2bf16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vrgather.vx v9, v8, a0 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x bfloat> @llvm.riscv.vrgather.vx.nxv2bf16.iXLen( + <vscale x 2 x bfloat> undef, + <vscale x 2 x bfloat> %0, + iXLen %1, + iXLen %2) + + ret <vscale x 2 x bfloat> %a +} + +declare <vscale x 2 x bfloat> @llvm.riscv.vrgather.vx.mask.nxv2bf16.iXLen( + <vscale x 2 x bfloat>, + <vscale x 2 x bfloat>, + iXLen, + <vscale x 2 x i1>, + iXLen, + iXLen) + +define <vscale x 2 x bfloat> @intrinsic_vrgather_mask_vx_nxv2bf16_nxv2bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x bfloat> %1, iXLen %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv2bf16_nxv2bf16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, mu +; CHECK-NEXT: vrgather.vx v8, v9, a0, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x bfloat> @llvm.riscv.vrgather.vx.mask.nxv2bf1bf16XLen( + <vscale x 2 x bfloat> %0, + <vscale x 2 x bfloat> %1, + iXLen %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x bfloat> %a +} + +declare <vscale x 4 x bfloat> @llvm.riscv.vrgather.vx.nxv4bf16.iXLen( + <vscale x 4 x bfloat>, + <vscale x 4 x bfloat>, + iXLen, + iXLen) + +define <vscale x 4 x bfloat> @intrinsic_vrgather_vx_nxv4bf16_nxv4bf16(<vscale x 4 x bfloat> %0, iXLen %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vrgather_vx_nxv4bf16_nxv4bf16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vrgather.vx v9, v8, a0 +; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x bfloat> @llvm.riscv.vrgather.vx.nxv4bf16.iXLen( + <vscale x 4 x bfloat> undef, + <vscale x 4 x bfloat> %0, + iXLen %1, + iXLen %2) + + ret <vscale x 4 x bfloat> %a +} + +declare <vscale x 4 x bfloat> @llvm.riscv.vrgather.vx.mask.nxv4bf16.iXLen( + <vscale x 4 x bfloat>, + <vscale x 4 x bfloat>, + iXLen, + <vscale x 4 x i1>, + iXLen, + iXLen) + +define <vscale x 4 x bfloat> @intrinsic_vrgather_mask_vx_nxv4bf16_nxv4bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x bfloat> %1, iXLen %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv4bf16_nxv4bf16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, mu +; CHECK-NEXT: vrgather.vx v8, v9, a0, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x bfloat> @llvm.riscv.vrgather.vx.mask.nxv4bf16.iXLen( + <vscale x 4 x bfloat> %0, + <vscale x 4 x bfloat> %1, + iXLen %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x bfloat> %a +} + +declare <vscale x 8 x bfloat> @llvm.riscv.vrgather.vx.nxv8bf16.iXLen( + <vscale x 8 x bfloat>, + <vscale x 8 x bfloat>, + iXLen, + iXLen) + +define <vscale x 8 x bfloat> @intrinsic_vrgather_vx_nxv8bf16_nxv8bf16(<vscale x 8 x bfloat> %0, iXLen %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vrgather_vx_nxv8bf16_nxv8bf16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vrgather.vx v10, v8, a0 +; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x bfloat> @llvm.riscv.vrgather.vx.nxv8bf16.iXLen( + <vscale x 8 x bfloat> undef, + <vscale x 8 x bfloat> %0, + iXLen %1, + iXLen %2) + + ret <vscale x 8 x bfloat> %a +} + +declare <vscale x 8 x bfloat> @llvm.riscv.vrgather.vx.mask.nxv8bf16.iXLen( + <vscale x 8 x bfloat>, + <vscale x 8 x bfloat>, + iXLen, + <vscale x 8 x i1>, + iXLen, + iXLen) + +define <vscale x 8 x bfloat> @intrinsic_vrgather_mask_vx_nxv8bf16_nxv8bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x bfloat> %1, iXLen %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv8bf16_nxv8bf16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, mu +; CHECK-NEXT: vrgather.vx v8, v10, a0, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x bfloat> @llvm.riscv.vrgather.vx.mask.nxv8bf16.iXLen( + <vscale x 8 x bfloat> %0, + <vscale x 8 x bfloat> %1, + iXLen %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x bfloat> %a +} + +declare <vscale x 16 x bfloat> @llvm.riscv.vrgather.vx.nxv16bf16.iXLen( + <vscale x 16 x bfloat>, + <vscale x 16 x bfloat>, + iXLen, + iXLen) + +define <vscale x 16 x bfloat> @intrinsic_vrgather_vx_nxv16bf16_nxv16bf16(<vscale x 16 x bfloat> %0, iXLen %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vrgather_vx_nxv16bf16_nxv16bf16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vrgather.vx v12, v8, a0 +; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x bfloat> @llvm.riscv.vrgather.vx.nxv16bf16.iXLen( + <vscale x 16 x bfloat> undef, + <vscale x 16 x bfloat> %0, + iXLen %1, + iXLen %2) + + ret <vscale x 16 x bfloat> %a +} + +declare <vscale x 16 x bfloat> @llvm.riscv.vrgather.vx.mask.nxv16bf16.iXLen( + <vscale x 16 x bfloat>, + <vscale x 16 x bfloat>, + iXLen, + <vscale x 16 x i1>, + iXLen, + iXLen) + +define <vscale x 16 x bfloat> @intrinsic_vrgather_mask_vx_nxv16bf16_nxv16bf16(<vscale x 16 x bfloat> %0, <vscale x 16 x bfloat> %1, iXLen %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv16bf16_nxv16bf16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, mu +; CHECK-NEXT: vrgather.vx v8, v12, a0, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x bfloat> @llvm.riscv.vrgather.vx.mask.nxv16bf16.iXLen( + <vscale x 16 x bfloat> %0, + <vscale x 16 x bfloat> %1, + iXLen %2, + <vscale x 16 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 16 x bfloat> %a +} + +declare <vscale x 32 x bfloat> @llvm.riscv.vrgather.vx.nxv32bf16.iXLen( + <vscale x 32 x bfloat>, + <vscale x 32 x bfloat>, + iXLen, + iXLen) + +define <vscale x 32 x bfloat> @intrinsic_vrgather_vx_nxv32bf16_nxv32bf16(<vscale x 32 x bfloat> %0, iXLen %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vrgather_vx_nxv32bf16_nxv32bf16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vrgather.vx v16, v8, a0 +; CHECK-NEXT: vmv.v.v v8, v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 32 x bfloat> @llvm.riscv.vrgather.vx.nxv32bf16.iXLen( + <vscale x 32 x bfloat> undef, + <vscale x 32 x bfloat> %0, + iXLen %1, + iXLen %2) + + ret <vscale x 32 x bfloat> %a +} + +declare <vscale x 32 x bfloat> @llvm.riscv.vrgather.vx.mask.nxv32bf16.iXLen( + <vscale x 32 x bfloat>, + <vscale x 32 x bfloat>, + iXLen, + <vscale x 32 x i1>, + iXLen, + iXLen) + +define <vscale x 32 x bfloat> @intrinsic_vrgather_mask_vx_nxv32bf16_nxv32bf16(<vscale x 32 x bfloat> %0, <vscale x 32 x bfloat> %1, iXLen %2, <vscale x 32 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv32bf16_nxv32bf16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, mu +; CHECK-NEXT: vrgather.vx v8, v16, a0, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 32 x bfloat> @llvm.riscv.vrgather.vx.mask.nxv32bf16.iXLen( + <vscale x 32 x bfloat> %0, + <vscale x 32 x bfloat> %1, + iXLen %2, + <vscale x 32 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 32 x bfloat> %a +} + +define <vscale x 1 x bfloat> @intrinsic_vrgather_vi_nxv1bf16_nxv1bf16(<vscale x 1 x bfloat> %0, iXLen %1) nounwind { +; CHECK-LABEL: intrinsic_vrgather_vi_nxv1bf16_nxv1bf16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; CHECK-NEXT: vrgather.vi v9, v8, 9 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x bfloat> @llvm.riscv.vrgather.vx.nxv1bf16.iXLen( + <vscale x 1 x bfloat> undef, + <vscale x 1 x bfloat> %0, + iXLen 9, + iXLen %1) + + ret <vscale x 1 x bfloat> %a +} + +define <vscale x 1 x bfloat> @intrinsic_vrgather_mask_vi_nxv1bf16_nxv1bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x i1> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv1bf16_nxv1bf16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, mu +; CHECK-NEXT: vrgather.vi v8, v9, 9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x bfloat> @llvm.riscv.vrgather.vx.mask.nxv1bf16.iXLen( + <vscale x 1 x bfloat> %0, + <vscale x 1 x bfloat> %1, + iXLen 9, + <vscale x 1 x i1> %2, + iXLen %3, iXLen 1) + + ret <vscale x 1 x bfloat> %a +} + +define <vscale x 2 x bfloat> @intrinsic_vrgather_vi_nxv2bf16_nxv2bf16(<vscale x 2 x bfloat> %0, iXLen %1) nounwind { +; CHECK-LABEL: intrinsic_vrgather_vi_nxv2bf16_nxv2bf16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; CHECK-NEXT: vrgather.vi v9, v8, 9 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x bfloat> @llvm.riscv.vrgather.vx.nxv2bf16.iXLen( + <vscale x 2 x bfloat> undef, + <vscale x 2 x bfloat> %0, + iXLen 9, + iXLen %1) + + ret <vscale x 2 x bfloat> %a +} + +define <vscale x 2 x bfloat> @intrinsic_vrgather_mask_vi_nxv2bf16_nxv2bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x bfloat> %1, <vscale x 2 x i1> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv2bf16_nxv2bf16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, mu +; CHECK-NEXT: vrgather.vi v8, v9, 9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x bfloat> @llvm.riscv.vrgather.vx.mask.nxv2bf16.iXLen( + <vscale x 2 x bfloat> %0, + <vscale x 2 x bfloat> %1, + iXLen 9, + <vscale x 2 x i1> %2, + iXLen %3, iXLen 1) + + ret <vscale x 2 x bfloat> %a +} + +define <vscale x 4 x bfloat> @intrinsic_vrgather_vi_nxv4bf16_nxv4bf16(<vscale x 4 x bfloat> %0, iXLen %1) nounwind { +; CHECK-LABEL: intrinsic_vrgather_vi_nxv4bf16_nxv4bf16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; CHECK-NEXT: vrgather.vi v9, v8, 9 +; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x bfloat> @llvm.riscv.vrgather.vx.nxv4bf16.iXLen( + <vscale x 4 x bfloat> undef, + <vscale x 4 x bfloat> %0, + iXLen 9, + iXLen %1) + + ret <vscale x 4 x bfloat> %a +} + +define <vscale x 4 x bfloat> @intrinsic_vrgather_mask_vi_nxv4bf16_nxv4bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x bfloat> %1, <vscale x 4 x i1> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv4bf16_nxv4bf16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, mu +; CHECK-NEXT: vrgather.vi v8, v9, 9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x bfloat> @llvm.riscv.vrgather.vx.mask.nxv4bf16.iXLen( + <vscale x 4 x bfloat> %0, + <vscale x 4 x bfloat> %1, + iXLen 9, + <vscale x 4 x i1> %2, + iXLen %3, iXLen 1) + + ret <vscale x 4 x bfloat> %a +} + +define <vscale x 8 x bfloat> @intrinsic_vrgather_vi_nxv8bf16_nxv8bf16(<vscale x 8 x bfloat> %0, iXLen %1) nounwind { +; CHECK-LABEL: intrinsic_vrgather_vi_nxv8bf16_nxv8bf16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vrgather.vi v10, v8, 9 +; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x bfloat> @llvm.riscv.vrgather.vx.nxv8bf16.iXLen( + <vscale x 8 x bfloat> undef, + <vscale x 8 x bfloat> %0, + iXLen 9, + iXLen %1) + + ret <vscale x 8 x bfloat> %a +} + +define <vscale x 8 x bfloat> @intrinsic_vrgather_mask_vi_nxv8bf16_nxv8bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x bfloat> %1, <vscale x 8 x i1> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv8bf16_nxv8bf16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, mu +; CHECK-NEXT: vrgather.vi v8, v10, 9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x bfloat> @llvm.riscv.vrgather.vx.mask.nxv8bf16.iXLen( + <vscale x 8 x bfloat> %0, + <vscale x 8 x bfloat> %1, + iXLen 9, + <vscale x 8 x i1> %2, + iXLen %3, iXLen 1) + + ret <vscale x 8 x bfloat> %a +} + +define <vscale x 16 x bfloat> @intrinsic_vrgather_vi_nxv16bf16_nxv16bf16(<vscale x 16 x bfloat> %0, iXLen %1) nounwind { +; CHECK-LABEL: intrinsic_vrgather_vi_nxv16bf16_nxv16bf16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vrgather.vi v12, v8, 9 +; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x bfloat> @llvm.riscv.vrgather.vx.nxv16bf16.iXLen( + <vscale x 16 x bfloat> undef, + <vscale x 16 x bfloat> %0, + iXLen 9, + iXLen %1) + + ret <vscale x 16 x bfloat> %a +} + +define <vscale x 16 x bfloat> @intrinsic_vrgather_mask_vi_nxv16bf16_nxv16bf16(<vscale x 16 x bfloat> %0, <vscale x 16 x bfloat> %1, <vscale x 16 x i1> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv16bf16_nxv16bf16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, mu +; CHECK-NEXT: vrgather.vi v8, v12, 9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x bfloat> @llvm.riscv.vrgather.vx.mask.nxv16bf16.iXLen( + <vscale x 16 x bfloat> %0, + <vscale x 16 x bfloat> %1, + iXLen 9, + <vscale x 16 x i1> %2, + iXLen %3, iXLen 1) + + ret <vscale x 16 x bfloat> %a +} + +define <vscale x 32 x bfloat> @intrinsic_vrgather_vi_nxv32bf16_nxv32bf16(<vscale x 32 x bfloat> %0, iXLen %1) nounwind { +; CHECK-LABEL: intrinsic_vrgather_vi_nxv32bf16_nxv32bf16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma +; CHECK-NEXT: vrgather.vi v16, v8, 9 +; CHECK-NEXT: vmv.v.v v8, v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 32 x bfloat> @llvm.riscv.vrgather.vx.nxv32bf16.iXLen( + <vscale x 32 x bfloat> undef, + <vscale x 32 x bfloat> %0, + iXLen 9, + iXLen %1) + + ret <vscale x 32 x bfloat> %a +} + +define <vscale x 32 x bfloat> @intrinsic_vrgather_mask_vi_nxv32bf16_nxv32bf16(<vscale x 32 x bfloat> %0, <vscale x 32 x bfloat> %1, <vscale x 32 x i1> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv32bf16_nxv32bf16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, mu +; CHECK-NEXT: vrgather.vi v8, v16, 9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 32 x bfloat> @llvm.riscv.vrgather.vx.mask.nxv32bf16.iXLen( + <vscale x 32 x bfloat> %0, + <vscale x 32 x bfloat> %1, + iXLen 9, + <vscale x 32 x i1> %2, + iXLen %3, iXLen 1) + + ret <vscale x 32 x bfloat> %a +} + diff --git a/llvm/test/CodeGen/RISCV/selectcc-to-shiftand.ll b/llvm/test/CodeGen/RISCV/selectcc-to-shiftand.ll index 4749cc656693..0d96fbfa8127 100644 --- a/llvm/test/CodeGen/RISCV/selectcc-to-shiftand.ll +++ b/llvm/test/CodeGen/RISCV/selectcc-to-shiftand.ll @@ -276,9 +276,8 @@ define i64 @sraiw_andi(i32 signext %0, i32 signext %1) nounwind { ; RV64-LABEL: sraiw_andi: ; RV64: # %bb.0: # %entry ; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: slli a0, a0, 32 -; RV64-NEXT: srai a0, a0, 2 -; RV64-NEXT: srli a0, a0, 61 +; RV64-NEXT: sraiw a0, a0, 31 +; RV64-NEXT: andi a0, a0, 7 ; RV64-NEXT: ret entry: %3 = add i32 %0, %1 diff --git a/llvm/test/CodeGen/WebAssembly/offset.ll b/llvm/test/CodeGen/WebAssembly/offset.ll index 65de341780e3..763c60cef818 100644 --- a/llvm/test/CodeGen/WebAssembly/offset.ll +++ b/llvm/test/CodeGen/WebAssembly/offset.ll @@ -40,6 +40,26 @@ define i32 @load_i32_with_folded_gep_offset(ptr %p) { ret i32 %t } +; Same for nusw. + +; CHECK-LABEL: load_i32_with_folded_gep_offset_nusw: +; CHECK: i32.load $push0=, 24($0){{$}} +define i32 @load_i32_with_folded_gep_offset_nusw(ptr %p) { + %s = getelementptr nusw i32, ptr %p, i32 6 + %t = load i32, ptr %s + ret i32 %t +} + +; For nuw we don't need the offset to be positive. + +; CHECK-LABEL: load_i32_with_folded_gep_offset_nuw: +; CHECK: i32.load $push0=, -24($0){{$}} +define i32 @load_i32_with_folded_gep_offset_nuw(ptr %p) { + %s = getelementptr nuw i32, ptr %p, i32 -6 + %t = load i32, ptr %s + ret i32 %t +} + ; We can't fold a negative offset though, even with an inbounds gep. ; CHECK-LABEL: load_i32_with_unfolded_gep_negative_offset: diff --git a/llvm/test/CodeGen/X86/O0-pipeline.ll b/llvm/test/CodeGen/X86/O0-pipeline.ll index 29d3c2795ffb..98b86384b844 100644 --- a/llvm/test/CodeGen/X86/O0-pipeline.ll +++ b/llvm/test/CodeGen/X86/O0-pipeline.ll @@ -26,7 +26,6 @@ ; CHECK-NEXT: Lower Garbage Collection Instructions ; CHECK-NEXT: Shadow Stack GC Lowering ; CHECK-NEXT: Remove unreachable blocks from the CFG -; CHECK-NEXT: Expand vector predication intrinsics ; CHECK-NEXT: Instrument function entry/exit with calls to e.g. mcount() (post inlining) ; CHECK-NEXT: Scalarize Masked Memory Intrinsics ; CHECK-NEXT: Expand reduction intrinsics diff --git a/llvm/test/CodeGen/X86/apx/and.ll b/llvm/test/CodeGen/X86/apx/and.ll index 51858ad59160..23aed77b948b 100644 --- a/llvm/test/CodeGen/X86/apx/and.ll +++ b/llvm/test/CodeGen/X86/apx/and.ll @@ -482,17 +482,17 @@ define i1 @andflag16rr(i16 %a, i16 %b) { define i1 @andflag32rr(i32 %a, i32 %b) { ; CHECK-LABEL: andflag32rr: ; CHECK: # %bb.0: -; CHECK-NEXT: andl %esi, %edi, %ecx # encoding: [0x62,0xf4,0x74,0x18,0x21,0xf7] +; CHECK-NEXT: andl %edi, %esi # EVEX TO LEGACY Compression encoding: [0x21,0xfe] ; CHECK-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] -; CHECK-NEXT: movl %ecx, d64(%rip) # encoding: [0x89,0x0d,A,A,A,A] +; CHECK-NEXT: movl %esi, d64(%rip) # encoding: [0x89,0x35,A,A,A,A] ; CHECK-NEXT: # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte ; CHECK-NEXT: retq # encoding: [0xc3] ; ; NF-LABEL: andflag32rr: ; NF: # %bb.0: -; NF-NEXT: andl %esi, %edi, %ecx # encoding: [0x62,0xf4,0x74,0x18,0x21,0xf7] +; NF-NEXT: andl %edi, %esi # EVEX TO LEGACY Compression encoding: [0x21,0xfe] ; NF-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] -; NF-NEXT: movl %ecx, d64(%rip) # encoding: [0x89,0x0d,A,A,A,A] +; NF-NEXT: movl %esi, d64(%rip) # encoding: [0x89,0x35,A,A,A,A] ; NF-NEXT: # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte ; NF-NEXT: retq # encoding: [0xc3] %v0 = and i32 %a, %b ; 0xff << 50 @@ -504,17 +504,17 @@ define i1 @andflag32rr(i32 %a, i32 %b) { define i1 @andflag64rr(i64 %a, i64 %b) { ; CHECK-LABEL: andflag64rr: ; CHECK: # %bb.0: -; CHECK-NEXT: andq %rsi, %rdi, %rcx # encoding: [0x62,0xf4,0xf4,0x18,0x21,0xf7] +; CHECK-NEXT: andq %rdi, %rsi # EVEX TO LEGACY Compression encoding: [0x48,0x21,0xfe] ; CHECK-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] -; CHECK-NEXT: movq %rcx, d64(%rip) # encoding: [0x48,0x89,0x0d,A,A,A,A] +; CHECK-NEXT: movq %rsi, d64(%rip) # encoding: [0x48,0x89,0x35,A,A,A,A] ; CHECK-NEXT: # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte ; CHECK-NEXT: retq # encoding: [0xc3] ; ; NF-LABEL: andflag64rr: ; NF: # %bb.0: -; NF-NEXT: andq %rsi, %rdi, %rcx # encoding: [0x62,0xf4,0xf4,0x18,0x21,0xf7] +; NF-NEXT: andq %rdi, %rsi # EVEX TO LEGACY Compression encoding: [0x48,0x21,0xfe] ; NF-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] -; NF-NEXT: movq %rcx, d64(%rip) # encoding: [0x48,0x89,0x0d,A,A,A,A] +; NF-NEXT: movq %rsi, d64(%rip) # encoding: [0x48,0x89,0x35,A,A,A,A] ; NF-NEXT: # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte ; NF-NEXT: retq # encoding: [0xc3] %v0 = and i64 %a, %b ; 0xff << 50 @@ -578,17 +578,17 @@ define i1 @andflag16rm(ptr %ptr, i16 %b) { define i1 @andflag32rm(ptr %ptr, i32 %b) { ; CHECK-LABEL: andflag32rm: ; CHECK: # %bb.0: -; CHECK-NEXT: andl (%rdi), %esi, %ecx # encoding: [0x62,0xf4,0x74,0x18,0x23,0x37] +; CHECK-NEXT: andl (%rdi), %esi # EVEX TO LEGACY Compression encoding: [0x23,0x37] ; CHECK-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] -; CHECK-NEXT: movl %ecx, d64(%rip) # encoding: [0x89,0x0d,A,A,A,A] +; CHECK-NEXT: movl %esi, d64(%rip) # encoding: [0x89,0x35,A,A,A,A] ; CHECK-NEXT: # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte ; CHECK-NEXT: retq # encoding: [0xc3] ; ; NF-LABEL: andflag32rm: ; NF: # %bb.0: -; NF-NEXT: andl (%rdi), %esi, %ecx # encoding: [0x62,0xf4,0x74,0x18,0x23,0x37] +; NF-NEXT: andl (%rdi), %esi # EVEX TO LEGACY Compression encoding: [0x23,0x37] ; NF-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] -; NF-NEXT: movl %ecx, d64(%rip) # encoding: [0x89,0x0d,A,A,A,A] +; NF-NEXT: movl %esi, d64(%rip) # encoding: [0x89,0x35,A,A,A,A] ; NF-NEXT: # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte ; NF-NEXT: retq # encoding: [0xc3] %a = load i32, ptr %ptr @@ -601,17 +601,17 @@ define i1 @andflag32rm(ptr %ptr, i32 %b) { define i1 @andflag64rm(ptr %ptr, i64 %b) { ; CHECK-LABEL: andflag64rm: ; CHECK: # %bb.0: -; CHECK-NEXT: andq (%rdi), %rsi, %rcx # encoding: [0x62,0xf4,0xf4,0x18,0x23,0x37] +; CHECK-NEXT: andq (%rdi), %rsi # EVEX TO LEGACY Compression encoding: [0x48,0x23,0x37] ; CHECK-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] -; CHECK-NEXT: movq %rcx, d64(%rip) # encoding: [0x48,0x89,0x0d,A,A,A,A] +; CHECK-NEXT: movq %rsi, d64(%rip) # encoding: [0x48,0x89,0x35,A,A,A,A] ; CHECK-NEXT: # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte ; CHECK-NEXT: retq # encoding: [0xc3] ; ; NF-LABEL: andflag64rm: ; NF: # %bb.0: -; NF-NEXT: andq (%rdi), %rsi, %rcx # encoding: [0x62,0xf4,0xf4,0x18,0x23,0x37] +; NF-NEXT: andq (%rdi), %rsi # EVEX TO LEGACY Compression encoding: [0x48,0x23,0x37] ; NF-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] -; NF-NEXT: movq %rcx, d64(%rip) # encoding: [0x48,0x89,0x0d,A,A,A,A] +; NF-NEXT: movq %rsi, d64(%rip) # encoding: [0x48,0x89,0x35,A,A,A,A] ; NF-NEXT: # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte ; NF-NEXT: retq # encoding: [0xc3] %a = load i64, ptr %ptr @@ -672,19 +672,19 @@ define i1 @andflag16ri(i16 %a) { define i1 @andflag32ri(i32 %a) { ; CHECK-LABEL: andflag32ri: ; CHECK: # %bb.0: -; CHECK-NEXT: andl $123456, %edi, %ecx # encoding: [0x62,0xf4,0x74,0x18,0x81,0xe7,0x40,0xe2,0x01,0x00] +; CHECK-NEXT: andl $123456, %edi # EVEX TO LEGACY Compression encoding: [0x81,0xe7,0x40,0xe2,0x01,0x00] ; CHECK-NEXT: # imm = 0x1E240 ; CHECK-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] -; CHECK-NEXT: movl %ecx, d64(%rip) # encoding: [0x89,0x0d,A,A,A,A] +; CHECK-NEXT: movl %edi, d64(%rip) # encoding: [0x89,0x3d,A,A,A,A] ; CHECK-NEXT: # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte ; CHECK-NEXT: retq # encoding: [0xc3] ; ; NF-LABEL: andflag32ri: ; NF: # %bb.0: -; NF-NEXT: andl $123456, %edi, %ecx # encoding: [0x62,0xf4,0x74,0x18,0x81,0xe7,0x40,0xe2,0x01,0x00] +; NF-NEXT: andl $123456, %edi # EVEX TO LEGACY Compression encoding: [0x81,0xe7,0x40,0xe2,0x01,0x00] ; NF-NEXT: # imm = 0x1E240 ; NF-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] -; NF-NEXT: movl %ecx, d64(%rip) # encoding: [0x89,0x0d,A,A,A,A] +; NF-NEXT: movl %edi, d64(%rip) # encoding: [0x89,0x3d,A,A,A,A] ; NF-NEXT: # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte ; NF-NEXT: retq # encoding: [0xc3] %v0 = and i32 %a, 123456 ; 0xff << 50 @@ -696,19 +696,19 @@ define i1 @andflag32ri(i32 %a) { define i1 @andflag64ri(i64 %a) { ; CHECK-LABEL: andflag64ri: ; CHECK: # %bb.0: -; CHECK-NEXT: andq $123456, %rdi, %rcx # encoding: [0x62,0xf4,0xf4,0x18,0x81,0xe7,0x40,0xe2,0x01,0x00] +; CHECK-NEXT: andq $123456, %rdi # EVEX TO LEGACY Compression encoding: [0x48,0x81,0xe7,0x40,0xe2,0x01,0x00] ; CHECK-NEXT: # imm = 0x1E240 ; CHECK-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] -; CHECK-NEXT: movq %rcx, d64(%rip) # encoding: [0x48,0x89,0x0d,A,A,A,A] +; CHECK-NEXT: movq %rdi, d64(%rip) # encoding: [0x48,0x89,0x3d,A,A,A,A] ; CHECK-NEXT: # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte ; CHECK-NEXT: retq # encoding: [0xc3] ; ; NF-LABEL: andflag64ri: ; NF: # %bb.0: -; NF-NEXT: andq $123456, %rdi, %rcx # encoding: [0x62,0xf4,0xf4,0x18,0x81,0xe7,0x40,0xe2,0x01,0x00] +; NF-NEXT: andq $123456, %rdi # EVEX TO LEGACY Compression encoding: [0x48,0x81,0xe7,0x40,0xe2,0x01,0x00] ; NF-NEXT: # imm = 0x1E240 ; NF-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] -; NF-NEXT: movq %rcx, d64(%rip) # encoding: [0x48,0x89,0x0d,A,A,A,A] +; NF-NEXT: movq %rdi, d64(%rip) # encoding: [0x48,0x89,0x3d,A,A,A,A] ; NF-NEXT: # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte ; NF-NEXT: retq # encoding: [0xc3] %v0 = and i64 %a, 123456 ; 0xff << 50 @@ -743,17 +743,17 @@ define i1 @andflag16ri8(i16 %a) { define i1 @andflag32ri8(i32 %a) { ; CHECK-LABEL: andflag32ri8: ; CHECK: # %bb.0: -; CHECK-NEXT: andl $123, %edi, %ecx # encoding: [0x62,0xf4,0x74,0x18,0x83,0xe7,0x7b] +; CHECK-NEXT: andl $123, %edi # EVEX TO LEGACY Compression encoding: [0x83,0xe7,0x7b] ; CHECK-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] -; CHECK-NEXT: movl %ecx, d64(%rip) # encoding: [0x89,0x0d,A,A,A,A] +; CHECK-NEXT: movl %edi, d64(%rip) # encoding: [0x89,0x3d,A,A,A,A] ; CHECK-NEXT: # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte ; CHECK-NEXT: retq # encoding: [0xc3] ; ; NF-LABEL: andflag32ri8: ; NF: # %bb.0: -; NF-NEXT: andl $123, %edi, %ecx # encoding: [0x62,0xf4,0x74,0x18,0x83,0xe7,0x7b] +; NF-NEXT: andl $123, %edi # EVEX TO LEGACY Compression encoding: [0x83,0xe7,0x7b] ; NF-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] -; NF-NEXT: movl %ecx, d64(%rip) # encoding: [0x89,0x0d,A,A,A,A] +; NF-NEXT: movl %edi, d64(%rip) # encoding: [0x89,0x3d,A,A,A,A] ; NF-NEXT: # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte ; NF-NEXT: retq # encoding: [0xc3] %v0 = and i32 %a, 123 ; 0xff << 50 @@ -765,17 +765,17 @@ define i1 @andflag32ri8(i32 %a) { define i1 @andflag64ri8(i64 %a) { ; CHECK-LABEL: andflag64ri8: ; CHECK: # %bb.0: -; CHECK-NEXT: andq $123, %rdi, %rcx # encoding: [0x62,0xf4,0xf4,0x18,0x83,0xe7,0x7b] +; CHECK-NEXT: andq $123, %rdi # EVEX TO LEGACY Compression encoding: [0x48,0x83,0xe7,0x7b] ; CHECK-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] -; CHECK-NEXT: movq %rcx, d64(%rip) # encoding: [0x48,0x89,0x0d,A,A,A,A] +; CHECK-NEXT: movq %rdi, d64(%rip) # encoding: [0x48,0x89,0x3d,A,A,A,A] ; CHECK-NEXT: # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte ; CHECK-NEXT: retq # encoding: [0xc3] ; ; NF-LABEL: andflag64ri8: ; NF: # %bb.0: -; NF-NEXT: andq $123, %rdi, %rcx # encoding: [0x62,0xf4,0xf4,0x18,0x83,0xe7,0x7b] +; NF-NEXT: andq $123, %rdi # EVEX TO LEGACY Compression encoding: [0x48,0x83,0xe7,0x7b] ; NF-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] -; NF-NEXT: movq %rcx, d64(%rip) # encoding: [0x48,0x89,0x0d,A,A,A,A] +; NF-NEXT: movq %rdi, d64(%rip) # encoding: [0x48,0x89,0x3d,A,A,A,A] ; NF-NEXT: # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte ; NF-NEXT: retq # encoding: [0xc3] %v0 = and i64 %a, 123 ; 0xff << 50 diff --git a/llvm/test/CodeGen/X86/apx/cmov.ll b/llvm/test/CodeGen/X86/apx/cmov.ll index 7a6a63f813c0..7b846120d3f7 100644 --- a/llvm/test/CodeGen/X86/apx/cmov.ll +++ b/llvm/test/CodeGen/X86/apx/cmov.ll @@ -5,10 +5,10 @@ define i8 @cmov8(i8 %a, i8 %b, i8 %x, ptr %y.ptr) { ; CHECK-LABEL: cmov8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: cmpb %sil, %dil # encoding: [0x40,0x38,0xf7] -; CHECK-NEXT: cmoval %edi, %edx, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x47,0xd7] -; CHECK-NEXT: movzbl (%rcx), %ecx # encoding: [0x0f,0xb6,0x09] -; CHECK-NEXT: cmovbel %edx, %ecx # EVEX TO LEGACY Compression encoding: [0x0f,0x46,0xca] -; CHECK-NEXT: addb %cl, %al # EVEX TO LEGACY Compression encoding: [0x00,0xc8] +; CHECK-NEXT: cmovbel %edx, %edi # EVEX TO LEGACY Compression encoding: [0x0f,0x46,0xfa] +; CHECK-NEXT: movzbl (%rcx), %eax # encoding: [0x0f,0xb6,0x01] +; CHECK-NEXT: cmovbel %edx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x46,0xc2] +; CHECK-NEXT: addb %dil, %al # EVEX TO LEGACY Compression encoding: [0x40,0x00,0xf8] ; CHECK-NEXT: retq # encoding: [0xc3] entry: %cond = icmp ugt i8 %a, %b @@ -23,9 +23,9 @@ define i16 @cmov16(i16 %a, i16 %b, i16 %x, ptr %y.ptr) { ; CHECK-LABEL: cmov16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: cmpw %si, %di # encoding: [0x66,0x39,0xf7] -; CHECK-NEXT: cmoval %edi, %edx, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x47,0xd7] -; CHECK-NEXT: cmovaw (%rcx), %dx, %cx # encoding: [0x62,0xf4,0x75,0x18,0x47,0x11] -; CHECK-NEXT: addw %cx, %ax # EVEX TO LEGACY Compression encoding: [0x66,0x01,0xc8] +; CHECK-NEXT: cmovbel %edx, %edi # EVEX TO LEGACY Compression encoding: [0x0f,0x46,0xfa] +; CHECK-NEXT: cmovaw (%rcx), %dx, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x47,0x11] +; CHECK-NEXT: addw %di, %ax # EVEX TO LEGACY Compression encoding: [0x66,0x01,0xf8] ; CHECK-NEXT: retq # encoding: [0xc3] entry: %cond = icmp ugt i16 %a, %b @@ -41,8 +41,8 @@ define i32 @cmov32(i32 %a, i32 %b, i32 %x, ptr %y.ptr) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: cmpl %esi, %edi # encoding: [0x39,0xf7] ; CHECK-NEXT: cmoval %edi, %edx, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x47,0xd7] -; CHECK-NEXT: cmoval (%rcx), %edx, %ecx # encoding: [0x62,0xf4,0x74,0x18,0x47,0x11] -; CHECK-NEXT: addl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x01,0xc8] +; CHECK-NEXT: cmoval (%rcx), %edx # EVEX TO LEGACY Compression encoding: [0x0f,0x47,0x11] +; CHECK-NEXT: addl %edx, %eax # EVEX TO LEGACY Compression encoding: [0x01,0xd0] ; CHECK-NEXT: retq # encoding: [0xc3] entry: %cond = icmp ugt i32 %a, %b @@ -58,8 +58,8 @@ define i64 @cmov64(i64 %a, i64 %b, i64 %x, ptr %y.ptr) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: cmpq %rsi, %rdi # encoding: [0x48,0x39,0xf7] ; CHECK-NEXT: cmovaq %rdi, %rdx, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x47,0xd7] -; CHECK-NEXT: cmovaq (%rcx), %rdx, %rcx # encoding: [0x62,0xf4,0xf4,0x18,0x47,0x11] -; CHECK-NEXT: addq %rcx, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x01,0xc8] +; CHECK-NEXT: cmovaq (%rcx), %rdx # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x47,0x11] +; CHECK-NEXT: addq %rdx, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x01,0xd0] ; CHECK-NEXT: retq # encoding: [0xc3] entry: %cond = icmp ugt i64 %a, %b diff --git a/llvm/test/CodeGen/X86/apx/mul-i1024.ll b/llvm/test/CodeGen/X86/apx/mul-i1024.ll index 2b99c44fc769..a4d15a1b21d6 100644 --- a/llvm/test/CodeGen/X86/apx/mul-i1024.ll +++ b/llvm/test/CodeGen/X86/apx/mul-i1024.ll @@ -1041,41 +1041,41 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; EGPR-NDD-NEXT: pushq %r13 ; EGPR-NDD-NEXT: pushq %r12 ; EGPR-NDD-NEXT: pushq %rbx -; EGPR-NDD-NEXT: subq $104, %rsp +; EGPR-NDD-NEXT: subq $96, %rsp ; EGPR-NDD-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; EGPR-NDD-NEXT: movq %rsi, %r15 ; EGPR-NDD-NEXT: movq %rdi, %r20 -; EGPR-NDD-NEXT: movq (%rdi), %r16 -; EGPR-NDD-NEXT: movq 8(%rdi), %r14 +; EGPR-NDD-NEXT: movq (%rdi), %r17 +; EGPR-NDD-NEXT: movq 8(%rdi), %r11 ; EGPR-NDD-NEXT: movq 24(%rdi), %r9 ; EGPR-NDD-NEXT: movq 16(%rdi), %r10 ; EGPR-NDD-NEXT: movq 40(%rdi), %rdi -; EGPR-NDD-NEXT: movq 32(%r20), %r11 -; EGPR-NDD-NEXT: movq 56(%r20), %r17 -; EGPR-NDD-NEXT: movq 48(%r20), %r15 -; EGPR-NDD-NEXT: movq 24(%rsi), %r18 +; EGPR-NDD-NEXT: movq 32(%r20), %r16 +; EGPR-NDD-NEXT: movq 56(%r20), %r18 +; EGPR-NDD-NEXT: movq 48(%r20), %r23 +; EGPR-NDD-NEXT: movq 24(%rsi), %r14 ; EGPR-NDD-NEXT: movq 16(%rsi), %r24 ; EGPR-NDD-NEXT: movq (%rsi), %r22 ; EGPR-NDD-NEXT: movq 8(%rsi), %r21 -; EGPR-NDD-NEXT: movq %rsi, %r23 -; EGPR-NDD-NEXT: movq %r15, %rax +; EGPR-NDD-NEXT: movq %r23, %rax ; EGPR-NDD-NEXT: mulq %r22 ; EGPR-NDD-NEXT: movq %rdx, %r25 ; EGPR-NDD-NEXT: movq %rax, %r19 -; EGPR-NDD-NEXT: movq %r17, %rax +; EGPR-NDD-NEXT: movq %r18, %rax ; EGPR-NDD-NEXT: mulq %r22 -; EGPR-NDD-NEXT: addq %r25, %rax, %rcx -; EGPR-NDD-NEXT: adcq $0, %rdx, %rsi -; EGPR-NDD-NEXT: movq %r15, %rax +; EGPR-NDD-NEXT: addq %rax, %r25 +; EGPR-NDD-NEXT: adcq $0, %rdx, %rcx +; EGPR-NDD-NEXT: movq %r23, %rax ; EGPR-NDD-NEXT: mulq %r21 -; EGPR-NDD-NEXT: addq %rcx, %rax, %r8 -; EGPR-NDD-NEXT: adcq %rdx, %rsi, %rcx +; EGPR-NDD-NEXT: addq %r25, %rax, %rsi +; EGPR-NDD-NEXT: adcq %rdx, %rcx ; EGPR-NDD-NEXT: setb %al -; EGPR-NDD-NEXT: movzbl %al, %esi -; EGPR-NDD-NEXT: movq %r17, %rax +; EGPR-NDD-NEXT: movzbl %al, %r8d +; EGPR-NDD-NEXT: movq %r18, %rax ; EGPR-NDD-NEXT: mulq %r21 ; EGPR-NDD-NEXT: addq %rcx, %rax, %r27 -; EGPR-NDD-NEXT: adcq %rdx, %rsi -; EGPR-NDD-NEXT: movq %r11, %rax +; EGPR-NDD-NEXT: adcq %rdx, %r8 +; EGPR-NDD-NEXT: movq %r16, %rax ; EGPR-NDD-NEXT: mulq %r22 ; EGPR-NDD-NEXT: movq %rdx, %r26 ; EGPR-NDD-NEXT: movq %rax, %r25 @@ -1083,7 +1083,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; EGPR-NDD-NEXT: mulq %r22 ; EGPR-NDD-NEXT: addq %r26, %rax, %rcx ; EGPR-NDD-NEXT: adcq $0, %rdx, %r26 -; EGPR-NDD-NEXT: movq %r11, %rax +; EGPR-NDD-NEXT: movq %r16, %rax ; EGPR-NDD-NEXT: mulq %r21 ; EGPR-NDD-NEXT: addq %rax, %rcx ; EGPR-NDD-NEXT: adcq %rdx, %r26 @@ -1094,58 +1094,59 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; EGPR-NDD-NEXT: addq %r26, %rax ; EGPR-NDD-NEXT: adcq %r28, %rdx ; EGPR-NDD-NEXT: addq %rax, %r19, %r28 -; EGPR-NDD-NEXT: adcq %rdx, %r8 +; EGPR-NDD-NEXT: adcq %rdx, %rsi, %r29 ; EGPR-NDD-NEXT: adcq $0, %r27 -; EGPR-NDD-NEXT: adcq $0, %rsi, %r29 -; EGPR-NDD-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; EGPR-NDD-NEXT: movq %r11, %rax +; EGPR-NDD-NEXT: adcq $0, %r8 +; EGPR-NDD-NEXT: movq %r16, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; EGPR-NDD-NEXT: movq %r16, %rax ; EGPR-NDD-NEXT: mulq %r24 ; EGPR-NDD-NEXT: movq %rdx, %r19 ; EGPR-NDD-NEXT: movq %rax, %r26 ; EGPR-NDD-NEXT: movq %rdi, %rax ; EGPR-NDD-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; EGPR-NDD-NEXT: mulq %r24 -; EGPR-NDD-NEXT: addq %r19, %rax, %rsi -; EGPR-NDD-NEXT: adcq $0, %rdx, %r19 -; EGPR-NDD-NEXT: movq %r11, %rax -; EGPR-NDD-NEXT: mulq %r18 -; EGPR-NDD-NEXT: addq %rsi, %rax, %r30 -; EGPR-NDD-NEXT: adcq %rdx, %r19, %rsi +; EGPR-NDD-NEXT: addq %rax, %r19 +; EGPR-NDD-NEXT: adcq $0, %rdx, %rsi +; EGPR-NDD-NEXT: movq %r16, %rax +; EGPR-NDD-NEXT: mulq %r14 +; EGPR-NDD-NEXT: addq %rax, %r19 +; EGPR-NDD-NEXT: adcq %rdx, %rsi ; EGPR-NDD-NEXT: setb %al -; EGPR-NDD-NEXT: movzbl %al, %r19d +; EGPR-NDD-NEXT: movzbl %al, %r30d ; EGPR-NDD-NEXT: movq %rdi, %rax -; EGPR-NDD-NEXT: mulq %r18 +; EGPR-NDD-NEXT: mulq %r14 ; EGPR-NDD-NEXT: addq %rsi, %rax -; EGPR-NDD-NEXT: adcq %r19, %rdx +; EGPR-NDD-NEXT: adcq %r30, %rdx ; EGPR-NDD-NEXT: addq %r28, %r26, %rsi -; EGPR-NDD-NEXT: adcq %r8, %r30, %r28 +; EGPR-NDD-NEXT: adcq %r29, %r19, %r28 ; EGPR-NDD-NEXT: adcq $0, %rax ; EGPR-NDD-NEXT: adcq $0, %rdx -; EGPR-NDD-NEXT: addq %rax, %r27, %r8 -; EGPR-NDD-NEXT: adcq %rdx, %r29, %r27 +; EGPR-NDD-NEXT: addq %rax, %r27 +; EGPR-NDD-NEXT: adcq %rdx, %r8 ; EGPR-NDD-NEXT: setb %al ; EGPR-NDD-NEXT: movzbl %al, %r31d -; EGPR-NDD-NEXT: movq %r15, %rax +; EGPR-NDD-NEXT: movq %r23, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; EGPR-NDD-NEXT: movq %r23, %rax ; EGPR-NDD-NEXT: mulq %r24 ; EGPR-NDD-NEXT: movq %rdx, %r19 ; EGPR-NDD-NEXT: movq %rax, %r26 -; EGPR-NDD-NEXT: movq %r17, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; EGPR-NDD-NEXT: movq %r17, %rax +; EGPR-NDD-NEXT: movq %r18, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; EGPR-NDD-NEXT: movq %r18, %rax ; EGPR-NDD-NEXT: mulq %r24 ; EGPR-NDD-NEXT: addq %rax, %r19 ; EGPR-NDD-NEXT: adcq $0, %rdx, %r29 -; EGPR-NDD-NEXT: movq %r15, %rax -; EGPR-NDD-NEXT: mulq %r18 +; EGPR-NDD-NEXT: movq %r23, %rax +; EGPR-NDD-NEXT: mulq %r14 ; EGPR-NDD-NEXT: addq %rax, %r19 ; EGPR-NDD-NEXT: adcq %rdx, %r29 ; EGPR-NDD-NEXT: setb %al ; EGPR-NDD-NEXT: movzbl %al, %r30d -; EGPR-NDD-NEXT: movq %r17, %rax -; EGPR-NDD-NEXT: mulq %r18 +; EGPR-NDD-NEXT: movq %r18, %rax +; EGPR-NDD-NEXT: mulq %r14 ; EGPR-NDD-NEXT: addq %r29, %rax ; EGPR-NDD-NEXT: adcq %r30, %rdx -; EGPR-NDD-NEXT: addq %r8, %r26, %r29 -; EGPR-NDD-NEXT: adcq %r27, %r19, %r30 +; EGPR-NDD-NEXT: addq %r27, %r26, %r29 +; EGPR-NDD-NEXT: adcq %r8, %r19, %r30 ; EGPR-NDD-NEXT: adcq %rax, %r31 ; EGPR-NDD-NEXT: adcq $0, %rdx, %rdi ; EGPR-NDD-NEXT: movq %r10, %rax @@ -1154,69 +1155,69 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; EGPR-NDD-NEXT: movq %rax, %r26 ; EGPR-NDD-NEXT: movq %r9, %rax ; EGPR-NDD-NEXT: mulq %r22 -; EGPR-NDD-NEXT: addq %r19, %rax, %r8 -; EGPR-NDD-NEXT: adcq $0, %rdx, %r19 +; EGPR-NDD-NEXT: addq %rax, %r19 +; EGPR-NDD-NEXT: adcq $0, %rdx, %r8 ; EGPR-NDD-NEXT: movq %r10, %rax ; EGPR-NDD-NEXT: mulq %r21 -; EGPR-NDD-NEXT: addq %rax, %r8 -; EGPR-NDD-NEXT: adcq %rdx, %r19 +; EGPR-NDD-NEXT: addq %rax, %r19 +; EGPR-NDD-NEXT: adcq %rdx, %r8 ; EGPR-NDD-NEXT: setb %al ; EGPR-NDD-NEXT: movzbl %al, %r27d ; EGPR-NDD-NEXT: movq %r9, %rax ; EGPR-NDD-NEXT: mulq %r21 -; EGPR-NDD-NEXT: addq %rax, %r19 +; EGPR-NDD-NEXT: addq %rax, %r8 ; EGPR-NDD-NEXT: adcq %r27, %rdx, %rbx -; EGPR-NDD-NEXT: movq %r16, %rax +; EGPR-NDD-NEXT: movq %r17, %rax ; EGPR-NDD-NEXT: mulq %r22 ; EGPR-NDD-NEXT: movq %rdx, %r27 ; EGPR-NDD-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; EGPR-NDD-NEXT: movq %r14, %rax +; EGPR-NDD-NEXT: movq %r11, %rax ; EGPR-NDD-NEXT: mulq %r22 ; EGPR-NDD-NEXT: addq %rax, %r27 ; EGPR-NDD-NEXT: adcq $0, %rdx, %r12 -; EGPR-NDD-NEXT: movq %r16, %rax +; EGPR-NDD-NEXT: movq %r17, %rax ; EGPR-NDD-NEXT: mulq %r21 ; EGPR-NDD-NEXT: addq %r27, %rax ; EGPR-NDD-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; EGPR-NDD-NEXT: adcq %rdx, %r12, %r27 -; EGPR-NDD-NEXT: setb %bpl -; EGPR-NDD-NEXT: movq %r14, %rax +; EGPR-NDD-NEXT: adcq %rdx, %r12 +; EGPR-NDD-NEXT: setb %r27b +; EGPR-NDD-NEXT: movq %r11, %rax ; EGPR-NDD-NEXT: mulq %r21 -; EGPR-NDD-NEXT: addq %r27, %rax -; EGPR-NDD-NEXT: movzbl %bpl, %r27d +; EGPR-NDD-NEXT: addq %r12, %rax +; EGPR-NDD-NEXT: movzbl %r27b, %r27d ; EGPR-NDD-NEXT: adcq %r27, %rdx ; EGPR-NDD-NEXT: addq %rax, %r26, %r12 -; EGPR-NDD-NEXT: adcq %rdx, %r8 -; EGPR-NDD-NEXT: adcq $0, %r19 +; EGPR-NDD-NEXT: adcq %rdx, %r19 +; EGPR-NDD-NEXT: adcq $0, %r8 ; EGPR-NDD-NEXT: adcq $0, %rbx -; EGPR-NDD-NEXT: movq %r16, %rax +; EGPR-NDD-NEXT: movq %r17, %rax ; EGPR-NDD-NEXT: mulq %r24 ; EGPR-NDD-NEXT: movq %rdx, %r26 ; EGPR-NDD-NEXT: movq %rax, %r27 -; EGPR-NDD-NEXT: movq %r14, %rax +; EGPR-NDD-NEXT: movq %r11, %rax ; EGPR-NDD-NEXT: mulq %r24 ; EGPR-NDD-NEXT: addq %rax, %r26 ; EGPR-NDD-NEXT: adcq $0, %rdx, %r13 -; EGPR-NDD-NEXT: movq %r16, %rax -; EGPR-NDD-NEXT: mulq %r18 +; EGPR-NDD-NEXT: movq %r17, %rax +; EGPR-NDD-NEXT: mulq %r14 ; EGPR-NDD-NEXT: addq %rax, %r26 ; EGPR-NDD-NEXT: adcq %rdx, %r13 ; EGPR-NDD-NEXT: setb %bpl -; EGPR-NDD-NEXT: movq %r14, %rax -; EGPR-NDD-NEXT: mulq %r18 +; EGPR-NDD-NEXT: movq %r11, %rax +; EGPR-NDD-NEXT: mulq %r14 ; EGPR-NDD-NEXT: addq %r13, %rax ; EGPR-NDD-NEXT: movzbl %bpl, %r13d ; EGPR-NDD-NEXT: adcq %r13, %rdx -; EGPR-NDD-NEXT: addq %r12, %r27, %r11 -; EGPR-NDD-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; EGPR-NDD-NEXT: adcq %r26, %r8 -; EGPR-NDD-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; EGPR-NDD-NEXT: addq %r12, %r27 +; EGPR-NDD-NEXT: movq %r27, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; EGPR-NDD-NEXT: adcq %r26, %r19 +; EGPR-NDD-NEXT: movq %r19, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; EGPR-NDD-NEXT: adcq $0, %rax ; EGPR-NDD-NEXT: adcq $0, %rdx -; EGPR-NDD-NEXT: addq %rax, %r19, %r8 -; EGPR-NDD-NEXT: adcq %rdx, %rbx, %r19 -; EGPR-NDD-NEXT: setb %bl -; EGPR-NDD-NEXT: movq %r10, %r17 +; EGPR-NDD-NEXT: addq %rax, %r8 +; EGPR-NDD-NEXT: adcq %rdx, %rbx +; EGPR-NDD-NEXT: setb %r19b +; EGPR-NDD-NEXT: movq %r10, %r16 ; EGPR-NDD-NEXT: movq %r10, %rax ; EGPR-NDD-NEXT: mulq %r24 ; EGPR-NDD-NEXT: movq %rdx, %r26 @@ -1226,32 +1227,31 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; EGPR-NDD-NEXT: addq %rax, %r26 ; EGPR-NDD-NEXT: adcq $0, %rdx, %r12 ; EGPR-NDD-NEXT: movq %r10, %rax -; EGPR-NDD-NEXT: movq %r18, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; EGPR-NDD-NEXT: mulq %r18 +; EGPR-NDD-NEXT: mulq %r14 ; EGPR-NDD-NEXT: addq %rax, %r26 ; EGPR-NDD-NEXT: adcq %rdx, %r12 ; EGPR-NDD-NEXT: setb %bpl ; EGPR-NDD-NEXT: movq %r9, %rax -; EGPR-NDD-NEXT: mulq %r18 +; EGPR-NDD-NEXT: mulq %r14 ; EGPR-NDD-NEXT: addq %r12, %rax ; EGPR-NDD-NEXT: movzbl %bpl, %r12d ; EGPR-NDD-NEXT: adcq %r12, %rdx ; EGPR-NDD-NEXT: addq %r27, %r8 -; EGPR-NDD-NEXT: adcq %r26, %r19 -; EGPR-NDD-NEXT: movzbl %bl, %r26d -; EGPR-NDD-NEXT: adcq %r26, %rax +; EGPR-NDD-NEXT: adcq %r26, %rbx +; EGPR-NDD-NEXT: movzbl %r19b, %r19d +; EGPR-NDD-NEXT: adcq %r19, %rax ; EGPR-NDD-NEXT: adcq $0, %rdx ; EGPR-NDD-NEXT: addq %r8, %r25, %r12 -; EGPR-NDD-NEXT: movq 32(%r23), %r26 -; EGPR-NDD-NEXT: adcq %r19, %rcx, %r13 +; EGPR-NDD-NEXT: movq 32(%r15), %r26 +; EGPR-NDD-NEXT: adcq %rbx, %rcx, %r13 ; EGPR-NDD-NEXT: adcq %rax, %rsi, %rbp ; EGPR-NDD-NEXT: adcq %rdx, %r28, %rbx -; EGPR-NDD-NEXT: adcq $0, %r29, %rax -; EGPR-NDD-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; EGPR-NDD-NEXT: adcq $0, %r29 +; EGPR-NDD-NEXT: movq %r29, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; EGPR-NDD-NEXT: adcq $0, %r30 ; EGPR-NDD-NEXT: adcq $0, %r31 -; EGPR-NDD-NEXT: adcq $0, %rdi, %rax -; EGPR-NDD-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; EGPR-NDD-NEXT: adcq $0, %rdi +; EGPR-NDD-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; EGPR-NDD-NEXT: movq %r10, %rax ; EGPR-NDD-NEXT: mulq %r26 ; EGPR-NDD-NEXT: movq %rdx, %r25 @@ -1259,341 +1259,333 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; EGPR-NDD-NEXT: movq %r9, %r19 ; EGPR-NDD-NEXT: movq %r9, %rax ; EGPR-NDD-NEXT: mulq %r26 -; EGPR-NDD-NEXT: addq %r25, %rax, %rcx -; EGPR-NDD-NEXT: adcq $0, %rdx, %r8 -; EGPR-NDD-NEXT: movq 40(%r23), %r18 -; EGPR-NDD-NEXT: movq %r23, %r11 +; EGPR-NDD-NEXT: addq %rax, %r25 +; EGPR-NDD-NEXT: adcq $0, %rdx, %rcx +; EGPR-NDD-NEXT: movq 40(%r15), %r18 ; EGPR-NDD-NEXT: movq %r10, %rax ; EGPR-NDD-NEXT: mulq %r18 -; EGPR-NDD-NEXT: addq %rcx, %rax, %rdi -; EGPR-NDD-NEXT: adcq %rdx, %r8 -; EGPR-NDD-NEXT: setb %r25b +; EGPR-NDD-NEXT: addq %r25, %rax, %r29 +; EGPR-NDD-NEXT: adcq %rdx, %rcx +; EGPR-NDD-NEXT: setb %r8b ; EGPR-NDD-NEXT: movq %r9, %rax ; EGPR-NDD-NEXT: mulq %r18 -; EGPR-NDD-NEXT: addq %r8, %rax, %r29 -; EGPR-NDD-NEXT: movzbl %r25b, %eax +; EGPR-NDD-NEXT: addq %rcx, %rax, %rdi +; EGPR-NDD-NEXT: movzbl %r8b, %eax ; EGPR-NDD-NEXT: adcq %rax, %rdx, %rsi -; EGPR-NDD-NEXT: movq %r16, %rax +; EGPR-NDD-NEXT: movq %r17, %rax ; EGPR-NDD-NEXT: mulq %r26 ; EGPR-NDD-NEXT: movq %rdx, %r28 ; EGPR-NDD-NEXT: movq %rax, %r25 -; EGPR-NDD-NEXT: movq %r14, %rax +; EGPR-NDD-NEXT: movq %r11, %r10 +; EGPR-NDD-NEXT: movq %r11, %rax ; EGPR-NDD-NEXT: mulq %r26 ; EGPR-NDD-NEXT: addq %r28, %rax, %r8 ; EGPR-NDD-NEXT: adcq $0, %rdx, %r28 -; EGPR-NDD-NEXT: movq %r16, %rax -; EGPR-NDD-NEXT: movq %r16, %r10 +; EGPR-NDD-NEXT: movq %r17, %rax ; EGPR-NDD-NEXT: mulq %r18 ; EGPR-NDD-NEXT: addq %r8, %rax, %r23 ; EGPR-NDD-NEXT: adcq %rdx, %r28 ; EGPR-NDD-NEXT: setb %cl -; EGPR-NDD-NEXT: movq %r14, %rax -; EGPR-NDD-NEXT: movq %r14, %r16 +; EGPR-NDD-NEXT: movq %r11, %rax ; EGPR-NDD-NEXT: mulq %r18 ; EGPR-NDD-NEXT: addq %r28, %rax ; EGPR-NDD-NEXT: movzbl %cl, %ecx ; EGPR-NDD-NEXT: adcq %rdx, %rcx ; EGPR-NDD-NEXT: addq %rax, %r27 -; EGPR-NDD-NEXT: adcq %rcx, %rdi -; EGPR-NDD-NEXT: adcq $0, %r29, %r8 +; EGPR-NDD-NEXT: adcq %rcx, %r29, %r8 +; EGPR-NDD-NEXT: adcq $0, %rdi ; EGPR-NDD-NEXT: adcq $0, %rsi, %r9 -; EGPR-NDD-NEXT: movq %r11, %r14 -; EGPR-NDD-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; EGPR-NDD-NEXT: movq 48(%r11), %r11 -; EGPR-NDD-NEXT: movq %r10, %rsi -; EGPR-NDD-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; EGPR-NDD-NEXT: movq %r10, %rax +; EGPR-NDD-NEXT: movq 48(%r15), %r11 +; EGPR-NDD-NEXT: movq %r17, %rsi +; EGPR-NDD-NEXT: movq %r17, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; EGPR-NDD-NEXT: movq %r17, %rax ; EGPR-NDD-NEXT: mulq %r11 ; EGPR-NDD-NEXT: movq %rdx, %r28 ; EGPR-NDD-NEXT: movq %rax, %r29 -; EGPR-NDD-NEXT: movq %r16, %rax -; EGPR-NDD-NEXT: movq %r16, %r10 -; EGPR-NDD-NEXT: movq %r16, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; EGPR-NDD-NEXT: movq %r10, %rax +; EGPR-NDD-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; EGPR-NDD-NEXT: mulq %r11 ; EGPR-NDD-NEXT: addq %rax, %r28 ; EGPR-NDD-NEXT: adcq $0, %rdx, %rcx -; EGPR-NDD-NEXT: movq 56(%r14), %r16 +; EGPR-NDD-NEXT: movq 56(%r15), %r17 ; EGPR-NDD-NEXT: movq %rsi, %rax -; EGPR-NDD-NEXT: mulq %r16 +; EGPR-NDD-NEXT: mulq %r17 ; EGPR-NDD-NEXT: addq %rax, %r28 ; EGPR-NDD-NEXT: adcq %rdx, %rcx ; EGPR-NDD-NEXT: setb %sil ; EGPR-NDD-NEXT: movq %r10, %rax -; EGPR-NDD-NEXT: mulq %r16 +; EGPR-NDD-NEXT: mulq %r17 ; EGPR-NDD-NEXT: addq %rcx, %rax ; EGPR-NDD-NEXT: movzbl %sil, %ecx ; EGPR-NDD-NEXT: adcq %rdx, %rcx -; EGPR-NDD-NEXT: addq %r27, %r29, %r10 -; EGPR-NDD-NEXT: adcq %r28, %rdi +; EGPR-NDD-NEXT: addq %r29, %r27 +; EGPR-NDD-NEXT: adcq %r8, %r28, %r10 ; EGPR-NDD-NEXT: adcq $0, %rax ; EGPR-NDD-NEXT: adcq $0, %rcx -; EGPR-NDD-NEXT: addq %rax, %r8 -; EGPR-NDD-NEXT: adcq %rcx, %r9, %rsi -; EGPR-NDD-NEXT: setb %r9b -; EGPR-NDD-NEXT: movq %r17, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; EGPR-NDD-NEXT: movq %r17, %rax +; EGPR-NDD-NEXT: addq %rax, %rdi +; EGPR-NDD-NEXT: adcq %rcx, %r9, %r8 +; EGPR-NDD-NEXT: setb %sil +; EGPR-NDD-NEXT: movq %r16, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; EGPR-NDD-NEXT: movq %r16, %rax ; EGPR-NDD-NEXT: mulq %r11 ; EGPR-NDD-NEXT: movq %rdx, %r28 ; EGPR-NDD-NEXT: movq %rax, %r29 +; EGPR-NDD-NEXT: movq %r19, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; EGPR-NDD-NEXT: movq %r19, %rax ; EGPR-NDD-NEXT: mulq %r11 -; EGPR-NDD-NEXT: addq %r28, %rax, %r27 -; EGPR-NDD-NEXT: adcq $0, %rdx, %r28 -; EGPR-NDD-NEXT: movq %r17, %rax -; EGPR-NDD-NEXT: mulq %r16 -; EGPR-NDD-NEXT: addq %rax, %r27 -; EGPR-NDD-NEXT: adcq %rdx, %r28 +; EGPR-NDD-NEXT: addq %rax, %r28 +; EGPR-NDD-NEXT: adcq $0, %rdx, %r9 +; EGPR-NDD-NEXT: movq %r16, %rax +; EGPR-NDD-NEXT: mulq %r17 +; EGPR-NDD-NEXT: addq %rax, %r28 +; EGPR-NDD-NEXT: adcq %rdx, %r9 ; EGPR-NDD-NEXT: setb %cl ; EGPR-NDD-NEXT: movq %r19, %rax -; EGPR-NDD-NEXT: movq %r19, %r17 -; EGPR-NDD-NEXT: mulq %r16 -; EGPR-NDD-NEXT: addq %r28, %rax +; EGPR-NDD-NEXT: mulq %r17 +; EGPR-NDD-NEXT: addq %r9, %rax ; EGPR-NDD-NEXT: movzbl %cl, %ecx ; EGPR-NDD-NEXT: adcq %rdx, %rcx -; EGPR-NDD-NEXT: addq %r8, %r29, %rdx -; EGPR-NDD-NEXT: adcq %r27, %rsi -; EGPR-NDD-NEXT: movzbl %r9b, %r8d -; EGPR-NDD-NEXT: adcq %r8, %rax +; EGPR-NDD-NEXT: addq %r29, %rdi +; EGPR-NDD-NEXT: adcq %r28, %r8 +; EGPR-NDD-NEXT: movzbl %sil, %edx +; EGPR-NDD-NEXT: adcq %rdx, %rax ; EGPR-NDD-NEXT: adcq $0, %rcx -; EGPR-NDD-NEXT: addq %r12, %r25, %r8 -; EGPR-NDD-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; EGPR-NDD-NEXT: adcq %r13, %r23, %r8 -; EGPR-NDD-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; EGPR-NDD-NEXT: adcq %rbp, %r10, %r8 -; EGPR-NDD-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; EGPR-NDD-NEXT: adcq %rbx, %rdi -; EGPR-NDD-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; EGPR-NDD-NEXT: adcq $0, %rdx -; EGPR-NDD-NEXT: adcq $0, %rsi +; EGPR-NDD-NEXT: addq %r12, %r25 +; EGPR-NDD-NEXT: movq %r25, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; EGPR-NDD-NEXT: adcq %r13, %r23, %r19 +; EGPR-NDD-NEXT: movq %r19, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; EGPR-NDD-NEXT: adcq %rbp, %r27 +; EGPR-NDD-NEXT: movq %r27, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; EGPR-NDD-NEXT: adcq %rbx, %r10 +; EGPR-NDD-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; EGPR-NDD-NEXT: adcq $0, %rdi +; EGPR-NDD-NEXT: adcq $0, %r8 ; EGPR-NDD-NEXT: adcq $0, %rax -; EGPR-NDD-NEXT: adcq $0, %rcx, %rdi -; EGPR-NDD-NEXT: addq %rdx, {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload -; EGPR-NDD-NEXT: adcq %rsi, %r30, %r19 -; EGPR-NDD-NEXT: adcq %rax, %r31, %r30 -; EGPR-NDD-NEXT: adcq %rdi, {{[-0-9]+}}(%r{{[sb]}}p), %r31 # 8-byte Folded Reload -; EGPR-NDD-NEXT: setb %bpl -; EGPR-NDD-NEXT: movq %r15, %rax +; EGPR-NDD-NEXT: adcq $0, %rcx +; EGPR-NDD-NEXT: addq %rdi, {{[-0-9]+}}(%r{{[sb]}}p), %r19 # 8-byte Folded Reload +; EGPR-NDD-NEXT: adcq %r8, %r30 +; EGPR-NDD-NEXT: adcq %rax, %r31 +; EGPR-NDD-NEXT: adcq %rcx, {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload +; EGPR-NDD-NEXT: setb %r8b +; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload +; EGPR-NDD-NEXT: movq %r13, %rax ; EGPR-NDD-NEXT: mulq %r26 ; EGPR-NDD-NEXT: movq %rdx, %r25 ; EGPR-NDD-NEXT: movq %rax, %r28 -; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload -; EGPR-NDD-NEXT: movq %r9, %rax +; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload +; EGPR-NDD-NEXT: movq %r10, %rax ; EGPR-NDD-NEXT: mulq %r26 -; EGPR-NDD-NEXT: addq %r25, %rax, %rsi -; EGPR-NDD-NEXT: adcq $0, %rdx, %rdi -; EGPR-NDD-NEXT: movq %r15, %rax -; EGPR-NDD-NEXT: movq %r15, %r13 +; EGPR-NDD-NEXT: addq %rax, %r25 +; EGPR-NDD-NEXT: adcq $0, %rdx, %rsi +; EGPR-NDD-NEXT: movq %r13, %rax ; EGPR-NDD-NEXT: mulq %r18 -; EGPR-NDD-NEXT: addq %rax, %rsi -; EGPR-NDD-NEXT: adcq %rdx, %rdi -; EGPR-NDD-NEXT: setb %r8b -; EGPR-NDD-NEXT: movq %r9, %rax -; EGPR-NDD-NEXT: movq %r9, %r23 +; EGPR-NDD-NEXT: addq %r25, %rax, %rdi +; EGPR-NDD-NEXT: adcq %rdx, %rsi +; EGPR-NDD-NEXT: setb %r9b +; EGPR-NDD-NEXT: movq %r10, %rax +; EGPR-NDD-NEXT: movq %r10, %r16 ; EGPR-NDD-NEXT: mulq %r18 -; EGPR-NDD-NEXT: addq %rax, %rdi -; EGPR-NDD-NEXT: movzbl %r8b, %eax -; EGPR-NDD-NEXT: adcq %rax, %rdx, %r8 -; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; EGPR-NDD-NEXT: movq %r14, %rax +; EGPR-NDD-NEXT: addq %rax, %rsi +; EGPR-NDD-NEXT: movzbl %r9b, %eax +; EGPR-NDD-NEXT: adcq %rax, %rdx, %r9 +; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r23 # 8-byte Reload +; EGPR-NDD-NEXT: movq %r23, %rax ; EGPR-NDD-NEXT: mulq %r26 ; EGPR-NDD-NEXT: movq %rdx, %r29 ; EGPR-NDD-NEXT: movq %rax, %r25 -; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; EGPR-NDD-NEXT: movq %r15, %rax +; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload +; EGPR-NDD-NEXT: movq %r12, %rax ; EGPR-NDD-NEXT: mulq %r26 -; EGPR-NDD-NEXT: addq %r29, %rax, %r9 +; EGPR-NDD-NEXT: addq %rax, %r29 ; EGPR-NDD-NEXT: adcq $0, %rdx, %r10 -; EGPR-NDD-NEXT: movq %r14, %rax +; EGPR-NDD-NEXT: movq %r23, %rax ; EGPR-NDD-NEXT: mulq %r18 -; EGPR-NDD-NEXT: addq %r9, %rax, %rbx -; EGPR-NDD-NEXT: adcq %rdx, %r10, %r9 -; EGPR-NDD-NEXT: setb %r10b -; EGPR-NDD-NEXT: movq %r15, %rax +; EGPR-NDD-NEXT: addq %r29, %rax, %rbx +; EGPR-NDD-NEXT: adcq %rdx, %r10 +; EGPR-NDD-NEXT: setb %r27b +; EGPR-NDD-NEXT: movq %r12, %rax ; EGPR-NDD-NEXT: mulq %r18 -; EGPR-NDD-NEXT: addq %r9, %rax -; EGPR-NDD-NEXT: movzbl %r10b, %r9d -; EGPR-NDD-NEXT: adcq %r9, %rdx -; EGPR-NDD-NEXT: addq %rax, %r28, %r9 -; EGPR-NDD-NEXT: adcq %rdx, %rsi -; EGPR-NDD-NEXT: adcq $0, %rdi -; EGPR-NDD-NEXT: adcq $0, %r8 -; EGPR-NDD-NEXT: movq %r14, %rax +; EGPR-NDD-NEXT: addq %r10, %rax +; EGPR-NDD-NEXT: movzbl %r27b, %r10d +; EGPR-NDD-NEXT: adcq %r10, %rdx +; EGPR-NDD-NEXT: addq %rax, %r28, %r10 +; EGPR-NDD-NEXT: adcq %rdx, %rdi +; EGPR-NDD-NEXT: adcq $0, %rsi +; EGPR-NDD-NEXT: adcq $0, %r9 +; EGPR-NDD-NEXT: movq %r23, %rax ; EGPR-NDD-NEXT: mulq %r11 ; EGPR-NDD-NEXT: movq %rdx, %r28 ; EGPR-NDD-NEXT: movq %rax, %r29 -; EGPR-NDD-NEXT: movq %r15, %rax +; EGPR-NDD-NEXT: movq %r12, %rax ; EGPR-NDD-NEXT: mulq %r11 -; EGPR-NDD-NEXT: addq %r28, %rax, %r10 +; EGPR-NDD-NEXT: addq %rax, %r28 ; EGPR-NDD-NEXT: adcq $0, %rdx, %r27 -; EGPR-NDD-NEXT: movq %r14, %rax -; EGPR-NDD-NEXT: mulq %r16 -; EGPR-NDD-NEXT: addq %rax, %r10 +; EGPR-NDD-NEXT: movq %r23, %rax +; EGPR-NDD-NEXT: mulq %r17 +; EGPR-NDD-NEXT: addq %rax, %r28 ; EGPR-NDD-NEXT: adcq %rdx, %r27 -; EGPR-NDD-NEXT: setb %r28b -; EGPR-NDD-NEXT: movq %r15, %rax -; EGPR-NDD-NEXT: mulq %r16 +; EGPR-NDD-NEXT: setb %bpl +; EGPR-NDD-NEXT: movq %r12, %rax +; EGPR-NDD-NEXT: mulq %r17 ; EGPR-NDD-NEXT: addq %r27, %rax -; EGPR-NDD-NEXT: movzbl %r28b, %r27d +; EGPR-NDD-NEXT: movzbl %bpl, %r27d ; EGPR-NDD-NEXT: adcq %r27, %rdx -; EGPR-NDD-NEXT: addq %r29, %r9 -; EGPR-NDD-NEXT: adcq %r10, %rsi +; EGPR-NDD-NEXT: addq %r29, %r10 +; EGPR-NDD-NEXT: adcq %r28, %rdi ; EGPR-NDD-NEXT: adcq $0, %rax ; EGPR-NDD-NEXT: adcq $0, %rdx -; EGPR-NDD-NEXT: addq %rax, %rdi -; EGPR-NDD-NEXT: adcq %rdx, %r8 -; EGPR-NDD-NEXT: setb %r10b +; EGPR-NDD-NEXT: addq %rax, %rsi +; EGPR-NDD-NEXT: adcq %rdx, %r9 +; EGPR-NDD-NEXT: setb %r27b ; EGPR-NDD-NEXT: movq %r13, %rax ; EGPR-NDD-NEXT: mulq %r11 ; EGPR-NDD-NEXT: movq %rdx, %r28 ; EGPR-NDD-NEXT: movq %rax, %r29 -; EGPR-NDD-NEXT: movq %r23, %r14 -; EGPR-NDD-NEXT: movq %r23, %rax +; EGPR-NDD-NEXT: movq %r16, %rax ; EGPR-NDD-NEXT: mulq %r11 -; EGPR-NDD-NEXT: addq %r28, %rax, %r27 -; EGPR-NDD-NEXT: adcq $0, %rdx, %r28 +; EGPR-NDD-NEXT: addq %rax, %r28 +; EGPR-NDD-NEXT: adcq $0, %rdx, %r12 ; EGPR-NDD-NEXT: movq %r13, %rax -; EGPR-NDD-NEXT: mulq %r16 -; EGPR-NDD-NEXT: addq %rax, %r27 -; EGPR-NDD-NEXT: adcq %rdx, %r28 -; EGPR-NDD-NEXT: setb %r15b -; EGPR-NDD-NEXT: movq %r23, %rax -; EGPR-NDD-NEXT: mulq %r16 -; EGPR-NDD-NEXT: addq %r28, %rax -; EGPR-NDD-NEXT: movzbl %r15b, %r28d -; EGPR-NDD-NEXT: adcq %r28, %rdx -; EGPR-NDD-NEXT: addq %r29, %rdi -; EGPR-NDD-NEXT: adcq %r27, %r8 -; EGPR-NDD-NEXT: movzbl %r10b, %r10d -; EGPR-NDD-NEXT: adcq %r10, %rax +; EGPR-NDD-NEXT: mulq %r17 +; EGPR-NDD-NEXT: addq %rax, %r28 +; EGPR-NDD-NEXT: adcq %rdx, %r12 +; EGPR-NDD-NEXT: setb %bpl +; EGPR-NDD-NEXT: movq %r16, %rax +; EGPR-NDD-NEXT: mulq %r17 +; EGPR-NDD-NEXT: addq %r12, %rax +; EGPR-NDD-NEXT: movzbl %bpl, %r12d +; EGPR-NDD-NEXT: adcq %r12, %rdx +; EGPR-NDD-NEXT: addq %r29, %rsi +; EGPR-NDD-NEXT: adcq %r28, %r9 +; EGPR-NDD-NEXT: movzbl %r27b, %r27d +; EGPR-NDD-NEXT: adcq %r27, %rax ; EGPR-NDD-NEXT: adcq $0, %rdx -; EGPR-NDD-NEXT: addq %r25, %rcx -; EGPR-NDD-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; EGPR-NDD-NEXT: adcq %r19, %rbx, %rcx -; EGPR-NDD-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; EGPR-NDD-NEXT: adcq %r30, %r9, %rcx -; EGPR-NDD-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; EGPR-NDD-NEXT: adcq %r31, %rsi, %rcx -; EGPR-NDD-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; EGPR-NDD-NEXT: movzbl %bpl, %ecx +; EGPR-NDD-NEXT: addq %r25, %r19 +; EGPR-NDD-NEXT: movq %r19, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; EGPR-NDD-NEXT: adcq %rbx, %r30 +; EGPR-NDD-NEXT: movq %r30, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; EGPR-NDD-NEXT: adcq %r31, %r10 +; EGPR-NDD-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; EGPR-NDD-NEXT: adcq %rdi, %rcx ; EGPR-NDD-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; EGPR-NDD-NEXT: adcq $0, %r8, %rcx +; EGPR-NDD-NEXT: movzbl %r8b, %ecx +; EGPR-NDD-NEXT: adcq %rsi, %rcx ; EGPR-NDD-NEXT: movq %rcx, (%rsp) # 8-byte Spill +; EGPR-NDD-NEXT: adcq $0, %r9 +; EGPR-NDD-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; EGPR-NDD-NEXT: adcq $0, %rax ; EGPR-NDD-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; EGPR-NDD-NEXT: adcq $0, %rdx, %rax -; EGPR-NDD-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; EGPR-NDD-NEXT: adcq $0, %rdx +; EGPR-NDD-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; EGPR-NDD-NEXT: movq 64(%r20), %r28 ; EGPR-NDD-NEXT: movq %r24, %rax ; EGPR-NDD-NEXT: mulq %r28 ; EGPR-NDD-NEXT: movq %rdx, %r25 ; EGPR-NDD-NEXT: movq %rax, %r30 -; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r23 # 8-byte Reload -; EGPR-NDD-NEXT: movq %r23, %rax +; EGPR-NDD-NEXT: movq %r14, %rax ; EGPR-NDD-NEXT: mulq %r28 -; EGPR-NDD-NEXT: addq %r25, %rax, %rcx -; EGPR-NDD-NEXT: adcq $0, %rdx, %rsi +; EGPR-NDD-NEXT: addq %rax, %r25 +; EGPR-NDD-NEXT: adcq $0, %rdx, %rcx ; EGPR-NDD-NEXT: movq 72(%r20), %r29 ; EGPR-NDD-NEXT: movq %r24, %rax ; EGPR-NDD-NEXT: mulq %r29 -; EGPR-NDD-NEXT: addq %rax, %rcx -; EGPR-NDD-NEXT: adcq %rdx, %rsi -; EGPR-NDD-NEXT: setb %dil -; EGPR-NDD-NEXT: movq %r23, %rax +; EGPR-NDD-NEXT: addq %rax, %r25 +; EGPR-NDD-NEXT: adcq %rdx, %rcx +; EGPR-NDD-NEXT: setb %sil +; EGPR-NDD-NEXT: movq %r14, %rax ; EGPR-NDD-NEXT: mulq %r29 -; EGPR-NDD-NEXT: addq %rax, %rsi -; EGPR-NDD-NEXT: movzbl %dil, %eax -; EGPR-NDD-NEXT: adcq %rax, %rdx, %rdi +; EGPR-NDD-NEXT: addq %rax, %rcx +; EGPR-NDD-NEXT: movzbl %sil, %eax +; EGPR-NDD-NEXT: adcq %rax, %rdx, %rsi ; EGPR-NDD-NEXT: movq %r22, %rax ; EGPR-NDD-NEXT: mulq %r28 ; EGPR-NDD-NEXT: movq %rdx, %r31 ; EGPR-NDD-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; EGPR-NDD-NEXT: movq %r21, %rax ; EGPR-NDD-NEXT: mulq %r28 -; EGPR-NDD-NEXT: addq %r31, %rax, %r8 -; EGPR-NDD-NEXT: adcq $0, %rdx, %r9 +; EGPR-NDD-NEXT: addq %rax, %r31 +; EGPR-NDD-NEXT: adcq $0, %rdx, %rdi ; EGPR-NDD-NEXT: movq %r22, %rax ; EGPR-NDD-NEXT: mulq %r29 -; EGPR-NDD-NEXT: addq %r8, %rax +; EGPR-NDD-NEXT: addq %r31, %rax ; EGPR-NDD-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; EGPR-NDD-NEXT: adcq %rdx, %r9, %r8 -; EGPR-NDD-NEXT: setb %r9b +; EGPR-NDD-NEXT: adcq %rdx, %rdi +; EGPR-NDD-NEXT: setb %r8b ; EGPR-NDD-NEXT: movq %r21, %rax ; EGPR-NDD-NEXT: mulq %r29 -; EGPR-NDD-NEXT: addq %r8, %rax -; EGPR-NDD-NEXT: movzbl %r9b, %r8d -; EGPR-NDD-NEXT: adcq %r8, %rdx -; EGPR-NDD-NEXT: addq %rax, %r30, %r8 -; EGPR-NDD-NEXT: adcq %rdx, %rcx +; EGPR-NDD-NEXT: addq %rdi, %rax +; EGPR-NDD-NEXT: movzbl %r8b, %edi +; EGPR-NDD-NEXT: adcq %rdi, %rdx +; EGPR-NDD-NEXT: addq %rax, %r30, %rdi +; EGPR-NDD-NEXT: adcq %rdx, %r25 +; EGPR-NDD-NEXT: adcq $0, %rcx ; EGPR-NDD-NEXT: adcq $0, %rsi -; EGPR-NDD-NEXT: adcq $0, %rdi -; EGPR-NDD-NEXT: movq 80(%r20), %rbx +; EGPR-NDD-NEXT: movq 80(%r20), %r8 ; EGPR-NDD-NEXT: movq %r22, %rax -; EGPR-NDD-NEXT: mulq %rbx +; EGPR-NDD-NEXT: mulq %r8 ; EGPR-NDD-NEXT: movq %rdx, %r30 ; EGPR-NDD-NEXT: movq %rax, %r31 ; EGPR-NDD-NEXT: movq %r21, %rax -; EGPR-NDD-NEXT: mulq %rbx -; EGPR-NDD-NEXT: addq %r30, %rax, %r9 -; EGPR-NDD-NEXT: adcq $0, %rdx, %r10 -; EGPR-NDD-NEXT: movq 88(%r20), %r15 +; EGPR-NDD-NEXT: mulq %r8 +; EGPR-NDD-NEXT: addq %rax, %r30 +; EGPR-NDD-NEXT: adcq $0, %rdx, %r9 +; EGPR-NDD-NEXT: movq 88(%r20), %rbx ; EGPR-NDD-NEXT: movq %r22, %rax -; EGPR-NDD-NEXT: mulq %r15 -; EGPR-NDD-NEXT: addq %rax, %r9 -; EGPR-NDD-NEXT: adcq %rdx, %r10 -; EGPR-NDD-NEXT: setb %r19b +; EGPR-NDD-NEXT: mulq %rbx +; EGPR-NDD-NEXT: addq %rax, %r30 +; EGPR-NDD-NEXT: adcq %rdx, %r9 +; EGPR-NDD-NEXT: setb %r10b ; EGPR-NDD-NEXT: movq %r21, %rax -; EGPR-NDD-NEXT: mulq %r15 -; EGPR-NDD-NEXT: addq %r10, %rax -; EGPR-NDD-NEXT: movzbl %r19b, %r10d -; EGPR-NDD-NEXT: adcq %r10, %rdx -; EGPR-NDD-NEXT: addq %r31, %r8 -; EGPR-NDD-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; EGPR-NDD-NEXT: adcq %r9, %rcx -; EGPR-NDD-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; EGPR-NDD-NEXT: mulq %rbx +; EGPR-NDD-NEXT: addq %r9, %rax +; EGPR-NDD-NEXT: movzbl %r10b, %r9d +; EGPR-NDD-NEXT: adcq %r9, %rdx +; EGPR-NDD-NEXT: addq %r31, %rdi +; EGPR-NDD-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; EGPR-NDD-NEXT: adcq %r25, %r30, %rbp ; EGPR-NDD-NEXT: adcq $0, %rax -; EGPR-NDD-NEXT: adcq $0, %rdx, %rcx -; EGPR-NDD-NEXT: addq %rax, %rsi -; EGPR-NDD-NEXT: adcq %rdi, %rcx +; EGPR-NDD-NEXT: adcq $0, %rdx +; EGPR-NDD-NEXT: addq %rax, %rcx +; EGPR-NDD-NEXT: adcq %rdx, %rsi ; EGPR-NDD-NEXT: setb %dil ; EGPR-NDD-NEXT: movq %r24, %rax -; EGPR-NDD-NEXT: mulq %rbx +; EGPR-NDD-NEXT: mulq %r8 ; EGPR-NDD-NEXT: movq %rdx, %r30 ; EGPR-NDD-NEXT: movq %rax, %r31 -; EGPR-NDD-NEXT: movq %r23, %rax -; EGPR-NDD-NEXT: mulq %rbx -; EGPR-NDD-NEXT: addq %r30, %rax, %r8 +; EGPR-NDD-NEXT: movq %r14, %rax +; EGPR-NDD-NEXT: mulq %r8 +; EGPR-NDD-NEXT: addq %rax, %r30 ; EGPR-NDD-NEXT: adcq $0, %rdx, %r9 ; EGPR-NDD-NEXT: movq %r24, %rax -; EGPR-NDD-NEXT: mulq %r15 -; EGPR-NDD-NEXT: addq %rax, %r8 +; EGPR-NDD-NEXT: mulq %rbx +; EGPR-NDD-NEXT: addq %rax, %r30 ; EGPR-NDD-NEXT: adcq %rdx, %r9 ; EGPR-NDD-NEXT: setb %r10b -; EGPR-NDD-NEXT: movq %r23, %rax -; EGPR-NDD-NEXT: mulq %r15 +; EGPR-NDD-NEXT: movq %r14, %rax +; EGPR-NDD-NEXT: mulq %rbx ; EGPR-NDD-NEXT: addq %r9, %rax ; EGPR-NDD-NEXT: movzbl %r10b, %r9d ; EGPR-NDD-NEXT: adcq %r9, %rdx -; EGPR-NDD-NEXT: addq %rsi, %r31, %r25 -; EGPR-NDD-NEXT: adcq %rcx, %r8, %r19 -; EGPR-NDD-NEXT: movzbl %dil, %ecx -; EGPR-NDD-NEXT: adcq %rcx, %rax, %r31 -; EGPR-NDD-NEXT: adcq $0, %rdx, %r12 -; EGPR-NDD-NEXT: imulq %r15, %r26, %rcx +; EGPR-NDD-NEXT: addq %rcx, %r31, %r25 +; EGPR-NDD-NEXT: adcq %rsi, %r30, %r12 +; EGPR-NDD-NEXT: movzbl %dil, %r19d +; EGPR-NDD-NEXT: adcq %rax, %r19 +; EGPR-NDD-NEXT: adcq $0, %rdx, %r31 +; EGPR-NDD-NEXT: imulq %r26, %rbx ; EGPR-NDD-NEXT: movq %r26, %rax -; EGPR-NDD-NEXT: mulq %rbx +; EGPR-NDD-NEXT: mulq %r8 ; EGPR-NDD-NEXT: movq %rax, %r30 -; EGPR-NDD-NEXT: addq %rcx, %rdx, %rax -; EGPR-NDD-NEXT: imulq %rbx, %r18, %rcx -; EGPR-NDD-NEXT: addq %rax, %rcx -; EGPR-NDD-NEXT: imulq %r29, %r11, %rsi +; EGPR-NDD-NEXT: addq %rbx, %rdx +; EGPR-NDD-NEXT: imulq %r18, %r8 +; EGPR-NDD-NEXT: addq %rdx, %r8 +; EGPR-NDD-NEXT: imulq %r29, %r11, %rcx ; EGPR-NDD-NEXT: movq %r11, %rax ; EGPR-NDD-NEXT: mulq %r28 -; EGPR-NDD-NEXT: addq %rsi, %rdx -; EGPR-NDD-NEXT: imulq %r28, %r16, %rsi -; EGPR-NDD-NEXT: addq %rsi, %rdx +; EGPR-NDD-NEXT: addq %rdx, %rcx +; EGPR-NDD-NEXT: imulq %r28, %r17, %r16 +; EGPR-NDD-NEXT: addq %r16, %rcx ; EGPR-NDD-NEXT: addq %r30, %rax, %rsi -; EGPR-NDD-NEXT: adcq %rcx, %rdx, %rdi +; EGPR-NDD-NEXT: adcq %rcx, %r8 ; EGPR-NDD-NEXT: movq %r28, %rax ; EGPR-NDD-NEXT: mulq %r26 ; EGPR-NDD-NEXT: movq %rdx, %r30 @@ -1601,215 +1593,215 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; EGPR-NDD-NEXT: movq %r29, %rax ; EGPR-NDD-NEXT: mulq %r26 ; EGPR-NDD-NEXT: addq %r30, %rax, %rcx -; EGPR-NDD-NEXT: adcq $0, %rdx, %r8 +; EGPR-NDD-NEXT: adcq $0, %rdx, %rdi ; EGPR-NDD-NEXT: movq %r28, %rax ; EGPR-NDD-NEXT: mulq %r18 ; EGPR-NDD-NEXT: addq %rax, %rcx -; EGPR-NDD-NEXT: adcq %rdx, %r8 +; EGPR-NDD-NEXT: adcq %rdx, %rdi ; EGPR-NDD-NEXT: setb %r9b ; EGPR-NDD-NEXT: movq %r29, %rax ; EGPR-NDD-NEXT: mulq %r18 -; EGPR-NDD-NEXT: addq %r8, %rax -; EGPR-NDD-NEXT: movzbl %r9b, %r8d -; EGPR-NDD-NEXT: adcq %r8, %rdx +; EGPR-NDD-NEXT: addq %rdi, %rax +; EGPR-NDD-NEXT: movzbl %r9b, %edi +; EGPR-NDD-NEXT: adcq %rdi, %rdx ; EGPR-NDD-NEXT: addq %rax, %rsi -; EGPR-NDD-NEXT: adcq %rdi, %rdx, %r29 +; EGPR-NDD-NEXT: adcq %rdx, %r8 ; EGPR-NDD-NEXT: movq 112(%r20), %rdi ; EGPR-NDD-NEXT: movq %r22, %rax ; EGPR-NDD-NEXT: mulq %rdi ; EGPR-NDD-NEXT: movq %rax, %r26 -; EGPR-NDD-NEXT: imulq %rdi, %r21, %rax -; EGPR-NDD-NEXT: addq %rdx, %rax -; EGPR-NDD-NEXT: imulq 120(%r20), %r22, %rdx -; EGPR-NDD-NEXT: addq %rdx, %rax, %r8 +; EGPR-NDD-NEXT: imulq %r21, %rdi +; EGPR-NDD-NEXT: addq %rdi, %rdx +; EGPR-NDD-NEXT: imulq 120(%r20), %r22, %rax +; EGPR-NDD-NEXT: addq %rax, %rdx, %r9 ; EGPR-NDD-NEXT: movq 96(%r20), %r28 ; EGPR-NDD-NEXT: movq 104(%r20), %rdi -; EGPR-NDD-NEXT: imulq %rdi, %r24, %r9 +; EGPR-NDD-NEXT: imulq %rdi, %r24, %r10 ; EGPR-NDD-NEXT: movq %r24, %rax ; EGPR-NDD-NEXT: mulq %r28 -; EGPR-NDD-NEXT: addq %r9, %rdx -; EGPR-NDD-NEXT: imulq %r28, %r23, %r9 -; EGPR-NDD-NEXT: addq %r9, %rdx -; EGPR-NDD-NEXT: addq %r26, %rax, %r9 -; EGPR-NDD-NEXT: adcq %rdx, %r8 +; EGPR-NDD-NEXT: addq %r10, %rdx +; EGPR-NDD-NEXT: imulq %r28, %r14, %r23 +; EGPR-NDD-NEXT: addq %r23, %rdx +; EGPR-NDD-NEXT: addq %rax, %r26 +; EGPR-NDD-NEXT: adcq %rdx, %r9 ; EGPR-NDD-NEXT: movq %r28, %rax ; EGPR-NDD-NEXT: mulq %r22 ; EGPR-NDD-NEXT: movq %rdx, %r23 ; EGPR-NDD-NEXT: movq %rax, %r24 ; EGPR-NDD-NEXT: movq %rdi, %rax ; EGPR-NDD-NEXT: mulq %r22 -; EGPR-NDD-NEXT: addq %r23, %rax, %r10 -; EGPR-NDD-NEXT: adcq $0, %rdx, %r11 +; EGPR-NDD-NEXT: addq %rax, %r23 +; EGPR-NDD-NEXT: adcq $0, %rdx, %r10 ; EGPR-NDD-NEXT: movq %r28, %rax ; EGPR-NDD-NEXT: mulq %r21 -; EGPR-NDD-NEXT: addq %rax, %r10 -; EGPR-NDD-NEXT: adcq %rdx, %r11 -; EGPR-NDD-NEXT: setb %r16b +; EGPR-NDD-NEXT: addq %rax, %r23 +; EGPR-NDD-NEXT: adcq %rdx, %r10 +; EGPR-NDD-NEXT: setb %r11b ; EGPR-NDD-NEXT: movq %rdi, %rax ; EGPR-NDD-NEXT: mulq %r21 -; EGPR-NDD-NEXT: addq %r11, %rax -; EGPR-NDD-NEXT: movzbl %r16b, %edi +; EGPR-NDD-NEXT: addq %r10, %rax +; EGPR-NDD-NEXT: movzbl %r11b, %edi ; EGPR-NDD-NEXT: adcq %rdi, %rdx -; EGPR-NDD-NEXT: addq %r9, %rax -; EGPR-NDD-NEXT: adcq %r8, %rdx -; EGPR-NDD-NEXT: addq %r27, %r24, %rdi -; EGPR-NDD-NEXT: adcq %r10, %rcx +; EGPR-NDD-NEXT: addq %r26, %rax +; EGPR-NDD-NEXT: adcq %r9, %rdx +; EGPR-NDD-NEXT: addq %r27, %r24 +; EGPR-NDD-NEXT: adcq %r23, %rcx ; EGPR-NDD-NEXT: adcq %rsi, %rax -; EGPR-NDD-NEXT: adcq %r29, %rdx -; EGPR-NDD-NEXT: addq %rdi, %r25, %r15 -; EGPR-NDD-NEXT: adcq %rcx, %r19, %rbx -; EGPR-NDD-NEXT: adcq %rax, %r31, %rbp -; EGPR-NDD-NEXT: adcq %rdx, %r12, %r30 -; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r18 # 8-byte Reload -; EGPR-NDD-NEXT: movq 80(%r18), %r22 +; EGPR-NDD-NEXT: adcq %r8, %rdx +; EGPR-NDD-NEXT: addq %r24, %r25, %rbx +; EGPR-NDD-NEXT: adcq %rcx, %r12 +; EGPR-NDD-NEXT: adcq %rax, %r19, %r13 +; EGPR-NDD-NEXT: adcq %rdx, %r31, %r30 +; EGPR-NDD-NEXT: movq 80(%r15), %r22 ; EGPR-NDD-NEXT: movq %r22, %rax -; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r21 # 8-byte Reload -; EGPR-NDD-NEXT: mulq %r21 +; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r16 # 8-byte Reload +; EGPR-NDD-NEXT: mulq %r16 ; EGPR-NDD-NEXT: movq %rax, %r26 ; EGPR-NDD-NEXT: movq %rdx, %rdi -; EGPR-NDD-NEXT: movq 88(%r18), %r20 +; EGPR-NDD-NEXT: movq 88(%r15), %r20 ; EGPR-NDD-NEXT: movq %r20, %rax -; EGPR-NDD-NEXT: mulq %r21 -; EGPR-NDD-NEXT: addq %rdi, %rax, %rcx -; EGPR-NDD-NEXT: adcq $0, %rdx, %rsi +; EGPR-NDD-NEXT: mulq %r16 +; EGPR-NDD-NEXT: addq %rax, %rdi +; EGPR-NDD-NEXT: adcq $0, %rdx, %rcx ; EGPR-NDD-NEXT: movq %r22, %rax -; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload -; EGPR-NDD-NEXT: mulq %r12 -; EGPR-NDD-NEXT: addq %rax, %rcx -; EGPR-NDD-NEXT: adcq %rdx, %rsi -; EGPR-NDD-NEXT: setb %dil +; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r21 # 8-byte Reload +; EGPR-NDD-NEXT: mulq %r21 +; EGPR-NDD-NEXT: addq %rax, %rdi +; EGPR-NDD-NEXT: adcq %rdx, %rcx +; EGPR-NDD-NEXT: setb %sil ; EGPR-NDD-NEXT: movq %r20, %rax -; EGPR-NDD-NEXT: mulq %r12 -; EGPR-NDD-NEXT: addq %rax, %rsi -; EGPR-NDD-NEXT: movzbl %dil, %eax -; EGPR-NDD-NEXT: adcq %rax, %rdx, %rdi -; EGPR-NDD-NEXT: movq 64(%r18), %r24 -; EGPR-NDD-NEXT: movq %r24, %rax ; EGPR-NDD-NEXT: mulq %r21 +; EGPR-NDD-NEXT: addq %rax, %rcx +; EGPR-NDD-NEXT: movzbl %sil, %eax +; EGPR-NDD-NEXT: adcq %rax, %rdx, %rsi +; EGPR-NDD-NEXT: movq 64(%r15), %r24 +; EGPR-NDD-NEXT: movq %r24, %rax +; EGPR-NDD-NEXT: mulq %r16 ; EGPR-NDD-NEXT: movq %rax, %r29 ; EGPR-NDD-NEXT: movq %rdx, %r27 -; EGPR-NDD-NEXT: movq 72(%r18), %r23 +; EGPR-NDD-NEXT: movq 72(%r15), %r23 ; EGPR-NDD-NEXT: movq %r23, %rax -; EGPR-NDD-NEXT: mulq %r21 -; EGPR-NDD-NEXT: addq %r27, %rax, %r8 -; EGPR-NDD-NEXT: adcq $0, %rdx, %r9 +; EGPR-NDD-NEXT: mulq %r16 +; EGPR-NDD-NEXT: addq %rax, %r27 +; EGPR-NDD-NEXT: adcq $0, %rdx, %r8 ; EGPR-NDD-NEXT: movq %r24, %rax -; EGPR-NDD-NEXT: mulq %r12 -; EGPR-NDD-NEXT: addq %r8, %rax, %r31 -; EGPR-NDD-NEXT: adcq %rdx, %r9, %r8 +; EGPR-NDD-NEXT: mulq %r21 +; EGPR-NDD-NEXT: addq %r27, %rax, %r31 +; EGPR-NDD-NEXT: adcq %rdx, %r8 ; EGPR-NDD-NEXT: setb %r9b ; EGPR-NDD-NEXT: movq %r23, %rax -; EGPR-NDD-NEXT: mulq %r12 +; EGPR-NDD-NEXT: mulq %r21 ; EGPR-NDD-NEXT: addq %r8, %rax ; EGPR-NDD-NEXT: movzbl %r9b, %r8d ; EGPR-NDD-NEXT: adcq %r8, %rdx -; EGPR-NDD-NEXT: addq %rax, %r26, %r8 -; EGPR-NDD-NEXT: adcq %rdx, %rcx +; EGPR-NDD-NEXT: addq %rax, %r26, %r28 +; EGPR-NDD-NEXT: adcq %rdx, %rdi +; EGPR-NDD-NEXT: adcq $0, %rcx ; EGPR-NDD-NEXT: adcq $0, %rsi -; EGPR-NDD-NEXT: adcq $0, %rdi ; EGPR-NDD-NEXT: movq %r24, %rax -; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r16 # 8-byte Reload -; EGPR-NDD-NEXT: mulq %r16 +; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload +; EGPR-NDD-NEXT: mulq %r10 ; EGPR-NDD-NEXT: movq %rdx, %r26 ; EGPR-NDD-NEXT: movq %rax, %r27 ; EGPR-NDD-NEXT: movq %r23, %rax -; EGPR-NDD-NEXT: mulq %r16 -; EGPR-NDD-NEXT: addq %r26, %rax, %r9 -; EGPR-NDD-NEXT: adcq $0, %rdx, %r10 +; EGPR-NDD-NEXT: mulq %r10 +; EGPR-NDD-NEXT: addq %rax, %r26 +; EGPR-NDD-NEXT: adcq $0, %rdx, %r8 ; EGPR-NDD-NEXT: movq %r24, %rax -; EGPR-NDD-NEXT: mulq %r17 -; EGPR-NDD-NEXT: addq %rax, %r9 -; EGPR-NDD-NEXT: adcq %rdx, %r10 -; EGPR-NDD-NEXT: setb %r11b +; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload +; EGPR-NDD-NEXT: mulq %r11 +; EGPR-NDD-NEXT: addq %r26, %rax, %r25 +; EGPR-NDD-NEXT: adcq %rdx, %r8 +; EGPR-NDD-NEXT: setb %r9b ; EGPR-NDD-NEXT: movq %r23, %rax -; EGPR-NDD-NEXT: mulq %r17 -; EGPR-NDD-NEXT: addq %r10, %rax -; EGPR-NDD-NEXT: movzbl %r11b, %r10d -; EGPR-NDD-NEXT: adcq %r10, %rdx -; EGPR-NDD-NEXT: addq %r8, %r27, %r28 -; EGPR-NDD-NEXT: adcq %rcx, %r9, %r25 +; EGPR-NDD-NEXT: mulq %r11 +; EGPR-NDD-NEXT: addq %r8, %rax +; EGPR-NDD-NEXT: movzbl %r9b, %r8d +; EGPR-NDD-NEXT: adcq %r8, %rdx +; EGPR-NDD-NEXT: addq %r27, %r28 +; EGPR-NDD-NEXT: adcq %rdi, %r25 ; EGPR-NDD-NEXT: adcq $0, %rax -; EGPR-NDD-NEXT: adcq $0, %rdx, %rcx -; EGPR-NDD-NEXT: addq %rax, %rsi -; EGPR-NDD-NEXT: adcq %rdi, %rcx +; EGPR-NDD-NEXT: adcq $0, %rdx +; EGPR-NDD-NEXT: addq %rax, %rcx +; EGPR-NDD-NEXT: adcq %rdx, %rsi ; EGPR-NDD-NEXT: setb %dil ; EGPR-NDD-NEXT: movq %r22, %rax -; EGPR-NDD-NEXT: mulq %r16 +; EGPR-NDD-NEXT: mulq %r10 ; EGPR-NDD-NEXT: movq %rdx, %r26 ; EGPR-NDD-NEXT: movq %rax, %r27 ; EGPR-NDD-NEXT: movq %r20, %rax -; EGPR-NDD-NEXT: mulq %r16 -; EGPR-NDD-NEXT: addq %r26, %rax, %r8 -; EGPR-NDD-NEXT: adcq $0, %rdx, %r9 +; EGPR-NDD-NEXT: mulq %r10 +; EGPR-NDD-NEXT: addq %rax, %r26 +; EGPR-NDD-NEXT: adcq $0, %rdx, %r8 ; EGPR-NDD-NEXT: movq %r22, %rax -; EGPR-NDD-NEXT: mulq %r17 -; EGPR-NDD-NEXT: addq %rax, %r8 -; EGPR-NDD-NEXT: adcq %rdx, %r9 -; EGPR-NDD-NEXT: setb %r10b +; EGPR-NDD-NEXT: mulq %r11 +; EGPR-NDD-NEXT: addq %r26, %rax, %r19 +; EGPR-NDD-NEXT: adcq %rdx, %r8 +; EGPR-NDD-NEXT: setb %r9b ; EGPR-NDD-NEXT: movq %r20, %rax -; EGPR-NDD-NEXT: mulq %r17 -; EGPR-NDD-NEXT: addq %r9, %rax -; EGPR-NDD-NEXT: movzbl %r10b, %r9d -; EGPR-NDD-NEXT: adcq %r9, %rdx -; EGPR-NDD-NEXT: addq %rsi, %r27 -; EGPR-NDD-NEXT: adcq %rcx, %r8, %r19 +; EGPR-NDD-NEXT: mulq %r11 +; EGPR-NDD-NEXT: addq %r8, %rax +; EGPR-NDD-NEXT: movzbl %r9b, %r8d +; EGPR-NDD-NEXT: adcq %r8, %rdx +; EGPR-NDD-NEXT: addq %rcx, %r27 +; EGPR-NDD-NEXT: adcq %rsi, %r19 ; EGPR-NDD-NEXT: movzbl %dil, %ecx ; EGPR-NDD-NEXT: adcq %rax, %rcx ; EGPR-NDD-NEXT: adcq $0, %rdx, %rdi -; EGPR-NDD-NEXT: movq %r18, %r9 -; EGPR-NDD-NEXT: movq 96(%r18), %r26 -; EGPR-NDD-NEXT: imulq %r17, %r26, %rsi +; EGPR-NDD-NEXT: movq 96(%r15), %r26 +; EGPR-NDD-NEXT: imulq %r11, %r26, %rsi ; EGPR-NDD-NEXT: movq %r26, %rax -; EGPR-NDD-NEXT: mulq %r16 +; EGPR-NDD-NEXT: mulq %r10 ; EGPR-NDD-NEXT: movq %rax, %r18 -; EGPR-NDD-NEXT: addq %rsi, %rdx, %rax -; EGPR-NDD-NEXT: movq 104(%r9), %r8 -; EGPR-NDD-NEXT: imulq %r16, %r8, %rdx -; EGPR-NDD-NEXT: addq %rdx, %rax, %rsi -; EGPR-NDD-NEXT: movq 112(%r9), %rax -; EGPR-NDD-NEXT: movq %r9, %r11 -; EGPR-NDD-NEXT: imulq %r12, %rax, %r9 -; EGPR-NDD-NEXT: mulq %r21 +; EGPR-NDD-NEXT: addq %rsi, %rdx +; EGPR-NDD-NEXT: movq 104(%r15), %r8 +; EGPR-NDD-NEXT: imulq %r10, %r8, %rax +; EGPR-NDD-NEXT: addq %rax, %rdx, %rsi +; EGPR-NDD-NEXT: movq 112(%r15), %rax +; EGPR-NDD-NEXT: imulq %r21, %rax, %r9 +; EGPR-NDD-NEXT: mulq %r16 ; EGPR-NDD-NEXT: addq %r9, %rdx -; EGPR-NDD-NEXT: imulq 120(%r11), %r21, %r9 +; EGPR-NDD-NEXT: imulq 120(%r15), %r16, %r9 ; EGPR-NDD-NEXT: addq %r9, %rdx -; EGPR-NDD-NEXT: addq %r18, %rax, %r9 -; EGPR-NDD-NEXT: adcq %rsi, %rdx, %r16 -; EGPR-NDD-NEXT: movq %r21, %rax +; EGPR-NDD-NEXT: addq %r18, %rax, %r10 +; EGPR-NDD-NEXT: adcq %rsi, %rdx, %r9 +; EGPR-NDD-NEXT: movq %r16, %rax +; EGPR-NDD-NEXT: movq %r16, %r18 ; EGPR-NDD-NEXT: mulq %r26 ; EGPR-NDD-NEXT: movq %rdx, %r17 ; EGPR-NDD-NEXT: movq %rax, %rsi -; EGPR-NDD-NEXT: movq %r12, %rax -; EGPR-NDD-NEXT: mulq %r26 -; EGPR-NDD-NEXT: addq %r17, %rax, %r10 -; EGPR-NDD-NEXT: adcq $0, %rdx, %r17 ; EGPR-NDD-NEXT: movq %r21, %rax +; EGPR-NDD-NEXT: mulq %r26 +; EGPR-NDD-NEXT: addq %r17, %rax, %r11 +; EGPR-NDD-NEXT: adcq $0, %rdx, %r16 +; EGPR-NDD-NEXT: movq %r18, %rax ; EGPR-NDD-NEXT: mulq %r8 -; EGPR-NDD-NEXT: addq %r10, %rax, %r11 -; EGPR-NDD-NEXT: adcq %rdx, %r17, %r10 +; EGPR-NDD-NEXT: addq %rax, %r11 +; EGPR-NDD-NEXT: adcq %rdx, %r16 ; EGPR-NDD-NEXT: setb %r17b -; EGPR-NDD-NEXT: movq %r12, %rax +; EGPR-NDD-NEXT: movq %r21, %rax ; EGPR-NDD-NEXT: mulq %r8 -; EGPR-NDD-NEXT: addq %r10, %rax +; EGPR-NDD-NEXT: addq %r16, %rax ; EGPR-NDD-NEXT: movzbl %r17b, %r8d ; EGPR-NDD-NEXT: adcq %r8, %rdx -; EGPR-NDD-NEXT: addq %r9, %rax, %r10 -; EGPR-NDD-NEXT: adcq %r16, %rdx, %r17 -; EGPR-NDD-NEXT: imulq %r14, %r24, %r8 +; EGPR-NDD-NEXT: addq %rax, %r10 +; EGPR-NDD-NEXT: adcq %r9, %rdx, %r17 +; EGPR-NDD-NEXT: imulq {{[-0-9]+}}(%r{{[sb]}}p), %r24, %r8 # 8-byte Folded Reload ; EGPR-NDD-NEXT: movq %r24, %rax -; EGPR-NDD-NEXT: mulq %r13 +; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r16 # 8-byte Reload +; EGPR-NDD-NEXT: mulq %r16 ; EGPR-NDD-NEXT: movq %rax, %r9 -; EGPR-NDD-NEXT: addq %r8, %rdx, %rax -; EGPR-NDD-NEXT: imulq %r13, %r23, %rdx -; EGPR-NDD-NEXT: addq %rdx, %rax, %r8 +; EGPR-NDD-NEXT: addq %r8, %rdx +; EGPR-NDD-NEXT: imulq %r16, %r23, %rax +; EGPR-NDD-NEXT: addq %rax, %rdx, %r8 ; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r21 # 8-byte Reload ; EGPR-NDD-NEXT: imulq %r21, %r22, %r16 ; EGPR-NDD-NEXT: movq %r22, %rax ; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r26 # 8-byte Reload ; EGPR-NDD-NEXT: mulq %r26 ; EGPR-NDD-NEXT: addq %r16, %rdx -; EGPR-NDD-NEXT: imulq %r26, %r20, %r16 -; EGPR-NDD-NEXT: addq %r16, %rdx +; EGPR-NDD-NEXT: imulq %r26, %r20 +; EGPR-NDD-NEXT: addq %r20, %rdx ; EGPR-NDD-NEXT: addq %r9, %rax, %r16 ; EGPR-NDD-NEXT: adcq %r8, %rdx, %r18 ; EGPR-NDD-NEXT: movq %r26, %rax @@ -1840,49 +1832,49 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; EGPR-NDD-NEXT: addq %r27, %rsi ; EGPR-NDD-NEXT: adcq %r19, %r8 ; EGPR-NDD-NEXT: adcq %rcx, %rax -; EGPR-NDD-NEXT: adcq %rdx, %rdi, %rcx -; EGPR-NDD-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r29, %rdx # 8-byte Folded Reload -; EGPR-NDD-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r31, %rdi # 8-byte Folded Reload -; EGPR-NDD-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r28, %r9 # 8-byte Folded Reload -; EGPR-NDD-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r25, %r10 # 8-byte Folded Reload -; EGPR-NDD-NEXT: adcq %r15, %rsi -; EGPR-NDD-NEXT: adcq %rbx, %r8 -; EGPR-NDD-NEXT: adcq %rbp, %rax -; EGPR-NDD-NEXT: adcq %r30, %rcx -; EGPR-NDD-NEXT: addq %rdx, {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload -; EGPR-NDD-NEXT: adcq %rdi, {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Folded Reload -; EGPR-NDD-NEXT: adcq %r9, {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload -; EGPR-NDD-NEXT: adcq %r10, {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Folded Reload -; EGPR-NDD-NEXT: adcq %rsi, {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload -; EGPR-NDD-NEXT: adcq %r8, (%rsp), %r8 # 8-byte Folded Reload +; EGPR-NDD-NEXT: adcq %rdi, %rdx +; EGPR-NDD-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r29 # 8-byte Folded Reload +; EGPR-NDD-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r31 # 8-byte Folded Reload +; EGPR-NDD-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r28 # 8-byte Folded Reload +; EGPR-NDD-NEXT: adcq %rbp, %r25 +; EGPR-NDD-NEXT: adcq %rbx, %rsi +; EGPR-NDD-NEXT: adcq %r12, %r8 +; EGPR-NDD-NEXT: adcq %r13, %rax +; EGPR-NDD-NEXT: adcq %r30, %rdx +; EGPR-NDD-NEXT: addq %r29, {{[-0-9]+}}(%r{{[sb]}}p), %r29 # 8-byte Folded Reload +; EGPR-NDD-NEXT: adcq %r31, {{[-0-9]+}}(%r{{[sb]}}p), %r31 # 8-byte Folded Reload +; EGPR-NDD-NEXT: adcq %r28, {{[-0-9]+}}(%r{{[sb]}}p), %r28 # 8-byte Folded Reload +; EGPR-NDD-NEXT: adcq %r25, {{[-0-9]+}}(%r{{[sb]}}p), %r25 # 8-byte Folded Reload +; EGPR-NDD-NEXT: adcq %rsi, (%rsp), %rsi # 8-byte Folded Reload +; EGPR-NDD-NEXT: adcq %r8, {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload ; EGPR-NDD-NEXT: adcq %rax, {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload -; EGPR-NDD-NEXT: adcq %rcx, {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload -; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload -; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r16 # 8-byte Reload -; EGPR-NDD-NEXT: movq %r16, (%r11) -; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r16 # 8-byte Reload -; EGPR-NDD-NEXT: movq %r16, 8(%r11) -; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r16 # 8-byte Reload -; EGPR-NDD-NEXT: movq %r16, 16(%r11) -; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r16 # 8-byte Reload -; EGPR-NDD-NEXT: movq %r16, 24(%r11) -; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r16 # 8-byte Reload -; EGPR-NDD-NEXT: movq %r16, 32(%r11) -; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r16 # 8-byte Reload -; EGPR-NDD-NEXT: movq %r16, 40(%r11) -; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r16 # 8-byte Reload -; EGPR-NDD-NEXT: movq %r16, 48(%r11) -; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r16 # 8-byte Reload -; EGPR-NDD-NEXT: movq %r16, 56(%r11) -; EGPR-NDD-NEXT: movq %rdx, 64(%r11) -; EGPR-NDD-NEXT: movq %rdi, 72(%r11) -; EGPR-NDD-NEXT: movq %r9, 80(%r11) -; EGPR-NDD-NEXT: movq %r10, 88(%r11) -; EGPR-NDD-NEXT: movq %rsi, 96(%r11) -; EGPR-NDD-NEXT: movq %r8, 104(%r11) -; EGPR-NDD-NEXT: movq %rax, 112(%r11) -; EGPR-NDD-NEXT: movq %rcx, 120(%r11) -; EGPR-NDD-NEXT: addq $104, %rsp +; EGPR-NDD-NEXT: adcq %rdx, {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload +; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload +; EGPR-NDD-NEXT: movq %rdi, (%rcx) +; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload +; EGPR-NDD-NEXT: movq %rdi, 8(%rcx) +; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload +; EGPR-NDD-NEXT: movq %rdi, 16(%rcx) +; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload +; EGPR-NDD-NEXT: movq %rdi, 24(%rcx) +; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload +; EGPR-NDD-NEXT: movq %rdi, 32(%rcx) +; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload +; EGPR-NDD-NEXT: movq %rdi, 40(%rcx) +; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload +; EGPR-NDD-NEXT: movq %rdi, 48(%rcx) +; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload +; EGPR-NDD-NEXT: movq %rdi, 56(%rcx) +; EGPR-NDD-NEXT: movq %r29, 64(%rcx) +; EGPR-NDD-NEXT: movq %r31, 72(%rcx) +; EGPR-NDD-NEXT: movq %r28, 80(%rcx) +; EGPR-NDD-NEXT: movq %r25, 88(%rcx) +; EGPR-NDD-NEXT: movq %rsi, 96(%rcx) +; EGPR-NDD-NEXT: movq %r8, 104(%rcx) +; EGPR-NDD-NEXT: movq %rax, 112(%rcx) +; EGPR-NDD-NEXT: movq %rdx, 120(%rcx) +; EGPR-NDD-NEXT: addq $96, %rsp ; EGPR-NDD-NEXT: popq %rbx ; EGPR-NDD-NEXT: popq %r12 ; EGPR-NDD-NEXT: popq %r13 diff --git a/llvm/test/CodeGen/X86/apx/or.ll b/llvm/test/CodeGen/X86/apx/or.ll index 6a3db295c8c1..e51ba9d9bf03 100644 --- a/llvm/test/CodeGen/X86/apx/or.ll +++ b/llvm/test/CodeGen/X86/apx/or.ll @@ -478,17 +478,17 @@ define i1 @orflag16rr(i16 %a, i16 %b) { define i1 @orflag32rr(i32 %a, i32 %b) { ; CHECK-LABEL: orflag32rr: ; CHECK: # %bb.0: -; CHECK-NEXT: orl %esi, %edi, %ecx # encoding: [0x62,0xf4,0x74,0x18,0x09,0xf7] +; CHECK-NEXT: orl %edi, %esi # EVEX TO LEGACY Compression encoding: [0x09,0xfe] ; CHECK-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] -; CHECK-NEXT: movl %ecx, d64(%rip) # encoding: [0x89,0x0d,A,A,A,A] +; CHECK-NEXT: movl %esi, d64(%rip) # encoding: [0x89,0x35,A,A,A,A] ; CHECK-NEXT: # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte ; CHECK-NEXT: retq # encoding: [0xc3] ; ; NF-LABEL: orflag32rr: ; NF: # %bb.0: -; NF-NEXT: orl %esi, %edi, %ecx # encoding: [0x62,0xf4,0x74,0x18,0x09,0xf7] +; NF-NEXT: orl %edi, %esi # EVEX TO LEGACY Compression encoding: [0x09,0xfe] ; NF-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] -; NF-NEXT: movl %ecx, d64(%rip) # encoding: [0x89,0x0d,A,A,A,A] +; NF-NEXT: movl %esi, d64(%rip) # encoding: [0x89,0x35,A,A,A,A] ; NF-NEXT: # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte ; NF-NEXT: retq # encoding: [0xc3] %v0 = or i32 %a, %b ; 0xff << 50 @@ -500,17 +500,17 @@ define i1 @orflag32rr(i32 %a, i32 %b) { define i1 @orflag64rr(i64 %a, i64 %b) { ; CHECK-LABEL: orflag64rr: ; CHECK: # %bb.0: -; CHECK-NEXT: orq %rsi, %rdi, %rcx # encoding: [0x62,0xf4,0xf4,0x18,0x09,0xf7] +; CHECK-NEXT: orq %rdi, %rsi # EVEX TO LEGACY Compression encoding: [0x48,0x09,0xfe] ; CHECK-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] -; CHECK-NEXT: movq %rcx, d64(%rip) # encoding: [0x48,0x89,0x0d,A,A,A,A] +; CHECK-NEXT: movq %rsi, d64(%rip) # encoding: [0x48,0x89,0x35,A,A,A,A] ; CHECK-NEXT: # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte ; CHECK-NEXT: retq # encoding: [0xc3] ; ; NF-LABEL: orflag64rr: ; NF: # %bb.0: -; NF-NEXT: orq %rsi, %rdi, %rcx # encoding: [0x62,0xf4,0xf4,0x18,0x09,0xf7] +; NF-NEXT: orq %rdi, %rsi # EVEX TO LEGACY Compression encoding: [0x48,0x09,0xfe] ; NF-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] -; NF-NEXT: movq %rcx, d64(%rip) # encoding: [0x48,0x89,0x0d,A,A,A,A] +; NF-NEXT: movq %rsi, d64(%rip) # encoding: [0x48,0x89,0x35,A,A,A,A] ; NF-NEXT: # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte ; NF-NEXT: retq # encoding: [0xc3] %v0 = or i64 %a, %b ; 0xff << 50 @@ -574,17 +574,17 @@ define i1 @orflag16rm(ptr %ptr, i16 %b) { define i1 @orflag32rm(ptr %ptr, i32 %b) { ; CHECK-LABEL: orflag32rm: ; CHECK: # %bb.0: -; CHECK-NEXT: orl (%rdi), %esi, %ecx # encoding: [0x62,0xf4,0x74,0x18,0x0b,0x37] +; CHECK-NEXT: orl (%rdi), %esi # EVEX TO LEGACY Compression encoding: [0x0b,0x37] ; CHECK-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] -; CHECK-NEXT: movl %ecx, d64(%rip) # encoding: [0x89,0x0d,A,A,A,A] +; CHECK-NEXT: movl %esi, d64(%rip) # encoding: [0x89,0x35,A,A,A,A] ; CHECK-NEXT: # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte ; CHECK-NEXT: retq # encoding: [0xc3] ; ; NF-LABEL: orflag32rm: ; NF: # %bb.0: -; NF-NEXT: orl (%rdi), %esi, %ecx # encoding: [0x62,0xf4,0x74,0x18,0x0b,0x37] +; NF-NEXT: orl (%rdi), %esi # EVEX TO LEGACY Compression encoding: [0x0b,0x37] ; NF-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] -; NF-NEXT: movl %ecx, d64(%rip) # encoding: [0x89,0x0d,A,A,A,A] +; NF-NEXT: movl %esi, d64(%rip) # encoding: [0x89,0x35,A,A,A,A] ; NF-NEXT: # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte ; NF-NEXT: retq # encoding: [0xc3] %a = load i32, ptr %ptr @@ -597,17 +597,17 @@ define i1 @orflag32rm(ptr %ptr, i32 %b) { define i1 @orflag64rm(ptr %ptr, i64 %b) { ; CHECK-LABEL: orflag64rm: ; CHECK: # %bb.0: -; CHECK-NEXT: orq (%rdi), %rsi, %rcx # encoding: [0x62,0xf4,0xf4,0x18,0x0b,0x37] +; CHECK-NEXT: orq (%rdi), %rsi # EVEX TO LEGACY Compression encoding: [0x48,0x0b,0x37] ; CHECK-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] -; CHECK-NEXT: movq %rcx, d64(%rip) # encoding: [0x48,0x89,0x0d,A,A,A,A] +; CHECK-NEXT: movq %rsi, d64(%rip) # encoding: [0x48,0x89,0x35,A,A,A,A] ; CHECK-NEXT: # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte ; CHECK-NEXT: retq # encoding: [0xc3] ; ; NF-LABEL: orflag64rm: ; NF: # %bb.0: -; NF-NEXT: orq (%rdi), %rsi, %rcx # encoding: [0x62,0xf4,0xf4,0x18,0x0b,0x37] +; NF-NEXT: orq (%rdi), %rsi # EVEX TO LEGACY Compression encoding: [0x48,0x0b,0x37] ; NF-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] -; NF-NEXT: movq %rcx, d64(%rip) # encoding: [0x48,0x89,0x0d,A,A,A,A] +; NF-NEXT: movq %rsi, d64(%rip) # encoding: [0x48,0x89,0x35,A,A,A,A] ; NF-NEXT: # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte ; NF-NEXT: retq # encoding: [0xc3] %a = load i64, ptr %ptr @@ -668,19 +668,19 @@ define i1 @orflag16ri(i16 %a) { define i1 @orflag32ri(i32 %a) { ; CHECK-LABEL: orflag32ri: ; CHECK: # %bb.0: -; CHECK-NEXT: orl $123456, %edi, %ecx # encoding: [0x62,0xf4,0x74,0x18,0x81,0xcf,0x40,0xe2,0x01,0x00] +; CHECK-NEXT: orl $123456, %edi # EVEX TO LEGACY Compression encoding: [0x81,0xcf,0x40,0xe2,0x01,0x00] ; CHECK-NEXT: # imm = 0x1E240 ; CHECK-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] -; CHECK-NEXT: movl %ecx, d64(%rip) # encoding: [0x89,0x0d,A,A,A,A] +; CHECK-NEXT: movl %edi, d64(%rip) # encoding: [0x89,0x3d,A,A,A,A] ; CHECK-NEXT: # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte ; CHECK-NEXT: retq # encoding: [0xc3] ; ; NF-LABEL: orflag32ri: ; NF: # %bb.0: -; NF-NEXT: orl $123456, %edi, %ecx # encoding: [0x62,0xf4,0x74,0x18,0x81,0xcf,0x40,0xe2,0x01,0x00] +; NF-NEXT: orl $123456, %edi # EVEX TO LEGACY Compression encoding: [0x81,0xcf,0x40,0xe2,0x01,0x00] ; NF-NEXT: # imm = 0x1E240 ; NF-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] -; NF-NEXT: movl %ecx, d64(%rip) # encoding: [0x89,0x0d,A,A,A,A] +; NF-NEXT: movl %edi, d64(%rip) # encoding: [0x89,0x3d,A,A,A,A] ; NF-NEXT: # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte ; NF-NEXT: retq # encoding: [0xc3] %v0 = or i32 %a, 123456 ; 0xff << 50 @@ -692,19 +692,19 @@ define i1 @orflag32ri(i32 %a) { define i1 @orflag64ri(i64 %a) { ; CHECK-LABEL: orflag64ri: ; CHECK: # %bb.0: -; CHECK-NEXT: orq $123456, %rdi, %rcx # encoding: [0x62,0xf4,0xf4,0x18,0x81,0xcf,0x40,0xe2,0x01,0x00] +; CHECK-NEXT: orq $123456, %rdi # EVEX TO LEGACY Compression encoding: [0x48,0x81,0xcf,0x40,0xe2,0x01,0x00] ; CHECK-NEXT: # imm = 0x1E240 ; CHECK-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] -; CHECK-NEXT: movq %rcx, d64(%rip) # encoding: [0x48,0x89,0x0d,A,A,A,A] +; CHECK-NEXT: movq %rdi, d64(%rip) # encoding: [0x48,0x89,0x3d,A,A,A,A] ; CHECK-NEXT: # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte ; CHECK-NEXT: retq # encoding: [0xc3] ; ; NF-LABEL: orflag64ri: ; NF: # %bb.0: -; NF-NEXT: orq $123456, %rdi, %rcx # encoding: [0x62,0xf4,0xf4,0x18,0x81,0xcf,0x40,0xe2,0x01,0x00] +; NF-NEXT: orq $123456, %rdi # EVEX TO LEGACY Compression encoding: [0x48,0x81,0xcf,0x40,0xe2,0x01,0x00] ; NF-NEXT: # imm = 0x1E240 ; NF-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] -; NF-NEXT: movq %rcx, d64(%rip) # encoding: [0x48,0x89,0x0d,A,A,A,A] +; NF-NEXT: movq %rdi, d64(%rip) # encoding: [0x48,0x89,0x3d,A,A,A,A] ; NF-NEXT: # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte ; NF-NEXT: retq # encoding: [0xc3] %v0 = or i64 %a, 123456 ; 0xff << 50 @@ -739,17 +739,17 @@ define i1 @orflag16ri8(i16 %a) { define i1 @orflag32ri8(i32 %a) { ; CHECK-LABEL: orflag32ri8: ; CHECK: # %bb.0: -; CHECK-NEXT: orl $123, %edi, %ecx # encoding: [0x62,0xf4,0x74,0x18,0x83,0xcf,0x7b] +; CHECK-NEXT: orl $123, %edi # EVEX TO LEGACY Compression encoding: [0x83,0xcf,0x7b] ; CHECK-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] -; CHECK-NEXT: movl %ecx, d64(%rip) # encoding: [0x89,0x0d,A,A,A,A] +; CHECK-NEXT: movl %edi, d64(%rip) # encoding: [0x89,0x3d,A,A,A,A] ; CHECK-NEXT: # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte ; CHECK-NEXT: retq # encoding: [0xc3] ; ; NF-LABEL: orflag32ri8: ; NF: # %bb.0: -; NF-NEXT: orl $123, %edi, %ecx # encoding: [0x62,0xf4,0x74,0x18,0x83,0xcf,0x7b] +; NF-NEXT: orl $123, %edi # EVEX TO LEGACY Compression encoding: [0x83,0xcf,0x7b] ; NF-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] -; NF-NEXT: movl %ecx, d64(%rip) # encoding: [0x89,0x0d,A,A,A,A] +; NF-NEXT: movl %edi, d64(%rip) # encoding: [0x89,0x3d,A,A,A,A] ; NF-NEXT: # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte ; NF-NEXT: retq # encoding: [0xc3] %v0 = or i32 %a, 123 ; 0xff << 50 @@ -761,17 +761,17 @@ define i1 @orflag32ri8(i32 %a) { define i1 @orflag64ri8(i64 %a) { ; CHECK-LABEL: orflag64ri8: ; CHECK: # %bb.0: -; CHECK-NEXT: orq $123, %rdi, %rcx # encoding: [0x62,0xf4,0xf4,0x18,0x83,0xcf,0x7b] +; CHECK-NEXT: orq $123, %rdi # EVEX TO LEGACY Compression encoding: [0x48,0x83,0xcf,0x7b] ; CHECK-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] -; CHECK-NEXT: movq %rcx, d64(%rip) # encoding: [0x48,0x89,0x0d,A,A,A,A] +; CHECK-NEXT: movq %rdi, d64(%rip) # encoding: [0x48,0x89,0x3d,A,A,A,A] ; CHECK-NEXT: # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte ; CHECK-NEXT: retq # encoding: [0xc3] ; ; NF-LABEL: orflag64ri8: ; NF: # %bb.0: -; NF-NEXT: orq $123, %rdi, %rcx # encoding: [0x62,0xf4,0xf4,0x18,0x83,0xcf,0x7b] +; NF-NEXT: orq $123, %rdi # EVEX TO LEGACY Compression encoding: [0x48,0x83,0xcf,0x7b] ; NF-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] -; NF-NEXT: movq %rcx, d64(%rip) # encoding: [0x48,0x89,0x0d,A,A,A,A] +; NF-NEXT: movq %rdi, d64(%rip) # encoding: [0x48,0x89,0x3d,A,A,A,A] ; NF-NEXT: # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte ; NF-NEXT: retq # encoding: [0xc3] %v0 = or i64 %a, 123 ; 0xff << 50 diff --git a/llvm/test/CodeGen/X86/apx/push2-pop2-vector-register.ll b/llvm/test/CodeGen/X86/apx/push2-pop2-vector-register.ll index aa5c54d30e3b..f20c4c1ae278 100644 --- a/llvm/test/CodeGen/X86/apx/push2-pop2-vector-register.ll +++ b/llvm/test/CodeGen/X86/apx/push2-pop2-vector-register.ll @@ -43,8 +43,12 @@ define void @widget(float %arg) nounwind { ; FRAME-NEXT: xorl %r8d, %r8d ; FRAME-NEXT: callq *%rsi ; FRAME-NEXT: movss %xmm6, 0 +; FRAME-NEXT: pushq %rbp +; FRAME-NEXT: pushq %rax ; FRAME-NEXT: #APP ; FRAME-NEXT: #NO_APP +; FRAME-NEXT: popq %rax +; FRAME-NEXT: popq %rbp ; FRAME-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; FRAME-NEXT: addq $48, %rsp ; FRAME-NEXT: pop2 %r15, %rsi diff --git a/llvm/test/CodeGen/X86/apx/push2-pop2.ll b/llvm/test/CodeGen/X86/apx/push2-pop2.ll index 25139f1da827..6bd9f525090e 100644 --- a/llvm/test/CodeGen/X86/apx/push2-pop2.ll +++ b/llvm/test/CodeGen/X86/apx/push2-pop2.ll @@ -24,8 +24,12 @@ define void @csr1() nounwind { ; FRAME: # %bb.0: # %entry ; FRAME-NEXT: pushq %rbp ; FRAME-NEXT: movq %rsp, %rbp +; FRAME-NEXT: pushq %rbp +; FRAME-NEXT: pushq %rax ; FRAME-NEXT: #APP ; FRAME-NEXT: #NO_APP +; FRAME-NEXT: popq %rax +; FRAME-NEXT: popq %rbp ; FRAME-NEXT: popq %rbp ; FRAME-NEXT: retq entry: @@ -59,8 +63,12 @@ define void @csr2() nounwind { ; FRAME-NEXT: pushq %rbp ; FRAME-NEXT: movq %rsp, %rbp ; FRAME-NEXT: pushq %r15 +; FRAME-NEXT: pushq %rbp +; FRAME-NEXT: pushq %rax ; FRAME-NEXT: #APP ; FRAME-NEXT: #NO_APP +; FRAME-NEXT: popq %rax +; FRAME-NEXT: popq %rbp ; FRAME-NEXT: popq %r15 ; FRAME-NEXT: popq %rbp ; FRAME-NEXT: retq @@ -95,8 +103,12 @@ define void @csr3() nounwind { ; FRAME-NEXT: pushq %rbp ; FRAME-NEXT: movq %rsp, %rbp ; FRAME-NEXT: push2 %r14, %r15 +; FRAME-NEXT: pushq %rbp +; FRAME-NEXT: pushq %rax ; FRAME-NEXT: #APP ; FRAME-NEXT: #NO_APP +; FRAME-NEXT: popq %rax +; FRAME-NEXT: popq %rbp ; FRAME-NEXT: pop2 %r15, %r14 ; FRAME-NEXT: popq %rbp ; FRAME-NEXT: retq @@ -136,8 +148,12 @@ define void @csr4() nounwind { ; FRAME-NEXT: movq %rsp, %rbp ; FRAME-NEXT: push2 %r14, %r15 ; FRAME-NEXT: pushq %r13 +; FRAME-NEXT: pushq %rbp +; FRAME-NEXT: pushq %rax ; FRAME-NEXT: #APP ; FRAME-NEXT: #NO_APP +; FRAME-NEXT: popq %rax +; FRAME-NEXT: popq %rbp ; FRAME-NEXT: popq %r13 ; FRAME-NEXT: pop2 %r15, %r14 ; FRAME-NEXT: popq %rbp @@ -178,8 +194,12 @@ define void @csr5() nounwind { ; FRAME-NEXT: movq %rsp, %rbp ; FRAME-NEXT: push2 %r14, %r15 ; FRAME-NEXT: push2 %r12, %r13 +; FRAME-NEXT: pushq %rbp +; FRAME-NEXT: pushq %rax ; FRAME-NEXT: #APP ; FRAME-NEXT: #NO_APP +; FRAME-NEXT: popq %rax +; FRAME-NEXT: popq %rbp ; FRAME-NEXT: pop2 %r13, %r12 ; FRAME-NEXT: pop2 %r15, %r14 ; FRAME-NEXT: popq %rbp @@ -225,8 +245,12 @@ define void @csr6() nounwind { ; FRAME-NEXT: push2 %r14, %r15 ; FRAME-NEXT: push2 %r12, %r13 ; FRAME-NEXT: pushq %rbx +; FRAME-NEXT: pushq %rbp +; FRAME-NEXT: pushq %rax ; FRAME-NEXT: #APP ; FRAME-NEXT: #NO_APP +; FRAME-NEXT: popq %rax +; FRAME-NEXT: popq %rbp ; FRAME-NEXT: popq %rbx ; FRAME-NEXT: pop2 %r13, %r12 ; FRAME-NEXT: pop2 %r15, %r14 diff --git a/llvm/test/CodeGen/X86/apx/pushp-popp.ll b/llvm/test/CodeGen/X86/apx/pushp-popp.ll index ad4306fccce6..625e70b07198 100644 --- a/llvm/test/CodeGen/X86/apx/pushp-popp.ll +++ b/llvm/test/CodeGen/X86/apx/pushp-popp.ll @@ -18,8 +18,12 @@ define void @csr2() nounwind { ; FRAME-NEXT: pushp %rbp ; FRAME-NEXT: movq %rsp, %rbp ; FRAME-NEXT: pushp %r15 +; FRAME-NEXT: pushp %rbp +; FRAME-NEXT: pushq %rax ; FRAME-NEXT: #APP ; FRAME-NEXT: #NO_APP +; FRAME-NEXT: popq %rax +; FRAME-NEXT: popp %rbp ; FRAME-NEXT: popp %r15 ; FRAME-NEXT: popp %rbp ; FRAME-NEXT: retq diff --git a/llvm/test/CodeGen/X86/apx/shift-eflags.ll b/llvm/test/CodeGen/X86/apx/shift-eflags.ll index 5da5090307e6..2659f8031ef7 100644 --- a/llvm/test/CodeGen/X86/apx/shift-eflags.ll +++ b/llvm/test/CodeGen/X86/apx/shift-eflags.ll @@ -7,7 +7,7 @@ define i32 @ashr_const(i32 %a0, i32 %a1, i32 %a2, i32 %a3) { ; CHECK-LABEL: ashr_const: ; CHECK: # %bb.0: -; CHECK-NEXT: sarl $14, %edi, %eax +; CHECK-NEXT: sarl $14, %edi ; CHECK-NEXT: cmovel %edx, %ecx, %eax ; CHECK-NEXT: retq %s = ashr i32 %a0, 14 @@ -85,7 +85,7 @@ define i32 @shl_const_self_select(i32 %a0, i32 %a1, i32 %a2, i32 %a3) { define i32 @ashr_const1(i32 %a0, i32 %a1, i32 %a2, i32 %a3) { ; CHECK-LABEL: ashr_const1: ; CHECK: # %bb.0: -; CHECK-NEXT: sarl %edi, %eax +; CHECK-NEXT: sarl %edi ; CHECK-NEXT: cmovel %edx, %ecx, %eax ; CHECK-NEXT: retq %s = ashr i32 %a0, 1 @@ -166,8 +166,8 @@ define i32 @ashr_var(i32 %a0, i32 %a1, i32 %a2, i32 %a3) { ; CHECK-NEXT: movl %ecx, %eax ; CHECK-NEXT: movl %esi, %ecx ; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx -; CHECK-NEXT: sarl %cl, %edi, %ecx -; CHECK-NEXT: testl %ecx, %ecx +; CHECK-NEXT: sarl %cl, %edi +; CHECK-NEXT: testl %edi, %edi ; CHECK-NEXT: cmovel %edx, %eax ; CHECK-NEXT: retq %s = ashr i32 %a0, %a1 @@ -183,8 +183,8 @@ define i32 @lshr_var(i32 %a0, i32 %a1, i32 %a2, i32 %a3) { ; CHECK-NEXT: movl %ecx, %eax ; CHECK-NEXT: movl %esi, %ecx ; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx -; CHECK-NEXT: shrl %cl, %edi, %ecx -; CHECK-NEXT: testl %ecx, %ecx +; CHECK-NEXT: shrl %cl, %edi +; CHECK-NEXT: testl %edi, %edi ; CHECK-NEXT: cmovel %edx, %eax ; CHECK-NEXT: retq %s = lshr i32 %a0, %a1 @@ -200,8 +200,8 @@ define i32 @shl_var(i32 %a0, i32 %a1, i32 %a2, i32 %a3) { ; CHECK-NEXT: movl %ecx, %eax ; CHECK-NEXT: movl %esi, %ecx ; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx -; CHECK-NEXT: shll %cl, %edi, %ecx -; CHECK-NEXT: testl %ecx, %ecx +; CHECK-NEXT: shll %cl, %edi +; CHECK-NEXT: testl %edi, %edi ; CHECK-NEXT: cmovel %edx, %eax ; CHECK-NEXT: retq %s = shl i32 %a0, %a1 @@ -264,8 +264,8 @@ define i32 @ashr_var_amt_never_zero(i32 %a0, i32 %a1, i32 %a2, i32 %a3) { ; CHECK: # %bb.0: ; CHECK-NEXT: movl %ecx, %eax ; CHECK-NEXT: orb $1, %sil, %cl -; CHECK-NEXT: sarl %cl, %edi, %ecx -; CHECK-NEXT: testl %ecx, %ecx +; CHECK-NEXT: sarl %cl, %edi +; CHECK-NEXT: testl %edi, %edi ; CHECK-NEXT: cmovel %edx, %eax ; CHECK-NEXT: retq %a = or i32 %a1, 1 @@ -281,8 +281,8 @@ define i32 @lshr_var_amt_never_zero(i32 %a0, i32 %a1, i32 %a2, i32 %a3) { ; CHECK: # %bb.0: ; CHECK-NEXT: movl %ecx, %eax ; CHECK-NEXT: orb $1, %sil, %cl -; CHECK-NEXT: shrl %cl, %edi, %ecx -; CHECK-NEXT: testl %ecx, %ecx +; CHECK-NEXT: shrl %cl, %edi +; CHECK-NEXT: testl %edi, %edi ; CHECK-NEXT: cmovel %edx, %eax ; CHECK-NEXT: retq %a = or i32 %a1, 1 @@ -298,8 +298,8 @@ define i32 @shl_var_amt_never_zero(i32 %a0, i32 %a1, i32 %a2, i32 %a3) { ; CHECK: # %bb.0: ; CHECK-NEXT: movl %ecx, %eax ; CHECK-NEXT: orb $1, %sil, %cl -; CHECK-NEXT: shll %cl, %edi, %ecx -; CHECK-NEXT: testl %ecx, %ecx +; CHECK-NEXT: shll %cl, %edi +; CHECK-NEXT: testl %edi, %edi ; CHECK-NEXT: cmovel %edx, %eax ; CHECK-NEXT: retq %a = or i32 %a1, 1 diff --git a/llvm/test/CodeGen/X86/apx/sub.ll b/llvm/test/CodeGen/X86/apx/sub.ll index 75d705557cdf..9519fab4ee51 100644 --- a/llvm/test/CodeGen/X86/apx/sub.ll +++ b/llvm/test/CodeGen/X86/apx/sub.ll @@ -451,16 +451,16 @@ define i16 @subflag16rr(i16 noundef %a, i16 noundef %b) { ; CHECK-LABEL: subflag16rr: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0] -; CHECK-NEXT: subw %si, %di, %cx # encoding: [0x62,0xf4,0x75,0x18,0x29,0xf7] -; CHECK-NEXT: cmovael %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc1] +; CHECK-NEXT: subw %si, %di # EVEX TO LEGACY Compression encoding: [0x66,0x29,0xf7] +; CHECK-NEXT: cmovael %edi, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc7] ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: retq # encoding: [0xc3] ; ; NF-LABEL: subflag16rr: ; NF: # %bb.0: # %entry ; NF-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0] -; NF-NEXT: subw %si, %di, %cx # encoding: [0x62,0xf4,0x75,0x18,0x29,0xf7] -; NF-NEXT: cmovael %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc1] +; NF-NEXT: subw %si, %di # EVEX TO LEGACY Compression encoding: [0x66,0x29,0xf7] +; NF-NEXT: cmovael %edi, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc7] ; NF-NEXT: # kill: def $ax killed $ax killed $eax ; NF-NEXT: retq # encoding: [0xc3] entry: @@ -472,15 +472,15 @@ define i32 @subflag32rr(i32 noundef %a, i32 noundef %b) { ; CHECK-LABEL: subflag32rr: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0] -; CHECK-NEXT: subl %esi, %edi, %ecx # encoding: [0x62,0xf4,0x74,0x18,0x29,0xf7] -; CHECK-NEXT: cmovael %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc1] +; CHECK-NEXT: subl %esi, %edi # EVEX TO LEGACY Compression encoding: [0x29,0xf7] +; CHECK-NEXT: cmovael %edi, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc7] ; CHECK-NEXT: retq # encoding: [0xc3] ; ; NF-LABEL: subflag32rr: ; NF: # %bb.0: # %entry ; NF-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0] -; NF-NEXT: subl %esi, %edi, %ecx # encoding: [0x62,0xf4,0x74,0x18,0x29,0xf7] -; NF-NEXT: cmovael %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc1] +; NF-NEXT: subl %esi, %edi # EVEX TO LEGACY Compression encoding: [0x29,0xf7] +; NF-NEXT: cmovael %edi, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc7] ; NF-NEXT: retq # encoding: [0xc3] entry: %sub = call i32 @llvm.usub.sat.i32(i32 %a, i32 %b) @@ -491,15 +491,15 @@ define i64 @subflag64rr(i64 noundef %a, i64 noundef %b) { ; CHECK-LABEL: subflag64rr: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0] -; CHECK-NEXT: subq %rsi, %rdi, %rcx # encoding: [0x62,0xf4,0xf4,0x18,0x29,0xf7] -; CHECK-NEXT: cmovaeq %rcx, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x43,0xc1] +; CHECK-NEXT: subq %rsi, %rdi # EVEX TO LEGACY Compression encoding: [0x48,0x29,0xf7] +; CHECK-NEXT: cmovaeq %rdi, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x43,0xc7] ; CHECK-NEXT: retq # encoding: [0xc3] ; ; NF-LABEL: subflag64rr: ; NF: # %bb.0: # %entry ; NF-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0] -; NF-NEXT: subq %rsi, %rdi, %rcx # encoding: [0x62,0xf4,0xf4,0x18,0x29,0xf7] -; NF-NEXT: cmovaeq %rcx, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x43,0xc1] +; NF-NEXT: subq %rsi, %rdi # EVEX TO LEGACY Compression encoding: [0x48,0x29,0xf7] +; NF-NEXT: cmovaeq %rdi, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x43,0xc7] ; NF-NEXT: retq # encoding: [0xc3] entry: %sub = call i64 @llvm.usub.sat.i64(i64 %a, i64 %b) @@ -534,16 +534,16 @@ define i16 @subflag16rm(i16 noundef %a, ptr %b) { ; CHECK-LABEL: subflag16rm: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0] -; CHECK-NEXT: subw (%rsi), %di, %cx # encoding: [0x62,0xf4,0x75,0x18,0x2b,0x3e] -; CHECK-NEXT: cmovael %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc1] +; CHECK-NEXT: subw (%rsi), %di # EVEX TO LEGACY Compression encoding: [0x66,0x2b,0x3e] +; CHECK-NEXT: cmovael %edi, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc7] ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: retq # encoding: [0xc3] ; ; NF-LABEL: subflag16rm: ; NF: # %bb.0: # %entry ; NF-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0] -; NF-NEXT: subw (%rsi), %di, %cx # encoding: [0x62,0xf4,0x75,0x18,0x2b,0x3e] -; NF-NEXT: cmovael %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc1] +; NF-NEXT: subw (%rsi), %di # EVEX TO LEGACY Compression encoding: [0x66,0x2b,0x3e] +; NF-NEXT: cmovael %edi, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc7] ; NF-NEXT: # kill: def $ax killed $ax killed $eax ; NF-NEXT: retq # encoding: [0xc3] entry: @@ -556,15 +556,15 @@ define i32 @subflag32rm(i32 noundef %a, ptr %b) { ; CHECK-LABEL: subflag32rm: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0] -; CHECK-NEXT: subl (%rsi), %edi, %ecx # encoding: [0x62,0xf4,0x74,0x18,0x2b,0x3e] -; CHECK-NEXT: cmovael %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc1] +; CHECK-NEXT: subl (%rsi), %edi # EVEX TO LEGACY Compression encoding: [0x2b,0x3e] +; CHECK-NEXT: cmovael %edi, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc7] ; CHECK-NEXT: retq # encoding: [0xc3] ; ; NF-LABEL: subflag32rm: ; NF: # %bb.0: # %entry ; NF-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0] -; NF-NEXT: subl (%rsi), %edi, %ecx # encoding: [0x62,0xf4,0x74,0x18,0x2b,0x3e] -; NF-NEXT: cmovael %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc1] +; NF-NEXT: subl (%rsi), %edi # EVEX TO LEGACY Compression encoding: [0x2b,0x3e] +; NF-NEXT: cmovael %edi, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc7] ; NF-NEXT: retq # encoding: [0xc3] entry: %t = load i32, ptr %b @@ -576,15 +576,15 @@ define i64 @subflag64rm(i64 noundef %a, ptr %b) { ; CHECK-LABEL: subflag64rm: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0] -; CHECK-NEXT: subq (%rsi), %rdi, %rcx # encoding: [0x62,0xf4,0xf4,0x18,0x2b,0x3e] -; CHECK-NEXT: cmovaeq %rcx, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x43,0xc1] +; CHECK-NEXT: subq (%rsi), %rdi # EVEX TO LEGACY Compression encoding: [0x48,0x2b,0x3e] +; CHECK-NEXT: cmovaeq %rdi, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x43,0xc7] ; CHECK-NEXT: retq # encoding: [0xc3] ; ; NF-LABEL: subflag64rm: ; NF: # %bb.0: # %entry ; NF-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0] -; NF-NEXT: subq (%rsi), %rdi, %rcx # encoding: [0x62,0xf4,0xf4,0x18,0x2b,0x3e] -; NF-NEXT: cmovaeq %rcx, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x43,0xc1] +; NF-NEXT: subq (%rsi), %rdi # EVEX TO LEGACY Compression encoding: [0x48,0x2b,0x3e] +; NF-NEXT: cmovaeq %rdi, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x43,0xc7] ; NF-NEXT: retq # encoding: [0xc3] entry: %t = load i64, ptr %b @@ -596,16 +596,16 @@ define i16 @subflag16ri8(i16 noundef %a) { ; CHECK-LABEL: subflag16ri8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0] -; CHECK-NEXT: subw $123, %di, %cx # encoding: [0x62,0xf4,0x75,0x18,0x83,0xef,0x7b] -; CHECK-NEXT: cmovael %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc1] +; CHECK-NEXT: subw $123, %di # EVEX TO LEGACY Compression encoding: [0x66,0x83,0xef,0x7b] +; CHECK-NEXT: cmovael %edi, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc7] ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: retq # encoding: [0xc3] ; ; NF-LABEL: subflag16ri8: ; NF: # %bb.0: # %entry ; NF-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0] -; NF-NEXT: subw $123, %di, %cx # encoding: [0x62,0xf4,0x75,0x18,0x83,0xef,0x7b] -; NF-NEXT: cmovael %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc1] +; NF-NEXT: subw $123, %di # EVEX TO LEGACY Compression encoding: [0x66,0x83,0xef,0x7b] +; NF-NEXT: cmovael %edi, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc7] ; NF-NEXT: # kill: def $ax killed $ax killed $eax ; NF-NEXT: retq # encoding: [0xc3] entry: @@ -617,15 +617,15 @@ define i32 @subflag32ri8(i32 noundef %a) { ; CHECK-LABEL: subflag32ri8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0] -; CHECK-NEXT: subl $123, %edi, %ecx # encoding: [0x62,0xf4,0x74,0x18,0x83,0xef,0x7b] -; CHECK-NEXT: cmovael %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc1] +; CHECK-NEXT: subl $123, %edi # EVEX TO LEGACY Compression encoding: [0x83,0xef,0x7b] +; CHECK-NEXT: cmovael %edi, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc7] ; CHECK-NEXT: retq # encoding: [0xc3] ; ; NF-LABEL: subflag32ri8: ; NF: # %bb.0: # %entry ; NF-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0] -; NF-NEXT: subl $123, %edi, %ecx # encoding: [0x62,0xf4,0x74,0x18,0x83,0xef,0x7b] -; NF-NEXT: cmovael %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc1] +; NF-NEXT: subl $123, %edi # EVEX TO LEGACY Compression encoding: [0x83,0xef,0x7b] +; NF-NEXT: cmovael %edi, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc7] ; NF-NEXT: retq # encoding: [0xc3] entry: %sub = call i32 @llvm.usub.sat.i32(i32 %a, i32 123) @@ -636,15 +636,15 @@ define i64 @subflag64ri8(i64 noundef %a) { ; CHECK-LABEL: subflag64ri8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0] -; CHECK-NEXT: subq $123, %rdi, %rcx # encoding: [0x62,0xf4,0xf4,0x18,0x83,0xef,0x7b] -; CHECK-NEXT: cmovaeq %rcx, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x43,0xc1] +; CHECK-NEXT: subq $123, %rdi # EVEX TO LEGACY Compression encoding: [0x48,0x83,0xef,0x7b] +; CHECK-NEXT: cmovaeq %rdi, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x43,0xc7] ; CHECK-NEXT: retq # encoding: [0xc3] ; ; NF-LABEL: subflag64ri8: ; NF: # %bb.0: # %entry ; NF-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0] -; NF-NEXT: subq $123, %rdi, %rcx # encoding: [0x62,0xf4,0xf4,0x18,0x83,0xef,0x7b] -; NF-NEXT: cmovaeq %rcx, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x43,0xc1] +; NF-NEXT: subq $123, %rdi # EVEX TO LEGACY Compression encoding: [0x48,0x83,0xef,0x7b] +; NF-NEXT: cmovaeq %rdi, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x43,0xc7] ; NF-NEXT: retq # encoding: [0xc3] entry: %sub = call i64 @llvm.usub.sat.i64(i64 %a, i64 123) @@ -678,18 +678,18 @@ define i16 @subflag16ri(i16 noundef %a) { ; CHECK-LABEL: subflag16ri: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0] -; CHECK-NEXT: subw $1234, %di, %cx # encoding: [0x62,0xf4,0x75,0x18,0x81,0xef,0xd2,0x04] +; CHECK-NEXT: subw $1234, %di # EVEX TO LEGACY Compression encoding: [0x66,0x81,0xef,0xd2,0x04] ; CHECK-NEXT: # imm = 0x4D2 -; CHECK-NEXT: cmovael %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc1] +; CHECK-NEXT: cmovael %edi, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc7] ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: retq # encoding: [0xc3] ; ; NF-LABEL: subflag16ri: ; NF: # %bb.0: # %entry ; NF-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0] -; NF-NEXT: subw $1234, %di, %cx # encoding: [0x62,0xf4,0x75,0x18,0x81,0xef,0xd2,0x04] +; NF-NEXT: subw $1234, %di # EVEX TO LEGACY Compression encoding: [0x66,0x81,0xef,0xd2,0x04] ; NF-NEXT: # imm = 0x4D2 -; NF-NEXT: cmovael %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc1] +; NF-NEXT: cmovael %edi, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc7] ; NF-NEXT: # kill: def $ax killed $ax killed $eax ; NF-NEXT: retq # encoding: [0xc3] entry: @@ -701,17 +701,17 @@ define i32 @subflag32ri(i32 noundef %a) { ; CHECK-LABEL: subflag32ri: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0] -; CHECK-NEXT: subl $123456, %edi, %ecx # encoding: [0x62,0xf4,0x74,0x18,0x81,0xef,0x40,0xe2,0x01,0x00] +; CHECK-NEXT: subl $123456, %edi # EVEX TO LEGACY Compression encoding: [0x81,0xef,0x40,0xe2,0x01,0x00] ; CHECK-NEXT: # imm = 0x1E240 -; CHECK-NEXT: cmovael %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc1] +; CHECK-NEXT: cmovael %edi, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc7] ; CHECK-NEXT: retq # encoding: [0xc3] ; ; NF-LABEL: subflag32ri: ; NF: # %bb.0: # %entry ; NF-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0] -; NF-NEXT: subl $123456, %edi, %ecx # encoding: [0x62,0xf4,0x74,0x18,0x81,0xef,0x40,0xe2,0x01,0x00] +; NF-NEXT: subl $123456, %edi # EVEX TO LEGACY Compression encoding: [0x81,0xef,0x40,0xe2,0x01,0x00] ; NF-NEXT: # imm = 0x1E240 -; NF-NEXT: cmovael %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc1] +; NF-NEXT: cmovael %edi, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc7] ; NF-NEXT: retq # encoding: [0xc3] entry: %sub = call i32 @llvm.usub.sat.i32(i32 %a, i32 123456) @@ -722,17 +722,17 @@ define i64 @subflag64ri(i64 noundef %a) { ; CHECK-LABEL: subflag64ri: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0] -; CHECK-NEXT: subq $123456, %rdi, %rcx # encoding: [0x62,0xf4,0xf4,0x18,0x81,0xef,0x40,0xe2,0x01,0x00] +; CHECK-NEXT: subq $123456, %rdi # EVEX TO LEGACY Compression encoding: [0x48,0x81,0xef,0x40,0xe2,0x01,0x00] ; CHECK-NEXT: # imm = 0x1E240 -; CHECK-NEXT: cmovaeq %rcx, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x43,0xc1] +; CHECK-NEXT: cmovaeq %rdi, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x43,0xc7] ; CHECK-NEXT: retq # encoding: [0xc3] ; ; NF-LABEL: subflag64ri: ; NF: # %bb.0: # %entry ; NF-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0] -; NF-NEXT: subq $123456, %rdi, %rcx # encoding: [0x62,0xf4,0xf4,0x18,0x81,0xef,0x40,0xe2,0x01,0x00] +; NF-NEXT: subq $123456, %rdi # EVEX TO LEGACY Compression encoding: [0x48,0x81,0xef,0x40,0xe2,0x01,0x00] ; NF-NEXT: # imm = 0x1E240 -; NF-NEXT: cmovaeq %rcx, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x43,0xc1] +; NF-NEXT: cmovaeq %rdi, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x43,0xc7] ; NF-NEXT: retq # encoding: [0xc3] entry: %sub = call i64 @llvm.usub.sat.i64(i64 %a, i64 123456) diff --git a/llvm/test/CodeGen/X86/apx/xor.ll b/llvm/test/CodeGen/X86/apx/xor.ll index 3426f9cc92ce..d908849e2848 100644 --- a/llvm/test/CodeGen/X86/apx/xor.ll +++ b/llvm/test/CodeGen/X86/apx/xor.ll @@ -428,8 +428,8 @@ entry: define i1 @xorflag8rr(i8 %a, i8 %b) { ; CHECK-LABEL: xorflag8rr: ; CHECK: # %bb.0: -; CHECK-NEXT: xorl %edi, %esi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x31,0xfe] -; CHECK-NEXT: xorb $-1, %al, %cl # encoding: [0x62,0xf4,0x74,0x18,0x80,0xf0,0xff] +; CHECK-NEXT: xorl %edi, %esi # EVEX TO LEGACY Compression encoding: [0x31,0xfe] +; CHECK-NEXT: xorb $-1, %sil, %cl # encoding: [0x62,0xf4,0x74,0x18,0x80,0xf6,0xff] ; CHECK-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] ; CHECK-NEXT: movb %cl, d64(%rip) # encoding: [0x88,0x0d,A,A,A,A] ; CHECK-NEXT: # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte @@ -437,8 +437,8 @@ define i1 @xorflag8rr(i8 %a, i8 %b) { ; ; NF-LABEL: xorflag8rr: ; NF: # %bb.0: -; NF-NEXT: {nf} xorl %edi, %esi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x31,0xfe] -; NF-NEXT: xorb $-1, %al, %cl # encoding: [0x62,0xf4,0x74,0x18,0x80,0xf0,0xff] +; NF-NEXT: xorl %edi, %esi # EVEX TO LEGACY Compression encoding: [0x31,0xfe] +; NF-NEXT: xorb $-1, %sil, %cl # encoding: [0x62,0xf4,0x74,0x18,0x80,0xf6,0xff] ; NF-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] ; NF-NEXT: movb %cl, d64(%rip) # encoding: [0x88,0x0d,A,A,A,A] ; NF-NEXT: # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte @@ -453,8 +453,8 @@ define i1 @xorflag8rr(i8 %a, i8 %b) { define i1 @xorflag16rr(i16 %a, i16 %b) { ; CHECK-LABEL: xorflag16rr: ; CHECK: # %bb.0: -; CHECK-NEXT: xorl %edi, %esi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x31,0xfe] -; CHECK-NEXT: xorw $-1, %ax, %cx # encoding: [0x62,0xf4,0x75,0x18,0x83,0xf0,0xff] +; CHECK-NEXT: xorl %edi, %esi # EVEX TO LEGACY Compression encoding: [0x31,0xfe] +; CHECK-NEXT: xorw $-1, %si, %cx # encoding: [0x62,0xf4,0x75,0x18,0x83,0xf6,0xff] ; CHECK-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] ; CHECK-NEXT: movw %cx, d64(%rip) # encoding: [0x66,0x89,0x0d,A,A,A,A] ; CHECK-NEXT: # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte @@ -462,8 +462,8 @@ define i1 @xorflag16rr(i16 %a, i16 %b) { ; ; NF-LABEL: xorflag16rr: ; NF: # %bb.0: -; NF-NEXT: {nf} xorl %edi, %esi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x31,0xfe] -; NF-NEXT: xorw $-1, %ax, %cx # encoding: [0x62,0xf4,0x75,0x18,0x83,0xf0,0xff] +; NF-NEXT: xorl %edi, %esi # EVEX TO LEGACY Compression encoding: [0x31,0xfe] +; NF-NEXT: xorw $-1, %si, %cx # encoding: [0x62,0xf4,0x75,0x18,0x83,0xf6,0xff] ; NF-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] ; NF-NEXT: movw %cx, d64(%rip) # encoding: [0x66,0x89,0x0d,A,A,A,A] ; NF-NEXT: # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte @@ -478,17 +478,17 @@ define i1 @xorflag16rr(i16 %a, i16 %b) { define i1 @xorflag32rr(i32 %a, i32 %b) { ; CHECK-LABEL: xorflag32rr: ; CHECK: # %bb.0: -; CHECK-NEXT: xorl %esi, %edi, %ecx # encoding: [0x62,0xf4,0x74,0x18,0x31,0xf7] +; CHECK-NEXT: xorl %edi, %esi # EVEX TO LEGACY Compression encoding: [0x31,0xfe] ; CHECK-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] -; CHECK-NEXT: movl %ecx, d64(%rip) # encoding: [0x89,0x0d,A,A,A,A] +; CHECK-NEXT: movl %esi, d64(%rip) # encoding: [0x89,0x35,A,A,A,A] ; CHECK-NEXT: # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte ; CHECK-NEXT: retq # encoding: [0xc3] ; ; NF-LABEL: xorflag32rr: ; NF: # %bb.0: -; NF-NEXT: xorl %esi, %edi, %ecx # encoding: [0x62,0xf4,0x74,0x18,0x31,0xf7] +; NF-NEXT: xorl %edi, %esi # EVEX TO LEGACY Compression encoding: [0x31,0xfe] ; NF-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] -; NF-NEXT: movl %ecx, d64(%rip) # encoding: [0x89,0x0d,A,A,A,A] +; NF-NEXT: movl %esi, d64(%rip) # encoding: [0x89,0x35,A,A,A,A] ; NF-NEXT: # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte ; NF-NEXT: retq # encoding: [0xc3] %v0 = xor i32 %a, %b ; 0xff << 50 @@ -500,17 +500,17 @@ define i1 @xorflag32rr(i32 %a, i32 %b) { define i1 @xorflag64rr(i64 %a, i64 %b) { ; CHECK-LABEL: xorflag64rr: ; CHECK: # %bb.0: -; CHECK-NEXT: xorq %rsi, %rdi, %rcx # encoding: [0x62,0xf4,0xf4,0x18,0x31,0xf7] +; CHECK-NEXT: xorq %rdi, %rsi # EVEX TO LEGACY Compression encoding: [0x48,0x31,0xfe] ; CHECK-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] -; CHECK-NEXT: movq %rcx, d64(%rip) # encoding: [0x48,0x89,0x0d,A,A,A,A] +; CHECK-NEXT: movq %rsi, d64(%rip) # encoding: [0x48,0x89,0x35,A,A,A,A] ; CHECK-NEXT: # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte ; CHECK-NEXT: retq # encoding: [0xc3] ; ; NF-LABEL: xorflag64rr: ; NF: # %bb.0: -; NF-NEXT: xorq %rsi, %rdi, %rcx # encoding: [0x62,0xf4,0xf4,0x18,0x31,0xf7] +; NF-NEXT: xorq %rdi, %rsi # EVEX TO LEGACY Compression encoding: [0x48,0x31,0xfe] ; NF-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] -; NF-NEXT: movq %rcx, d64(%rip) # encoding: [0x48,0x89,0x0d,A,A,A,A] +; NF-NEXT: movq %rsi, d64(%rip) # encoding: [0x48,0x89,0x35,A,A,A,A] ; NF-NEXT: # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte ; NF-NEXT: retq # encoding: [0xc3] %v0 = xor i64 %a, %b ; 0xff << 50 @@ -574,17 +574,17 @@ define i1 @xorflag16rm(ptr %ptr, i16 %b) { define i1 @xorflag32rm(ptr %ptr, i32 %b) { ; CHECK-LABEL: xorflag32rm: ; CHECK: # %bb.0: -; CHECK-NEXT: xorl (%rdi), %esi, %ecx # encoding: [0x62,0xf4,0x74,0x18,0x33,0x37] +; CHECK-NEXT: xorl (%rdi), %esi # EVEX TO LEGACY Compression encoding: [0x33,0x37] ; CHECK-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] -; CHECK-NEXT: movl %ecx, d64(%rip) # encoding: [0x89,0x0d,A,A,A,A] +; CHECK-NEXT: movl %esi, d64(%rip) # encoding: [0x89,0x35,A,A,A,A] ; CHECK-NEXT: # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte ; CHECK-NEXT: retq # encoding: [0xc3] ; ; NF-LABEL: xorflag32rm: ; NF: # %bb.0: -; NF-NEXT: xorl (%rdi), %esi, %ecx # encoding: [0x62,0xf4,0x74,0x18,0x33,0x37] +; NF-NEXT: xorl (%rdi), %esi # EVEX TO LEGACY Compression encoding: [0x33,0x37] ; NF-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] -; NF-NEXT: movl %ecx, d64(%rip) # encoding: [0x89,0x0d,A,A,A,A] +; NF-NEXT: movl %esi, d64(%rip) # encoding: [0x89,0x35,A,A,A,A] ; NF-NEXT: # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte ; NF-NEXT: retq # encoding: [0xc3] %a = load i32, ptr %ptr @@ -597,17 +597,17 @@ define i1 @xorflag32rm(ptr %ptr, i32 %b) { define i1 @xorflag64rm(ptr %ptr, i64 %b) { ; CHECK-LABEL: xorflag64rm: ; CHECK: # %bb.0: -; CHECK-NEXT: xorq (%rdi), %rsi, %rcx # encoding: [0x62,0xf4,0xf4,0x18,0x33,0x37] +; CHECK-NEXT: xorq (%rdi), %rsi # EVEX TO LEGACY Compression encoding: [0x48,0x33,0x37] ; CHECK-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] -; CHECK-NEXT: movq %rcx, d64(%rip) # encoding: [0x48,0x89,0x0d,A,A,A,A] +; CHECK-NEXT: movq %rsi, d64(%rip) # encoding: [0x48,0x89,0x35,A,A,A,A] ; CHECK-NEXT: # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte ; CHECK-NEXT: retq # encoding: [0xc3] ; ; NF-LABEL: xorflag64rm: ; NF: # %bb.0: -; NF-NEXT: xorq (%rdi), %rsi, %rcx # encoding: [0x62,0xf4,0xf4,0x18,0x33,0x37] +; NF-NEXT: xorq (%rdi), %rsi # EVEX TO LEGACY Compression encoding: [0x48,0x33,0x37] ; NF-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] -; NF-NEXT: movq %rcx, d64(%rip) # encoding: [0x48,0x89,0x0d,A,A,A,A] +; NF-NEXT: movq %rsi, d64(%rip) # encoding: [0x48,0x89,0x35,A,A,A,A] ; NF-NEXT: # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte ; NF-NEXT: retq # encoding: [0xc3] %a = load i64, ptr %ptr @@ -668,19 +668,19 @@ define i1 @xorflag16ri(i16 %a) { define i1 @xorflag32ri(i32 %a) { ; CHECK-LABEL: xorflag32ri: ; CHECK: # %bb.0: -; CHECK-NEXT: xorl $123456, %edi, %ecx # encoding: [0x62,0xf4,0x74,0x18,0x81,0xf7,0x40,0xe2,0x01,0x00] +; CHECK-NEXT: xorl $123456, %edi # EVEX TO LEGACY Compression encoding: [0x81,0xf7,0x40,0xe2,0x01,0x00] ; CHECK-NEXT: # imm = 0x1E240 ; CHECK-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] -; CHECK-NEXT: movl %ecx, d64(%rip) # encoding: [0x89,0x0d,A,A,A,A] +; CHECK-NEXT: movl %edi, d64(%rip) # encoding: [0x89,0x3d,A,A,A,A] ; CHECK-NEXT: # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte ; CHECK-NEXT: retq # encoding: [0xc3] ; ; NF-LABEL: xorflag32ri: ; NF: # %bb.0: -; NF-NEXT: xorl $123456, %edi, %ecx # encoding: [0x62,0xf4,0x74,0x18,0x81,0xf7,0x40,0xe2,0x01,0x00] +; NF-NEXT: xorl $123456, %edi # EVEX TO LEGACY Compression encoding: [0x81,0xf7,0x40,0xe2,0x01,0x00] ; NF-NEXT: # imm = 0x1E240 ; NF-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] -; NF-NEXT: movl %ecx, d64(%rip) # encoding: [0x89,0x0d,A,A,A,A] +; NF-NEXT: movl %edi, d64(%rip) # encoding: [0x89,0x3d,A,A,A,A] ; NF-NEXT: # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte ; NF-NEXT: retq # encoding: [0xc3] %v0 = xor i32 %a, 123456 ; 0xff << 50 @@ -692,19 +692,19 @@ define i1 @xorflag32ri(i32 %a) { define i1 @xorflag64ri(i64 %a) { ; CHECK-LABEL: xorflag64ri: ; CHECK: # %bb.0: -; CHECK-NEXT: xorq $123456, %rdi, %rcx # encoding: [0x62,0xf4,0xf4,0x18,0x81,0xf7,0x40,0xe2,0x01,0x00] +; CHECK-NEXT: xorq $123456, %rdi # EVEX TO LEGACY Compression encoding: [0x48,0x81,0xf7,0x40,0xe2,0x01,0x00] ; CHECK-NEXT: # imm = 0x1E240 ; CHECK-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] -; CHECK-NEXT: movq %rcx, d64(%rip) # encoding: [0x48,0x89,0x0d,A,A,A,A] +; CHECK-NEXT: movq %rdi, d64(%rip) # encoding: [0x48,0x89,0x3d,A,A,A,A] ; CHECK-NEXT: # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte ; CHECK-NEXT: retq # encoding: [0xc3] ; ; NF-LABEL: xorflag64ri: ; NF: # %bb.0: -; NF-NEXT: xorq $123456, %rdi, %rcx # encoding: [0x62,0xf4,0xf4,0x18,0x81,0xf7,0x40,0xe2,0x01,0x00] +; NF-NEXT: xorq $123456, %rdi # EVEX TO LEGACY Compression encoding: [0x48,0x81,0xf7,0x40,0xe2,0x01,0x00] ; NF-NEXT: # imm = 0x1E240 ; NF-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] -; NF-NEXT: movq %rcx, d64(%rip) # encoding: [0x48,0x89,0x0d,A,A,A,A] +; NF-NEXT: movq %rdi, d64(%rip) # encoding: [0x48,0x89,0x3d,A,A,A,A] ; NF-NEXT: # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte ; NF-NEXT: retq # encoding: [0xc3] %v0 = xor i64 %a, 123456 ; 0xff << 50 @@ -739,17 +739,17 @@ define i1 @xorflag16ri8(i16 %a) { define i1 @xorflag32ri8(i32 %a) { ; CHECK-LABEL: xorflag32ri8: ; CHECK: # %bb.0: -; CHECK-NEXT: xorl $123, %edi, %ecx # encoding: [0x62,0xf4,0x74,0x18,0x83,0xf7,0x7b] +; CHECK-NEXT: xorl $123, %edi # EVEX TO LEGACY Compression encoding: [0x83,0xf7,0x7b] ; CHECK-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] -; CHECK-NEXT: movl %ecx, d64(%rip) # encoding: [0x89,0x0d,A,A,A,A] +; CHECK-NEXT: movl %edi, d64(%rip) # encoding: [0x89,0x3d,A,A,A,A] ; CHECK-NEXT: # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte ; CHECK-NEXT: retq # encoding: [0xc3] ; ; NF-LABEL: xorflag32ri8: ; NF: # %bb.0: -; NF-NEXT: xorl $123, %edi, %ecx # encoding: [0x62,0xf4,0x74,0x18,0x83,0xf7,0x7b] +; NF-NEXT: xorl $123, %edi # EVEX TO LEGACY Compression encoding: [0x83,0xf7,0x7b] ; NF-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] -; NF-NEXT: movl %ecx, d64(%rip) # encoding: [0x89,0x0d,A,A,A,A] +; NF-NEXT: movl %edi, d64(%rip) # encoding: [0x89,0x3d,A,A,A,A] ; NF-NEXT: # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte ; NF-NEXT: retq # encoding: [0xc3] %v0 = xor i32 %a, 123 ; 0xff << 50 @@ -761,17 +761,17 @@ define i1 @xorflag32ri8(i32 %a) { define i1 @xorflag64ri8(i64 %a) { ; CHECK-LABEL: xorflag64ri8: ; CHECK: # %bb.0: -; CHECK-NEXT: xorq $123, %rdi, %rcx # encoding: [0x62,0xf4,0xf4,0x18,0x83,0xf7,0x7b] +; CHECK-NEXT: xorq $123, %rdi # EVEX TO LEGACY Compression encoding: [0x48,0x83,0xf7,0x7b] ; CHECK-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] -; CHECK-NEXT: movq %rcx, d64(%rip) # encoding: [0x48,0x89,0x0d,A,A,A,A] +; CHECK-NEXT: movq %rdi, d64(%rip) # encoding: [0x48,0x89,0x3d,A,A,A,A] ; CHECK-NEXT: # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte ; CHECK-NEXT: retq # encoding: [0xc3] ; ; NF-LABEL: xorflag64ri8: ; NF: # %bb.0: -; NF-NEXT: xorq $123, %rdi, %rcx # encoding: [0x62,0xf4,0xf4,0x18,0x83,0xf7,0x7b] +; NF-NEXT: xorq $123, %rdi # EVEX TO LEGACY Compression encoding: [0x48,0x83,0xf7,0x7b] ; NF-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] -; NF-NEXT: movq %rcx, d64(%rip) # encoding: [0x48,0x89,0x0d,A,A,A,A] +; NF-NEXT: movq %rdi, d64(%rip) # encoding: [0x48,0x89,0x3d,A,A,A,A] ; NF-NEXT: # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte ; NF-NEXT: retq # encoding: [0xc3] %v0 = xor i64 %a, 123 ; 0xff << 50 diff --git a/llvm/test/CodeGen/X86/avx10_2_512satcvt-intrinsics.ll b/llvm/test/CodeGen/X86/avx10_2_512satcvt-intrinsics.ll new file mode 100644 index 000000000000..19860530c030 --- /dev/null +++ b/llvm/test/CodeGen/X86/avx10_2_512satcvt-intrinsics.ll @@ -0,0 +1,1003 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64 --show-mc-encoding -mattr=+avx10.2-512 | FileCheck %s --check-prefixes=CHECK,X64 +; RUN: llc < %s -verify-machineinstrs -mtriple=i686 --show-mc-encoding -mattr=+avx10.2-512 | FileCheck %s --check-prefixes=CHECK,X86 + +define dso_local <8 x i64> @test_mm512_ipcvtnebf16_epi8(<32 x bfloat> noundef %__A) { +; CHECK-LABEL: test_mm512_ipcvtnebf16_epi8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vcvtnebf162ibs %zmm0, %zmm0 # encoding: [0x62,0xf5,0x7f,0x48,0x69,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +entry: + %0 = tail call <32 x i16> @llvm.x86.avx10.vcvtnebf162ibs512(<32 x bfloat> %__A) + %1 = bitcast <32 x i16> %0 to <8 x i64> + ret <8 x i64> %1 +} + +define dso_local <8 x i64> @test_mm512_mask_ipcvtnebf16_epi8(<8 x i64> noundef %__S, i32 noundef %__A, <32 x bfloat> noundef %__B) { +; X64-LABEL: test_mm512_mask_ipcvtnebf16_epi8: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvtnebf162ibs %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf5,0x7f,0x49,0x69,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm512_mask_ipcvtnebf16_epi8: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtnebf162ibs %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf5,0x7f,0x49,0x69,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = bitcast <8 x i64> %__S to <32 x i16> + %1 = tail call <32 x i16> @llvm.x86.avx10.vcvtnebf162ibs512(<32 x bfloat> %__B) + %2 = bitcast i32 %__A to <32 x i1> + %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> %0 + %4 = bitcast <32 x i16> %3 to <8 x i64> + ret <8 x i64> %4 +} + +declare <32 x i16> @llvm.x86.avx10.vcvtnebf162ibs512(<32 x bfloat>) + +define dso_local <8 x i64> @test_mm512_maskz_ipcvtnebf16_epi8(i32 noundef %__A, <32 x bfloat> noundef %__B) { +; X64-LABEL: test_mm512_maskz_ipcvtnebf16_epi8: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvtnebf162ibs %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7f,0xc9,0x69,0xc0] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm512_maskz_ipcvtnebf16_epi8: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtnebf162ibs %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7f,0xc9,0x69,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = tail call <32 x i16> @llvm.x86.avx10.vcvtnebf162ibs512(<32 x bfloat> %__B) + %1 = bitcast i32 %__A to <32 x i1> + %2 = select <32 x i1> %1, <32 x i16> %0, <32 x i16> zeroinitializer + %3 = bitcast <32 x i16> %2 to <8 x i64> + ret <8 x i64> %3 +} + +define dso_local <8 x i64> @test_mm512_ipcvtnebf16_epu8(<32 x bfloat> noundef %__A) { +; CHECK-LABEL: test_mm512_ipcvtnebf16_epu8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vcvtnebf162iubs %zmm0, %zmm0 # encoding: [0x62,0xf5,0x7f,0x48,0x6b,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +entry: + %0 = tail call <32 x i16> @llvm.x86.avx10.vcvtnebf162iubs512(<32 x bfloat> %__A) + %1 = bitcast <32 x i16> %0 to <8 x i64> + ret <8 x i64> %1 +} + +define dso_local <8 x i64> @test_mm512_mask_ipcvtnebf16_epu8(<8 x i64> noundef %__S, i32 noundef %__A, <32 x bfloat> noundef %__B) { +; X64-LABEL: test_mm512_mask_ipcvtnebf16_epu8: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvtnebf162iubs %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf5,0x7f,0x49,0x6b,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm512_mask_ipcvtnebf16_epu8: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtnebf162iubs %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf5,0x7f,0x49,0x6b,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = bitcast <8 x i64> %__S to <32 x i16> + %1 = tail call <32 x i16> @llvm.x86.avx10.vcvtnebf162iubs512(<32 x bfloat> %__B) + %2 = bitcast i32 %__A to <32 x i1> + %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> %0 + %4 = bitcast <32 x i16> %3 to <8 x i64> + ret <8 x i64> %4 +} + +declare <32 x i16> @llvm.x86.avx10.vcvtnebf162iubs512(<32 x bfloat>) + +define dso_local <8 x i64> @test_mm512_maskz_ipcvtnebf16_epu8(i32 noundef %__A, <32 x bfloat> noundef %__B) { +; X64-LABEL: test_mm512_maskz_ipcvtnebf16_epu8: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvtnebf162iubs %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7f,0xc9,0x6b,0xc0] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm512_maskz_ipcvtnebf16_epu8: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtnebf162iubs %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7f,0xc9,0x6b,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = tail call <32 x i16> @llvm.x86.avx10.vcvtnebf162iubs512(<32 x bfloat> %__B) + %1 = bitcast i32 %__A to <32 x i1> + %2 = select <32 x i1> %1, <32 x i16> %0, <32 x i16> zeroinitializer + %3 = bitcast <32 x i16> %2 to <8 x i64> + ret <8 x i64> %3 +} + +define dso_local <8 x i64> @test_mm512_ipcvtph_epi8(<32 x half> noundef %__A) { +; CHECK-LABEL: test_mm512_ipcvtph_epi8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vcvtph2ibs %zmm0, %zmm0 # encoding: [0x62,0xf5,0x7c,0x48,0x69,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +entry: + %0 = tail call <32 x i16> @llvm.x86.avx10.mask.vcvtph2ibs512(<32 x half> %__A, <32 x i16> zeroinitializer, i32 -1, i32 4) + %1 = bitcast <32 x i16> %0 to <8 x i64> + ret <8 x i64> %1 +} + +define dso_local <8 x i64> @test_mm512_mask_ipcvtph_epi8(<8 x i64> noundef %__S, i32 noundef %__A, <32 x half> noundef %__B) { +; X64-LABEL: test_mm512_mask_ipcvtph_epi8: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvtph2ibs %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf5,0x7c,0x49,0x69,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm512_mask_ipcvtph_epi8: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtph2ibs %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf5,0x7c,0x49,0x69,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = bitcast <8 x i64> %__S to <32 x i16> + %1 = tail call <32 x i16> @llvm.x86.avx10.mask.vcvtph2ibs512(<32 x half> %__B, <32 x i16> %0, i32 %__A, i32 4) + %2 = bitcast <32 x i16> %1 to <8 x i64> + ret <8 x i64> %2 +} + +declare <32 x i16> @llvm.x86.avx10.mask.vcvtph2ibs512(<32 x half>, <32 x i16>, i32, i32) + +define dso_local <8 x i64> @test_mm512_maskz_ipcvtph_epi8(i32 noundef %__A, <32 x half> noundef %__B) { +; X64-LABEL: test_mm512_maskz_ipcvtph_epi8: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvtph2ibs %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7c,0xc9,0x69,0xc0] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm512_maskz_ipcvtph_epi8: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtph2ibs %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7c,0xc9,0x69,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = tail call <32 x i16> @llvm.x86.avx10.mask.vcvtph2ibs512(<32 x half> %__B, <32 x i16> zeroinitializer, i32 %__A, i32 4) + %1 = bitcast <32 x i16> %0 to <8 x i64> + ret <8 x i64> %1 +} + +define dso_local <8 x i64> @test_mm512_ipcvt_roundph_epi8(<32 x half> noundef %__A) { +; CHECK-LABEL: test_mm512_ipcvt_roundph_epi8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vcvtph2ibs {rz-sae}, %zmm0, %zmm0 # encoding: [0x62,0xf5,0x7c,0x78,0x69,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +entry: + %0 = tail call <32 x i16> @llvm.x86.avx10.mask.vcvtph2ibs512(<32 x half> %__A, <32 x i16> zeroinitializer, i32 -1, i32 11) + %1 = bitcast <32 x i16> %0 to <8 x i64> + ret <8 x i64> %1 +} + +define dso_local <8 x i64> @test_mm512_mask_ipcvt_roundph_epi8(<8 x i64> noundef %__S, i32 noundef %__A, <32 x half> noundef %__B) { +; X64-LABEL: test_mm512_mask_ipcvt_roundph_epi8: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvtph2ibs {rz-sae}, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf5,0x7c,0x79,0x69,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm512_mask_ipcvt_roundph_epi8: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtph2ibs {rz-sae}, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf5,0x7c,0x79,0x69,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = bitcast <8 x i64> %__S to <32 x i16> + %1 = tail call <32 x i16> @llvm.x86.avx10.mask.vcvtph2ibs512(<32 x half> %__B, <32 x i16> %0, i32 %__A, i32 11) + %2 = bitcast <32 x i16> %1 to <8 x i64> + ret <8 x i64> %2 +} + +define dso_local <8 x i64> @test_mm512_maskz_ipcvt_roundph_epi8(i32 noundef %__A, <32 x half> noundef %__B) { +; X64-LABEL: test_mm512_maskz_ipcvt_roundph_epi8: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvtph2ibs {rz-sae}, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7c,0xf9,0x69,0xc0] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm512_maskz_ipcvt_roundph_epi8: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtph2ibs {rz-sae}, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7c,0xf9,0x69,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = tail call <32 x i16> @llvm.x86.avx10.mask.vcvtph2ibs512(<32 x half> %__B, <32 x i16> zeroinitializer, i32 %__A, i32 11) + %1 = bitcast <32 x i16> %0 to <8 x i64> + ret <8 x i64> %1 +} + +define dso_local <8 x i64> @test_mm512_ipcvtph_epu8(<32 x half> noundef %__A) { +; CHECK-LABEL: test_mm512_ipcvtph_epu8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vcvtph2iubs %zmm0, %zmm0 # encoding: [0x62,0xf5,0x7c,0x48,0x6b,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +entry: + %0 = tail call <32 x i16> @llvm.x86.avx10.mask.vcvtph2iubs512(<32 x half> %__A, <32 x i16> zeroinitializer, i32 -1, i32 4) + %1 = bitcast <32 x i16> %0 to <8 x i64> + ret <8 x i64> %1 +} + +define dso_local <8 x i64> @test_mm512_mask_ipcvtph_epu8(<8 x i64> noundef %__S, i32 noundef %__A, <32 x half> noundef %__B) { +; X64-LABEL: test_mm512_mask_ipcvtph_epu8: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvtph2iubs %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf5,0x7c,0x49,0x6b,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm512_mask_ipcvtph_epu8: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtph2iubs %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf5,0x7c,0x49,0x6b,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = bitcast <8 x i64> %__S to <32 x i16> + %1 = tail call <32 x i16> @llvm.x86.avx10.mask.vcvtph2iubs512( <32 x half> %__B, <32 x i16> %0, i32 %__A, i32 4) + %2 = bitcast <32 x i16> %1 to <8 x i64> + ret <8 x i64> %2 +} + +define dso_local <8 x i64> @test_mm512_maskz_ipcvtph_epu8(i32 noundef %__A, <32 x half> noundef %__B) { +; X64-LABEL: test_mm512_maskz_ipcvtph_epu8: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvtph2iubs %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7c,0xc9,0x6b,0xc0] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm512_maskz_ipcvtph_epu8: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtph2iubs %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7c,0xc9,0x6b,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = tail call <32 x i16> @llvm.x86.avx10.mask.vcvtph2iubs512(<32 x half> %__B, <32 x i16> zeroinitializer, i32 %__A, i32 4) + %1 = bitcast <32 x i16> %0 to <8 x i64> + ret <8 x i64> %1 +} + +define dso_local <8 x i64> @test_mm512_ipcvt_roundph_epu8(<32 x half> noundef %__A) { +; CHECK-LABEL: test_mm512_ipcvt_roundph_epu8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vcvtph2iubs {rz-sae}, %zmm0, %zmm0 # encoding: [0x62,0xf5,0x7c,0x78,0x6b,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +entry: + %0 = tail call <32 x i16> @llvm.x86.avx10.mask.vcvtph2iubs512(<32 x half> %__A, <32 x i16> zeroinitializer, i32 -1, i32 11) + %1 = bitcast <32 x i16> %0 to <8 x i64> + ret <8 x i64> %1 +} + +define dso_local <8 x i64> @test_mm512_mask_ipcvt_roundph_epu8(<8 x i64> noundef %__S, i32 noundef %__A, <32 x half> noundef %__B) { +; X64-LABEL: test_mm512_mask_ipcvt_roundph_epu8: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvtph2iubs {rz-sae}, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf5,0x7c,0x79,0x6b,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm512_mask_ipcvt_roundph_epu8: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtph2iubs {rz-sae}, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf5,0x7c,0x79,0x6b,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = bitcast <8 x i64> %__S to <32 x i16> + %1 = tail call <32 x i16> @llvm.x86.avx10.mask.vcvtph2iubs512(<32 x half> %__B, <32 x i16> %0, i32 %__A, i32 11) + %2 = bitcast <32 x i16> %1 to <8 x i64> + ret <8 x i64> %2 +} + +declare <32 x i16> @llvm.x86.avx10.mask.vcvtph2iubs512(<32 x half>, <32 x i16>, i32, i32) + +define dso_local <8 x i64> @test_mm512_maskz_ipcvt_roundph_epu8(i32 noundef %__A, <32 x half> noundef %__B) { +; X64-LABEL: test_mm512_maskz_ipcvt_roundph_epu8: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvtph2iubs {rz-sae}, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7c,0xf9,0x6b,0xc0] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm512_maskz_ipcvt_roundph_epu8: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtph2iubs {rz-sae}, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7c,0xf9,0x6b,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = tail call <32 x i16> @llvm.x86.avx10.mask.vcvtph2iubs512(<32 x half> %__B, <32 x i16> zeroinitializer, i32 %__A, i32 11) + %1 = bitcast <32 x i16> %0 to <8 x i64> + ret <8 x i64> %1 +} + +define dso_local <8 x i64> @test_mm512_ipcvtps_epi8(<16 x float> noundef %__A) { +; CHECK-LABEL: test_mm512_ipcvtps_epi8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vcvtps2ibs %zmm0, %zmm0 # encoding: [0x62,0xf5,0x7d,0x48,0x69,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +entry: + %0 = tail call <16 x i32> @llvm.x86.avx10.mask.vcvtps2ibs512(<16 x float> %__A, <16 x i32> zeroinitializer, i16 -1, i32 4) + %1 = bitcast <16 x i32> %0 to <8 x i64> + ret <8 x i64> %1 +} + +define dso_local <8 x i64> @test_mm512_mask_ipcvtps_epi8(<8 x i64> noundef %__S, i16 noundef zeroext %__A, <16 x float> noundef %__B) { +; X64-LABEL: test_mm512_mask_ipcvtps_epi8: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvtps2ibs %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf5,0x7d,0x49,0x69,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm512_mask_ipcvtps_epi8: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtps2ibs %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf5,0x7d,0x49,0x69,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = bitcast <8 x i64> %__S to <16 x i32> + %1 = tail call <16 x i32> @llvm.x86.avx10.mask.vcvtps2ibs512(<16 x float> %__B, <16 x i32> %0, i16 %__A, i32 4) + %2 = bitcast <16 x i32> %1 to <8 x i64> + ret <8 x i64> %2 +} + +declare <16 x i32> @llvm.x86.avx10.mask.vcvtps2ibs512(<16 x float>, <16 x i32>, i16, i32) + +define dso_local <8 x i64> @test_mm512_maskz_ipcvtps_epi8(i16 noundef zeroext %__A, <16 x float> noundef %__B) { +; X64-LABEL: test_mm512_maskz_ipcvtps_epi8: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvtps2ibs %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7d,0xc9,0x69,0xc0] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm512_maskz_ipcvtps_epi8: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtps2ibs %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7d,0xc9,0x69,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = tail call <16 x i32> @llvm.x86.avx10.mask.vcvtps2ibs512(<16 x float> %__B, <16 x i32> zeroinitializer, i16 %__A, i32 4) + %1 = bitcast <16 x i32> %0 to <8 x i64> + ret <8 x i64> %1 +} + +define dso_local <8 x i64> @test_mm512_ipcvt_roundps_epi8(<16 x float> noundef %__A) { +; CHECK-LABEL: test_mm512_ipcvt_roundps_epi8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vcvtps2ibs {rz-sae}, %zmm0, %zmm0 # encoding: [0x62,0xf5,0x7d,0x78,0x69,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +entry: + %0 = tail call <16 x i32> @llvm.x86.avx10.mask.vcvtps2ibs512(<16 x float> %__A, <16 x i32> zeroinitializer, i16 -1, i32 11) + %1 = bitcast <16 x i32> %0 to <8 x i64> + ret <8 x i64> %1 +} + +define dso_local <8 x i64> @test_mm512_mask_ipcvt_roundps_epi8(<8 x i64> noundef %__S, i16 noundef zeroext %__A, <16 x float> noundef %__B) { +; X64-LABEL: test_mm512_mask_ipcvt_roundps_epi8: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvtps2ibs {rz-sae}, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf5,0x7d,0x79,0x69,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm512_mask_ipcvt_roundps_epi8: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtps2ibs {rz-sae}, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf5,0x7d,0x79,0x69,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = bitcast <8 x i64> %__S to <16 x i32> + %1 = tail call <16 x i32> @llvm.x86.avx10.mask.vcvtps2ibs512(<16 x float> %__B, <16 x i32> %0, i16 %__A, i32 11) + %2 = bitcast <16 x i32> %1 to <8 x i64> + ret <8 x i64> %2 +} + +define dso_local <8 x i64> @test_mm512_maskz_ipcvt_roundps_epi8(i16 noundef zeroext %__A, <16 x float> noundef %__B) { +; X64-LABEL: test_mm512_maskz_ipcvt_roundps_epi8: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvtps2ibs {rz-sae}, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7d,0xf9,0x69,0xc0] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm512_maskz_ipcvt_roundps_epi8: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtps2ibs {rz-sae}, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7d,0xf9,0x69,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = tail call <16 x i32> @llvm.x86.avx10.mask.vcvtps2ibs512(<16 x float> %__B, <16 x i32> zeroinitializer, i16 %__A, i32 11) + %1 = bitcast <16 x i32> %0 to <8 x i64> + ret <8 x i64> %1 +} + +define dso_local <8 x i64> @test_mm512_ipcvtps_epu8(<16 x float> noundef %__A) { +; CHECK-LABEL: test_mm512_ipcvtps_epu8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vcvtps2iubs %zmm0, %zmm0 # encoding: [0x62,0xf5,0x7d,0x48,0x6b,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +entry: + %0 = tail call <16 x i32> @llvm.x86.avx10.mask.vcvtps2iubs512(<16 x float> %__A, <16 x i32> zeroinitializer, i16 -1, i32 4) + %1 = bitcast <16 x i32> %0 to <8 x i64> + ret <8 x i64> %1 +} + +define dso_local <8 x i64> @test_mm512_mask_ipcvtps_epu8(<8 x i64> noundef %__S, i16 noundef zeroext %__A, <16 x float> noundef %__B) { +; X64-LABEL: test_mm512_mask_ipcvtps_epu8: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvtps2iubs %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf5,0x7d,0x49,0x6b,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm512_mask_ipcvtps_epu8: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtps2iubs %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf5,0x7d,0x49,0x6b,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = bitcast <8 x i64> %__S to <16 x i32> + %1 = tail call <16 x i32> @llvm.x86.avx10.mask.vcvtps2iubs512(<16 x float> %__B, <16 x i32> %0, i16 %__A, i32 4) + %2 = bitcast <16 x i32> %1 to <8 x i64> + ret <8 x i64> %2 +} + +define dso_local <8 x i64> @test_mm512_maskz_ipcvtps_epu8(i16 noundef zeroext %__A, <16 x float> noundef %__B) { +; X64-LABEL: test_mm512_maskz_ipcvtps_epu8: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvtps2iubs %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7d,0xc9,0x6b,0xc0] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm512_maskz_ipcvtps_epu8: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtps2iubs %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7d,0xc9,0x6b,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = tail call <16 x i32> @llvm.x86.avx10.mask.vcvtps2iubs512(<16 x float> %__B, <16 x i32> zeroinitializer, i16 %__A, i32 4) + %1 = bitcast <16 x i32> %0 to <8 x i64> + ret <8 x i64> %1 +} + +define dso_local <8 x i64> @test_mm512_ipcvt_roundps_epu8(<16 x float> noundef %__A) { +; CHECK-LABEL: test_mm512_ipcvt_roundps_epu8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vcvtps2iubs {rz-sae}, %zmm0, %zmm0 # encoding: [0x62,0xf5,0x7d,0x78,0x6b,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +entry: + %0 = tail call <16 x i32> @llvm.x86.avx10.mask.vcvtps2iubs512(<16 x float> %__A, <16 x i32> zeroinitializer, i16 -1, i32 11) + %1 = bitcast <16 x i32> %0 to <8 x i64> + ret <8 x i64> %1 +} + +define dso_local <8 x i64> @test_mm512_mask_ipcvt_roundps_epu8(<8 x i64> noundef %__S, i16 noundef zeroext %__A, <16 x float> noundef %__B) { +; X64-LABEL: test_mm512_mask_ipcvt_roundps_epu8: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvtps2iubs {rz-sae}, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf5,0x7d,0x79,0x6b,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm512_mask_ipcvt_roundps_epu8: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtps2iubs {rz-sae}, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf5,0x7d,0x79,0x6b,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = bitcast <8 x i64> %__S to <16 x i32> + %1 = tail call <16 x i32> @llvm.x86.avx10.mask.vcvtps2iubs512(<16 x float> %__B, <16 x i32> %0, i16 %__A, i32 11) + %2 = bitcast <16 x i32> %1 to <8 x i64> + ret <8 x i64> %2 +} + +declare <16 x i32> @llvm.x86.avx10.mask.vcvtps2iubs512(<16 x float>, <16 x i32>, i16, i32) + +define dso_local <8 x i64> @test_mm512_maskz_ipcvt_roundps_epu8(i16 noundef zeroext %__A, <16 x float> noundef %__B) { +; X64-LABEL: test_mm512_maskz_ipcvt_roundps_epu8: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvtps2iubs {rz-sae}, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7d,0xf9,0x6b,0xc0] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm512_maskz_ipcvt_roundps_epu8: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtps2iubs {rz-sae}, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7d,0xf9,0x6b,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = tail call <16 x i32> @llvm.x86.avx10.mask.vcvtps2iubs512(<16 x float> %__B, <16 x i32> zeroinitializer, i16 %__A, i32 11) + %1 = bitcast <16 x i32> %0 to <8 x i64> + ret <8 x i64> %1 +} + +define dso_local <8 x i64> @test_mm512_ipcvttnebf16_epi8(<32 x bfloat> noundef %__A) { +; CHECK-LABEL: test_mm512_ipcvttnebf16_epi8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vcvttnebf162ibs %zmm0, %zmm0 # encoding: [0x62,0xf5,0x7f,0x48,0x68,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +entry: + %0 = tail call <32 x i16> @llvm.x86.avx10.vcvttnebf162ibs512(<32 x bfloat> %__A) + %1 = bitcast <32 x i16> %0 to <8 x i64> + ret <8 x i64> %1 +} + +define dso_local <8 x i64> @test_mm512_mask_ipcvttnebf16_epi8(<8 x i64> noundef %__S, i32 noundef %__A, <32 x bfloat> noundef %__B) { +; X64-LABEL: test_mm512_mask_ipcvttnebf16_epi8: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvttnebf162ibs %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf5,0x7f,0x49,0x68,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm512_mask_ipcvttnebf16_epi8: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvttnebf162ibs %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf5,0x7f,0x49,0x68,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = bitcast <8 x i64> %__S to <32 x i16> + %1 = tail call <32 x i16> @llvm.x86.avx10.vcvttnebf162ibs512(<32 x bfloat> %__B) + %2 = bitcast i32 %__A to <32 x i1> + %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> %0 + %4 = bitcast <32 x i16> %3 to <8 x i64> + ret <8 x i64> %4 +} + +declare <32 x i16> @llvm.x86.avx10.vcvttnebf162ibs512(<32 x bfloat>) + +define dso_local <8 x i64> @test_mm512_maskz_ipcvttnebf16_epi8(i32 noundef %__A, <32 x bfloat> noundef %__B) { +; X64-LABEL: test_mm512_maskz_ipcvttnebf16_epi8: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvttnebf162ibs %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7f,0xc9,0x68,0xc0] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm512_maskz_ipcvttnebf16_epi8: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvttnebf162ibs %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7f,0xc9,0x68,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = tail call <32 x i16> @llvm.x86.avx10.vcvttnebf162ibs512(<32 x bfloat> %__B) + %1 = bitcast i32 %__A to <32 x i1> + %2 = select <32 x i1> %1, <32 x i16> %0, <32 x i16> zeroinitializer + %3 = bitcast <32 x i16> %2 to <8 x i64> + ret <8 x i64> %3 +} + +define dso_local <8 x i64> @test_mm512_ipcvttnebf16_epu8(<32 x bfloat> noundef %__A) { +; CHECK-LABEL: test_mm512_ipcvttnebf16_epu8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vcvttnebf162iubs %zmm0, %zmm0 # encoding: [0x62,0xf5,0x7f,0x48,0x6a,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +entry: + %0 = tail call <32 x i16> @llvm.x86.avx10.vcvttnebf162iubs512(<32 x bfloat> %__A) + %1 = bitcast <32 x i16> %0 to <8 x i64> + ret <8 x i64> %1 +} + +define dso_local <8 x i64> @test_mm512_mask_ipcvttnebf16_epu8(<8 x i64> noundef %__S, i32 noundef %__A, <32 x bfloat> noundef %__B) { +; X64-LABEL: test_mm512_mask_ipcvttnebf16_epu8: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvttnebf162iubs %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf5,0x7f,0x49,0x6a,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm512_mask_ipcvttnebf16_epu8: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvttnebf162iubs %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf5,0x7f,0x49,0x6a,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = bitcast <8 x i64> %__S to <32 x i16> + %1 = tail call <32 x i16> @llvm.x86.avx10.vcvttnebf162iubs512(<32 x bfloat> %__B) + %2 = bitcast i32 %__A to <32 x i1> + %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> %0 + %4 = bitcast <32 x i16> %3 to <8 x i64> + ret <8 x i64> %4 +} + +declare <32 x i16> @llvm.x86.avx10.vcvttnebf162iubs512(<32 x bfloat>) + +define dso_local <8 x i64> @test_mm512_maskz_ipcvttnebf16_epu8(i32 noundef %__A, <32 x bfloat> noundef %__B) { +; X64-LABEL: test_mm512_maskz_ipcvttnebf16_epu8: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvttnebf162iubs %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7f,0xc9,0x6a,0xc0] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm512_maskz_ipcvttnebf16_epu8: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvttnebf162iubs %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7f,0xc9,0x6a,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = tail call <32 x i16> @llvm.x86.avx10.vcvttnebf162iubs512(<32 x bfloat> %__B) + %1 = bitcast i32 %__A to <32 x i1> + %2 = select <32 x i1> %1, <32 x i16> %0, <32 x i16> zeroinitializer + %3 = bitcast <32 x i16> %2 to <8 x i64> + ret <8 x i64> %3 +} + +define dso_local <8 x i64> @test_mm512_ipcvttph_epi8(<32 x half> noundef %__A) { +; CHECK-LABEL: test_mm512_ipcvttph_epi8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vcvttph2ibs %zmm0, %zmm0 # encoding: [0x62,0xf5,0x7c,0x48,0x68,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +entry: + %0 = tail call <32 x i16> @llvm.x86.avx10.mask.vcvttph2ibs512(<32 x half> %__A, <32 x i16> zeroinitializer, i32 -1, i32 4) + %1 = bitcast <32 x i16> %0 to <8 x i64> + ret <8 x i64> %1 +} + +define dso_local <8 x i64> @test_mm512_mask_ipcvttph_epi8(<8 x i64> noundef %__S, i32 noundef %__A, <32 x half> noundef %__B) { +; X64-LABEL: test_mm512_mask_ipcvttph_epi8: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvttph2ibs %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf5,0x7c,0x49,0x68,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm512_mask_ipcvttph_epi8: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvttph2ibs %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf5,0x7c,0x49,0x68,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = bitcast <8 x i64> %__S to <32 x i16> + %1 = tail call <32 x i16> @llvm.x86.avx10.mask.vcvttph2ibs512(<32 x half> %__B, <32 x i16> %0, i32 %__A, i32 4) + %2 = bitcast <32 x i16> %1 to <8 x i64> + ret <8 x i64> %2 +} + +define dso_local <8 x i64> @test_mm512_maskz_ipcvttph_epi8(i32 noundef %__A, <32 x half> noundef %__B) { +; X64-LABEL: test_mm512_maskz_ipcvttph_epi8: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvttph2ibs %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7c,0xc9,0x68,0xc0] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm512_maskz_ipcvttph_epi8: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvttph2ibs %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7c,0xc9,0x68,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = tail call <32 x i16> @llvm.x86.avx10.mask.vcvttph2ibs512(<32 x half> %__B, <32 x i16> zeroinitializer, i32 %__A, i32 4) + %1 = bitcast <32 x i16> %0 to <8 x i64> + ret <8 x i64> %1 +} + +define dso_local <8 x i64> @test_mm512_ipcvtt_roundph_epi8(<32 x half> noundef %__A) { +; CHECK-LABEL: test_mm512_ipcvtt_roundph_epi8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vcvttph2ibs {sae}, %zmm0, %zmm0 # encoding: [0x62,0xf5,0x7c,0x18,0x68,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +entry: + %0 = tail call <32 x i16> @llvm.x86.avx10.mask.vcvttph2ibs512(<32 x half> %__A, <32 x i16> zeroinitializer, i32 -1, i32 8) + %1 = bitcast <32 x i16> %0 to <8 x i64> + ret <8 x i64> %1 +} + +define dso_local <8 x i64> @test_mm512_mask_ipcvtt_roundph_epi8(<8 x i64> noundef %__S, i32 noundef %__A, <32 x half> noundef %__B) { +; X64-LABEL: test_mm512_mask_ipcvtt_roundph_epi8: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvttph2ibs {sae}, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf5,0x7c,0x19,0x68,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm512_mask_ipcvtt_roundph_epi8: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvttph2ibs {sae}, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf5,0x7c,0x19,0x68,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = bitcast <8 x i64> %__S to <32 x i16> + %1 = tail call <32 x i16> @llvm.x86.avx10.mask.vcvttph2ibs512(<32 x half> %__B, <32 x i16> %0, i32 %__A, i32 8) + %2 = bitcast <32 x i16> %1 to <8 x i64> + ret <8 x i64> %2 +} + +declare <32 x i16> @llvm.x86.avx10.mask.vcvttph2ibs512(<32 x half>, <32 x i16>, i32, i32) + +define dso_local <8 x i64> @test_mm512_maskz_ipcvtt_roundph_epi8(i32 noundef %__A, <32 x half> noundef %__B) { +; X64-LABEL: test_mm512_maskz_ipcvtt_roundph_epi8: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvttph2ibs {sae}, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7c,0x99,0x68,0xc0] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm512_maskz_ipcvtt_roundph_epi8: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvttph2ibs {sae}, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7c,0x99,0x68,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = tail call <32 x i16> @llvm.x86.avx10.mask.vcvttph2ibs512(<32 x half> %__B, <32 x i16> zeroinitializer, i32 %__A, i32 8) + %1 = bitcast <32 x i16> %0 to <8 x i64> + ret <8 x i64> %1 +} + +define dso_local <8 x i64> @test_mm512_ipcvttph_epu8(<32 x half> noundef %__A) { +; CHECK-LABEL: test_mm512_ipcvttph_epu8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vcvttph2iubs %zmm0, %zmm0 # encoding: [0x62,0xf5,0x7c,0x48,0x6a,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +entry: + %0 = tail call <32 x i16> @llvm.x86.avx10.mask.vcvttph2iubs512(<32 x half> %__A, <32 x i16> zeroinitializer, i32 -1, i32 4) + %1 = bitcast <32 x i16> %0 to <8 x i64> + ret <8 x i64> %1 +} + +define dso_local <8 x i64> @test_mm512_mask_ipcvttph_epu8(<8 x i64> noundef %__S, i32 noundef %__A, <32 x half> noundef %__B) { +; X64-LABEL: test_mm512_mask_ipcvttph_epu8: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvttph2iubs %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf5,0x7c,0x49,0x6a,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm512_mask_ipcvttph_epu8: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvttph2iubs %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf5,0x7c,0x49,0x6a,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = bitcast <8 x i64> %__S to <32 x i16> + %1 = tail call <32 x i16> @llvm.x86.avx10.mask.vcvttph2iubs512(<32 x half> %__B, <32 x i16> %0, i32 %__A, i32 4) + %2 = bitcast <32 x i16> %1 to <8 x i64> + ret <8 x i64> %2 +} + +define dso_local <8 x i64> @test_mm512_maskz_ipcvttph_epu8(i32 noundef %__A, <32 x half> noundef %__B) { +; X64-LABEL: test_mm512_maskz_ipcvttph_epu8: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvttph2iubs %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7c,0xc9,0x6a,0xc0] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm512_maskz_ipcvttph_epu8: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvttph2iubs %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7c,0xc9,0x6a,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = tail call <32 x i16> @llvm.x86.avx10.mask.vcvttph2iubs512(<32 x half> %__B, <32 x i16> zeroinitializer, i32 %__A, i32 4) + %1 = bitcast <32 x i16> %0 to <8 x i64> + ret <8 x i64> %1 +} + +define dso_local <8 x i64> @test_mm512_ipcvtt_roundph_epu8(<32 x half> noundef %__A) { +; CHECK-LABEL: test_mm512_ipcvtt_roundph_epu8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vcvttph2iubs {sae}, %zmm0, %zmm0 # encoding: [0x62,0xf5,0x7c,0x18,0x6a,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +entry: + %0 = tail call <32 x i16> @llvm.x86.avx10.mask.vcvttph2iubs512(<32 x half> %__A, <32 x i16> zeroinitializer, i32 -1, i32 8) + %1 = bitcast <32 x i16> %0 to <8 x i64> + ret <8 x i64> %1 +} + +define dso_local <8 x i64> @test_mm512_mask_ipcvtt_roundph_epu8(<8 x i64> noundef %__S, i32 noundef %__A, <32 x half> noundef %__B) { +; X64-LABEL: test_mm512_mask_ipcvtt_roundph_epu8: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvttph2iubs {sae}, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf5,0x7c,0x19,0x6a,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm512_mask_ipcvtt_roundph_epu8: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvttph2iubs {sae}, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf5,0x7c,0x19,0x6a,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = bitcast <8 x i64> %__S to <32 x i16> + %1 = tail call <32 x i16> @llvm.x86.avx10.mask.vcvttph2iubs512(<32 x half> %__B, <32 x i16> %0, i32 %__A, i32 8) + %2 = bitcast <32 x i16> %1 to <8 x i64> + ret <8 x i64> %2 +} + +declare <32 x i16> @llvm.x86.avx10.mask.vcvttph2iubs512(<32 x half>, <32 x i16>, i32, i32) + +define dso_local <8 x i64> @test_mm512_maskz_ipcvtt_roundph_epu8(i32 noundef %__A, <32 x half> noundef %__B) { +; X64-LABEL: test_mm512_maskz_ipcvtt_roundph_epu8: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvttph2iubs {sae}, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7c,0x99,0x6a,0xc0] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm512_maskz_ipcvtt_roundph_epu8: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvttph2iubs {sae}, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7c,0x99,0x6a,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = tail call <32 x i16> @llvm.x86.avx10.mask.vcvttph2iubs512(<32 x half> %__B, <32 x i16> zeroinitializer, i32 %__A, i32 8) + %1 = bitcast <32 x i16> %0 to <8 x i64> + ret <8 x i64> %1 +} + +define dso_local <8 x i64> @test_mm512_ipcvttps_epi8(<16 x float> noundef %__A) { +; CHECK-LABEL: test_mm512_ipcvttps_epi8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vcvttps2ibs %zmm0, %zmm0 # encoding: [0x62,0xf5,0x7d,0x48,0x68,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +entry: + %0 = tail call <16 x i32> @llvm.x86.avx10.mask.vcvttps2ibs512(<16 x float> %__A, <16 x i32> zeroinitializer, i16 -1, i32 4) + %1 = bitcast <16 x i32> %0 to <8 x i64> + ret <8 x i64> %1 +} + +define dso_local <8 x i64> @test_mm512_mask_ipcvttps_epi8(<8 x i64> noundef %__S, i16 noundef zeroext %__A, <16 x float> noundef %__B) { +; X64-LABEL: test_mm512_mask_ipcvttps_epi8: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvttps2ibs %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf5,0x7d,0x49,0x68,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm512_mask_ipcvttps_epi8: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvttps2ibs %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf5,0x7d,0x49,0x68,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = bitcast <8 x i64> %__S to <16 x i32> + %1 = tail call <16 x i32> @llvm.x86.avx10.mask.vcvttps2ibs512(<16 x float> %__B, <16 x i32> %0, i16 %__A, i32 4) + %2 = bitcast <16 x i32> %1 to <8 x i64> + ret <8 x i64> %2 +} + +define dso_local <8 x i64> @test_mm512_maskz_ipcvttps_epi8(i16 noundef zeroext %__A, <16 x float> noundef %__B) { +; X64-LABEL: test_mm512_maskz_ipcvttps_epi8: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvttps2ibs %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7d,0xc9,0x68,0xc0] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm512_maskz_ipcvttps_epi8: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvttps2ibs %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7d,0xc9,0x68,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = tail call <16 x i32> @llvm.x86.avx10.mask.vcvttps2ibs512(<16 x float> %__B, <16 x i32> zeroinitializer, i16 %__A, i32 4) + %1 = bitcast <16 x i32> %0 to <8 x i64> + ret <8 x i64> %1 +} + +define dso_local <8 x i64> @test_mm512_ipcvtt_roundps_epi8(<16 x float> noundef %__A) { +; CHECK-LABEL: test_mm512_ipcvtt_roundps_epi8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vcvttps2ibs {sae}, %zmm0, %zmm0 # encoding: [0x62,0xf5,0x7d,0x18,0x68,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +entry: + %0 = tail call <16 x i32> @llvm.x86.avx10.mask.vcvttps2ibs512(<16 x float> %__A, <16 x i32> zeroinitializer, i16 -1, i32 8) + %1 = bitcast <16 x i32> %0 to <8 x i64> + ret <8 x i64> %1 +} + +define dso_local <8 x i64> @test_mm512_mask_ipcvtt_roundps_epi8(<8 x i64> noundef %__S, i16 noundef zeroext %__A, <16 x float> noundef %__B) { +; X64-LABEL: test_mm512_mask_ipcvtt_roundps_epi8: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvttps2ibs {sae}, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf5,0x7d,0x19,0x68,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm512_mask_ipcvtt_roundps_epi8: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvttps2ibs {sae}, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf5,0x7d,0x19,0x68,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = bitcast <8 x i64> %__S to <16 x i32> + %1 = tail call <16 x i32> @llvm.x86.avx10.mask.vcvttps2ibs512(<16 x float> %__B, <16 x i32> %0, i16 %__A, i32 8) + %2 = bitcast <16 x i32> %1 to <8 x i64> + ret <8 x i64> %2 +} + +declare <16 x i32> @llvm.x86.avx10.mask.vcvttps2ibs512(<16 x float>, <16 x i32>, i16, i32) + +define dso_local <8 x i64> @test_mm512_maskz_ipcvtt_roundps_epi8(i16 noundef zeroext %__A, <16 x float> noundef %__B) { +; X64-LABEL: test_mm512_maskz_ipcvtt_roundps_epi8: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvttps2ibs {sae}, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7d,0x99,0x68,0xc0] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm512_maskz_ipcvtt_roundps_epi8: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvttps2ibs {sae}, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7d,0x99,0x68,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = tail call <16 x i32> @llvm.x86.avx10.mask.vcvttps2ibs512(<16 x float> %__B, <16 x i32> zeroinitializer, i16 %__A, i32 8) + %1 = bitcast <16 x i32> %0 to <8 x i64> + ret <8 x i64> %1 +} + +define dso_local <8 x i64> @test_mm512_ipcvttps_epu8(<16 x float> noundef %__A) { +; CHECK-LABEL: test_mm512_ipcvttps_epu8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vcvttps2iubs %zmm0, %zmm0 # encoding: [0x62,0xf5,0x7d,0x48,0x6a,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +entry: + %0 = tail call <16 x i32> @llvm.x86.avx10.mask.vcvttps2iubs512(<16 x float> %__A, <16 x i32> zeroinitializer, i16 -1, i32 4) + %1 = bitcast <16 x i32> %0 to <8 x i64> + ret <8 x i64> %1 +} + +define dso_local <8 x i64> @test_mm512_mask_ipcvttps_epu8(<8 x i64> noundef %__S, i16 noundef zeroext %__A, <16 x float> noundef %__B) { +; X64-LABEL: test_mm512_mask_ipcvttps_epu8: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvttps2iubs %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf5,0x7d,0x49,0x6a,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm512_mask_ipcvttps_epu8: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvttps2iubs %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf5,0x7d,0x49,0x6a,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = bitcast <8 x i64> %__S to <16 x i32> + %1 = tail call <16 x i32> @llvm.x86.avx10.mask.vcvttps2iubs512(<16 x float> %__B, <16 x i32> %0, i16 %__A, i32 4) + %2 = bitcast <16 x i32> %1 to <8 x i64> + ret <8 x i64> %2 +} + +define dso_local <8 x i64> @test_mm512_maskz_ipcvttps_epu8(i16 noundef zeroext %__A, <16 x float> noundef %__B) { +; X64-LABEL: test_mm512_maskz_ipcvttps_epu8: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvttps2iubs %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7d,0xc9,0x6a,0xc0] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm512_maskz_ipcvttps_epu8: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvttps2iubs %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7d,0xc9,0x6a,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = tail call <16 x i32> @llvm.x86.avx10.mask.vcvttps2iubs512(<16 x float> %__B, <16 x i32> zeroinitializer, i16 %__A, i32 4) + %1 = bitcast <16 x i32> %0 to <8 x i64> + ret <8 x i64> %1 +} + +define dso_local <8 x i64> @test_mm512_ipcvtt_roundps_epu8(<16 x float> noundef %__A) { +; CHECK-LABEL: test_mm512_ipcvtt_roundps_epu8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vcvttps2iubs {sae}, %zmm0, %zmm0 # encoding: [0x62,0xf5,0x7d,0x18,0x6a,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +entry: + %0 = tail call <16 x i32> @llvm.x86.avx10.mask.vcvttps2iubs512(<16 x float> %__A, <16 x i32> zeroinitializer, i16 -1, i32 8) + %1 = bitcast <16 x i32> %0 to <8 x i64> + ret <8 x i64> %1 +} + +define dso_local <8 x i64> @test_mm512_mask_ipcvtt_roundps_epu8(<8 x i64> noundef %__S, i16 noundef zeroext %__A, <16 x float> noundef %__B) { +; X64-LABEL: test_mm512_mask_ipcvtt_roundps_epu8: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvttps2iubs {sae}, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf5,0x7d,0x19,0x6a,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm512_mask_ipcvtt_roundps_epu8: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvttps2iubs {sae}, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf5,0x7d,0x19,0x6a,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = bitcast <8 x i64> %__S to <16 x i32> + %1 = tail call <16 x i32> @llvm.x86.avx10.mask.vcvttps2iubs512(<16 x float> %__B, <16 x i32> %0, i16 %__A, i32 8) + %2 = bitcast <16 x i32> %1 to <8 x i64> + ret <8 x i64> %2 +} + +declare <16 x i32> @llvm.x86.avx10.mask.vcvttps2iubs512(<16 x float>, <16 x i32>, i16, i32) + +define dso_local <8 x i64> @test_mm512_maskz_ipcvtt_roundps_epu8(i16 noundef zeroext %__A, <16 x float> noundef %__B) { +; X64-LABEL: test_mm512_maskz_ipcvtt_roundps_epu8: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvttps2iubs {sae}, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7d,0x99,0x6a,0xc0] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm512_maskz_ipcvtt_roundps_epu8: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvttps2iubs {sae}, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7d,0x99,0x6a,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = tail call <16 x i32> @llvm.x86.avx10.mask.vcvttps2iubs512(<16 x float> %__B, <16 x i32> zeroinitializer, i16 %__A, i32 8) + %1 = bitcast <16 x i32> %0 to <8 x i64> + ret <8 x i64> %1 +} diff --git a/llvm/test/CodeGen/X86/avx10_2satcvt-intrinsics.ll b/llvm/test/CodeGen/X86/avx10_2satcvt-intrinsics.ll new file mode 100644 index 000000000000..e16aa9d2de31 --- /dev/null +++ b/llvm/test/CodeGen/X86/avx10_2satcvt-intrinsics.ll @@ -0,0 +1,1618 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64 --show-mc-encoding -mattr=+avx10.2-256 | FileCheck %s --check-prefixes=CHECK,X64 +; RUN: llc < %s -verify-machineinstrs -mtriple=i686 --show-mc-encoding -mattr=+avx10.2-256 | FileCheck %s --check-prefixes=CHECK,X86 + +define dso_local <2 x i64> @test_mm_ipcvtnebf16_epi8(<8 x bfloat> noundef %__A) { +; CHECK-LABEL: test_mm_ipcvtnebf16_epi8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vcvtnebf162ibs %xmm0, %xmm0 # encoding: [0x62,0xf5,0x7f,0x08,0x69,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +entry: + %0 = tail call <8 x i16> @llvm.x86.avx10.vcvtnebf162ibs128(<8 x bfloat> %__A) + %1 = bitcast <8 x i16> %0 to <2 x i64> + ret <2 x i64> %1 +} + +define dso_local <2 x i64> @test_mm_mask_ipcvtnebf16_epi8(<2 x i64> noundef %__S, i8 noundef zeroext %__A, <8 x bfloat> noundef %__B) { +; X64-LABEL: test_mm_mask_ipcvtnebf16_epi8: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvtnebf162ibs %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf5,0x7f,0x09,0x69,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm_mask_ipcvtnebf16_epi8: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtnebf162ibs %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf5,0x7f,0x09,0x69,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = bitcast <2 x i64> %__S to <8 x i16> + %1 = tail call <8 x i16> @llvm.x86.avx10.vcvtnebf162ibs128(<8 x bfloat> %__B) + %2 = bitcast i8 %__A to <8 x i1> + %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> %0 + %4 = bitcast <8 x i16> %3 to <2 x i64> + ret <2 x i64> %4 +} + +declare <8 x i16> @llvm.x86.avx10.vcvtnebf162ibs128(<8 x bfloat>) + +define dso_local <2 x i64> @test_mm_maskz_ipcvtnebf16_epi8(i8 noundef zeroext %__A, <8 x bfloat> noundef %__B) { +; X64-LABEL: test_mm_maskz_ipcvtnebf16_epi8: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvtnebf162ibs %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7f,0x89,0x69,0xc0] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm_maskz_ipcvtnebf16_epi8: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtnebf162ibs %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7f,0x89,0x69,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = tail call <8 x i16> @llvm.x86.avx10.vcvtnebf162ibs128(<8 x bfloat> %__B) + %1 = bitcast i8 %__A to <8 x i1> + %2 = select <8 x i1> %1, <8 x i16> %0, <8 x i16> zeroinitializer + %3 = bitcast <8 x i16> %2 to <2 x i64> + ret <2 x i64> %3 +} + +define dso_local <4 x i64> @test_mm256_ipcvtnebf16_epi8(<16 x bfloat> noundef %__A) local_unnamed_addr #2 { +; CHECK-LABEL: test_mm256_ipcvtnebf16_epi8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vcvtnebf162ibs %ymm0, %ymm0 # encoding: [0x62,0xf5,0x7f,0x28,0x69,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +entry: + %0 = tail call <16 x i16> @llvm.x86.avx10.vcvtnebf162ibs256(<16 x bfloat> %__A) + %1 = bitcast <16 x i16> %0 to <4 x i64> + ret <4 x i64> %1 +} + +define dso_local <4 x i64> @test_mm256_mask_ipcvtnebf16_epi8(<4 x i64> noundef %__S, i16 noundef zeroext %__A, <16 x bfloat> noundef %__B) local_unnamed_addr #2 { +; X64-LABEL: test_mm256_mask_ipcvtnebf16_epi8: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvtnebf162ibs %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf5,0x7f,0x29,0x69,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm256_mask_ipcvtnebf16_epi8: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtnebf162ibs %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf5,0x7f,0x29,0x69,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = bitcast <4 x i64> %__S to <16 x i16> + %1 = tail call <16 x i16> @llvm.x86.avx10.vcvtnebf162ibs256(<16 x bfloat> %__B) + %2 = bitcast i16 %__A to <16 x i1> + %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> %0 + %4 = bitcast <16 x i16> %3 to <4 x i64> + ret <4 x i64> %4 +} + +declare <16 x i16> @llvm.x86.avx10.vcvtnebf162ibs256(<16 x bfloat>) + +define dso_local <4 x i64> @test_mm256_maskz_ipcvtnebf16_epi8(i16 noundef zeroext %__A, <16 x bfloat> noundef %__B) local_unnamed_addr #2 { +; X64-LABEL: test_mm256_maskz_ipcvtnebf16_epi8: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvtnebf162ibs %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x7f,0xa9,0x69,0xc0] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm256_maskz_ipcvtnebf16_epi8: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtnebf162ibs %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x7f,0xa9,0x69,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = tail call <16 x i16> @llvm.x86.avx10.vcvtnebf162ibs256(<16 x bfloat> %__B) + %1 = bitcast i16 %__A to <16 x i1> + %2 = select <16 x i1> %1, <16 x i16> %0, <16 x i16> zeroinitializer + %3 = bitcast <16 x i16> %2 to <4 x i64> + ret <4 x i64> %3 +} + +define dso_local <2 x i64> @test_mm_ipcvtnebf16_epu8(<8 x bfloat> noundef %__A) { +; CHECK-LABEL: test_mm_ipcvtnebf16_epu8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vcvtnebf162iubs %xmm0, %xmm0 # encoding: [0x62,0xf5,0x7f,0x08,0x6b,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +entry: + %0 = tail call <8 x i16> @llvm.x86.avx10.vcvtnebf162iubs128(<8 x bfloat> %__A) + %1 = bitcast <8 x i16> %0 to <2 x i64> + ret <2 x i64> %1 +} + +define dso_local <2 x i64> @test_mm_mask_ipcvtnebf16_epu8(<2 x i64> noundef %__S, i8 noundef zeroext %__A, <8 x bfloat> noundef %__B) { +; X64-LABEL: test_mm_mask_ipcvtnebf16_epu8: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvtnebf162iubs %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf5,0x7f,0x09,0x6b,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm_mask_ipcvtnebf16_epu8: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtnebf162iubs %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf5,0x7f,0x09,0x6b,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = bitcast <2 x i64> %__S to <8 x i16> + %1 = tail call <8 x i16> @llvm.x86.avx10.vcvtnebf162iubs128(<8 x bfloat> %__B) + %2 = bitcast i8 %__A to <8 x i1> + %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> %0 + %4 = bitcast <8 x i16> %3 to <2 x i64> + ret <2 x i64> %4 +} + +declare <8 x i16> @llvm.x86.avx10.vcvtnebf162iubs128(<8 x bfloat>) + +define dso_local <2 x i64> @test_mm_maskz_ipcvtnebf16_epu8(i8 noundef zeroext %__A, <8 x bfloat> noundef %__B) { +; X64-LABEL: test_mm_maskz_ipcvtnebf16_epu8: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvtnebf162iubs %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7f,0x89,0x6b,0xc0] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm_maskz_ipcvtnebf16_epu8: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtnebf162iubs %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7f,0x89,0x6b,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = tail call <8 x i16> @llvm.x86.avx10.vcvtnebf162iubs128(<8 x bfloat> %__B) + %1 = bitcast i8 %__A to <8 x i1> + %2 = select <8 x i1> %1, <8 x i16> %0, <8 x i16> zeroinitializer + %3 = bitcast <8 x i16> %2 to <2 x i64> + ret <2 x i64> %3 +} + +define dso_local <4 x i64> @test_mm256_ipcvtnebf16_epu8(<16 x bfloat> noundef %__A) local_unnamed_addr #2 { +; CHECK-LABEL: test_mm256_ipcvtnebf16_epu8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vcvtnebf162iubs %ymm0, %ymm0 # encoding: [0x62,0xf5,0x7f,0x28,0x6b,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +entry: + %0 = tail call <16 x i16> @llvm.x86.avx10.vcvtnebf162iubs256(<16 x bfloat> %__A) + %1 = bitcast <16 x i16> %0 to <4 x i64> + ret <4 x i64> %1 +} + +define dso_local <4 x i64> @test_mm256_mask_ipcvtnebf16_epu8(<4 x i64> noundef %__S, i16 noundef zeroext %__A, <16 x bfloat> noundef %__B) local_unnamed_addr #2 { +; X64-LABEL: test_mm256_mask_ipcvtnebf16_epu8: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvtnebf162iubs %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf5,0x7f,0x29,0x6b,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm256_mask_ipcvtnebf16_epu8: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtnebf162iubs %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf5,0x7f,0x29,0x6b,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = bitcast <4 x i64> %__S to <16 x i16> + %1 = tail call <16 x i16> @llvm.x86.avx10.vcvtnebf162iubs256(<16 x bfloat> %__B) + %2 = bitcast i16 %__A to <16 x i1> + %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> %0 + %4 = bitcast <16 x i16> %3 to <4 x i64> + ret <4 x i64> %4 +} + +define dso_local <4 x i64> @test_mm256_maskz_ipcvtnebf16_epu8(i16 noundef zeroext %__A, <16 x bfloat> noundef %__B) local_unnamed_addr #2 { +; X64-LABEL: test_mm256_maskz_ipcvtnebf16_epu8: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvtnebf162iubs %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x7f,0xa9,0x6b,0xc0] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm256_maskz_ipcvtnebf16_epu8: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtnebf162iubs %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x7f,0xa9,0x6b,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = tail call <16 x i16> @llvm.x86.avx10.vcvtnebf162iubs256(<16 x bfloat> %__B) + %1 = bitcast i16 %__A to <16 x i1> + %2 = select <16 x i1> %1, <16 x i16> %0, <16 x i16> zeroinitializer + %3 = bitcast <16 x i16> %2 to <4 x i64> + ret <4 x i64> %3 +} + +declare <16 x i16> @llvm.x86.avx10.vcvtnebf162iubs256(<16 x bfloat>) + +define dso_local <2 x i64> @test_mm_ipcvtph_epi8(<8 x half> noundef %__A) { +; CHECK-LABEL: test_mm_ipcvtph_epi8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vcvtph2ibs %xmm0, %xmm0 # encoding: [0x62,0xf5,0x7c,0x08,0x69,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +entry: + %0 = tail call <8 x i16> @llvm.x86.avx10.mask.vcvtph2ibs128(<8 x half> %__A, <8 x i16> zeroinitializer, i8 -1) + %1 = bitcast <8 x i16> %0 to <2 x i64> + ret <2 x i64> %1 +} + +define dso_local <2 x i64> @test_mm_mask_ipcvtph_epi8(<2 x i64> noundef %__S, i8 noundef zeroext %__A, <8 x half> noundef %__B) { +; X64-LABEL: test_mm_mask_ipcvtph_epi8: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvtph2ibs %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7c,0x89,0x69,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm_mask_ipcvtph_epi8: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtph2ibs %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7c,0x89,0x69,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = bitcast <2 x i64> %__S to <8 x i16> + %1 = tail call <8 x i16> @llvm.x86.avx10.mask.vcvtph2ibs128(<8 x half> %__B, <8 x i16> zeroinitializer, i8 %__A) + %2 = bitcast <8 x i16> %1 to <2 x i64> + ret <2 x i64> %2 +} + +declare <8 x i16> @llvm.x86.avx10.mask.vcvtph2ibs128(<8 x half>, <8 x i16>, i8) + +define dso_local <2 x i64> @test_mm_maskz_ipcvtph_epi8(i8 noundef zeroext %__A, <8 x half> noundef %__B) { +; X64-LABEL: test_mm_maskz_ipcvtph_epi8: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvtph2ibs %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7c,0x89,0x69,0xc0] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm_maskz_ipcvtph_epi8: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtph2ibs %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7c,0x89,0x69,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = tail call <8 x i16> @llvm.x86.avx10.mask.vcvtph2ibs128(<8 x half> %__B, <8 x i16> zeroinitializer, i8 %__A) + %1 = bitcast <8 x i16> %0 to <2 x i64> + ret <2 x i64> %1 +} + +define dso_local <4 x i64> @test_mm256_ipcvtph_epi8(<16 x half> noundef %__A) local_unnamed_addr #2 { +; CHECK-LABEL: test_mm256_ipcvtph_epi8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vcvtph2ibs %ymm0, %ymm0 # encoding: [0x62,0xf5,0x7c,0x28,0x69,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +entry: + %0 = tail call <16 x i16> @llvm.x86.avx10.mask.vcvtph2ibs256(<16 x half> %__A, <16 x i16> zeroinitializer, i16 -1, i32 4) + %1 = bitcast <16 x i16> %0 to <4 x i64> + ret <4 x i64> %1 +} + +define dso_local <4 x i64> @test_mm256_mask_ipcvtph_epi8(<4 x i64> noundef %__S, i16 noundef zeroext %__A, <16 x half> noundef %__B) local_unnamed_addr #2 { +; X64-LABEL: test_mm256_mask_ipcvtph_epi8: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvtph2ibs %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf5,0x7c,0x29,0x69,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm256_mask_ipcvtph_epi8: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtph2ibs %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf5,0x7c,0x29,0x69,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = bitcast <4 x i64> %__S to <16 x i16> + %1 = tail call <16 x i16> @llvm.x86.avx10.mask.vcvtph2ibs256(<16 x half> %__B, <16 x i16> %0, i16 %__A, i32 4) + %2 = bitcast <16 x i16> %1 to <4 x i64> + ret <4 x i64> %2 +} + +define dso_local <4 x i64> @test_mm256_maskz_ipcvtph_epi8(i16 noundef zeroext %__A, <16 x half> noundef %__B) local_unnamed_addr #2 { +; X64-LABEL: test_mm256_maskz_ipcvtph_epi8: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvtph2ibs %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x7c,0xa9,0x69,0xc0] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm256_maskz_ipcvtph_epi8: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtph2ibs %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x7c,0xa9,0x69,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = tail call <16 x i16> @llvm.x86.avx10.mask.vcvtph2ibs256(<16 x half> %__B, <16 x i16> zeroinitializer, i16 %__A, i32 4) + %1 = bitcast <16 x i16> %0 to <4 x i64> + ret <4 x i64> %1 +} + +define dso_local <4 x i64> @test_mm256_ipcvtph_epi8_round(<16 x half> noundef %__A) { +; CHECK-LABEL: test_mm256_ipcvtph_epi8_round: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vcvtph2ibs {rz-sae}, %ymm0, %ymm0 # encoding: [0x62,0xf5,0x78,0x78,0x69,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +entry: + %0 = tail call <16 x i16> @llvm.x86.avx10.mask.vcvtph2ibs256(<16 x half> %__A, <16 x i16> zeroinitializer, i16 -1, i32 11) + %1 = bitcast <16 x i16> %0 to <4 x i64> + ret <4 x i64> %1 +} + +define dso_local <4 x i64> @test_mm256_mask_ipcvtph_epi8_round(<4 x i64> noundef %__S, i16 noundef zeroext %__A, <16 x half> noundef %__B) { +; X64-LABEL: test_mm256_mask_ipcvtph_epi8_round: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvtph2ibs {rz-sae}, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf5,0x78,0x79,0x69,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm256_mask_ipcvtph_epi8_round: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtph2ibs {rz-sae}, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf5,0x78,0x79,0x69,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = bitcast <4 x i64> %__S to <16 x i16> + %1 = tail call <16 x i16> @llvm.x86.avx10.mask.vcvtph2ibs256(<16 x half> %__B, <16 x i16> %0, i16 %__A, i32 11) + %2 = bitcast <16 x i16> %1 to <4 x i64> + ret <4 x i64> %2 +} + +define dso_local <4 x i64> @test_mm256_maskz_ipcvtph_epi8_round(i16 noundef zeroext %__A, <16 x half> noundef %__B) { +; X64-LABEL: test_mm256_maskz_ipcvtph_epi8_round: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvtph2ibs {rz-sae}, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x78,0xf9,0x69,0xc0] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm256_maskz_ipcvtph_epi8_round: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtph2ibs {rz-sae}, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x78,0xf9,0x69,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = tail call <16 x i16> @llvm.x86.avx10.mask.vcvtph2ibs256(<16 x half> %__B, <16 x i16> zeroinitializer, i16 %__A, i32 11) + %1 = bitcast <16 x i16> %0 to <4 x i64> + ret <4 x i64> %1 +} + +declare <16 x i16> @llvm.x86.avx10.mask.vcvtph2ibs256(<16 x half>, <16 x i16>, i16, i32) + +define dso_local <2 x i64> @test_mm_ipcvtph_epu8(<8 x half> noundef %__A) { +; CHECK-LABEL: test_mm_ipcvtph_epu8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vcvtph2iubs %xmm0, %xmm0 # encoding: [0x62,0xf5,0x7c,0x08,0x6b,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +entry: + %0 = tail call <8 x i16> @llvm.x86.avx10.mask.vcvtph2iubs128(<8 x half> %__A, <8 x i16> zeroinitializer, i8 -1) + %1 = bitcast <8 x i16> %0 to <2 x i64> + ret <2 x i64> %1 +} + +define dso_local <2 x i64> @test_mm_mask_ipcvtph_epu8(<2 x i64> noundef %__S, i8 noundef zeroext %__A, <8 x half> noundef %__B) { +; X64-LABEL: test_mm_mask_ipcvtph_epu8: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvtph2iubs %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf5,0x7c,0x09,0x6b,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm_mask_ipcvtph_epu8: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtph2iubs %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf5,0x7c,0x09,0x6b,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = bitcast <2 x i64> %__S to <8 x i16> + %1 = tail call <8 x i16> @llvm.x86.avx10.mask.vcvtph2iubs128(<8 x half> %__B, <8 x i16> %0, i8 %__A) + %2 = bitcast <8 x i16> %1 to <2 x i64> + ret <2 x i64> %2 +} + +declare <8 x i16> @llvm.x86.avx10.mask.vcvtph2iubs128(<8 x half>, <8 x i16>, i8) + +define dso_local <2 x i64> @test_mm_maskz_ipcvtph_epu8(i8 noundef zeroext %__A, <8 x half> noundef %__B) { +; X64-LABEL: test_mm_maskz_ipcvtph_epu8: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvtph2iubs %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7c,0x89,0x6b,0xc0] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm_maskz_ipcvtph_epu8: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtph2iubs %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7c,0x89,0x6b,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = tail call <8 x i16> @llvm.x86.avx10.mask.vcvtph2iubs128(<8 x half> %__B, <8 x i16> zeroinitializer, i8 %__A) + %1 = bitcast <8 x i16> %0 to <2 x i64> + ret <2 x i64> %1 +} + +define dso_local <4 x i64> @test_mm256_ipcvtph_epu8(<16 x half> noundef %__A) local_unnamed_addr #2 { +; CHECK-LABEL: test_mm256_ipcvtph_epu8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vcvtph2iubs %ymm0, %ymm0 # encoding: [0x62,0xf5,0x7c,0x28,0x6b,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +entry: + %0 = tail call <16 x i16> @llvm.x86.avx10.mask.vcvtph2iubs256(<16 x half> %__A, <16 x i16> zeroinitializer, i16 -1, i32 4) + %1 = bitcast <16 x i16> %0 to <4 x i64> + ret <4 x i64> %1 +} + +define dso_local <4 x i64> @test_mm256_mask_ipcvtph_epu8(<4 x i64> noundef %__S, i16 noundef zeroext %__A, <16 x half> noundef %__B) local_unnamed_addr #2 { +; X64-LABEL: test_mm256_mask_ipcvtph_epu8: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvtph2iubs %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf5,0x7c,0x29,0x6b,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm256_mask_ipcvtph_epu8: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtph2iubs %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf5,0x7c,0x29,0x6b,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = bitcast <4 x i64> %__S to <16 x i16> + %1 = tail call <16 x i16> @llvm.x86.avx10.mask.vcvtph2iubs256(<16 x half> %__B, <16 x i16> %0, i16 %__A, i32 4) + %2 = bitcast <16 x i16> %1 to <4 x i64> + ret <4 x i64> %2 +} + +define dso_local <4 x i64> @test_mm256_maskz_ipcvtph_epu8(i16 noundef zeroext %__A, <16 x half> noundef %__B) local_unnamed_addr #2 { +; X64-LABEL: test_mm256_maskz_ipcvtph_epu8: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvtph2iubs %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x7c,0xa9,0x6b,0xc0] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm256_maskz_ipcvtph_epu8: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtph2iubs %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x7c,0xa9,0x6b,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = tail call <16 x i16> @llvm.x86.avx10.mask.vcvtph2iubs256(<16 x half> %__B, <16 x i16> zeroinitializer, i16 %__A, i32 4) + %1 = bitcast <16 x i16> %0 to <4 x i64> + ret <4 x i64> %1 +} + +define dso_local <4 x i64> @test_mm256_ipcvtph_epu8_round(<16 x half> noundef %__A) { +; CHECK-LABEL: test_mm256_ipcvtph_epu8_round: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vcvtph2iubs {rz-sae}, %ymm0, %ymm0 # encoding: [0x62,0xf5,0x78,0x78,0x6b,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +entry: + %0 = tail call <16 x i16> @llvm.x86.avx10.mask.vcvtph2iubs256(<16 x half> %__A, <16 x i16> zeroinitializer, i16 -1, i32 11) + %1 = bitcast <16 x i16> %0 to <4 x i64> + ret <4 x i64> %1 +} + +define dso_local <4 x i64> @test_mm256_mask_ipcvtph_epu8_round(<4 x i64> noundef %__S, i16 noundef zeroext %__A, <16 x half> noundef %__B) { +; X64-LABEL: test_mm256_mask_ipcvtph_epu8_round: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvtph2iubs {rz-sae}, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf5,0x78,0x79,0x6b,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm256_mask_ipcvtph_epu8_round: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtph2iubs {rz-sae}, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf5,0x78,0x79,0x6b,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = bitcast <4 x i64> %__S to <16 x i16> + %1 = tail call <16 x i16> @llvm.x86.avx10.mask.vcvtph2iubs256(<16 x half> %__B, <16 x i16> %0, i16 %__A, i32 11) + %2 = bitcast <16 x i16> %1 to <4 x i64> + ret <4 x i64> %2 +} + +define dso_local <4 x i64> @test_mm256_maskz_ipcvtph_epu8_round(i16 noundef zeroext %__A, <16 x half> noundef %__B) { +; X64-LABEL: test_mm256_maskz_ipcvtph_epu8_round: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvtph2iubs {rz-sae}, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x78,0xf9,0x6b,0xc0] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm256_maskz_ipcvtph_epu8_round: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtph2iubs {rz-sae}, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x78,0xf9,0x6b,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = tail call <16 x i16> @llvm.x86.avx10.mask.vcvtph2iubs256(<16 x half> %__B, <16 x i16> zeroinitializer, i16 %__A, i32 11) + %1 = bitcast <16 x i16> %0 to <4 x i64> + ret <4 x i64> %1 +} + +declare <16 x i16> @llvm.x86.avx10.mask.vcvtph2iubs256(<16 x half>, <16 x i16>, i16, i32) + +define dso_local <2 x i64> @test_mm_ipcvtps_epi8(<4 x float> noundef %__A) { +; CHECK-LABEL: test_mm_ipcvtps_epi8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vcvtps2ibs %xmm0, %xmm0 # encoding: [0x62,0xf5,0x7d,0x08,0x69,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +entry: + %0 = tail call <4 x i32> @llvm.x86.avx10.mask.vcvtps2ibs128(<4 x float> %__A, <4 x i32> zeroinitializer, i8 -1) + %1 = bitcast <4 x i32> %0 to <2 x i64> + ret <2 x i64> %1 +} + +define dso_local <2 x i64> @test_mm_mask_ipcvtps_epi8(<2 x i64> noundef %__S, i8 noundef zeroext %__A, <4 x float> noundef %__B) { +; X64-LABEL: test_mm_mask_ipcvtps_epi8: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvtps2ibs %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf5,0x7d,0x09,0x69,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm_mask_ipcvtps_epi8: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtps2ibs %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf5,0x7d,0x09,0x69,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = bitcast <2 x i64> %__S to <4 x i32> + %1 = tail call <4 x i32> @llvm.x86.avx10.mask.vcvtps2ibs128(<4 x float> %__B, <4 x i32> %0, i8 %__A) + %2 = bitcast <4 x i32> %1 to <2 x i64> + ret <2 x i64> %2 +} + +define dso_local <2 x i64> @test_mm_maskz_ipcvtps_epi8(i8 noundef zeroext %__A, <4 x float> noundef %__B) { +; X64-LABEL: test_mm_maskz_ipcvtps_epi8: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvtps2ibs %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7d,0x89,0x69,0xc0] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm_maskz_ipcvtps_epi8: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtps2ibs %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7d,0x89,0x69,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = tail call <4 x i32> @llvm.x86.avx10.mask.vcvtps2ibs128(<4 x float> %__B, <4 x i32> zeroinitializer, i8 %__A) + %1 = bitcast <4 x i32> %0 to <2 x i64> + ret <2 x i64> %1 +} + +declare <4 x i32> @llvm.x86.avx10.mask.vcvtps2ibs128(<4 x float>, <4 x i32>, i8) + +define dso_local <4 x i64> @test_mm256_ipcvtps_epi8(<8 x float> noundef %__A) local_unnamed_addr #2 { +; CHECK-LABEL: test_mm256_ipcvtps_epi8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vcvtps2ibs %ymm0, %ymm0 # encoding: [0x62,0xf5,0x7d,0x28,0x69,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +entry: + %0 = tail call <8 x i32> @llvm.x86.avx10.mask.vcvtps2ibs256(<8 x float> %__A, <8 x i32> zeroinitializer, i8 -1, i32 4) + %1 = bitcast <8 x i32> %0 to <4 x i64> + ret <4 x i64> %1 +} + +define dso_local <4 x i64> @test_mm256_mask_ipcvtps_epi8(<4 x i64> noundef %__S, i8 noundef zeroext %__A, <8 x float> noundef %__B) local_unnamed_addr #2 { +; X64-LABEL: test_mm256_mask_ipcvtps_epi8: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvtps2ibs %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf5,0x7d,0x29,0x69,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm256_mask_ipcvtps_epi8: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtps2ibs %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf5,0x7d,0x29,0x69,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = bitcast <4 x i64> %__S to <8 x i32> + %1 = tail call <8 x i32> @llvm.x86.avx10.mask.vcvtps2ibs256(<8 x float> %__B, <8 x i32> %0, i8 %__A, i32 4) + %2 = bitcast <8 x i32> %1 to <4 x i64> + ret <4 x i64> %2 +} + +define dso_local <4 x i64> @test_mm256_maskz_ipcvtps_epi8(i8 noundef zeroext %__A, <8 x float> noundef %__B) local_unnamed_addr #2 { +; X64-LABEL: test_mm256_maskz_ipcvtps_epi8: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvtps2ibs %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x7d,0xa9,0x69,0xc0] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm256_maskz_ipcvtps_epi8: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtps2ibs %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x7d,0xa9,0x69,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = tail call <8 x i32> @llvm.x86.avx10.mask.vcvtps2ibs256(<8 x float> %__B, <8 x i32> zeroinitializer, i8 %__A, i32 4) + %1 = bitcast <8 x i32> %0 to <4 x i64> + ret <4 x i64> %1 +} + +define dso_local <4 x i64> @test_mm256_ipcvtps_epi8_round(<8 x float> noundef %__A) { +; CHECK-LABEL: test_mm256_ipcvtps_epi8_round: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vcvtps2ibs {rz-sae}, %ymm0, %ymm0 # encoding: [0x62,0xf5,0x79,0x78,0x69,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +entry: + %0 = tail call <8 x i32> @llvm.x86.avx10.mask.vcvtps2ibs256(<8 x float> %__A, <8 x i32> zeroinitializer, i8 -1, i32 11) + %1 = bitcast <8 x i32> %0 to <4 x i64> + ret <4 x i64> %1 +} + +define dso_local <4 x i64> @test_mm256_mask_ipcvtps_epi8_round(<4 x i64> noundef %__S, i8 noundef zeroext %__A, <8 x float> noundef %__B) { +; X64-LABEL: test_mm256_mask_ipcvtps_epi8_round: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvtps2ibs {rz-sae}, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf5,0x79,0x79,0x69,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm256_mask_ipcvtps_epi8_round: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtps2ibs {rz-sae}, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf5,0x79,0x79,0x69,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = bitcast <4 x i64> %__S to <8 x i32> + %1 = tail call <8 x i32> @llvm.x86.avx10.mask.vcvtps2ibs256(<8 x float> %__B, <8 x i32> %0, i8 %__A, i32 11) + %2 = bitcast <8 x i32> %1 to <4 x i64> + ret <4 x i64> %2 +} + +define dso_local <4 x i64> @test_mm256_maskz_ipcvtps_epi8_round(i8 noundef zeroext %__A, <8 x float> noundef %__B) { +; X64-LABEL: test_mm256_maskz_ipcvtps_epi8_round: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvtps2ibs {rz-sae}, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x79,0xf9,0x69,0xc0] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm256_maskz_ipcvtps_epi8_round: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtps2ibs {rz-sae}, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x79,0xf9,0x69,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = tail call <8 x i32> @llvm.x86.avx10.mask.vcvtps2ibs256(<8 x float> %__B, <8 x i32> zeroinitializer, i8 %__A, i32 11) + %1 = bitcast <8 x i32> %0 to <4 x i64> + ret <4 x i64> %1 +} + +declare <8 x i32> @llvm.x86.avx10.mask.vcvtps2ibs256(<8 x float>, <8 x i32>, i8, i32) + +define dso_local <2 x i64> @test_mm_ipcvtps_epu8(<4 x float> noundef %__A) { +; CHECK-LABEL: test_mm_ipcvtps_epu8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vcvtps2iubs %xmm0, %xmm0 # encoding: [0x62,0xf5,0x7d,0x08,0x6b,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +entry: + %0 = tail call <4 x i32> @llvm.x86.avx10.mask.vcvtps2iubs128(<4 x float> %__A, <4 x i32> zeroinitializer, i8 -1) + %1 = bitcast <4 x i32> %0 to <2 x i64> + ret <2 x i64> %1 +} + +define dso_local <2 x i64> @test_mm_mask_ipcvtps_epu8(<2 x i64> noundef %__S, i8 noundef zeroext %__A, <4 x float> noundef %__B) { +; X64-LABEL: test_mm_mask_ipcvtps_epu8: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvtps2iubs %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf5,0x7d,0x09,0x6b,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm_mask_ipcvtps_epu8: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtps2iubs %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf5,0x7d,0x09,0x6b,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = bitcast <2 x i64> %__S to <4 x i32> + %1 = tail call <4 x i32> @llvm.x86.avx10.mask.vcvtps2iubs128(<4 x float> %__B, <4 x i32> %0, i8 %__A) + %2 = bitcast <4 x i32> %1 to <2 x i64> + ret <2 x i64> %2 +} + +define dso_local <2 x i64> @test_mm_maskz_ipcvtps_epu8(i8 noundef zeroext %__A, <4 x float> noundef %__B) { +; X64-LABEL: test_mm_maskz_ipcvtps_epu8: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvtps2iubs %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7d,0x89,0x6b,0xc0] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm_maskz_ipcvtps_epu8: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtps2iubs %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7d,0x89,0x6b,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = tail call <4 x i32> @llvm.x86.avx10.mask.vcvtps2iubs128(<4 x float> %__B, <4 x i32> zeroinitializer, i8 %__A) + %1 = bitcast <4 x i32> %0 to <2 x i64> + ret <2 x i64> %1 +} + +declare <4 x i32> @llvm.x86.avx10.mask.vcvtps2iubs128(<4 x float>, <4 x i32>, i8) + +define dso_local <4 x i64> @test_mm256_ipcvtps_epu8(<8 x float> noundef %__A) local_unnamed_addr #2 { +; CHECK-LABEL: test_mm256_ipcvtps_epu8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vcvtps2iubs %ymm0, %ymm0 # encoding: [0x62,0xf5,0x7d,0x28,0x6b,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +entry: + %0 = tail call <8 x i32> @llvm.x86.avx10.mask.vcvtps2iubs256(<8 x float> %__A, <8 x i32> zeroinitializer, i8 -1, i32 4) + %1 = bitcast <8 x i32> %0 to <4 x i64> + ret <4 x i64> %1 +} + +define dso_local <4 x i64> @test_mm256_mask_ipcvtps_epu8(<4 x i64> noundef %__S, i8 noundef zeroext %__A, <8 x float> noundef %__B) local_unnamed_addr #2 { +; X64-LABEL: test_mm256_mask_ipcvtps_epu8: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvtps2iubs %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf5,0x7d,0x29,0x6b,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm256_mask_ipcvtps_epu8: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtps2iubs %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf5,0x7d,0x29,0x6b,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = bitcast <4 x i64> %__S to <8 x i32> + %1 = tail call <8 x i32> @llvm.x86.avx10.mask.vcvtps2iubs256(<8 x float> %__B, <8 x i32> %0, i8 %__A, i32 4) + %2 = bitcast <8 x i32> %1 to <4 x i64> + ret <4 x i64> %2 +} + +define dso_local <4 x i64> @test_mm256_maskz_ipcvtps_epu8(i8 noundef zeroext %__A, <8 x float> noundef %__B) local_unnamed_addr #2 { +; X64-LABEL: test_mm256_maskz_ipcvtps_epu8: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvtps2iubs %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x7d,0xa9,0x6b,0xc0] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm256_maskz_ipcvtps_epu8: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtps2iubs %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x7d,0xa9,0x6b,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = tail call <8 x i32> @llvm.x86.avx10.mask.vcvtps2iubs256(<8 x float> %__B, <8 x i32> zeroinitializer, i8 %__A, i32 4) + %1 = bitcast <8 x i32> %0 to <4 x i64> + ret <4 x i64> %1 +} + +define dso_local <4 x i64> @test_mm256_ipcvtps_epu8_round(<8 x float> noundef %__A) { +; CHECK-LABEL: test_mm256_ipcvtps_epu8_round: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vcvtps2iubs {rz-sae}, %ymm0, %ymm0 # encoding: [0x62,0xf5,0x79,0x78,0x6b,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +entry: + %0 = tail call <8 x i32> @llvm.x86.avx10.mask.vcvtps2iubs256(<8 x float> %__A, <8 x i32> zeroinitializer, i8 -1, i32 11) + %1 = bitcast <8 x i32> %0 to <4 x i64> + ret <4 x i64> %1 +} + +define dso_local <4 x i64> @test_mm256_mask_ipcvtps_epu8_round(<4 x i64> noundef %__S, i8 noundef zeroext %__A, <8 x float> noundef %__B) { +; X64-LABEL: test_mm256_mask_ipcvtps_epu8_round: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvtps2iubs {rz-sae}, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf5,0x79,0x79,0x6b,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm256_mask_ipcvtps_epu8_round: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtps2iubs {rz-sae}, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf5,0x79,0x79,0x6b,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = bitcast <4 x i64> %__S to <8 x i32> + %1 = tail call <8 x i32> @llvm.x86.avx10.mask.vcvtps2iubs256(<8 x float> %__B, <8 x i32> %0, i8 %__A, i32 11) + %2 = bitcast <8 x i32> %1 to <4 x i64> + ret <4 x i64> %2 +} + +define dso_local <4 x i64> @test_mm256_maskz_ipcvtps_epu8_round(i8 noundef zeroext %__A, <8 x float> noundef %__B) { +; X64-LABEL: test_mm256_maskz_ipcvtps_epu8_round: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvtps2iubs {rz-sae}, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x79,0xf9,0x6b,0xc0] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm256_maskz_ipcvtps_epu8_round: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvtps2iubs {rz-sae}, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x79,0xf9,0x6b,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = tail call <8 x i32> @llvm.x86.avx10.mask.vcvtps2iubs256(<8 x float> %__B, <8 x i32> zeroinitializer, i8 %__A, i32 11) + %1 = bitcast <8 x i32> %0 to <4 x i64> + ret <4 x i64> %1 +} + +declare <8 x i32> @llvm.x86.avx10.mask.vcvtps2iubs256(<8 x float>, <8 x i32>, i8, i32) + +define dso_local <2 x i64> @test_mm_ipcvttnebf16_epi8(<8 x bfloat> noundef %__A) { +; CHECK-LABEL: test_mm_ipcvttnebf16_epi8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vcvttnebf162ibs %xmm0, %xmm0 # encoding: [0x62,0xf5,0x7f,0x08,0x68,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +entry: + %0 = tail call <8 x i16> @llvm.x86.avx10.vcvttnebf162ibs128(<8 x bfloat> %__A) + %1 = bitcast <8 x i16> %0 to <2 x i64> + ret <2 x i64> %1 +} + +define dso_local <2 x i64> @test_mm_mask_ipcvttnebf16_epi8(<2 x i64> noundef %__S, i8 noundef zeroext %__A, <8 x bfloat> noundef %__B) { +; X64-LABEL: test_mm_mask_ipcvttnebf16_epi8: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvttnebf162ibs %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf5,0x7f,0x09,0x68,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm_mask_ipcvttnebf16_epi8: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvttnebf162ibs %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf5,0x7f,0x09,0x68,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = bitcast <2 x i64> %__S to <8 x i16> + %1 = tail call <8 x i16> @llvm.x86.avx10.vcvttnebf162ibs128(<8 x bfloat> %__B) + %2 = bitcast i8 %__A to <8 x i1> + %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> %0 + %4 = bitcast <8 x i16> %3 to <2 x i64> + ret <2 x i64> %4 +} + +declare <8 x i16> @llvm.x86.avx10.vcvttnebf162ibs128(<8 x bfloat>) + +define dso_local <2 x i64> @test_mm_maskz_ipcvttnebf16_epi8(i8 noundef zeroext %__A, <8 x bfloat> noundef %__B) { +; X64-LABEL: test_mm_maskz_ipcvttnebf16_epi8: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvttnebf162ibs %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7f,0x89,0x68,0xc0] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm_maskz_ipcvttnebf16_epi8: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvttnebf162ibs %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7f,0x89,0x68,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = tail call <8 x i16> @llvm.x86.avx10.vcvttnebf162ibs128(<8 x bfloat> %__B) + %1 = bitcast i8 %__A to <8 x i1> + %2 = select <8 x i1> %1, <8 x i16> %0, <8 x i16> zeroinitializer + %3 = bitcast <8 x i16> %2 to <2 x i64> + ret <2 x i64> %3 +} + +define dso_local <4 x i64> @test_mm256_ipcvttnebf16_epi8(<16 x bfloat> noundef %__A) local_unnamed_addr #2 { +; CHECK-LABEL: test_mm256_ipcvttnebf16_epi8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vcvttnebf162ibs %ymm0, %ymm0 # encoding: [0x62,0xf5,0x7f,0x28,0x68,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +entry: + %0 = tail call <16 x i16> @llvm.x86.avx10.vcvttnebf162ibs256(<16 x bfloat> %__A) + %1 = bitcast <16 x i16> %0 to <4 x i64> + ret <4 x i64> %1 +} + +define dso_local <4 x i64> @test_mm256_mask_ipcvttnebf16_epi8(<4 x i64> noundef %__S, i16 noundef zeroext %__A, <16 x bfloat> noundef %__B) local_unnamed_addr #2 { +; X64-LABEL: test_mm256_mask_ipcvttnebf16_epi8: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvttnebf162ibs %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf5,0x7f,0x29,0x68,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm256_mask_ipcvttnebf16_epi8: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvttnebf162ibs %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf5,0x7f,0x29,0x68,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = bitcast <4 x i64> %__S to <16 x i16> + %1 = tail call <16 x i16> @llvm.x86.avx10.vcvttnebf162ibs256(<16 x bfloat> %__B) + %2 = bitcast i16 %__A to <16 x i1> + %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> %0 + %4 = bitcast <16 x i16> %3 to <4 x i64> + ret <4 x i64> %4 +} + +define dso_local <4 x i64> @test_mm256_maskz_ipcvttnebf16_epi8(i16 noundef zeroext %__A, <16 x bfloat> noundef %__B) local_unnamed_addr #2 { +; X64-LABEL: test_mm256_maskz_ipcvttnebf16_epi8: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvttnebf162ibs %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x7f,0xa9,0x68,0xc0] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm256_maskz_ipcvttnebf16_epi8: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvttnebf162ibs %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x7f,0xa9,0x68,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = tail call <16 x i16> @llvm.x86.avx10.vcvttnebf162ibs256(<16 x bfloat> %__B) + %1 = bitcast i16 %__A to <16 x i1> + %2 = select <16 x i1> %1, <16 x i16> %0, <16 x i16> zeroinitializer + %3 = bitcast <16 x i16> %2 to <4 x i64> + ret <4 x i64> %3 +} + +declare <16 x i16> @llvm.x86.avx10.vcvttnebf162ibs256(<16 x bfloat>) + +define dso_local <2 x i64> @test_mm_ipcvttnebf16_epu8(<8 x bfloat> noundef %__A) { +; CHECK-LABEL: test_mm_ipcvttnebf16_epu8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vcvttnebf162iubs %xmm0, %xmm0 # encoding: [0x62,0xf5,0x7f,0x08,0x6a,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +entry: + %0 = tail call <8 x i16> @llvm.x86.avx10.vcvttnebf162iubs128(<8 x bfloat> %__A) + %1 = bitcast <8 x i16> %0 to <2 x i64> + ret <2 x i64> %1 +} + +define dso_local <2 x i64> @test_mm_mask_ipcvttnebf16_epu8(<2 x i64> noundef %__S, i8 noundef zeroext %__A, <8 x bfloat> noundef %__B) { +; X64-LABEL: test_mm_mask_ipcvttnebf16_epu8: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvttnebf162iubs %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf5,0x7f,0x09,0x6a,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm_mask_ipcvttnebf16_epu8: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvttnebf162iubs %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf5,0x7f,0x09,0x6a,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = bitcast <2 x i64> %__S to <8 x i16> + %1 = tail call <8 x i16> @llvm.x86.avx10.vcvttnebf162iubs128(<8 x bfloat> %__B) + %2 = bitcast i8 %__A to <8 x i1> + %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> %0 + %4 = bitcast <8 x i16> %3 to <2 x i64> + ret <2 x i64> %4 +} + +declare <8 x i16> @llvm.x86.avx10.vcvttnebf162iubs128(<8 x bfloat>) + +define dso_local <2 x i64> @test_mm_maskz_ipcvttnebf16_epu8(i8 noundef zeroext %__A, <8 x bfloat> noundef %__B) { +; X64-LABEL: test_mm_maskz_ipcvttnebf16_epu8: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvttnebf162iubs %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7f,0x89,0x6a,0xc0] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm_maskz_ipcvttnebf16_epu8: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvttnebf162iubs %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7f,0x89,0x6a,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = tail call <8 x i16> @llvm.x86.avx10.vcvttnebf162iubs128(<8 x bfloat> %__B) + %1 = bitcast i8 %__A to <8 x i1> + %2 = select <8 x i1> %1, <8 x i16> %0, <8 x i16> zeroinitializer + %3 = bitcast <8 x i16> %2 to <2 x i64> + ret <2 x i64> %3 +} + +define dso_local <4 x i64> @test_mm256_ipcvttnebf16_epu8(<16 x bfloat> noundef %__A) local_unnamed_addr #2 { +; CHECK-LABEL: test_mm256_ipcvttnebf16_epu8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vcvttnebf162iubs %ymm0, %ymm0 # encoding: [0x62,0xf5,0x7f,0x28,0x6a,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +entry: + %0 = tail call <16 x i16> @llvm.x86.avx10.vcvttnebf162iubs256(<16 x bfloat> %__A) + %1 = bitcast <16 x i16> %0 to <4 x i64> + ret <4 x i64> %1 +} + +define dso_local <4 x i64> @test_mm256_mask_ipcvttnebf16_epu8(<4 x i64> noundef %__S, i16 noundef zeroext %__A, <16 x bfloat> noundef %__B) local_unnamed_addr #2 { +; X64-LABEL: test_mm256_mask_ipcvttnebf16_epu8: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvttnebf162iubs %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf5,0x7f,0x29,0x6a,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm256_mask_ipcvttnebf16_epu8: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvttnebf162iubs %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf5,0x7f,0x29,0x6a,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = bitcast <4 x i64> %__S to <16 x i16> + %1 = tail call <16 x i16> @llvm.x86.avx10.vcvttnebf162iubs256(<16 x bfloat> %__B) + %2 = bitcast i16 %__A to <16 x i1> + %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> %0 + %4 = bitcast <16 x i16> %3 to <4 x i64> + ret <4 x i64> %4 +} + +define dso_local <4 x i64> @test_mm256_maskz_ipcvttnebf16_epu8(i16 noundef zeroext %__A, <16 x bfloat> noundef %__B) local_unnamed_addr #2 { +; X64-LABEL: test_mm256_maskz_ipcvttnebf16_epu8: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvttnebf162iubs %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x7f,0xa9,0x6a,0xc0] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm256_maskz_ipcvttnebf16_epu8: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvttnebf162iubs %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x7f,0xa9,0x6a,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = tail call <16 x i16> @llvm.x86.avx10.vcvttnebf162iubs256(<16 x bfloat> %__B) + %1 = bitcast i16 %__A to <16 x i1> + %2 = select <16 x i1> %1, <16 x i16> %0, <16 x i16> zeroinitializer + %3 = bitcast <16 x i16> %2 to <4 x i64> + ret <4 x i64> %3 +} + +declare <16 x i16> @llvm.x86.avx10.vcvttnebf162iubs256(<16 x bfloat>) + +define dso_local <2 x i64> @test_mm_ipcvttph_epi8(<8 x half> noundef %__A) { +; CHECK-LABEL: test_mm_ipcvttph_epi8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vcvttph2ibs %xmm0, %xmm0 # encoding: [0x62,0xf5,0x7c,0x08,0x68,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +entry: + %0 = tail call <8 x i16> @llvm.x86.avx10.mask.vcvttph2ibs128(<8 x half> %__A, <8 x i16> zeroinitializer, i8 -1) + %1 = bitcast <8 x i16> %0 to <2 x i64> + ret <2 x i64> %1 +} + +define dso_local <2 x i64> @test_mm_mask_ipcvttph_epi8(<2 x i64> noundef %__S, i8 noundef zeroext %__A, <8 x half> noundef %__B) { +; X64-LABEL: test_mm_mask_ipcvttph_epi8: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvttph2ibs %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf5,0x7c,0x09,0x68,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm_mask_ipcvttph_epi8: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvttph2ibs %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf5,0x7c,0x09,0x68,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = bitcast <2 x i64> %__S to <8 x i16> + %1 = tail call <8 x i16> @llvm.x86.avx10.mask.vcvttph2ibs128(<8 x half> %__B, <8 x i16> %0, i8 %__A) + %2 = bitcast <8 x i16> %1 to <2 x i64> + ret <2 x i64> %2 +} + +declare <8 x i16> @llvm.x86.avx10.mask.vcvttph2ibs128(<8 x half>, <8 x i16>, i8) + +define dso_local <2 x i64> @test_mm_maskz_ipcvttph_epi8(i8 noundef zeroext %__A, <8 x half> noundef %__B) { +; X64-LABEL: test_mm_maskz_ipcvttph_epi8: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvttph2ibs %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7c,0x89,0x68,0xc0] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm_maskz_ipcvttph_epi8: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvttph2ibs %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7c,0x89,0x68,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = tail call <8 x i16> @llvm.x86.avx10.mask.vcvttph2ibs128(<8 x half> %__B, <8 x i16> zeroinitializer, i8 %__A) + %1 = bitcast <8 x i16> %0 to <2 x i64> + ret <2 x i64> %1 +} + +define dso_local <4 x i64> @test_mm256_ipcvttph_epi8(<16 x half> noundef %__A) local_unnamed_addr #2 { +; CHECK-LABEL: test_mm256_ipcvttph_epi8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vcvttph2ibs %ymm0, %ymm0 # encoding: [0x62,0xf5,0x7c,0x28,0x68,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +entry: + %0 = tail call <16 x i16> @llvm.x86.avx10.mask.vcvttph2ibs256(<16 x half> %__A, <16 x i16> zeroinitializer, i16 -1, i32 4) + %1 = bitcast <16 x i16> %0 to <4 x i64> + ret <4 x i64> %1 +} + +define dso_local <4 x i64> @test_mm256_mask_ipcvttph_epi8(<4 x i64> noundef %__S, i16 noundef zeroext %__A, <16 x half> noundef %__B) local_unnamed_addr #2 { +; X64-LABEL: test_mm256_mask_ipcvttph_epi8: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvttph2ibs %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf5,0x7c,0x29,0x68,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm256_mask_ipcvttph_epi8: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvttph2ibs %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf5,0x7c,0x29,0x68,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = bitcast <4 x i64> %__S to <16 x i16> + %1 = tail call <16 x i16> @llvm.x86.avx10.mask.vcvttph2ibs256(<16 x half> %__B, <16 x i16> %0, i16 %__A, i32 4) + %2 = bitcast <16 x i16> %1 to <4 x i64> + ret <4 x i64> %2 +} + +define dso_local <4 x i64> @test_mm256_maskz_ipcvttph_epi8(i16 noundef zeroext %__A, <16 x half> noundef %__B) local_unnamed_addr #2 { +; X64-LABEL: test_mm256_maskz_ipcvttph_epi8: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvttph2ibs %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x7c,0xa9,0x68,0xc0] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm256_maskz_ipcvttph_epi8: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvttph2ibs %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x7c,0xa9,0x68,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = tail call <16 x i16> @llvm.x86.avx10.mask.vcvttph2ibs256(<16 x half> %__B, <16 x i16> zeroinitializer, i16 %__A, i32 4) + %1 = bitcast <16 x i16> %0 to <4 x i64> + ret <4 x i64> %1 +} + +define dso_local <4 x i64> @test_mm256_ipcvttph_epi8_round(<16 x half> noundef %__A) { +; CHECK-LABEL: test_mm256_ipcvttph_epi8_round: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vcvttph2ibs {sae}, %ymm0, %ymm0 # encoding: [0x62,0xf5,0x78,0x18,0x68,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +entry: + %0 = tail call <16 x i16> @llvm.x86.avx10.mask.vcvttph2ibs256(<16 x half> %__A, <16 x i16> zeroinitializer, i16 -1, i32 8) + %1 = bitcast <16 x i16> %0 to <4 x i64> + ret <4 x i64> %1 +} + +define dso_local <4 x i64> @test_mm256_mask_ipcvttph_epi8_round(<4 x i64> noundef %__S, i16 noundef zeroext %__A, <16 x half> noundef %__B) { +; X64-LABEL: test_mm256_mask_ipcvttph_epi8_round: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvttph2ibs {sae}, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf5,0x78,0x19,0x68,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm256_mask_ipcvttph_epi8_round: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvttph2ibs {sae}, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf5,0x78,0x19,0x68,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = bitcast <4 x i64> %__S to <16 x i16> + %1 = tail call <16 x i16> @llvm.x86.avx10.mask.vcvttph2ibs256(<16 x half> %__B, <16 x i16> %0, i16 %__A, i32 8) + %2 = bitcast <16 x i16> %1 to <4 x i64> + ret <4 x i64> %2 +} + +define dso_local <4 x i64> @test_mm256_maskz_ipcvttph_epi8_round(i16 noundef zeroext %__A, <16 x half> noundef %__B) { +; X64-LABEL: test_mm256_maskz_ipcvttph_epi8_round: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvttph2ibs {sae}, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x78,0x99,0x68,0xc0] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm256_maskz_ipcvttph_epi8_round: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvttph2ibs {sae}, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x78,0x99,0x68,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = tail call <16 x i16> @llvm.x86.avx10.mask.vcvttph2ibs256(<16 x half> %__B, <16 x i16> zeroinitializer, i16 %__A, i32 8) + %1 = bitcast <16 x i16> %0 to <4 x i64> + ret <4 x i64> %1 +} +declare <16 x i16> @llvm.x86.avx10.mask.vcvttph2ibs256(<16 x half>, <16 x i16>, i16, i32) + +define dso_local <2 x i64> @test_mm_ipcvttph_epu8(<8 x half> noundef %__A) { +; CHECK-LABEL: test_mm_ipcvttph_epu8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vcvttph2iubs %xmm0, %xmm0 # encoding: [0x62,0xf5,0x7c,0x08,0x6a,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +entry: + %0 = tail call <8 x i16> @llvm.x86.avx10.mask.vcvttph2iubs128(<8 x half> %__A, <8 x i16> zeroinitializer, i8 -1) + %1 = bitcast <8 x i16> %0 to <2 x i64> + ret <2 x i64> %1 +} + +define dso_local <2 x i64> @test_mm_mask_ipcvttph_epu8(<2 x i64> noundef %__S, i8 noundef zeroext %__A, <8 x half> noundef %__B) { +; X64-LABEL: test_mm_mask_ipcvttph_epu8: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvttph2iubs %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7c,0x89,0x6a,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm_mask_ipcvttph_epu8: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvttph2iubs %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7c,0x89,0x6a,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = bitcast <2 x i64> %__S to <8 x i16> + %1 = tail call <8 x i16> @llvm.x86.avx10.mask.vcvttph2iubs128(<8 x half> %__B, <8 x i16> zeroinitializer, i8 %__A) + %2 = bitcast <8 x i16> %1 to <2 x i64> + ret <2 x i64> %2 +} + +declare <8 x i16> @llvm.x86.avx10.mask.vcvttph2iubs128(<8 x half>, <8 x i16>, i8) + +define dso_local <2 x i64> @test_mm_maskz_ipcvttph_epu8(i8 noundef zeroext %__A, <8 x half> noundef %__B) { +; X64-LABEL: test_mm_maskz_ipcvttph_epu8: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvttph2iubs %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7c,0x89,0x6a,0xc0] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm_maskz_ipcvttph_epu8: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvttph2iubs %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7c,0x89,0x6a,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = tail call <8 x i16> @llvm.x86.avx10.mask.vcvttph2iubs128(<8 x half> %__B, <8 x i16> zeroinitializer, i8 %__A) + %1 = bitcast <8 x i16> %0 to <2 x i64> + ret <2 x i64> %1 +} + +define dso_local <4 x i64> @test_mm256_ipcvttph_epu8(<16 x half> noundef %__A) local_unnamed_addr #2 { +; CHECK-LABEL: test_mm256_ipcvttph_epu8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vcvttph2iubs %ymm0, %ymm0 # encoding: [0x62,0xf5,0x7c,0x28,0x6a,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +entry: + %0 = tail call <16 x i16> @llvm.x86.avx10.mask.vcvttph2iubs256(<16 x half> %__A, <16 x i16> zeroinitializer, i16 -1, i32 4) + %1 = bitcast <16 x i16> %0 to <4 x i64> + ret <4 x i64> %1 +} + +define dso_local <4 x i64> @test_mm256_mask_ipcvttph_epu8(<4 x i64> noundef %__S, i16 noundef zeroext %__A, <16 x half> noundef %__B) local_unnamed_addr #2 { +; X64-LABEL: test_mm256_mask_ipcvttph_epu8: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvttph2iubs %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf5,0x7c,0x29,0x6a,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm256_mask_ipcvttph_epu8: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvttph2iubs %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf5,0x7c,0x29,0x6a,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = bitcast <4 x i64> %__S to <16 x i16> + %1 = tail call <16 x i16> @llvm.x86.avx10.mask.vcvttph2iubs256(<16 x half> %__B, <16 x i16> %0, i16 %__A, i32 4) + %2 = bitcast <16 x i16> %1 to <4 x i64> + ret <4 x i64> %2 +} + +define dso_local <4 x i64> @test_mm256_maskz_ipcvttph_epu8(i16 noundef zeroext %__A, <16 x half> noundef %__B) local_unnamed_addr #2 { +; X64-LABEL: test_mm256_maskz_ipcvttph_epu8: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvttph2iubs %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x7c,0xa9,0x6a,0xc0] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm256_maskz_ipcvttph_epu8: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvttph2iubs %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x7c,0xa9,0x6a,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = tail call <16 x i16> @llvm.x86.avx10.mask.vcvttph2iubs256(<16 x half> %__B, <16 x i16> zeroinitializer, i16 %__A, i32 4) + %1 = bitcast <16 x i16> %0 to <4 x i64> + ret <4 x i64> %1 +} + +define dso_local <4 x i64> @test_mm256_ipcvttph_epu8_round(<16 x half> noundef %__A) { +; CHECK-LABEL: test_mm256_ipcvttph_epu8_round: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vcvttph2iubs {sae}, %ymm0, %ymm0 # encoding: [0x62,0xf5,0x78,0x18,0x6a,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +entry: + %0 = tail call <16 x i16> @llvm.x86.avx10.mask.vcvttph2iubs256(<16 x half> %__A, <16 x i16> zeroinitializer, i16 -1, i32 8) + %1 = bitcast <16 x i16> %0 to <4 x i64> + ret <4 x i64> %1 +} + +define dso_local <4 x i64> @test_mm256_mask_ipcvttph_epu8_round(<4 x i64> noundef %__S, i16 noundef zeroext %__A, <16 x half> noundef %__B) { +; X64-LABEL: test_mm256_mask_ipcvttph_epu8_round: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvttph2iubs {sae}, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf5,0x78,0x19,0x6a,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm256_mask_ipcvttph_epu8_round: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvttph2iubs {sae}, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf5,0x78,0x19,0x6a,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = bitcast <4 x i64> %__S to <16 x i16> + %1 = tail call <16 x i16> @llvm.x86.avx10.mask.vcvttph2iubs256(<16 x half> %__B, <16 x i16> %0, i16 %__A, i32 8) + %2 = bitcast <16 x i16> %1 to <4 x i64> + ret <4 x i64> %2 +} + +define dso_local <4 x i64> @test_mm256_maskz_ipcvttph_epu8_round(i16 noundef zeroext %__A, <16 x half> noundef %__B) { +; X64-LABEL: test_mm256_maskz_ipcvttph_epu8_round: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvttph2iubs {sae}, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x78,0x99,0x6a,0xc0] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm256_maskz_ipcvttph_epu8_round: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvttph2iubs {sae}, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x78,0x99,0x6a,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = tail call <16 x i16> @llvm.x86.avx10.mask.vcvttph2iubs256(<16 x half> %__B, <16 x i16> zeroinitializer, i16 %__A, i32 8) + %1 = bitcast <16 x i16> %0 to <4 x i64> + ret <4 x i64> %1 +} + +declare <16 x i16> @llvm.x86.avx10.mask.vcvttph2iubs256(<16 x half>, <16 x i16>, i16, i32) + +define dso_local <2 x i64> @test_mm_ipcvttps_epi8(<4 x float> noundef %__A) { +; CHECK-LABEL: test_mm_ipcvttps_epi8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vcvttps2ibs %xmm0, %xmm0 # encoding: [0x62,0xf5,0x7d,0x08,0x68,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +entry: + %0 = tail call <4 x i32> @llvm.x86.avx10.mask.vcvttps2ibs128(<4 x float> %__A, <4 x i32> zeroinitializer, i8 -1) + %1 = bitcast <4 x i32> %0 to <2 x i64> + ret <2 x i64> %1 +} + +define dso_local <2 x i64> @test_mm_mask_ipcvttps_epi8(<2 x i64> noundef %__S, i8 noundef zeroext %__A, <4 x float> noundef %__B) { +; X64-LABEL: test_mm_mask_ipcvttps_epi8: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvttps2ibs %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf5,0x7d,0x09,0x68,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm_mask_ipcvttps_epi8: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvttps2ibs %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf5,0x7d,0x09,0x68,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = bitcast <2 x i64> %__S to <4 x i32> + %1 = tail call <4 x i32> @llvm.x86.avx10.mask.vcvttps2ibs128(<4 x float> %__B, <4 x i32> %0, i8 %__A) + %2 = bitcast <4 x i32> %1 to <2 x i64> + ret <2 x i64> %2 +} + +define dso_local <2 x i64> @test_mm_maskz_ipcvttps_epi8(i8 noundef zeroext %__A, <4 x float> noundef %__B) { +; X64-LABEL: test_mm_maskz_ipcvttps_epi8: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvttps2ibs %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7d,0x89,0x68,0xc0] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm_maskz_ipcvttps_epi8: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvttps2ibs %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7d,0x89,0x68,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = tail call <4 x i32> @llvm.x86.avx10.mask.vcvttps2ibs128(<4 x float> %__B, <4 x i32> zeroinitializer, i8 %__A) + %1 = bitcast <4 x i32> %0 to <2 x i64> + ret <2 x i64> %1 +} + +declare <4 x i32> @llvm.x86.avx10.mask.vcvttps2ibs128(<4 x float>, <4 x i32>, i8) + +define dso_local <4 x i64> @test_mm256_ipcvttps_epi8(<8 x float> noundef %__A) local_unnamed_addr #2 { +; CHECK-LABEL: test_mm256_ipcvttps_epi8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vcvttps2ibs %ymm0, %ymm0 # encoding: [0x62,0xf5,0x7d,0x28,0x68,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +entry: + %0 = tail call <8 x i32> @llvm.x86.avx10.mask.vcvttps2ibs256(<8 x float> %__A, <8 x i32> zeroinitializer, i8 -1, i32 4) + %1 = bitcast <8 x i32> %0 to <4 x i64> + ret <4 x i64> %1 +} + +define dso_local <4 x i64> @test_mm256_mask_ipcvttps_epi8(<4 x i64> noundef %__S, i8 noundef zeroext %__A, <8 x float> noundef %__B) local_unnamed_addr #2 { +; X64-LABEL: test_mm256_mask_ipcvttps_epi8: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvttps2ibs %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf5,0x7d,0x29,0x68,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm256_mask_ipcvttps_epi8: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvttps2ibs %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf5,0x7d,0x29,0x68,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = bitcast <4 x i64> %__S to <8 x i32> + %1 = tail call <8 x i32> @llvm.x86.avx10.mask.vcvttps2ibs256(<8 x float> %__B, <8 x i32> %0, i8 %__A, i32 4) + %2 = bitcast <8 x i32> %1 to <4 x i64> + ret <4 x i64> %2 +} + +define dso_local <4 x i64> @test_mm256_maskz_ipcvttps_epi8(i8 noundef zeroext %__A, <8 x float> noundef %__B) local_unnamed_addr #2 { +; X64-LABEL: test_mm256_maskz_ipcvttps_epi8: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvttps2ibs %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x7d,0xa9,0x68,0xc0] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm256_maskz_ipcvttps_epi8: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvttps2ibs %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x7d,0xa9,0x68,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = tail call <8 x i32> @llvm.x86.avx10.mask.vcvttps2ibs256(<8 x float> %__B, <8 x i32> zeroinitializer, i8 %__A, i32 4) + %1 = bitcast <8 x i32> %0 to <4 x i64> + ret <4 x i64> %1 +} + +define dso_local <4 x i64> @test_mm256_ipcvttps_epi8_round(<8 x float> noundef %__A) { +; CHECK-LABEL: test_mm256_ipcvttps_epi8_round: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vcvttps2ibs {sae}, %ymm0, %ymm0 # encoding: [0x62,0xf5,0x79,0x18,0x68,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +entry: + %0 = tail call <8 x i32> @llvm.x86.avx10.mask.vcvttps2ibs256(<8 x float> %__A, <8 x i32> zeroinitializer, i8 -1, i32 8) + %1 = bitcast <8 x i32> %0 to <4 x i64> + ret <4 x i64> %1 +} + +define dso_local <4 x i64> @test_mm256_mask_ipcvttps_epi8_round(<4 x i64> noundef %__S, i8 noundef zeroext %__A, <8 x float> noundef %__B) { +; X64-LABEL: test_mm256_mask_ipcvttps_epi8_round: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvttps2ibs {sae}, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf5,0x79,0x19,0x68,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm256_mask_ipcvttps_epi8_round: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvttps2ibs {sae}, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf5,0x79,0x19,0x68,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = bitcast <4 x i64> %__S to <8 x i32> + %1 = tail call <8 x i32> @llvm.x86.avx10.mask.vcvttps2ibs256(<8 x float> %__B, <8 x i32> %0, i8 %__A, i32 8) + %2 = bitcast <8 x i32> %1 to <4 x i64> + ret <4 x i64> %2 +} + +define dso_local <4 x i64> @test_mm256_maskz_ipcvttps_epi8_round(i8 noundef zeroext %__A, <8 x float> noundef %__B) { +; X64-LABEL: test_mm256_maskz_ipcvttps_epi8_round: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvttps2ibs {sae}, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x79,0x99,0x68,0xc0] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm256_maskz_ipcvttps_epi8_round: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvttps2ibs {sae}, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x79,0x99,0x68,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = tail call <8 x i32> @llvm.x86.avx10.mask.vcvttps2ibs256(<8 x float> %__B, <8 x i32> zeroinitializer, i8 %__A, i32 8) + %1 = bitcast <8 x i32> %0 to <4 x i64> + ret <4 x i64> %1 +} + +declare <8 x i32> @llvm.x86.avx10.mask.vcvttps2ibs256(<8 x float>, <8 x i32>, i8, i32) + +define dso_local <2 x i64> @test_mm_ipcvttps_epu8(<4 x float> noundef %__A) { +; CHECK-LABEL: test_mm_ipcvttps_epu8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vcvttps2iubs %xmm0, %xmm0 # encoding: [0x62,0xf5,0x7d,0x08,0x6a,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +entry: + %0 = tail call <4 x i32> @llvm.x86.avx10.mask.vcvttps2iubs128(<4 x float> %__A, <4 x i32> zeroinitializer, i8 -1) + %1 = bitcast <4 x i32> %0 to <2 x i64> + ret <2 x i64> %1 +} + +define dso_local <2 x i64> @test_mm_mask_ipcvttps_epu8(<2 x i64> noundef %__S, i8 noundef zeroext %__A, <4 x float> noundef %__B) { +; X64-LABEL: test_mm_mask_ipcvttps_epu8: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvttps2iubs %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf5,0x7d,0x09,0x6a,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm_mask_ipcvttps_epu8: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvttps2iubs %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf5,0x7d,0x09,0x6a,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = bitcast <2 x i64> %__S to <4 x i32> + %1 = tail call <4 x i32> @llvm.x86.avx10.mask.vcvttps2iubs128(<4 x float> %__B, <4 x i32> %0, i8 %__A) + %2 = bitcast <4 x i32> %1 to <2 x i64> + ret <2 x i64> %2 +} + +define dso_local <2 x i64> @test_mm_maskz_ipcvttps_epu8(i8 noundef zeroext %__A, <4 x float> noundef %__B) { +; X64-LABEL: test_mm_maskz_ipcvttps_epu8: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvttps2iubs %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7d,0x89,0x6a,0xc0] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm_maskz_ipcvttps_epu8: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvttps2iubs %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x7d,0x89,0x6a,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = tail call <4 x i32> @llvm.x86.avx10.mask.vcvttps2iubs128(<4 x float> %__B, <4 x i32> zeroinitializer, i8 %__A) + %1 = bitcast <4 x i32> %0 to <2 x i64> + ret <2 x i64> %1 +} + +declare <4 x i32> @llvm.x86.avx10.mask.vcvttps2iubs128(<4 x float>, <4 x i32>, i8) + +define dso_local <4 x i64> @test_mm256_ipcvttps_epu8(<8 x float> noundef %__A) local_unnamed_addr #2 { +; CHECK-LABEL: test_mm256_ipcvttps_epu8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vcvttps2iubs %ymm0, %ymm0 # encoding: [0x62,0xf5,0x7d,0x28,0x6a,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +entry: + %0 = tail call <8 x i32> @llvm.x86.avx10.mask.vcvttps2iubs256(<8 x float> %__A, <8 x i32> zeroinitializer, i8 -1, i32 4) + %1 = bitcast <8 x i32> %0 to <4 x i64> + ret <4 x i64> %1 +} + +define dso_local <4 x i64> @test_mm256_mask_ipcvttps_epu8(<4 x i64> noundef %__S, i8 noundef zeroext %__A, <8 x float> noundef %__B) local_unnamed_addr #2 { +; X64-LABEL: test_mm256_mask_ipcvttps_epu8: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvttps2iubs %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf5,0x7d,0x29,0x6a,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm256_mask_ipcvttps_epu8: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvttps2iubs %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf5,0x7d,0x29,0x6a,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = bitcast <4 x i64> %__S to <8 x i32> + %1 = tail call <8 x i32> @llvm.x86.avx10.mask.vcvttps2iubs256(<8 x float> %__B, <8 x i32> %0, i8 %__A, i32 4) + %2 = bitcast <8 x i32> %1 to <4 x i64> + ret <4 x i64> %2 +} + +define dso_local <4 x i64> @test_mm256_maskz_ipcvttps_epu8(i8 noundef zeroext %__A, <8 x float> noundef %__B) local_unnamed_addr #2 { +; X64-LABEL: test_mm256_maskz_ipcvttps_epu8: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvttps2iubs %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x7d,0xa9,0x6a,0xc0] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm256_maskz_ipcvttps_epu8: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvttps2iubs %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x7d,0xa9,0x6a,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = tail call <8 x i32> @llvm.x86.avx10.mask.vcvttps2iubs256(<8 x float> %__B, <8 x i32> zeroinitializer, i8 %__A, i32 4) + %1 = bitcast <8 x i32> %0 to <4 x i64> + ret <4 x i64> %1 +} + +define dso_local <4 x i64> @test_mm256_ipcvttps_epu8_round(<8 x float> noundef %__A) { +; CHECK-LABEL: test_mm256_ipcvttps_epu8_round: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vcvttps2iubs {sae}, %ymm0, %ymm0 # encoding: [0x62,0xf5,0x79,0x18,0x6a,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +entry: + %0 = tail call <8 x i32> @llvm.x86.avx10.mask.vcvttps2iubs256(<8 x float> %__A, <8 x i32> zeroinitializer, i8 -1, i32 8) + %1 = bitcast <8 x i32> %0 to <4 x i64> + ret <4 x i64> %1 +} + +define dso_local <4 x i64> @test_mm256_mask_ipcvttps_epu8_round(<4 x i64> noundef %__S, i8 noundef zeroext %__A, <8 x float> noundef %__B) { +; X64-LABEL: test_mm256_mask_ipcvttps_epu8_round: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvttps2iubs {sae}, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf5,0x79,0x19,0x6a,0xc1] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm256_mask_ipcvttps_epu8_round: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvttps2iubs {sae}, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf5,0x79,0x19,0x6a,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = bitcast <4 x i64> %__S to <8 x i32> + %1 = tail call <8 x i32> @llvm.x86.avx10.mask.vcvttps2iubs256(<8 x float> %__B, <8 x i32> %0, i8 %__A, i32 8) + %2 = bitcast <8 x i32> %1 to <4 x i64> + ret <4 x i64> %2 +} + +define dso_local <4 x i64> @test_mm256_maskz_ipcvttps_epu8_round(i8 noundef zeroext %__A, <8 x float> noundef %__B) { +; X64-LABEL: test_mm256_maskz_ipcvttps_epu8_round: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vcvttps2iubs {sae}, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x79,0x99,0x6a,0xc0] +; X64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mm256_maskz_ipcvttps_epu8_round: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vcvttps2iubs {sae}, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x79,0x99,0x6a,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +entry: + %0 = tail call <8 x i32> @llvm.x86.avx10.mask.vcvttps2iubs256(<8 x float> %__B, <8 x i32> zeroinitializer, i8 %__A, i32 8) + %1 = bitcast <8 x i32> %0 to <4 x i64> + ret <4 x i64> %1 +} + +declare <8 x i32> @llvm.x86.avx10.mask.vcvttps2iubs256(<8 x float>, <8 x i32>, i8, i32) diff --git a/llvm/test/CodeGen/X86/avx512-intel-ocl.ll b/llvm/test/CodeGen/X86/avx512-intel-ocl.ll index 25d182afd66e..78870278eeac 100644 --- a/llvm/test/CodeGen/X86/avx512-intel-ocl.ll +++ b/llvm/test/CodeGen/X86/avx512-intel-ocl.ll @@ -69,8 +69,12 @@ define <16 x float> @testf16_inp(<16 x float> %a, <16 x float> %b) nounwind { ; X64-NEXT: andq $-64, %rsp ; X64-NEXT: subq $128, %rsp ; X64-NEXT: vaddps %zmm1, %zmm0, %zmm0 +; X64-NEXT: pushq %rbp +; X64-NEXT: pushq %rax ; X64-NEXT: movq %rsp, %rdi ; X64-NEXT: callq _func_float16_ptr +; X64-NEXT: addq $8, %rsp +; X64-NEXT: popq %rbp ; X64-NEXT: vaddps (%rsp), %zmm0, %zmm0 ; X64-NEXT: leaq -16(%rbp), %rsp ; X64-NEXT: popq %r12 @@ -149,8 +153,12 @@ define <16 x float> @testf16_regs(<16 x float> %a, <16 x float> %b) nounwind { ; X64-NEXT: subq $128, %rsp ; X64-NEXT: vmovaps %zmm1, %zmm16 ; X64-NEXT: vaddps %zmm1, %zmm0, %zmm0 +; X64-NEXT: pushq %rbp +; X64-NEXT: pushq %rax ; X64-NEXT: movq %rsp, %rdi ; X64-NEXT: callq _func_float16_ptr +; X64-NEXT: addq $8, %rsp +; X64-NEXT: popq %rbp ; X64-NEXT: vaddps %zmm16, %zmm0, %zmm0 ; X64-NEXT: vaddps (%rsp), %zmm0, %zmm0 ; X64-NEXT: leaq -16(%rbp), %rsp diff --git a/llvm/test/CodeGen/X86/clobber_base_ptr.ll b/llvm/test/CodeGen/X86/clobber_base_ptr.ll new file mode 100644 index 000000000000..2c39560f02d1 --- /dev/null +++ b/llvm/test/CodeGen/X86/clobber_base_ptr.ll @@ -0,0 +1,118 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc < %s | FileCheck %s + +target datalayout = "e-m:x-p:32:32-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:32-n8:16:32-a:0:32-S32" +target triple = "i386-pc-windows-gnu" + +; This function uses esi as base pointer, the inline asm clobbers esi, so we +; should save esi using esp before the inline asm, and restore esi after the +; inline asm. + +define i32 @clober_bp() { +; CHECK-LABEL: clober_bp: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pushl %ebp +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: .cfi_offset %ebp, -8 +; CHECK-NEXT: movl %esp, %ebp +; CHECK-NEXT: .cfi_def_cfa_register %ebp +; CHECK-NEXT: pushl %edi +; CHECK-NEXT: pushl %esi +; CHECK-NEXT: andl $-16, %esp +; CHECK-NEXT: subl $16, %esp +; CHECK-NEXT: movl %esp, %esi +; CHECK-NEXT: .cfi_offset %esi, -16 +; CHECK-NEXT: .cfi_offset %edi, -12 +; CHECK-NEXT: movl $4, 12(%esi) +; CHECK-NEXT: movl 12(%esi), %eax +; CHECK-NEXT: addl $3, %eax +; CHECK-NEXT: andl $-4, %eax +; CHECK-NEXT: calll __alloca +; CHECK-NEXT: movl %esp, %eax +; CHECK-NEXT: andl $-16, %eax +; CHECK-NEXT: movl %eax, %esp +; CHECK-NEXT: movl $1, (%eax) +; CHECK-NEXT: leal 8(%esi), %edi +; CHECK-NEXT: movl $4, %ecx +; CHECK-NEXT: pushl %esi +; CHECK-NEXT: movl %eax, %esi +; CHECK-NEXT: #APP +; CHECK-NEXT: rep movsb (%esi), %es:(%edi) +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: popl %esi +; CHECK-NEXT: movl 8(%esi), %eax +; CHECK-NEXT: leal -8(%ebp), %esp +; CHECK-NEXT: popl %esi +; CHECK-NEXT: popl %edi +; CHECK-NEXT: popl %ebp +; CHECK-NEXT: retl +entry: + %size = alloca i32, align 4 + %g = alloca i32, align 4 + store volatile i32 4, ptr %size, align 4 + %len = load volatile i32, ptr %size, align 4 + %var_array = alloca i8, i32 %len, align 16 + store i32 1, ptr %var_array, align 16 + %nil = call { ptr, ptr, i32 } asm "rep movsb", "={di},={si},={cx},0,1,2,~{memory},~{dirflag},~{fpsr},~{flags}"(ptr %g, ptr %var_array, i32 4) + %retval = load i32, ptr %g, align 4 + ret i32 %retval +} + +; This function has the same code except the inline asm also clobbers +; frame pointer. + +define i32 @clobber_bpfp() { +; CHECK-LABEL: clobber_bpfp: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pushl %ebp +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: .cfi_offset %ebp, -8 +; CHECK-NEXT: movl %esp, %ebp +; CHECK-NEXT: .cfi_def_cfa_register %ebp +; CHECK-NEXT: pushl %edi +; CHECK-NEXT: pushl %esi +; CHECK-NEXT: andl $-16, %esp +; CHECK-NEXT: subl $16, %esp +; CHECK-NEXT: movl %esp, %esi +; CHECK-NEXT: .cfi_offset %esi, -16 +; CHECK-NEXT: .cfi_offset %edi, -12 +; CHECK-NEXT: movl $4, 12(%esi) +; CHECK-NEXT: movl 12(%esi), %eax +; CHECK-NEXT: addl $3, %eax +; CHECK-NEXT: andl $-4, %eax +; CHECK-NEXT: calll __alloca +; CHECK-NEXT: movl %esp, %eax +; CHECK-NEXT: andl $-16, %eax +; CHECK-NEXT: movl %eax, %esp +; CHECK-NEXT: movl $1, (%eax) +; CHECK-NEXT: leal 8(%esi), %edi +; CHECK-NEXT: movl $4, %ecx +; CHECK-NEXT: pushl %ebp +; CHECK-NEXT: pushl %esi +; CHECK-NEXT: .cfi_remember_state +; CHECK-NEXT: .cfi_escape 0x0f, 0x06, 0x74, 0x04, 0x06, 0x11, 0x08, 0x22 # +; CHECK-NEXT: movl %eax, %esi +; CHECK-NEXT: #APP +; CHECK-NEXT: rep movsb (%esi), %es:(%edi) +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: popl %esi +; CHECK-NEXT: popl %ebp +; CHECK-NEXT: .cfi_restore_state +; CHECK-NEXT: movl 8(%esi), %eax +; CHECK-NEXT: leal -8(%ebp), %esp +; CHECK-NEXT: popl %esi +; CHECK-NEXT: popl %edi +; CHECK-NEXT: popl %ebp +; CHECK-NEXT: retl +entry: + %size = alloca i32, align 4 + %g = alloca i32, align 4 + store volatile i32 4, ptr %size, align 4 + %len = load volatile i32, ptr %size, align 4 + %var_array = alloca i8, i32 %len, align 16 + store i32 1, ptr %var_array, align 16 + %nil = call { ptr, ptr, i32 } asm "rep movsb", "={di},={si},={cx},0,1,2,~{memory},~{dirflag},~{fpsr},~{flags},~{ebp}"(ptr %g, ptr %var_array, i32 4) + %retval = load i32, ptr %g, align 4 + ret i32 %retval +} + diff --git a/llvm/test/CodeGen/X86/clobber_frame_ptr.ll b/llvm/test/CodeGen/X86/clobber_frame_ptr.ll new file mode 100644 index 000000000000..6209e1a85e9e --- /dev/null +++ b/llvm/test/CodeGen/X86/clobber_frame_ptr.ll @@ -0,0 +1,159 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc -mtriple=x86_64-pc-linux -stackrealign -verify-machineinstrs < %s | FileCheck %s + +; Calling convention ghccc uses ebp to pass parameter, so calling a function +; using ghccc clobbers ebp. We should save and restore ebp around such a call +; if ebp is used as frame pointer. + +declare ghccc i32 @external(i32) + +; Basic test with ghccc calling convention. +define i32 @test1(i32 %0, i32 %1) { +; CHECK-LABEL: test1: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rbp +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset %rbp, -16 +; CHECK-NEXT: movq %rsp, %rbp +; CHECK-NEXT: .cfi_def_cfa_register %rbp +; CHECK-NEXT: pushq %r15 +; CHECK-NEXT: pushq %r14 +; CHECK-NEXT: pushq %r13 +; CHECK-NEXT: pushq %r12 +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: andq $-16, %rsp +; CHECK-NEXT: subq $16, %rsp +; CHECK-NEXT: .cfi_offset %rbx, -56 +; CHECK-NEXT: .cfi_offset %r12, -48 +; CHECK-NEXT: .cfi_offset %r13, -40 +; CHECK-NEXT: .cfi_offset %r14, -32 +; CHECK-NEXT: .cfi_offset %r15, -24 +; CHECK-NEXT: # kill: def $edi killed $edi def $rdi +; CHECK-NEXT: pushq %rbp +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: .cfi_remember_state +; CHECK-NEXT: .cfi_escape 0x0f, 0x06, 0x77, 0x08, 0x06, 0x11, 0x10, 0x22 # +; CHECK-NEXT: movl %esi, %ebp +; CHECK-NEXT: movq %rdi, %r13 +; CHECK-NEXT: callq external@PLT +; CHECK-NEXT: addq $8, %rsp +; CHECK-NEXT: popq %rbp +; CHECK-NEXT: .cfi_restore_state +; CHECK-NEXT: leaq -40(%rbp), %rsp +; CHECK-NEXT: popq %rbx +; CHECK-NEXT: popq %r12 +; CHECK-NEXT: popq %r13 +; CHECK-NEXT: popq %r14 +; CHECK-NEXT: popq %r15 +; CHECK-NEXT: popq %rbp +; CHECK-NEXT: .cfi_def_cfa %rsp, 8 +; CHECK-NEXT: retq + %x = call ghccc i32 @external(i32 %0, i32 %1) + ret i32 %x +} + +; Calling convention hipe has similar behavior. It clobbers rbp but not rbx. + +declare cc 11 i64 @hipe1(i64) +declare cc 11 i64 @hipe2(i64, i64, i64, i64, i64, i64, i64) + +; Basic test with hipe calling convention. +define i64 @test2(i64 %a0, i64 %a1) { +; CHECK-LABEL: test2: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rbp +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset %rbp, -16 +; CHECK-NEXT: movq %rsp, %rbp +; CHECK-NEXT: .cfi_def_cfa_register %rbp +; CHECK-NEXT: pushq %r15 +; CHECK-NEXT: pushq %r14 +; CHECK-NEXT: pushq %r13 +; CHECK-NEXT: pushq %r12 +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: andq $-16, %rsp +; CHECK-NEXT: subq $16, %rsp +; CHECK-NEXT: .cfi_offset %rbx, -56 +; CHECK-NEXT: .cfi_offset %r12, -48 +; CHECK-NEXT: .cfi_offset %r13, -40 +; CHECK-NEXT: .cfi_offset %r14, -32 +; CHECK-NEXT: .cfi_offset %r15, -24 +; CHECK-NEXT: pushq %rbp +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: .cfi_remember_state +; CHECK-NEXT: .cfi_escape 0x0f, 0x06, 0x77, 0x08, 0x06, 0x11, 0x10, 0x22 # +; CHECK-NEXT: movq %rsi, %rbp +; CHECK-NEXT: movq %rdi, %r15 +; CHECK-NEXT: callq hipe1@PLT +; CHECK-NEXT: addq $8, %rsp +; CHECK-NEXT: popq %rbp +; CHECK-NEXT: .cfi_restore_state +; CHECK-NEXT: movq %r15, %rax +; CHECK-NEXT: leaq -40(%rbp), %rsp +; CHECK-NEXT: popq %rbx +; CHECK-NEXT: popq %r12 +; CHECK-NEXT: popq %r13 +; CHECK-NEXT: popq %r14 +; CHECK-NEXT: popq %r15 +; CHECK-NEXT: popq %rbp +; CHECK-NEXT: .cfi_def_cfa %rsp, 8 +; CHECK-NEXT: retq + %x = call cc 11 i64 @hipe1(i64 %a0, i64 %a1) + ret i64 %x +} + +; Test with more arguments, so some of them are passed from stack. The spilling +; of rbp should not disturb stack arguments. +; fixme: current generated code is wrong because rbp is used to load passed in +; argument after rbp is assigned argument for function call, it is caused +; by x86-cf-opt. +define i64 @test3(i64 %a0, i64 %a1, i64 %a2, i64 %a3, i64 %a4, i64 %a5, i64 %a6, i64 %a7) { +; CHECK-LABEL: test3: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rbp +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset %rbp, -16 +; CHECK-NEXT: movq %rsp, %rbp +; CHECK-NEXT: .cfi_def_cfa_register %rbp +; CHECK-NEXT: pushq %r15 +; CHECK-NEXT: pushq %r14 +; CHECK-NEXT: pushq %r13 +; CHECK-NEXT: pushq %r12 +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: andq $-16, %rsp +; CHECK-NEXT: subq $16, %rsp +; CHECK-NEXT: .cfi_offset %rbx, -56 +; CHECK-NEXT: .cfi_offset %r12, -48 +; CHECK-NEXT: .cfi_offset %r13, -40 +; CHECK-NEXT: .cfi_offset %r14, -32 +; CHECK-NEXT: .cfi_offset %r15, -24 +; CHECK-NEXT: pushq %rbp +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: .cfi_remember_state +; CHECK-NEXT: .cfi_escape 0x0f, 0x06, 0x77, 0x08, 0x06, 0x11, 0x10, 0x22 # +; CHECK-NEXT: movq %rsi, %rbp +; CHECK-NEXT: movq %rdi, %r15 +; CHECK-NEXT: movq %rdx, %rsi +; CHECK-NEXT: movq %rcx, %rdx +; CHECK-NEXT: movq %r8, %rcx +; CHECK-NEXT: movq %r9, %r8 +; CHECK-NEXT: pushq 24(%rbp) +; CHECK-NEXT: pushq 16(%rbp) +; CHECK-NEXT: callq hipe2@PLT +; CHECK-NEXT: addq $8, %rsp +; CHECK-NEXT: popq %rbp +; CHECK-NEXT: .cfi_restore_state +; CHECK-NEXT: addq $16, %rsp +; CHECK-NEXT: movq %r15, %rax +; CHECK-NEXT: leaq -40(%rbp), %rsp +; CHECK-NEXT: popq %rbx +; CHECK-NEXT: popq %r12 +; CHECK-NEXT: popq %r13 +; CHECK-NEXT: popq %r14 +; CHECK-NEXT: popq %r15 +; CHECK-NEXT: popq %rbp +; CHECK-NEXT: .cfi_def_cfa %rsp, 8 +; CHECK-NEXT: retq + %x = call cc 11 i64 @hipe2(i64 %a0, i64 %a1, i64 %a2, i64 %a3, i64 %a4, i64 %a5, i64 %a6, i64 %a7) + ret i64 %x +} diff --git a/llvm/test/CodeGen/X86/clobber_frame_ptr_x32.ll b/llvm/test/CodeGen/X86/clobber_frame_ptr_x32.ll new file mode 100644 index 000000000000..25c951d8b1a1 --- /dev/null +++ b/llvm/test/CodeGen/X86/clobber_frame_ptr_x32.ll @@ -0,0 +1,53 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc < %s | FileCheck %s + +target triple = "x86_64-linux-gnux32" + +define i32 @foo() { +; CHECK-LABEL: foo: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pushq %rbp +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset %rbp, -16 +; CHECK-NEXT: movq %rsp, %rbp +; CHECK-NEXT: .cfi_def_cfa_register %rbp +; CHECK-NEXT: subq $16, %rsp +; CHECK-NEXT: movl $4, -8(%rbp) +; CHECK-NEXT: movl $5, -4(%rbp) +; CHECK-NEXT: movl -8(%rbp), %eax +; CHECK-NEXT: movq %rsp, %rcx +; CHECK-NEXT: addq $15, %rax +; CHECK-NEXT: andq $-16, %rax +; CHECK-NEXT: movq %rcx, %rdx +; CHECK-NEXT: subq %rax, %rdx +; CHECK-NEXT: movq %rdx, %rsp +; CHECK-NEXT: negq %rax +; CHECK-NEXT: movl $1, (%rcx,%rax) +; CHECK-NEXT: pushq %rbp +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: .cfi_remember_state +; CHECK-NEXT: .cfi_escape 0x0f, 0x06, 0x77, 0x08, 0x06, 0x11, 0x10, 0x22 # +; CHECK-NEXT: movl $123, %ebp +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: addq $8, %rsp +; CHECK-NEXT: popq %rbp +; CHECK-NEXT: .cfi_restore_state +; CHECK-NEXT: movl -4(%rbp), %eax +; CHECK-NEXT: movq %rbp, %rsp +; CHECK-NEXT: popq %rbp +; CHECK-NEXT: .cfi_def_cfa %rsp, 8 +; CHECK-NEXT: retq +entry: + %size = alloca i32, align 4 + %g = alloca i32, align 4 + store volatile i32 4, ptr %size, align 4 + store volatile i32 5, ptr %g, align 4 + %len = load volatile i32, ptr %size, align 4 + %var_array = alloca i8, i32 %len, align 16 + store i32 1, ptr %var_array, align 16 + call void asm "nop", "{ebp},~{memory}"(i32 123) + %retval = load i32, ptr %g, align 4 + ret i32 %retval +} diff --git a/llvm/test/CodeGen/X86/cmp.ll b/llvm/test/CodeGen/X86/cmp.ll index 5a63d36a6be4..0965b1c7208f 100644 --- a/llvm/test/CodeGen/X86/cmp.ll +++ b/llvm/test/CodeGen/X86/cmp.ll @@ -178,7 +178,7 @@ define i32 @test7(i64 %res) nounwind { ; NDD-LABEL: test7: ; NDD: # %bb.0: # %entry ; NDD-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0] -; NDD-NEXT: shrq $32, %rdi, %rcx # encoding: [0x62,0xf4,0xf4,0x18,0xc1,0xef,0x20] +; NDD-NEXT: shrq $32, %rdi # EVEX TO LEGACY Compression encoding: [0x48,0xc1,0xef,0x20] ; NDD-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] ; NDD-NEXT: retq # encoding: [0xc3] entry: @@ -198,9 +198,9 @@ define i32 @test8(i64 %res) nounwind { ; ; NDD-LABEL: test8: ; NDD: # %bb.0: -; NDD-NEXT: shrq $32, %rdi, %rcx # encoding: [0x62,0xf4,0xf4,0x18,0xc1,0xef,0x20] +; NDD-NEXT: shrq $32, %rdi # EVEX TO LEGACY Compression encoding: [0x48,0xc1,0xef,0x20] ; NDD-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0] -; NDD-NEXT: cmpl $3, %ecx # encoding: [0x83,0xf9,0x03] +; NDD-NEXT: cmpl $3, %edi # encoding: [0x83,0xff,0x03] ; NDD-NEXT: setb %al # encoding: [0x0f,0x92,0xc0] ; NDD-NEXT: retq # encoding: [0xc3] %lnot = icmp ult i64 %res, 12884901888 @@ -219,7 +219,7 @@ define i32 @test9(i64 %res) nounwind { ; NDD-LABEL: test9: ; NDD: # %bb.0: ; NDD-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0] -; NDD-NEXT: shrq $33, %rdi, %rcx # encoding: [0x62,0xf4,0xf4,0x18,0xc1,0xef,0x21] +; NDD-NEXT: shrq $33, %rdi # EVEX TO LEGACY Compression encoding: [0x48,0xc1,0xef,0x21] ; NDD-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] ; NDD-NEXT: retq # encoding: [0xc3] %lnot = icmp ult i64 %res, 8589934592 @@ -238,7 +238,7 @@ define i32 @test10(i64 %res) nounwind { ; NDD-LABEL: test10: ; NDD: # %bb.0: ; NDD-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0] -; NDD-NEXT: shrq $32, %rdi, %rcx # encoding: [0x62,0xf4,0xf4,0x18,0xc1,0xef,0x20] +; NDD-NEXT: shrq $32, %rdi # EVEX TO LEGACY Compression encoding: [0x48,0xc1,0xef,0x20] ; NDD-NEXT: setne %al # encoding: [0x0f,0x95,0xc0] ; NDD-NEXT: retq # encoding: [0xc3] %lnot = icmp uge i64 %res, 4294967296 @@ -257,9 +257,9 @@ define i32 @test11(i64 %l) nounwind { ; ; NDD-LABEL: test11: ; NDD: # %bb.0: -; NDD-NEXT: shrq $47, %rdi, %rcx # encoding: [0x62,0xf4,0xf4,0x18,0xc1,0xef,0x2f] +; NDD-NEXT: shrq $47, %rdi # EVEX TO LEGACY Compression encoding: [0x48,0xc1,0xef,0x2f] ; NDD-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0] -; NDD-NEXT: cmpl $1, %ecx # encoding: [0x83,0xf9,0x01] +; NDD-NEXT: cmpl $1, %edi # encoding: [0x83,0xff,0x01] ; NDD-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] ; NDD-NEXT: retq # encoding: [0xc3] %shr.mask = and i64 %l, -140737488355328 @@ -331,7 +331,7 @@ define i32 @test14(i32 %mask, i32 %base, i32 %intra) { ; ; NDD-LABEL: test14: ; NDD: # %bb.0: -; NDD-NEXT: shrl $7, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0xc1,0xef,0x07] +; NDD-NEXT: shrl $7, %edi # EVEX TO LEGACY Compression encoding: [0xc1,0xef,0x07] ; NDD-NEXT: cmovnsl %edx, %esi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x49,0xf2] ; NDD-NEXT: retq # encoding: [0xc3] %s = lshr i32 %mask, 7 @@ -353,10 +353,10 @@ define zeroext i1 @test15(i32 %bf.load, i32 %n) { ; ; NDD-LABEL: test15: ; NDD: # %bb.0: -; NDD-NEXT: shrl $16, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0xc1,0xef,0x10] -; NDD-NEXT: sete %cl # encoding: [0x0f,0x94,0xc1] -; NDD-NEXT: cmpl %esi, %eax # encoding: [0x39,0xf0] -; NDD-NEXT: setae %al # encoding: [0x0f,0x93,0xc0] +; NDD-NEXT: shrl $16, %edi # EVEX TO LEGACY Compression encoding: [0xc1,0xef,0x10] +; NDD-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] +; NDD-NEXT: cmpl %esi, %edi # encoding: [0x39,0xf7] +; NDD-NEXT: setae %cl # encoding: [0x0f,0x93,0xc1] ; NDD-NEXT: orb %cl, %al # EVEX TO LEGACY Compression encoding: [0x08,0xc8] ; NDD-NEXT: retq # encoding: [0xc3] %bf.lshr = lshr i32 %bf.load, 16 @@ -482,7 +482,7 @@ define i32 @highmask_i64_mask64(i64 %val) { ; NDD-LABEL: highmask_i64_mask64: ; NDD: # %bb.0: ; NDD-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0] -; NDD-NEXT: shrq $41, %rdi, %rcx # encoding: [0x62,0xf4,0xf4,0x18,0xc1,0xef,0x29] +; NDD-NEXT: shrq $41, %rdi # EVEX TO LEGACY Compression encoding: [0x48,0xc1,0xef,0x29] ; NDD-NEXT: setne %al # encoding: [0x0f,0x95,0xc0] ; NDD-NEXT: retq # encoding: [0xc3] %and = and i64 %val, -2199023255552 @@ -526,7 +526,7 @@ define i32 @highmask_i64_mask32(i64 %val) { ; NDD-LABEL: highmask_i64_mask32: ; NDD: # %bb.0: ; NDD-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0] -; NDD-NEXT: shrq $20, %rdi, %rcx # encoding: [0x62,0xf4,0xf4,0x18,0xc1,0xef,0x14] +; NDD-NEXT: shrq $20, %rdi # EVEX TO LEGACY Compression encoding: [0x48,0xc1,0xef,0x14] ; NDD-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] ; NDD-NEXT: retq # encoding: [0xc3] %and = and i64 %val, -1048576 @@ -584,7 +584,7 @@ define i32 @lowmask_i64_mask64(i64 %val) { ; NDD-LABEL: lowmask_i64_mask64: ; NDD: # %bb.0: ; NDD-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0] -; NDD-NEXT: shlq $16, %rdi, %rcx # encoding: [0x62,0xf4,0xf4,0x18,0xc1,0xe7,0x10] +; NDD-NEXT: shlq $16, %rdi # EVEX TO LEGACY Compression encoding: [0x48,0xc1,0xe7,0x10] ; NDD-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] ; NDD-NEXT: retq # encoding: [0xc3] %and = and i64 %val, 281474976710655 @@ -628,7 +628,7 @@ define i32 @lowmask_i64_mask32(i64 %val) { ; NDD-LABEL: lowmask_i64_mask32: ; NDD: # %bb.0: ; NDD-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0] -; NDD-NEXT: shlq $44, %rdi, %rcx # encoding: [0x62,0xf4,0xf4,0x18,0xc1,0xe7,0x2c] +; NDD-NEXT: shlq $44, %rdi # EVEX TO LEGACY Compression encoding: [0x48,0xc1,0xe7,0x2c] ; NDD-NEXT: setne %al # encoding: [0x0f,0x95,0xc0] ; NDD-NEXT: retq # encoding: [0xc3] %and = and i64 %val, 1048575 @@ -739,8 +739,8 @@ define i1 @shifted_mask64_testb(i64 %a) { ; ; NDD-LABEL: shifted_mask64_testb: ; NDD: # %bb.0: -; NDD-NEXT: shrq $50, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0xc1,0xef,0x32] -; NDD-NEXT: testb %al, %al # encoding: [0x84,0xc0] +; NDD-NEXT: shrq $50, %rdi # EVEX TO LEGACY Compression encoding: [0x48,0xc1,0xef,0x32] +; NDD-NEXT: testb %dil, %dil # encoding: [0x40,0x84,0xff] ; NDD-NEXT: setne %al # encoding: [0x0f,0x95,0xc0] ; NDD-NEXT: retq # encoding: [0xc3] %v0 = and i64 %a, 287104476244869120 ; 0xff << 50 @@ -758,8 +758,8 @@ define i1 @shifted_mask64_testw(i64 %a) { ; ; NDD-LABEL: shifted_mask64_testw: ; NDD: # %bb.0: -; NDD-NEXT: shrq $33, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0xc1,0xef,0x21] -; NDD-NEXT: testw %ax, %ax # encoding: [0x66,0x85,0xc0] +; NDD-NEXT: shrq $33, %rdi # EVEX TO LEGACY Compression encoding: [0x48,0xc1,0xef,0x21] +; NDD-NEXT: testw %di, %di # encoding: [0x66,0x85,0xff] ; NDD-NEXT: setne %al # encoding: [0x0f,0x95,0xc0] ; NDD-NEXT: retq # encoding: [0xc3] %v0 = and i64 %a, 562941363486720 ; 0xffff << 33 @@ -777,8 +777,8 @@ define i1 @shifted_mask64_testl(i64 %a) { ; ; NDD-LABEL: shifted_mask64_testl: ; NDD: # %bb.0: -; NDD-NEXT: shrq $7, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0xc1,0xef,0x07] -; NDD-NEXT: testl %eax, %eax # encoding: [0x85,0xc0] +; NDD-NEXT: shrq $7, %rdi # EVEX TO LEGACY Compression encoding: [0x48,0xc1,0xef,0x07] +; NDD-NEXT: testl %edi, %edi # encoding: [0x85,0xff] ; NDD-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] ; NDD-NEXT: retq # encoding: [0xc3] %v0 = and i64 %a, 549755813760 ; 0xffffffff << 7 @@ -817,9 +817,9 @@ define i1 @shifted_mask64_extra_use_and(i64 %a) { ; NDD: # %bb.0: ; NDD-NEXT: movabsq $287104476244869120, %rax # encoding: [0x48,0xb8,0x00,0x00,0x00,0x00,0x00,0x00,0xfc,0x03] ; NDD-NEXT: # imm = 0x3FC000000000000 -; NDD-NEXT: andq %rax, %rdi, %rcx # encoding: [0x62,0xf4,0xf4,0x18,0x21,0xc7] +; NDD-NEXT: andq %rax, %rdi # EVEX TO LEGACY Compression encoding: [0x48,0x21,0xc7] ; NDD-NEXT: setne %al # encoding: [0x0f,0x95,0xc0] -; NDD-NEXT: movq %rcx, d64(%rip) # encoding: [0x48,0x89,0x0d,A,A,A,A] +; NDD-NEXT: movq %rdi, d64(%rip) # encoding: [0x48,0x89,0x3d,A,A,A,A] ; NDD-NEXT: # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte ; NDD-NEXT: retq # encoding: [0xc3] %v0 = and i64 %a, 287104476244869120 ; 0xff << 50 @@ -868,10 +868,10 @@ define i1 @shifted_mask32_extra_use_and(i64 %a) { ; ; NDD-LABEL: shifted_mask32_extra_use_and: ; NDD: # %bb.0: -; NDD-NEXT: andq $66846720, %rdi, %rcx # encoding: [0x62,0xf4,0xf4,0x18,0x81,0xe7,0x00,0x00,0xfc,0x03] +; NDD-NEXT: andq $66846720, %rdi # EVEX TO LEGACY Compression encoding: [0x48,0x81,0xe7,0x00,0x00,0xfc,0x03] ; NDD-NEXT: # imm = 0x3FC0000 ; NDD-NEXT: setne %al # encoding: [0x0f,0x95,0xc0] -; NDD-NEXT: movq %rcx, d64(%rip) # encoding: [0x48,0x89,0x0d,A,A,A,A] +; NDD-NEXT: movq %rdi, d64(%rip) # encoding: [0x48,0x89,0x3d,A,A,A,A] ; NDD-NEXT: # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte ; NDD-NEXT: retq # encoding: [0xc3] %v0 = and i64 %a, 66846720 ; 0xff << 50 diff --git a/llvm/test/CodeGen/X86/combine-srem.ll b/llvm/test/CodeGen/X86/combine-srem.ll index 4ed00a9d66bd..8bfaa6118b79 100644 --- a/llvm/test/CodeGen/X86/combine-srem.ll +++ b/llvm/test/CodeGen/X86/combine-srem.ll @@ -83,7 +83,7 @@ define <4 x i32> @combine_vec_srem_by_minsigned(<4 x i32> %x) { ; AVX1-NEXT: vpsrld $1, %xmm1, %xmm1 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: combine_vec_srem_by_minsigned: @@ -93,7 +93,7 @@ define <4 x i32> @combine_vec_srem_by_minsigned(<4 x i32> %x) { ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq %1 = srem <4 x i32> %x, <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648> ret <4 x i32> %1 @@ -225,24 +225,28 @@ define <4 x i32> @combine_vec_srem_by_pow2a_neg(<4 x i32> %x) { ; SSE-NEXT: psrad $31, %xmm1 ; SSE-NEXT: psrld $30, %xmm1 ; SSE-NEXT: paddd %xmm0, %xmm1 -; SSE-NEXT: psrld $2, %xmm1 -; SSE-NEXT: pxor %xmm2, %xmm2 -; SSE-NEXT: psubd %xmm1, %xmm2 -; SSE-NEXT: pslld $2, %xmm2 -; SSE-NEXT: paddd %xmm2, %xmm0 +; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE-NEXT: psubd %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: combine_vec_srem_by_pow2a_neg: -; AVX: # %bb.0: -; AVX-NEXT: vpsrad $31, %xmm0, %xmm1 -; AVX-NEXT: vpsrld $30, %xmm1, %xmm1 -; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm1 -; AVX-NEXT: vpsrld $2, %xmm1, %xmm1 -; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpsubd %xmm1, %xmm2, %xmm1 -; AVX-NEXT: vpslld $2, %xmm1, %xmm1 -; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: combine_vec_srem_by_pow2a_neg: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsrad $31, %xmm0, %xmm1 +; AVX1-NEXT: vpsrld $30, %xmm1, %xmm1 +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_vec_srem_by_pow2a_neg: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsrad $31, %xmm0, %xmm1 +; AVX2-NEXT: vpsrld $30, %xmm1, %xmm1 +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [4294967292,4294967292,4294967292,4294967292] +; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq %1 = srem <4 x i32> %x, <i32 -4, i32 -4, i32 -4, i32 -4> ret <4 x i32> %1 } diff --git a/llvm/test/CodeGen/X86/i386-baseptr.ll b/llvm/test/CodeGen/X86/i386-baseptr.ll index 08e4bde7353a..777eb838b84c 100644 --- a/llvm/test/CodeGen/X86/i386-baseptr.ll +++ b/llvm/test/CodeGen/X86/i386-baseptr.ll @@ -109,10 +109,14 @@ define x86_regcallcc void @clobber_baseptr_argptr(i32 %param1, i32 %param2, i32 ; CHECK-NEXT: subl %eax, %edx ; CHECK-NEXT: movl %edx, %esp ; CHECK-NEXT: negl %eax +; CHECK-NEXT: pushl %esi +; CHECK-NEXT: subl $28, %esp ; CHECK-NEXT: movl $405, %esi # imm = 0x195 ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP +; CHECK-NEXT: addl $28, %esp +; CHECK-NEXT: popl %esi ; CHECK-NEXT: movl $405, %ebx # imm = 0x195 ; CHECK-NEXT: #APP ; CHECK-NEXT: nop diff --git a/llvm/test/CodeGen/X86/inline-asm-function-call-pic.ll b/llvm/test/CodeGen/X86/inline-asm-function-call-pic.ll index 3c98eead8d18..d3ca872509ad 100644 --- a/llvm/test/CodeGen/X86/inline-asm-function-call-pic.ll +++ b/llvm/test/CodeGen/X86/inline-asm-function-call-pic.ll @@ -37,6 +37,8 @@ define void @func() local_unnamed_addr #0 { ; CHECK-NEXT: .Ltmp0: ; CHECK-NEXT: addl $_GLOBAL_OFFSET_TABLE_+(.Ltmp0-.L0$pb), %ebx ; CHECK-NEXT: calll static_func +; CHECK-NEXT: pushl %ebp +; CHECK-NEXT: subl $12, %esp ; CHECK-NEXT: #APP ; CHECK-EMPTY: ; CHECK-NEXT: calll static_func @@ -52,6 +54,8 @@ define void @func() local_unnamed_addr #0 { ; CHECK-NEXT: shrl $0, %esp ; CHECK-EMPTY: ; CHECK-NEXT: #NO_APP +; CHECK-NEXT: addl $12, %esp +; CHECK-NEXT: popl %ebp entry: %call = tail call i32 @static_func() ;; We test call, CALL, and jmp. diff --git a/llvm/test/CodeGen/X86/opt-pipeline.ll b/llvm/test/CodeGen/X86/opt-pipeline.ll index 275a42e02d74..6fcc1ed068f4 100644 --- a/llvm/test/CodeGen/X86/opt-pipeline.ll +++ b/llvm/test/CodeGen/X86/opt-pipeline.ll @@ -59,7 +59,6 @@ ; CHECK-NEXT: Constant Hoisting ; CHECK-NEXT: Replace intrinsics with calls to vector library ; CHECK-NEXT: Partially inline calls to library functions -; CHECK-NEXT: Expand vector predication intrinsics ; CHECK-NEXT: Instrument function entry/exit with calls to e.g. mcount() (post inlining) ; CHECK-NEXT: Scalarize Masked Memory Intrinsics ; CHECK-NEXT: Expand reduction intrinsics diff --git a/llvm/test/CodeGen/X86/popcnt.ll b/llvm/test/CodeGen/X86/popcnt.ll index 13fa639dc63b..35c7c0e09f39 100644 --- a/llvm/test/CodeGen/X86/popcnt.ll +++ b/llvm/test/CodeGen/X86/popcnt.ll @@ -182,11 +182,11 @@ define i32 @cnt32(i32 %x) nounwind readnone { ; X64-NDD: # %bb.0: ; X64-NDD-NEXT: shrl %edi, %eax ; X64-NDD-NEXT: andl $1431655765, %eax # imm = 0x55555555 -; X64-NDD-NEXT: subl %eax, %edi, %eax -; X64-NDD-NEXT: andl $858993459, %eax, %ecx # imm = 0x33333333 -; X64-NDD-NEXT: shrl $2, %eax -; X64-NDD-NEXT: andl $858993459, %eax # imm = 0x33333333 -; X64-NDD-NEXT: addl %ecx, %eax +; X64-NDD-NEXT: subl %eax, %edi +; X64-NDD-NEXT: andl $858993459, %edi, %eax # imm = 0x33333333 +; X64-NDD-NEXT: shrl $2, %edi +; X64-NDD-NEXT: andl $858993459, %edi # imm = 0x33333333 +; X64-NDD-NEXT: addl %edi, %eax ; X64-NDD-NEXT: shrl $4, %eax, %ecx ; X64-NDD-NEXT: addl %ecx, %eax ; X64-NDD-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F @@ -277,12 +277,12 @@ define i64 @cnt64(i64 %x) nounwind readnone { ; X64-NDD-NEXT: shrq %rdi, %rax ; X64-NDD-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555 ; X64-NDD-NEXT: andq %rcx, %rax -; X64-NDD-NEXT: subq %rax, %rdi, %rax -; X64-NDD-NEXT: movabsq $3689348814741910323, %rcx # imm = 0x3333333333333333 -; X64-NDD-NEXT: andq %rcx, %rax, %rdx -; X64-NDD-NEXT: shrq $2, %rax -; X64-NDD-NEXT: andq %rcx, %rax -; X64-NDD-NEXT: addq %rdx, %rax +; X64-NDD-NEXT: subq %rax, %rdi +; X64-NDD-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333 +; X64-NDD-NEXT: andq %rax, %rdi, %rcx +; X64-NDD-NEXT: shrq $2, %rdi +; X64-NDD-NEXT: andq %rdi, %rax +; X64-NDD-NEXT: addq %rcx, %rax ; X64-NDD-NEXT: shrq $4, %rax, %rcx ; X64-NDD-NEXT: addq %rcx, %rax ; X64-NDD-NEXT: movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F @@ -491,32 +491,32 @@ define i128 @cnt128(i128 %x) nounwind readnone { ; X64-NDD-NEXT: shrq %rsi, %rax ; X64-NDD-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555 ; X64-NDD-NEXT: andq %rcx, %rax -; X64-NDD-NEXT: subq %rax, %rsi, %rax -; X64-NDD-NEXT: movabsq $3689348814741910323, %rdx # imm = 0x3333333333333333 -; X64-NDD-NEXT: andq %rdx, %rax, %rsi -; X64-NDD-NEXT: shrq $2, %rax -; X64-NDD-NEXT: andq %rdx, %rax -; X64-NDD-NEXT: addq %rsi, %rax -; X64-NDD-NEXT: shrq $4, %rax, %rsi -; X64-NDD-NEXT: addq %rsi, %rax +; X64-NDD-NEXT: subq %rax, %rsi +; X64-NDD-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333 +; X64-NDD-NEXT: andq %rax, %rsi, %rdx +; X64-NDD-NEXT: shrq $2, %rsi +; X64-NDD-NEXT: andq %rax, %rsi +; X64-NDD-NEXT: addq %rsi, %rdx +; X64-NDD-NEXT: shrq $4, %rdx, %rsi +; X64-NDD-NEXT: addq %rsi, %rdx ; X64-NDD-NEXT: movabsq $1085102592571150095, %rsi # imm = 0xF0F0F0F0F0F0F0F -; X64-NDD-NEXT: andq %rsi, %rax +; X64-NDD-NEXT: andq %rsi, %rdx ; X64-NDD-NEXT: movabsq $72340172838076673, %r8 # imm = 0x101010101010101 -; X64-NDD-NEXT: imulq %r8, %rax -; X64-NDD-NEXT: shrq $56, %rax +; X64-NDD-NEXT: imulq %r8, %rdx +; X64-NDD-NEXT: shrq $56, %rdx ; X64-NDD-NEXT: shrq %rdi, %r9 ; X64-NDD-NEXT: andq %r9, %rcx -; X64-NDD-NEXT: subq %rcx, %rdi, %rcx -; X64-NDD-NEXT: andq %rdx, %rcx, %rdi -; X64-NDD-NEXT: shrq $2, %rcx -; X64-NDD-NEXT: andq %rdx, %rcx -; X64-NDD-NEXT: addq %rdi, %rcx -; X64-NDD-NEXT: shrq $4, %rcx, %rdx -; X64-NDD-NEXT: addq %rdx, %rcx -; X64-NDD-NEXT: andq %rsi, %rcx -; X64-NDD-NEXT: imulq %r8, %rcx -; X64-NDD-NEXT: shrq $56, %rcx +; X64-NDD-NEXT: subq %rcx, %rdi +; X64-NDD-NEXT: andq %rax, %rdi, %rcx +; X64-NDD-NEXT: shrq $2, %rdi +; X64-NDD-NEXT: andq %rdi, %rax +; X64-NDD-NEXT: addq %rcx, %rax +; X64-NDD-NEXT: shrq $4, %rax, %rcx ; X64-NDD-NEXT: addq %rcx, %rax +; X64-NDD-NEXT: andq %rsi, %rax +; X64-NDD-NEXT: imulq %r8, %rax +; X64-NDD-NEXT: shrq $56, %rax +; X64-NDD-NEXT: addq %rdx, %rax ; X64-NDD-NEXT: xorl %edx, %edx ; X64-NDD-NEXT: retq ; @@ -685,12 +685,12 @@ define i64 @cnt64_noimplicitfloat(i64 %x) nounwind readnone noimplicitfloat { ; X64-NDD-NEXT: shrq %rdi, %rax ; X64-NDD-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555 ; X64-NDD-NEXT: andq %rcx, %rax -; X64-NDD-NEXT: subq %rax, %rdi, %rax -; X64-NDD-NEXT: movabsq $3689348814741910323, %rcx # imm = 0x3333333333333333 -; X64-NDD-NEXT: andq %rcx, %rax, %rdx -; X64-NDD-NEXT: shrq $2, %rax -; X64-NDD-NEXT: andq %rcx, %rax -; X64-NDD-NEXT: addq %rdx, %rax +; X64-NDD-NEXT: subq %rax, %rdi +; X64-NDD-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333 +; X64-NDD-NEXT: andq %rax, %rdi, %rcx +; X64-NDD-NEXT: shrq $2, %rdi +; X64-NDD-NEXT: andq %rdi, %rax +; X64-NDD-NEXT: addq %rcx, %rax ; X64-NDD-NEXT: shrq $4, %rax, %rcx ; X64-NDD-NEXT: addq %rcx, %rax ; X64-NDD-NEXT: movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F @@ -759,12 +759,12 @@ define i32 @cnt32_optsize(i32 %x) nounwind readnone optsize { ; X64-NDD: # %bb.0: ; X64-NDD-NEXT: shrl %edi, %eax ; X64-NDD-NEXT: andl $1431655765, %eax # imm = 0x55555555 -; X64-NDD-NEXT: subl %eax, %edi, %eax -; X64-NDD-NEXT: movl $858993459, %ecx # imm = 0x33333333 -; X64-NDD-NEXT: andl %ecx, %eax, %edx -; X64-NDD-NEXT: shrl $2, %eax -; X64-NDD-NEXT: andl %ecx, %eax -; X64-NDD-NEXT: addl %edx, %eax +; X64-NDD-NEXT: subl %eax, %edi +; X64-NDD-NEXT: movl $858993459, %eax # imm = 0x33333333 +; X64-NDD-NEXT: andl %eax, %edi, %ecx +; X64-NDD-NEXT: shrl $2, %edi +; X64-NDD-NEXT: andl %edi, %eax +; X64-NDD-NEXT: addl %ecx, %eax ; X64-NDD-NEXT: shrl $4, %eax, %ecx ; X64-NDD-NEXT: addl %ecx, %eax ; X64-NDD-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F @@ -864,12 +864,12 @@ define i64 @cnt64_optsize(i64 %x) nounwind readnone optsize { ; X64-NDD-NEXT: shrq %rdi, %rax ; X64-NDD-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555 ; X64-NDD-NEXT: andq %rcx, %rax -; X64-NDD-NEXT: subq %rax, %rdi, %rax -; X64-NDD-NEXT: movabsq $3689348814741910323, %rcx # imm = 0x3333333333333333 -; X64-NDD-NEXT: andq %rcx, %rax, %rdx -; X64-NDD-NEXT: shrq $2, %rax -; X64-NDD-NEXT: andq %rcx, %rax -; X64-NDD-NEXT: addq %rdx, %rax +; X64-NDD-NEXT: subq %rax, %rdi +; X64-NDD-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333 +; X64-NDD-NEXT: andq %rax, %rdi, %rcx +; X64-NDD-NEXT: shrq $2, %rdi +; X64-NDD-NEXT: andq %rdi, %rax +; X64-NDD-NEXT: addq %rcx, %rax ; X64-NDD-NEXT: shrq $4, %rax, %rcx ; X64-NDD-NEXT: addq %rcx, %rax ; X64-NDD-NEXT: movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F @@ -1087,32 +1087,32 @@ define i128 @cnt128_optsize(i128 %x) nounwind readnone optsize { ; X64-NDD-NEXT: shrq %rsi, %rax ; X64-NDD-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555 ; X64-NDD-NEXT: andq %rcx, %rax -; X64-NDD-NEXT: subq %rax, %rsi, %rax -; X64-NDD-NEXT: movabsq $3689348814741910323, %rdx # imm = 0x3333333333333333 -; X64-NDD-NEXT: andq %rdx, %rax, %rsi -; X64-NDD-NEXT: shrq $2, %rax -; X64-NDD-NEXT: andq %rdx, %rax -; X64-NDD-NEXT: addq %rsi, %rax -; X64-NDD-NEXT: shrq $4, %rax, %rsi -; X64-NDD-NEXT: addq %rsi, %rax +; X64-NDD-NEXT: subq %rax, %rsi +; X64-NDD-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333 +; X64-NDD-NEXT: andq %rax, %rsi, %rdx +; X64-NDD-NEXT: shrq $2, %rsi +; X64-NDD-NEXT: andq %rax, %rsi +; X64-NDD-NEXT: addq %rsi, %rdx +; X64-NDD-NEXT: shrq $4, %rdx, %rsi +; X64-NDD-NEXT: addq %rsi, %rdx ; X64-NDD-NEXT: movabsq $1085102592571150095, %rsi # imm = 0xF0F0F0F0F0F0F0F -; X64-NDD-NEXT: andq %rsi, %rax +; X64-NDD-NEXT: andq %rsi, %rdx ; X64-NDD-NEXT: movabsq $72340172838076673, %r8 # imm = 0x101010101010101 -; X64-NDD-NEXT: imulq %r8, %rax -; X64-NDD-NEXT: shrq $56, %rax +; X64-NDD-NEXT: imulq %r8, %rdx +; X64-NDD-NEXT: shrq $56, %rdx ; X64-NDD-NEXT: shrq %rdi, %r9 ; X64-NDD-NEXT: andq %r9, %rcx -; X64-NDD-NEXT: subq %rcx, %rdi, %rcx -; X64-NDD-NEXT: andq %rdx, %rcx, %rdi -; X64-NDD-NEXT: shrq $2, %rcx -; X64-NDD-NEXT: andq %rdx, %rcx -; X64-NDD-NEXT: addq %rdi, %rcx -; X64-NDD-NEXT: shrq $4, %rcx, %rdx -; X64-NDD-NEXT: addq %rdx, %rcx -; X64-NDD-NEXT: andq %rsi, %rcx -; X64-NDD-NEXT: imulq %r8, %rcx -; X64-NDD-NEXT: shrq $56, %rcx +; X64-NDD-NEXT: subq %rcx, %rdi +; X64-NDD-NEXT: andq %rax, %rdi, %rcx +; X64-NDD-NEXT: shrq $2, %rdi +; X64-NDD-NEXT: andq %rdi, %rax +; X64-NDD-NEXT: addq %rcx, %rax +; X64-NDD-NEXT: shrq $4, %rax, %rcx ; X64-NDD-NEXT: addq %rcx, %rax +; X64-NDD-NEXT: andq %rsi, %rax +; X64-NDD-NEXT: imulq %r8, %rax +; X64-NDD-NEXT: shrq $56, %rax +; X64-NDD-NEXT: addq %rdx, %rax ; X64-NDD-NEXT: xorl %edx, %edx ; X64-NDD-NEXT: retq ; @@ -1257,11 +1257,11 @@ define i32 @cnt32_pgso(i32 %x) nounwind readnone !prof !14 { ; X64-NDD: # %bb.0: ; X64-NDD-NEXT: shrl %edi, %eax ; X64-NDD-NEXT: andl $1431655765, %eax # imm = 0x55555555 -; X64-NDD-NEXT: subl %eax, %edi, %eax -; X64-NDD-NEXT: andl $858993459, %eax, %ecx # imm = 0x33333333 -; X64-NDD-NEXT: shrl $2, %eax -; X64-NDD-NEXT: andl $858993459, %eax # imm = 0x33333333 -; X64-NDD-NEXT: addl %ecx, %eax +; X64-NDD-NEXT: subl %eax, %edi +; X64-NDD-NEXT: andl $858993459, %edi, %eax # imm = 0x33333333 +; X64-NDD-NEXT: shrl $2, %edi +; X64-NDD-NEXT: andl $858993459, %edi # imm = 0x33333333 +; X64-NDD-NEXT: addl %edi, %eax ; X64-NDD-NEXT: shrl $4, %eax, %ecx ; X64-NDD-NEXT: addl %ecx, %eax ; X64-NDD-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F @@ -1352,12 +1352,12 @@ define i64 @cnt64_pgso(i64 %x) nounwind readnone !prof !14 { ; X64-NDD-NEXT: shrq %rdi, %rax ; X64-NDD-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555 ; X64-NDD-NEXT: andq %rcx, %rax -; X64-NDD-NEXT: subq %rax, %rdi, %rax -; X64-NDD-NEXT: movabsq $3689348814741910323, %rcx # imm = 0x3333333333333333 -; X64-NDD-NEXT: andq %rcx, %rax, %rdx -; X64-NDD-NEXT: shrq $2, %rax -; X64-NDD-NEXT: andq %rcx, %rax -; X64-NDD-NEXT: addq %rdx, %rax +; X64-NDD-NEXT: subq %rax, %rdi +; X64-NDD-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333 +; X64-NDD-NEXT: andq %rax, %rdi, %rcx +; X64-NDD-NEXT: shrq $2, %rdi +; X64-NDD-NEXT: andq %rdi, %rax +; X64-NDD-NEXT: addq %rcx, %rax ; X64-NDD-NEXT: shrq $4, %rax, %rcx ; X64-NDD-NEXT: addq %rcx, %rax ; X64-NDD-NEXT: movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F @@ -1568,32 +1568,32 @@ define i128 @cnt128_pgso(i128 %x) nounwind readnone !prof !14 { ; X64-NDD-NEXT: shrq %rsi, %rax ; X64-NDD-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555 ; X64-NDD-NEXT: andq %rcx, %rax -; X64-NDD-NEXT: subq %rax, %rsi, %rax -; X64-NDD-NEXT: movabsq $3689348814741910323, %rdx # imm = 0x3333333333333333 -; X64-NDD-NEXT: andq %rdx, %rax, %rsi -; X64-NDD-NEXT: shrq $2, %rax -; X64-NDD-NEXT: andq %rdx, %rax -; X64-NDD-NEXT: addq %rsi, %rax -; X64-NDD-NEXT: shrq $4, %rax, %rsi -; X64-NDD-NEXT: addq %rsi, %rax +; X64-NDD-NEXT: subq %rax, %rsi +; X64-NDD-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333 +; X64-NDD-NEXT: andq %rax, %rsi, %rdx +; X64-NDD-NEXT: shrq $2, %rsi +; X64-NDD-NEXT: andq %rax, %rsi +; X64-NDD-NEXT: addq %rsi, %rdx +; X64-NDD-NEXT: shrq $4, %rdx, %rsi +; X64-NDD-NEXT: addq %rsi, %rdx ; X64-NDD-NEXT: movabsq $1085102592571150095, %rsi # imm = 0xF0F0F0F0F0F0F0F -; X64-NDD-NEXT: andq %rsi, %rax +; X64-NDD-NEXT: andq %rsi, %rdx ; X64-NDD-NEXT: movabsq $72340172838076673, %r8 # imm = 0x101010101010101 -; X64-NDD-NEXT: imulq %r8, %rax -; X64-NDD-NEXT: shrq $56, %rax +; X64-NDD-NEXT: imulq %r8, %rdx +; X64-NDD-NEXT: shrq $56, %rdx ; X64-NDD-NEXT: shrq %rdi, %r9 ; X64-NDD-NEXT: andq %r9, %rcx -; X64-NDD-NEXT: subq %rcx, %rdi, %rcx -; X64-NDD-NEXT: andq %rdx, %rcx, %rdi -; X64-NDD-NEXT: shrq $2, %rcx -; X64-NDD-NEXT: andq %rdx, %rcx -; X64-NDD-NEXT: addq %rdi, %rcx -; X64-NDD-NEXT: shrq $4, %rcx, %rdx -; X64-NDD-NEXT: addq %rdx, %rcx -; X64-NDD-NEXT: andq %rsi, %rcx -; X64-NDD-NEXT: imulq %r8, %rcx -; X64-NDD-NEXT: shrq $56, %rcx +; X64-NDD-NEXT: subq %rcx, %rdi +; X64-NDD-NEXT: andq %rax, %rdi, %rcx +; X64-NDD-NEXT: shrq $2, %rdi +; X64-NDD-NEXT: andq %rdi, %rax +; X64-NDD-NEXT: addq %rcx, %rax +; X64-NDD-NEXT: shrq $4, %rax, %rcx ; X64-NDD-NEXT: addq %rcx, %rax +; X64-NDD-NEXT: andq %rsi, %rax +; X64-NDD-NEXT: imulq %r8, %rax +; X64-NDD-NEXT: shrq $56, %rax +; X64-NDD-NEXT: addq %rdx, %rax ; X64-NDD-NEXT: xorl %edx, %edx ; X64-NDD-NEXT: retq ; @@ -1739,11 +1739,11 @@ define i32 @popcount_zext_i32(i16 zeroext %x) { ; X64-NDD: # %bb.0: ; X64-NDD-NEXT: shrl %edi, %eax ; X64-NDD-NEXT: andl $21845, %eax # imm = 0x5555 -; X64-NDD-NEXT: subl %eax, %edi, %eax -; X64-NDD-NEXT: andl $858993459, %eax, %ecx # imm = 0x33333333 -; X64-NDD-NEXT: shrl $2, %eax -; X64-NDD-NEXT: andl $858993459, %eax # imm = 0x33333333 -; X64-NDD-NEXT: addl %ecx, %eax +; X64-NDD-NEXT: subl %eax, %edi +; X64-NDD-NEXT: andl $858993459, %edi, %eax # imm = 0x33333333 +; X64-NDD-NEXT: shrl $2, %edi +; X64-NDD-NEXT: andl $858993459, %edi # imm = 0x33333333 +; X64-NDD-NEXT: addl %edi, %eax ; X64-NDD-NEXT: shrl $4, %eax, %ecx ; X64-NDD-NEXT: addl %ecx, %eax ; X64-NDD-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F diff --git a/llvm/test/CodeGen/X86/select_const_i128.ll b/llvm/test/CodeGen/X86/select_const_i128.ll index d7859baec815..f0f0c584a7fc 100644 --- a/llvm/test/CodeGen/X86/select_const_i128.ll +++ b/llvm/test/CodeGen/X86/select_const_i128.ll @@ -23,8 +23,8 @@ define i128 @select_eq_i128(ptr %a) { ; NDD-NEXT: ptest %xmm0, %xmm0 ; NDD-NEXT: setne %al ; NDD-NEXT: addq $-1, %rax -; NDD-NEXT: movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF -; NDD-NEXT: adcq $0, %rcx, %rdx +; NDD-NEXT: movabsq $9223372036854775807, %rdx # imm = 0x7FFFFFFFFFFFFFFF +; NDD-NEXT: adcq $0, %rdx ; NDD-NEXT: retq %1 = load i128, ptr %a, align 16 %cmp = icmp eq i128 %1, 1 diff --git a/llvm/test/CodeGen/X86/srem-seteq-vec-splat.ll b/llvm/test/CodeGen/X86/srem-seteq-vec-splat.ll index d2a1e5e42812..33592027dee9 100644 --- a/llvm/test/CodeGen/X86/srem-seteq-vec-splat.ll +++ b/llvm/test/CodeGen/X86/srem-seteq-vec-splat.ll @@ -624,7 +624,7 @@ define <4 x i32> @test_srem_int_min(<4 x i32> %X) nounwind { ; CHECK-AVX1-NEXT: vpsrld $1, %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1 ; CHECK-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 @@ -637,7 +637,7 @@ define <4 x i32> @test_srem_int_min(<4 x i32> %X) nounwind { ; CHECK-AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm1 ; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] ; CHECK-AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; CHECK-AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 @@ -649,7 +649,7 @@ define <4 x i32> @test_srem_int_min(<4 x i32> %X) nounwind { ; CHECK-AVX512VL-NEXT: vpsrld $1, %xmm1, %xmm1 ; CHECK-AVX512VL-NEXT: vpaddd %xmm1, %xmm0, %xmm1 ; CHECK-AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; CHECK-AVX512VL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/x86-32-intrcc.ll b/llvm/test/CodeGen/X86/x86-32-intrcc.ll index 3c3944c2082b..a0f937e2c323 100644 --- a/llvm/test/CodeGen/X86/x86-32-intrcc.ll +++ b/llvm/test/CodeGen/X86/x86-32-intrcc.ll @@ -108,8 +108,10 @@ define x86_intrcc void @test_isr_clobbers(ptr byval(%struct.interrupt_frame) %fr ; CHECK-NEXT: pushl %eax ; CHECK-NEXT: andl $-16, %esp ; CHECK-NEXT: cld +; CHECK-NEXT: pushl %ebp ; CHECK-NEXT: #APP ; CHECK-NEXT: #NO_APP +; CHECK-NEXT: popl %ebp ; CHECK-NEXT: leal -12(%ebp), %esp ; CHECK-NEXT: popl %eax ; CHECK-NEXT: popl %ebx @@ -127,8 +129,10 @@ define x86_intrcc void @test_isr_clobbers(ptr byval(%struct.interrupt_frame) %fr ; CHECK0-NEXT: pushl %eax ; CHECK0-NEXT: andl $-16, %esp ; CHECK0-NEXT: cld +; CHECK0-NEXT: pushl %ebp ; CHECK0-NEXT: #APP ; CHECK0-NEXT: #NO_APP +; CHECK0-NEXT: popl %ebp ; CHECK0-NEXT: leal -12(%ebp), %esp ; CHECK0-NEXT: popl %eax ; CHECK0-NEXT: popl %ebx diff --git a/llvm/test/CodeGen/X86/x86-64-baseptr.ll b/llvm/test/CodeGen/X86/x86-64-baseptr.ll index 8cda4ba2814b..020004def6e7 100644 --- a/llvm/test/CodeGen/X86/x86-64-baseptr.ll +++ b/llvm/test/CodeGen/X86/x86-64-baseptr.ll @@ -136,10 +136,14 @@ define void @clobber_base() #0 { ; X32ABI-NEXT: subl %eax, %edx ; X32ABI-NEXT: negl %eax ; X32ABI-NEXT: movl %edx, %esp +; X32ABI-NEXT: pushq %rbx +; X32ABI-NEXT: subl $24, %esp ; X32ABI-NEXT: movl $405, %ebx # imm = 0x195 ; X32ABI-NEXT: #APP ; X32ABI-NEXT: nop ; X32ABI-NEXT: #NO_APP +; X32ABI-NEXT: addl $24, %esp +; X32ABI-NEXT: popq %rbx ; X32ABI-NEXT: movl $8, %edx ; X32ABI-NEXT: #APP ; X32ABI-NEXT: movl %edx, (%ebx) @@ -268,6 +272,8 @@ define x86_regcallcc void @clobber_baseptr_argptr(i32 %param1, i32 %param2, i32 ; X32ABI-NEXT: subl %eax, %edx ; X32ABI-NEXT: negl %eax ; X32ABI-NEXT: movl %edx, %esp +; X32ABI-NEXT: pushq %rbx +; X32ABI-NEXT: subl $24, %esp ; X32ABI-NEXT: movl $405, %ebx # imm = 0x195 ; X32ABI-NEXT: #APP ; X32ABI-NEXT: nop @@ -275,6 +281,8 @@ define x86_regcallcc void @clobber_baseptr_argptr(i32 %param1, i32 %param2, i32 ; X32ABI-NEXT: #APP ; X32ABI-NEXT: nop ; X32ABI-NEXT: #NO_APP +; X32ABI-NEXT: addl $24, %esp +; X32ABI-NEXT: popq %rbx ; X32ABI-NEXT: movl $8, %edx ; X32ABI-NEXT: #APP ; X32ABI-NEXT: movl %edx, (%ebx) @@ -385,10 +393,14 @@ define void @vmw_host_printf(ptr %fmt, ...) nounwind { ; X32ABI-NEXT: movl $48, (%eax) ; X32ABI-NEXT: movl $8, (%eax) ; X32ABI-NEXT: xorl %eax, %eax +; X32ABI-NEXT: pushq %rbx +; X32ABI-NEXT: subl $24, %esp ; X32ABI-NEXT: xorl %ebx, %ebx ; X32ABI-NEXT: xorl %ecx, %ecx ; X32ABI-NEXT: #APP ; X32ABI-NEXT: #NO_APP +; X32ABI-NEXT: addl $24, %esp +; X32ABI-NEXT: popq %rbx ; X32ABI-NEXT: leal -8(%ebp), %esp ; X32ABI-NEXT: popq %rbx ; X32ABI-NEXT: popq %rbp diff --git a/llvm/test/CodeGen/X86/x86-64-flags-intrinsics.ll b/llvm/test/CodeGen/X86/x86-64-flags-intrinsics.ll index 47aefdbf0e46..b4c18dd7f457 100644 --- a/llvm/test/CodeGen/X86/x86-64-flags-intrinsics.ll +++ b/llvm/test/CodeGen/X86/x86-64-flags-intrinsics.ll @@ -94,6 +94,8 @@ define i64 @read_flags_reg_pressure() nounwind { ; WIN64-NEXT: pushq %rbx ; WIN64-NEXT: subq $16, %rsp ; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rbp +; WIN64-NEXT: pushq %rbp +; WIN64-NEXT: pushq %rax ; WIN64-NEXT: #APP ; WIN64-NEXT: #NO_APP ; WIN64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill @@ -103,6 +105,8 @@ define i64 @read_flags_reg_pressure() nounwind { ; WIN64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload ; WIN64-NEXT: #APP ; WIN64-NEXT: #NO_APP +; WIN64-NEXT: addq $8, %rsp +; WIN64-NEXT: popq %rbp ; WIN64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; WIN64-NEXT: addq $16, %rsp ; WIN64-NEXT: popq %rbx @@ -177,6 +181,8 @@ define void @write_flags_reg_pressure(i64 noundef %0) nounwind { ; WIN64-NEXT: subq $16, %rsp ; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rbp ; WIN64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; WIN64-NEXT: pushq %rbp +; WIN64-NEXT: pushq %rax ; WIN64-NEXT: #APP ; WIN64-NEXT: #NO_APP ; WIN64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill @@ -186,6 +192,8 @@ define void @write_flags_reg_pressure(i64 noundef %0) nounwind { ; WIN64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload ; WIN64-NEXT: #APP ; WIN64-NEXT: #NO_APP +; WIN64-NEXT: popq %rax +; WIN64-NEXT: popq %rbp ; WIN64-NEXT: addq $16, %rsp ; WIN64-NEXT: popq %rbx ; WIN64-NEXT: popq %rdi |
